Merge tag 'drm-next-2020-12-18' of git://anongit.freedesktop.org/drm/drm
[linux-2.6-microblaze.git] / fs / btrfs / block-group.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "misc.h"
4 #include "ctree.h"
5 #include "block-group.h"
6 #include "space-info.h"
7 #include "disk-io.h"
8 #include "free-space-cache.h"
9 #include "free-space-tree.h"
10 #include "volumes.h"
11 #include "transaction.h"
12 #include "ref-verify.h"
13 #include "sysfs.h"
14 #include "tree-log.h"
15 #include "delalloc-space.h"
16 #include "discard.h"
17 #include "raid56.h"
18
19 /*
20  * Return target flags in extended format or 0 if restripe for this chunk_type
21  * is not in progress
22  *
23  * Should be called with balance_lock held
24  */
25 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
26 {
27         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
28         u64 target = 0;
29
30         if (!bctl)
31                 return 0;
32
33         if (flags & BTRFS_BLOCK_GROUP_DATA &&
34             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
35                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
36         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
37                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
38                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
39         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
40                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
41                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
42         }
43
44         return target;
45 }
46
47 /*
48  * @flags: available profiles in extended format (see ctree.h)
49  *
50  * Return reduced profile in chunk format.  If profile changing is in progress
51  * (either running or paused) picks the target profile (if it's already
52  * available), otherwise falls back to plain reducing.
53  */
54 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
55 {
56         u64 num_devices = fs_info->fs_devices->rw_devices;
57         u64 target;
58         u64 raid_type;
59         u64 allowed = 0;
60
61         /*
62          * See if restripe for this chunk_type is in progress, if so try to
63          * reduce to the target profile
64          */
65         spin_lock(&fs_info->balance_lock);
66         target = get_restripe_target(fs_info, flags);
67         if (target) {
68                 spin_unlock(&fs_info->balance_lock);
69                 return extended_to_chunk(target);
70         }
71         spin_unlock(&fs_info->balance_lock);
72
73         /* First, mask out the RAID levels which aren't possible */
74         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
75                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
76                         allowed |= btrfs_raid_array[raid_type].bg_flag;
77         }
78         allowed &= flags;
79
80         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
81                 allowed = BTRFS_BLOCK_GROUP_RAID6;
82         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
83                 allowed = BTRFS_BLOCK_GROUP_RAID5;
84         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
85                 allowed = BTRFS_BLOCK_GROUP_RAID10;
86         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
87                 allowed = BTRFS_BLOCK_GROUP_RAID1;
88         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
89                 allowed = BTRFS_BLOCK_GROUP_RAID0;
90
91         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
92
93         return extended_to_chunk(flags | allowed);
94 }
95
96 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
97 {
98         unsigned seq;
99         u64 flags;
100
101         do {
102                 flags = orig_flags;
103                 seq = read_seqbegin(&fs_info->profiles_lock);
104
105                 if (flags & BTRFS_BLOCK_GROUP_DATA)
106                         flags |= fs_info->avail_data_alloc_bits;
107                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
108                         flags |= fs_info->avail_system_alloc_bits;
109                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
110                         flags |= fs_info->avail_metadata_alloc_bits;
111         } while (read_seqretry(&fs_info->profiles_lock, seq));
112
113         return btrfs_reduce_alloc_profile(fs_info, flags);
114 }
115
116 void btrfs_get_block_group(struct btrfs_block_group *cache)
117 {
118         refcount_inc(&cache->refs);
119 }
120
121 void btrfs_put_block_group(struct btrfs_block_group *cache)
122 {
123         if (refcount_dec_and_test(&cache->refs)) {
124                 WARN_ON(cache->pinned > 0);
125                 WARN_ON(cache->reserved > 0);
126
127                 /*
128                  * A block_group shouldn't be on the discard_list anymore.
129                  * Remove the block_group from the discard_list to prevent us
130                  * from causing a panic due to NULL pointer dereference.
131                  */
132                 if (WARN_ON(!list_empty(&cache->discard_list)))
133                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
134                                                   cache);
135
136                 /*
137                  * If not empty, someone is still holding mutex of
138                  * full_stripe_lock, which can only be released by caller.
139                  * And it will definitely cause use-after-free when caller
140                  * tries to release full stripe lock.
141                  *
142                  * No better way to resolve, but only to warn.
143                  */
144                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
145                 kfree(cache->free_space_ctl);
146                 kfree(cache);
147         }
148 }
149
150 /*
151  * This adds the block group to the fs_info rb tree for the block group cache
152  */
153 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
154                                        struct btrfs_block_group *block_group)
155 {
156         struct rb_node **p;
157         struct rb_node *parent = NULL;
158         struct btrfs_block_group *cache;
159
160         ASSERT(block_group->length != 0);
161
162         spin_lock(&info->block_group_cache_lock);
163         p = &info->block_group_cache_tree.rb_node;
164
165         while (*p) {
166                 parent = *p;
167                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
168                 if (block_group->start < cache->start) {
169                         p = &(*p)->rb_left;
170                 } else if (block_group->start > cache->start) {
171                         p = &(*p)->rb_right;
172                 } else {
173                         spin_unlock(&info->block_group_cache_lock);
174                         return -EEXIST;
175                 }
176         }
177
178         rb_link_node(&block_group->cache_node, parent, p);
179         rb_insert_color(&block_group->cache_node,
180                         &info->block_group_cache_tree);
181
182         if (info->first_logical_byte > block_group->start)
183                 info->first_logical_byte = block_group->start;
184
185         spin_unlock(&info->block_group_cache_lock);
186
187         return 0;
188 }
189
190 /*
191  * This will return the block group at or after bytenr if contains is 0, else
192  * it will return the block group that contains the bytenr
193  */
194 static struct btrfs_block_group *block_group_cache_tree_search(
195                 struct btrfs_fs_info *info, u64 bytenr, int contains)
196 {
197         struct btrfs_block_group *cache, *ret = NULL;
198         struct rb_node *n;
199         u64 end, start;
200
201         spin_lock(&info->block_group_cache_lock);
202         n = info->block_group_cache_tree.rb_node;
203
204         while (n) {
205                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
206                 end = cache->start + cache->length - 1;
207                 start = cache->start;
208
209                 if (bytenr < start) {
210                         if (!contains && (!ret || start < ret->start))
211                                 ret = cache;
212                         n = n->rb_left;
213                 } else if (bytenr > start) {
214                         if (contains && bytenr <= end) {
215                                 ret = cache;
216                                 break;
217                         }
218                         n = n->rb_right;
219                 } else {
220                         ret = cache;
221                         break;
222                 }
223         }
224         if (ret) {
225                 btrfs_get_block_group(ret);
226                 if (bytenr == 0 && info->first_logical_byte > ret->start)
227                         info->first_logical_byte = ret->start;
228         }
229         spin_unlock(&info->block_group_cache_lock);
230
231         return ret;
232 }
233
234 /*
235  * Return the block group that starts at or after bytenr
236  */
237 struct btrfs_block_group *btrfs_lookup_first_block_group(
238                 struct btrfs_fs_info *info, u64 bytenr)
239 {
240         return block_group_cache_tree_search(info, bytenr, 0);
241 }
242
243 /*
244  * Return the block group that contains the given bytenr
245  */
246 struct btrfs_block_group *btrfs_lookup_block_group(
247                 struct btrfs_fs_info *info, u64 bytenr)
248 {
249         return block_group_cache_tree_search(info, bytenr, 1);
250 }
251
252 struct btrfs_block_group *btrfs_next_block_group(
253                 struct btrfs_block_group *cache)
254 {
255         struct btrfs_fs_info *fs_info = cache->fs_info;
256         struct rb_node *node;
257
258         spin_lock(&fs_info->block_group_cache_lock);
259
260         /* If our block group was removed, we need a full search. */
261         if (RB_EMPTY_NODE(&cache->cache_node)) {
262                 const u64 next_bytenr = cache->start + cache->length;
263
264                 spin_unlock(&fs_info->block_group_cache_lock);
265                 btrfs_put_block_group(cache);
266                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
267         }
268         node = rb_next(&cache->cache_node);
269         btrfs_put_block_group(cache);
270         if (node) {
271                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
272                 btrfs_get_block_group(cache);
273         } else
274                 cache = NULL;
275         spin_unlock(&fs_info->block_group_cache_lock);
276         return cache;
277 }
278
279 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
280 {
281         struct btrfs_block_group *bg;
282         bool ret = true;
283
284         bg = btrfs_lookup_block_group(fs_info, bytenr);
285         if (!bg)
286                 return false;
287
288         spin_lock(&bg->lock);
289         if (bg->ro)
290                 ret = false;
291         else
292                 atomic_inc(&bg->nocow_writers);
293         spin_unlock(&bg->lock);
294
295         /* No put on block group, done by btrfs_dec_nocow_writers */
296         if (!ret)
297                 btrfs_put_block_group(bg);
298
299         return ret;
300 }
301
302 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
303 {
304         struct btrfs_block_group *bg;
305
306         bg = btrfs_lookup_block_group(fs_info, bytenr);
307         ASSERT(bg);
308         if (atomic_dec_and_test(&bg->nocow_writers))
309                 wake_up_var(&bg->nocow_writers);
310         /*
311          * Once for our lookup and once for the lookup done by a previous call
312          * to btrfs_inc_nocow_writers()
313          */
314         btrfs_put_block_group(bg);
315         btrfs_put_block_group(bg);
316 }
317
318 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
319 {
320         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
321 }
322
323 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
324                                         const u64 start)
325 {
326         struct btrfs_block_group *bg;
327
328         bg = btrfs_lookup_block_group(fs_info, start);
329         ASSERT(bg);
330         if (atomic_dec_and_test(&bg->reservations))
331                 wake_up_var(&bg->reservations);
332         btrfs_put_block_group(bg);
333 }
334
335 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
336 {
337         struct btrfs_space_info *space_info = bg->space_info;
338
339         ASSERT(bg->ro);
340
341         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
342                 return;
343
344         /*
345          * Our block group is read only but before we set it to read only,
346          * some task might have had allocated an extent from it already, but it
347          * has not yet created a respective ordered extent (and added it to a
348          * root's list of ordered extents).
349          * Therefore wait for any task currently allocating extents, since the
350          * block group's reservations counter is incremented while a read lock
351          * on the groups' semaphore is held and decremented after releasing
352          * the read access on that semaphore and creating the ordered extent.
353          */
354         down_write(&space_info->groups_sem);
355         up_write(&space_info->groups_sem);
356
357         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
358 }
359
360 struct btrfs_caching_control *btrfs_get_caching_control(
361                 struct btrfs_block_group *cache)
362 {
363         struct btrfs_caching_control *ctl;
364
365         spin_lock(&cache->lock);
366         if (!cache->caching_ctl) {
367                 spin_unlock(&cache->lock);
368                 return NULL;
369         }
370
371         ctl = cache->caching_ctl;
372         refcount_inc(&ctl->count);
373         spin_unlock(&cache->lock);
374         return ctl;
375 }
376
377 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
378 {
379         if (refcount_dec_and_test(&ctl->count))
380                 kfree(ctl);
381 }
382
383 /*
384  * When we wait for progress in the block group caching, its because our
385  * allocation attempt failed at least once.  So, we must sleep and let some
386  * progress happen before we try again.
387  *
388  * This function will sleep at least once waiting for new free space to show
389  * up, and then it will check the block group free space numbers for our min
390  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
391  * a free extent of a given size, but this is a good start.
392  *
393  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
394  * any of the information in this block group.
395  */
396 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
397                                            u64 num_bytes)
398 {
399         struct btrfs_caching_control *caching_ctl;
400
401         caching_ctl = btrfs_get_caching_control(cache);
402         if (!caching_ctl)
403                 return;
404
405         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
406                    (cache->free_space_ctl->free_space >= num_bytes));
407
408         btrfs_put_caching_control(caching_ctl);
409 }
410
411 int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
412 {
413         struct btrfs_caching_control *caching_ctl;
414         int ret = 0;
415
416         caching_ctl = btrfs_get_caching_control(cache);
417         if (!caching_ctl)
418                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
419
420         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
421         if (cache->cached == BTRFS_CACHE_ERROR)
422                 ret = -EIO;
423         btrfs_put_caching_control(caching_ctl);
424         return ret;
425 }
426
427 static bool space_cache_v1_done(struct btrfs_block_group *cache)
428 {
429         bool ret;
430
431         spin_lock(&cache->lock);
432         ret = cache->cached != BTRFS_CACHE_FAST;
433         spin_unlock(&cache->lock);
434
435         return ret;
436 }
437
438 void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
439                                 struct btrfs_caching_control *caching_ctl)
440 {
441         wait_event(caching_ctl->wait, space_cache_v1_done(cache));
442 }
443
444 #ifdef CONFIG_BTRFS_DEBUG
445 static void fragment_free_space(struct btrfs_block_group *block_group)
446 {
447         struct btrfs_fs_info *fs_info = block_group->fs_info;
448         u64 start = block_group->start;
449         u64 len = block_group->length;
450         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
451                 fs_info->nodesize : fs_info->sectorsize;
452         u64 step = chunk << 1;
453
454         while (len > chunk) {
455                 btrfs_remove_free_space(block_group, start, chunk);
456                 start += step;
457                 if (len < step)
458                         len = 0;
459                 else
460                         len -= step;
461         }
462 }
463 #endif
464
465 /*
466  * This is only called by btrfs_cache_block_group, since we could have freed
467  * extents we need to check the pinned_extents for any extents that can't be
468  * used yet since their free space will be released as soon as the transaction
469  * commits.
470  */
471 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
472 {
473         struct btrfs_fs_info *info = block_group->fs_info;
474         u64 extent_start, extent_end, size, total_added = 0;
475         int ret;
476
477         while (start < end) {
478                 ret = find_first_extent_bit(&info->excluded_extents, start,
479                                             &extent_start, &extent_end,
480                                             EXTENT_DIRTY | EXTENT_UPTODATE,
481                                             NULL);
482                 if (ret)
483                         break;
484
485                 if (extent_start <= start) {
486                         start = extent_end + 1;
487                 } else if (extent_start > start && extent_start < end) {
488                         size = extent_start - start;
489                         total_added += size;
490                         ret = btrfs_add_free_space_async_trimmed(block_group,
491                                                                  start, size);
492                         BUG_ON(ret); /* -ENOMEM or logic error */
493                         start = extent_end + 1;
494                 } else {
495                         break;
496                 }
497         }
498
499         if (start < end) {
500                 size = end - start;
501                 total_added += size;
502                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
503                                                          size);
504                 BUG_ON(ret); /* -ENOMEM or logic error */
505         }
506
507         return total_added;
508 }
509
510 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
511 {
512         struct btrfs_block_group *block_group = caching_ctl->block_group;
513         struct btrfs_fs_info *fs_info = block_group->fs_info;
514         struct btrfs_root *extent_root = fs_info->extent_root;
515         struct btrfs_path *path;
516         struct extent_buffer *leaf;
517         struct btrfs_key key;
518         u64 total_found = 0;
519         u64 last = 0;
520         u32 nritems;
521         int ret;
522         bool wakeup = true;
523
524         path = btrfs_alloc_path();
525         if (!path)
526                 return -ENOMEM;
527
528         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
529
530 #ifdef CONFIG_BTRFS_DEBUG
531         /*
532          * If we're fragmenting we don't want to make anybody think we can
533          * allocate from this block group until we've had a chance to fragment
534          * the free space.
535          */
536         if (btrfs_should_fragment_free_space(block_group))
537                 wakeup = false;
538 #endif
539         /*
540          * We don't want to deadlock with somebody trying to allocate a new
541          * extent for the extent root while also trying to search the extent
542          * root to add free space.  So we skip locking and search the commit
543          * root, since its read-only
544          */
545         path->skip_locking = 1;
546         path->search_commit_root = 1;
547         path->reada = READA_FORWARD;
548
549         key.objectid = last;
550         key.offset = 0;
551         key.type = BTRFS_EXTENT_ITEM_KEY;
552
553 next:
554         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
555         if (ret < 0)
556                 goto out;
557
558         leaf = path->nodes[0];
559         nritems = btrfs_header_nritems(leaf);
560
561         while (1) {
562                 if (btrfs_fs_closing(fs_info) > 1) {
563                         last = (u64)-1;
564                         break;
565                 }
566
567                 if (path->slots[0] < nritems) {
568                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
569                 } else {
570                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
571                         if (ret)
572                                 break;
573
574                         if (need_resched() ||
575                             rwsem_is_contended(&fs_info->commit_root_sem)) {
576                                 if (wakeup)
577                                         caching_ctl->progress = last;
578                                 btrfs_release_path(path);
579                                 up_read(&fs_info->commit_root_sem);
580                                 mutex_unlock(&caching_ctl->mutex);
581                                 cond_resched();
582                                 mutex_lock(&caching_ctl->mutex);
583                                 down_read(&fs_info->commit_root_sem);
584                                 goto next;
585                         }
586
587                         ret = btrfs_next_leaf(extent_root, path);
588                         if (ret < 0)
589                                 goto out;
590                         if (ret)
591                                 break;
592                         leaf = path->nodes[0];
593                         nritems = btrfs_header_nritems(leaf);
594                         continue;
595                 }
596
597                 if (key.objectid < last) {
598                         key.objectid = last;
599                         key.offset = 0;
600                         key.type = BTRFS_EXTENT_ITEM_KEY;
601
602                         if (wakeup)
603                                 caching_ctl->progress = last;
604                         btrfs_release_path(path);
605                         goto next;
606                 }
607
608                 if (key.objectid < block_group->start) {
609                         path->slots[0]++;
610                         continue;
611                 }
612
613                 if (key.objectid >= block_group->start + block_group->length)
614                         break;
615
616                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
617                     key.type == BTRFS_METADATA_ITEM_KEY) {
618                         total_found += add_new_free_space(block_group, last,
619                                                           key.objectid);
620                         if (key.type == BTRFS_METADATA_ITEM_KEY)
621                                 last = key.objectid +
622                                         fs_info->nodesize;
623                         else
624                                 last = key.objectid + key.offset;
625
626                         if (total_found > CACHING_CTL_WAKE_UP) {
627                                 total_found = 0;
628                                 if (wakeup)
629                                         wake_up(&caching_ctl->wait);
630                         }
631                 }
632                 path->slots[0]++;
633         }
634         ret = 0;
635
636         total_found += add_new_free_space(block_group, last,
637                                 block_group->start + block_group->length);
638         caching_ctl->progress = (u64)-1;
639
640 out:
641         btrfs_free_path(path);
642         return ret;
643 }
644
645 static noinline void caching_thread(struct btrfs_work *work)
646 {
647         struct btrfs_block_group *block_group;
648         struct btrfs_fs_info *fs_info;
649         struct btrfs_caching_control *caching_ctl;
650         int ret;
651
652         caching_ctl = container_of(work, struct btrfs_caching_control, work);
653         block_group = caching_ctl->block_group;
654         fs_info = block_group->fs_info;
655
656         mutex_lock(&caching_ctl->mutex);
657         down_read(&fs_info->commit_root_sem);
658
659         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
660                 ret = load_free_space_cache(block_group);
661                 if (ret == 1) {
662                         ret = 0;
663                         goto done;
664                 }
665
666                 /*
667                  * We failed to load the space cache, set ourselves to
668                  * CACHE_STARTED and carry on.
669                  */
670                 spin_lock(&block_group->lock);
671                 block_group->cached = BTRFS_CACHE_STARTED;
672                 spin_unlock(&block_group->lock);
673                 wake_up(&caching_ctl->wait);
674         }
675
676         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
677                 ret = load_free_space_tree(caching_ctl);
678         else
679                 ret = load_extent_tree_free(caching_ctl);
680 done:
681         spin_lock(&block_group->lock);
682         block_group->caching_ctl = NULL;
683         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
684         spin_unlock(&block_group->lock);
685
686 #ifdef CONFIG_BTRFS_DEBUG
687         if (btrfs_should_fragment_free_space(block_group)) {
688                 u64 bytes_used;
689
690                 spin_lock(&block_group->space_info->lock);
691                 spin_lock(&block_group->lock);
692                 bytes_used = block_group->length - block_group->used;
693                 block_group->space_info->bytes_used += bytes_used >> 1;
694                 spin_unlock(&block_group->lock);
695                 spin_unlock(&block_group->space_info->lock);
696                 fragment_free_space(block_group);
697         }
698 #endif
699
700         caching_ctl->progress = (u64)-1;
701
702         up_read(&fs_info->commit_root_sem);
703         btrfs_free_excluded_extents(block_group);
704         mutex_unlock(&caching_ctl->mutex);
705
706         wake_up(&caching_ctl->wait);
707
708         btrfs_put_caching_control(caching_ctl);
709         btrfs_put_block_group(block_group);
710 }
711
712 int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
713 {
714         DEFINE_WAIT(wait);
715         struct btrfs_fs_info *fs_info = cache->fs_info;
716         struct btrfs_caching_control *caching_ctl = NULL;
717         int ret = 0;
718
719         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
720         if (!caching_ctl)
721                 return -ENOMEM;
722
723         INIT_LIST_HEAD(&caching_ctl->list);
724         mutex_init(&caching_ctl->mutex);
725         init_waitqueue_head(&caching_ctl->wait);
726         caching_ctl->block_group = cache;
727         caching_ctl->progress = cache->start;
728         refcount_set(&caching_ctl->count, 2);
729         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
730
731         spin_lock(&cache->lock);
732         if (cache->cached != BTRFS_CACHE_NO) {
733                 kfree(caching_ctl);
734
735                 caching_ctl = cache->caching_ctl;
736                 if (caching_ctl)
737                         refcount_inc(&caching_ctl->count);
738                 spin_unlock(&cache->lock);
739                 goto out;
740         }
741         WARN_ON(cache->caching_ctl);
742         cache->caching_ctl = caching_ctl;
743         if (btrfs_test_opt(fs_info, SPACE_CACHE))
744                 cache->cached = BTRFS_CACHE_FAST;
745         else
746                 cache->cached = BTRFS_CACHE_STARTED;
747         cache->has_caching_ctl = 1;
748         spin_unlock(&cache->lock);
749
750         spin_lock(&fs_info->block_group_cache_lock);
751         refcount_inc(&caching_ctl->count);
752         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
753         spin_unlock(&fs_info->block_group_cache_lock);
754
755         btrfs_get_block_group(cache);
756
757         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
758 out:
759         if (load_cache_only && caching_ctl)
760                 btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
761         if (caching_ctl)
762                 btrfs_put_caching_control(caching_ctl);
763
764         return ret;
765 }
766
767 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
768 {
769         u64 extra_flags = chunk_to_extended(flags) &
770                                 BTRFS_EXTENDED_PROFILE_MASK;
771
772         write_seqlock(&fs_info->profiles_lock);
773         if (flags & BTRFS_BLOCK_GROUP_DATA)
774                 fs_info->avail_data_alloc_bits &= ~extra_flags;
775         if (flags & BTRFS_BLOCK_GROUP_METADATA)
776                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
777         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
778                 fs_info->avail_system_alloc_bits &= ~extra_flags;
779         write_sequnlock(&fs_info->profiles_lock);
780 }
781
782 /*
783  * Clear incompat bits for the following feature(s):
784  *
785  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
786  *            in the whole filesystem
787  *
788  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
789  */
790 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
791 {
792         bool found_raid56 = false;
793         bool found_raid1c34 = false;
794
795         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
796             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
797             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
798                 struct list_head *head = &fs_info->space_info;
799                 struct btrfs_space_info *sinfo;
800
801                 list_for_each_entry_rcu(sinfo, head, list) {
802                         down_read(&sinfo->groups_sem);
803                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
804                                 found_raid56 = true;
805                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
806                                 found_raid56 = true;
807                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
808                                 found_raid1c34 = true;
809                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
810                                 found_raid1c34 = true;
811                         up_read(&sinfo->groups_sem);
812                 }
813                 if (!found_raid56)
814                         btrfs_clear_fs_incompat(fs_info, RAID56);
815                 if (!found_raid1c34)
816                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
817         }
818 }
819
820 static int remove_block_group_item(struct btrfs_trans_handle *trans,
821                                    struct btrfs_path *path,
822                                    struct btrfs_block_group *block_group)
823 {
824         struct btrfs_fs_info *fs_info = trans->fs_info;
825         struct btrfs_root *root;
826         struct btrfs_key key;
827         int ret;
828
829         root = fs_info->extent_root;
830         key.objectid = block_group->start;
831         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
832         key.offset = block_group->length;
833
834         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
835         if (ret > 0)
836                 ret = -ENOENT;
837         if (ret < 0)
838                 return ret;
839
840         ret = btrfs_del_item(trans, root, path);
841         return ret;
842 }
843
844 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
845                              u64 group_start, struct extent_map *em)
846 {
847         struct btrfs_fs_info *fs_info = trans->fs_info;
848         struct btrfs_path *path;
849         struct btrfs_block_group *block_group;
850         struct btrfs_free_cluster *cluster;
851         struct inode *inode;
852         struct kobject *kobj = NULL;
853         int ret;
854         int index;
855         int factor;
856         struct btrfs_caching_control *caching_ctl = NULL;
857         bool remove_em;
858         bool remove_rsv = false;
859
860         block_group = btrfs_lookup_block_group(fs_info, group_start);
861         BUG_ON(!block_group);
862         BUG_ON(!block_group->ro);
863
864         trace_btrfs_remove_block_group(block_group);
865         /*
866          * Free the reserved super bytes from this block group before
867          * remove it.
868          */
869         btrfs_free_excluded_extents(block_group);
870         btrfs_free_ref_tree_range(fs_info, block_group->start,
871                                   block_group->length);
872
873         index = btrfs_bg_flags_to_raid_index(block_group->flags);
874         factor = btrfs_bg_type_to_factor(block_group->flags);
875
876         /* make sure this block group isn't part of an allocation cluster */
877         cluster = &fs_info->data_alloc_cluster;
878         spin_lock(&cluster->refill_lock);
879         btrfs_return_cluster_to_free_space(block_group, cluster);
880         spin_unlock(&cluster->refill_lock);
881
882         /*
883          * make sure this block group isn't part of a metadata
884          * allocation cluster
885          */
886         cluster = &fs_info->meta_alloc_cluster;
887         spin_lock(&cluster->refill_lock);
888         btrfs_return_cluster_to_free_space(block_group, cluster);
889         spin_unlock(&cluster->refill_lock);
890
891         path = btrfs_alloc_path();
892         if (!path) {
893                 ret = -ENOMEM;
894                 goto out;
895         }
896
897         /*
898          * get the inode first so any iput calls done for the io_list
899          * aren't the final iput (no unlinks allowed now)
900          */
901         inode = lookup_free_space_inode(block_group, path);
902
903         mutex_lock(&trans->transaction->cache_write_mutex);
904         /*
905          * Make sure our free space cache IO is done before removing the
906          * free space inode
907          */
908         spin_lock(&trans->transaction->dirty_bgs_lock);
909         if (!list_empty(&block_group->io_list)) {
910                 list_del_init(&block_group->io_list);
911
912                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
913
914                 spin_unlock(&trans->transaction->dirty_bgs_lock);
915                 btrfs_wait_cache_io(trans, block_group, path);
916                 btrfs_put_block_group(block_group);
917                 spin_lock(&trans->transaction->dirty_bgs_lock);
918         }
919
920         if (!list_empty(&block_group->dirty_list)) {
921                 list_del_init(&block_group->dirty_list);
922                 remove_rsv = true;
923                 btrfs_put_block_group(block_group);
924         }
925         spin_unlock(&trans->transaction->dirty_bgs_lock);
926         mutex_unlock(&trans->transaction->cache_write_mutex);
927
928         ret = btrfs_remove_free_space_inode(trans, inode, block_group);
929         if (ret)
930                 goto out;
931
932         spin_lock(&fs_info->block_group_cache_lock);
933         rb_erase(&block_group->cache_node,
934                  &fs_info->block_group_cache_tree);
935         RB_CLEAR_NODE(&block_group->cache_node);
936
937         /* Once for the block groups rbtree */
938         btrfs_put_block_group(block_group);
939
940         if (fs_info->first_logical_byte == block_group->start)
941                 fs_info->first_logical_byte = (u64)-1;
942         spin_unlock(&fs_info->block_group_cache_lock);
943
944         down_write(&block_group->space_info->groups_sem);
945         /*
946          * we must use list_del_init so people can check to see if they
947          * are still on the list after taking the semaphore
948          */
949         list_del_init(&block_group->list);
950         if (list_empty(&block_group->space_info->block_groups[index])) {
951                 kobj = block_group->space_info->block_group_kobjs[index];
952                 block_group->space_info->block_group_kobjs[index] = NULL;
953                 clear_avail_alloc_bits(fs_info, block_group->flags);
954         }
955         up_write(&block_group->space_info->groups_sem);
956         clear_incompat_bg_bits(fs_info, block_group->flags);
957         if (kobj) {
958                 kobject_del(kobj);
959                 kobject_put(kobj);
960         }
961
962         if (block_group->has_caching_ctl)
963                 caching_ctl = btrfs_get_caching_control(block_group);
964         if (block_group->cached == BTRFS_CACHE_STARTED)
965                 btrfs_wait_block_group_cache_done(block_group);
966         if (block_group->has_caching_ctl) {
967                 spin_lock(&fs_info->block_group_cache_lock);
968                 if (!caching_ctl) {
969                         struct btrfs_caching_control *ctl;
970
971                         list_for_each_entry(ctl,
972                                     &fs_info->caching_block_groups, list)
973                                 if (ctl->block_group == block_group) {
974                                         caching_ctl = ctl;
975                                         refcount_inc(&caching_ctl->count);
976                                         break;
977                                 }
978                 }
979                 if (caching_ctl)
980                         list_del_init(&caching_ctl->list);
981                 spin_unlock(&fs_info->block_group_cache_lock);
982                 if (caching_ctl) {
983                         /* Once for the caching bgs list and once for us. */
984                         btrfs_put_caching_control(caching_ctl);
985                         btrfs_put_caching_control(caching_ctl);
986                 }
987         }
988
989         spin_lock(&trans->transaction->dirty_bgs_lock);
990         WARN_ON(!list_empty(&block_group->dirty_list));
991         WARN_ON(!list_empty(&block_group->io_list));
992         spin_unlock(&trans->transaction->dirty_bgs_lock);
993
994         btrfs_remove_free_space_cache(block_group);
995
996         spin_lock(&block_group->space_info->lock);
997         list_del_init(&block_group->ro_list);
998
999         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1000                 WARN_ON(block_group->space_info->total_bytes
1001                         < block_group->length);
1002                 WARN_ON(block_group->space_info->bytes_readonly
1003                         < block_group->length);
1004                 WARN_ON(block_group->space_info->disk_total
1005                         < block_group->length * factor);
1006         }
1007         block_group->space_info->total_bytes -= block_group->length;
1008         block_group->space_info->bytes_readonly -= block_group->length;
1009         block_group->space_info->disk_total -= block_group->length * factor;
1010
1011         spin_unlock(&block_group->space_info->lock);
1012
1013         /*
1014          * Remove the free space for the block group from the free space tree
1015          * and the block group's item from the extent tree before marking the
1016          * block group as removed. This is to prevent races with tasks that
1017          * freeze and unfreeze a block group, this task and another task
1018          * allocating a new block group - the unfreeze task ends up removing
1019          * the block group's extent map before the task calling this function
1020          * deletes the block group item from the extent tree, allowing for
1021          * another task to attempt to create another block group with the same
1022          * item key (and failing with -EEXIST and a transaction abort).
1023          */
1024         ret = remove_block_group_free_space(trans, block_group);
1025         if (ret)
1026                 goto out;
1027
1028         ret = remove_block_group_item(trans, path, block_group);
1029         if (ret < 0)
1030                 goto out;
1031
1032         spin_lock(&block_group->lock);
1033         block_group->removed = 1;
1034         /*
1035          * At this point trimming or scrub can't start on this block group,
1036          * because we removed the block group from the rbtree
1037          * fs_info->block_group_cache_tree so no one can't find it anymore and
1038          * even if someone already got this block group before we removed it
1039          * from the rbtree, they have already incremented block_group->frozen -
1040          * if they didn't, for the trimming case they won't find any free space
1041          * entries because we already removed them all when we called
1042          * btrfs_remove_free_space_cache().
1043          *
1044          * And we must not remove the extent map from the fs_info->mapping_tree
1045          * to prevent the same logical address range and physical device space
1046          * ranges from being reused for a new block group. This is needed to
1047          * avoid races with trimming and scrub.
1048          *
1049          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1050          * completely transactionless, so while it is trimming a range the
1051          * currently running transaction might finish and a new one start,
1052          * allowing for new block groups to be created that can reuse the same
1053          * physical device locations unless we take this special care.
1054          *
1055          * There may also be an implicit trim operation if the file system
1056          * is mounted with -odiscard. The same protections must remain
1057          * in place until the extents have been discarded completely when
1058          * the transaction commit has completed.
1059          */
1060         remove_em = (atomic_read(&block_group->frozen) == 0);
1061         spin_unlock(&block_group->lock);
1062
1063         if (remove_em) {
1064                 struct extent_map_tree *em_tree;
1065
1066                 em_tree = &fs_info->mapping_tree;
1067                 write_lock(&em_tree->lock);
1068                 remove_extent_mapping(em_tree, em);
1069                 write_unlock(&em_tree->lock);
1070                 /* once for the tree */
1071                 free_extent_map(em);
1072         }
1073
1074 out:
1075         /* Once for the lookup reference */
1076         btrfs_put_block_group(block_group);
1077         if (remove_rsv)
1078                 btrfs_delayed_refs_rsv_release(fs_info, 1);
1079         btrfs_free_path(path);
1080         return ret;
1081 }
1082
1083 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1084                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1085 {
1086         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1087         struct extent_map *em;
1088         struct map_lookup *map;
1089         unsigned int num_items;
1090
1091         read_lock(&em_tree->lock);
1092         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1093         read_unlock(&em_tree->lock);
1094         ASSERT(em && em->start == chunk_offset);
1095
1096         /*
1097          * We need to reserve 3 + N units from the metadata space info in order
1098          * to remove a block group (done at btrfs_remove_chunk() and at
1099          * btrfs_remove_block_group()), which are used for:
1100          *
1101          * 1 unit for adding the free space inode's orphan (located in the tree
1102          * of tree roots).
1103          * 1 unit for deleting the block group item (located in the extent
1104          * tree).
1105          * 1 unit for deleting the free space item (located in tree of tree
1106          * roots).
1107          * N units for deleting N device extent items corresponding to each
1108          * stripe (located in the device tree).
1109          *
1110          * In order to remove a block group we also need to reserve units in the
1111          * system space info in order to update the chunk tree (update one or
1112          * more device items and remove one chunk item), but this is done at
1113          * btrfs_remove_chunk() through a call to check_system_chunk().
1114          */
1115         map = em->map_lookup;
1116         num_items = 3 + map->num_stripes;
1117         free_extent_map(em);
1118
1119         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1120                                                            num_items);
1121 }
1122
1123 /*
1124  * Mark block group @cache read-only, so later write won't happen to block
1125  * group @cache.
1126  *
1127  * If @force is not set, this function will only mark the block group readonly
1128  * if we have enough free space (1M) in other metadata/system block groups.
1129  * If @force is not set, this function will mark the block group readonly
1130  * without checking free space.
1131  *
1132  * NOTE: This function doesn't care if other block groups can contain all the
1133  * data in this block group. That check should be done by relocation routine,
1134  * not this function.
1135  */
1136 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1137 {
1138         struct btrfs_space_info *sinfo = cache->space_info;
1139         u64 num_bytes;
1140         int ret = -ENOSPC;
1141
1142         spin_lock(&sinfo->lock);
1143         spin_lock(&cache->lock);
1144
1145         if (cache->ro) {
1146                 cache->ro++;
1147                 ret = 0;
1148                 goto out;
1149         }
1150
1151         num_bytes = cache->length - cache->reserved - cache->pinned -
1152                     cache->bytes_super - cache->used;
1153
1154         /*
1155          * Data never overcommits, even in mixed mode, so do just the straight
1156          * check of left over space in how much we have allocated.
1157          */
1158         if (force) {
1159                 ret = 0;
1160         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1161                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1162
1163                 /*
1164                  * Here we make sure if we mark this bg RO, we still have enough
1165                  * free space as buffer.
1166                  */
1167                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1168                         ret = 0;
1169         } else {
1170                 /*
1171                  * We overcommit metadata, so we need to do the
1172                  * btrfs_can_overcommit check here, and we need to pass in
1173                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1174                  * leeway to allow us to mark this block group as read only.
1175                  */
1176                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1177                                          BTRFS_RESERVE_NO_FLUSH))
1178                         ret = 0;
1179         }
1180
1181         if (!ret) {
1182                 sinfo->bytes_readonly += num_bytes;
1183                 cache->ro++;
1184                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1185         }
1186 out:
1187         spin_unlock(&cache->lock);
1188         spin_unlock(&sinfo->lock);
1189         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1190                 btrfs_info(cache->fs_info,
1191                         "unable to make block group %llu ro", cache->start);
1192                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1193         }
1194         return ret;
1195 }
1196
1197 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1198                                  struct btrfs_block_group *bg)
1199 {
1200         struct btrfs_fs_info *fs_info = bg->fs_info;
1201         struct btrfs_transaction *prev_trans = NULL;
1202         const u64 start = bg->start;
1203         const u64 end = start + bg->length - 1;
1204         int ret;
1205
1206         spin_lock(&fs_info->trans_lock);
1207         if (trans->transaction->list.prev != &fs_info->trans_list) {
1208                 prev_trans = list_last_entry(&trans->transaction->list,
1209                                              struct btrfs_transaction, list);
1210                 refcount_inc(&prev_trans->use_count);
1211         }
1212         spin_unlock(&fs_info->trans_lock);
1213
1214         /*
1215          * Hold the unused_bg_unpin_mutex lock to avoid racing with
1216          * btrfs_finish_extent_commit(). If we are at transaction N, another
1217          * task might be running finish_extent_commit() for the previous
1218          * transaction N - 1, and have seen a range belonging to the block
1219          * group in pinned_extents before we were able to clear the whole block
1220          * group range from pinned_extents. This means that task can lookup for
1221          * the block group after we unpinned it from pinned_extents and removed
1222          * it, leading to a BUG_ON() at unpin_extent_range().
1223          */
1224         mutex_lock(&fs_info->unused_bg_unpin_mutex);
1225         if (prev_trans) {
1226                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1227                                         EXTENT_DIRTY);
1228                 if (ret)
1229                         goto out;
1230         }
1231
1232         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1233                                 EXTENT_DIRTY);
1234 out:
1235         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1236         if (prev_trans)
1237                 btrfs_put_transaction(prev_trans);
1238
1239         return ret == 0;
1240 }
1241
1242 /*
1243  * Process the unused_bgs list and remove any that don't have any allocated
1244  * space inside of them.
1245  */
1246 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1247 {
1248         struct btrfs_block_group *block_group;
1249         struct btrfs_space_info *space_info;
1250         struct btrfs_trans_handle *trans;
1251         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1252         int ret = 0;
1253
1254         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1255                 return;
1256
1257         spin_lock(&fs_info->unused_bgs_lock);
1258         while (!list_empty(&fs_info->unused_bgs)) {
1259                 int trimming;
1260
1261                 block_group = list_first_entry(&fs_info->unused_bgs,
1262                                                struct btrfs_block_group,
1263                                                bg_list);
1264                 list_del_init(&block_group->bg_list);
1265
1266                 space_info = block_group->space_info;
1267
1268                 if (ret || btrfs_mixed_space_info(space_info)) {
1269                         btrfs_put_block_group(block_group);
1270                         continue;
1271                 }
1272                 spin_unlock(&fs_info->unused_bgs_lock);
1273
1274                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1275
1276                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
1277
1278                 /* Don't want to race with allocators so take the groups_sem */
1279                 down_write(&space_info->groups_sem);
1280
1281                 /*
1282                  * Async discard moves the final block group discard to be prior
1283                  * to the unused_bgs code path.  Therefore, if it's not fully
1284                  * trimmed, punt it back to the async discard lists.
1285                  */
1286                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1287                     !btrfs_is_free_space_trimmed(block_group)) {
1288                         trace_btrfs_skip_unused_block_group(block_group);
1289                         up_write(&space_info->groups_sem);
1290                         /* Requeue if we failed because of async discard */
1291                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1292                                                  block_group);
1293                         goto next;
1294                 }
1295
1296                 spin_lock(&block_group->lock);
1297                 if (block_group->reserved || block_group->pinned ||
1298                     block_group->used || block_group->ro ||
1299                     list_is_singular(&block_group->list)) {
1300                         /*
1301                          * We want to bail if we made new allocations or have
1302                          * outstanding allocations in this block group.  We do
1303                          * the ro check in case balance is currently acting on
1304                          * this block group.
1305                          */
1306                         trace_btrfs_skip_unused_block_group(block_group);
1307                         spin_unlock(&block_group->lock);
1308                         up_write(&space_info->groups_sem);
1309                         goto next;
1310                 }
1311                 spin_unlock(&block_group->lock);
1312
1313                 /* We don't want to force the issue, only flip if it's ok. */
1314                 ret = inc_block_group_ro(block_group, 0);
1315                 up_write(&space_info->groups_sem);
1316                 if (ret < 0) {
1317                         ret = 0;
1318                         goto next;
1319                 }
1320
1321                 /*
1322                  * Want to do this before we do anything else so we can recover
1323                  * properly if we fail to join the transaction.
1324                  */
1325                 trans = btrfs_start_trans_remove_block_group(fs_info,
1326                                                      block_group->start);
1327                 if (IS_ERR(trans)) {
1328                         btrfs_dec_block_group_ro(block_group);
1329                         ret = PTR_ERR(trans);
1330                         goto next;
1331                 }
1332
1333                 /*
1334                  * We could have pending pinned extents for this block group,
1335                  * just delete them, we don't care about them anymore.
1336                  */
1337                 if (!clean_pinned_extents(trans, block_group)) {
1338                         btrfs_dec_block_group_ro(block_group);
1339                         goto end_trans;
1340                 }
1341
1342                 /*
1343                  * At this point, the block_group is read only and should fail
1344                  * new allocations.  However, btrfs_finish_extent_commit() can
1345                  * cause this block_group to be placed back on the discard
1346                  * lists because now the block_group isn't fully discarded.
1347                  * Bail here and try again later after discarding everything.
1348                  */
1349                 spin_lock(&fs_info->discard_ctl.lock);
1350                 if (!list_empty(&block_group->discard_list)) {
1351                         spin_unlock(&fs_info->discard_ctl.lock);
1352                         btrfs_dec_block_group_ro(block_group);
1353                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1354                                                  block_group);
1355                         goto end_trans;
1356                 }
1357                 spin_unlock(&fs_info->discard_ctl.lock);
1358
1359                 /* Reset pinned so btrfs_put_block_group doesn't complain */
1360                 spin_lock(&space_info->lock);
1361                 spin_lock(&block_group->lock);
1362
1363                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1364                                                      -block_group->pinned);
1365                 space_info->bytes_readonly += block_group->pinned;
1366                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1367                                    -block_group->pinned,
1368                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
1369                 block_group->pinned = 0;
1370
1371                 spin_unlock(&block_group->lock);
1372                 spin_unlock(&space_info->lock);
1373
1374                 /*
1375                  * The normal path here is an unused block group is passed here,
1376                  * then trimming is handled in the transaction commit path.
1377                  * Async discard interposes before this to do the trimming
1378                  * before coming down the unused block group path as trimming
1379                  * will no longer be done later in the transaction commit path.
1380                  */
1381                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1382                         goto flip_async;
1383
1384                 /* DISCARD can flip during remount */
1385                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
1386
1387                 /* Implicit trim during transaction commit. */
1388                 if (trimming)
1389                         btrfs_freeze_block_group(block_group);
1390
1391                 /*
1392                  * Btrfs_remove_chunk will abort the transaction if things go
1393                  * horribly wrong.
1394                  */
1395                 ret = btrfs_remove_chunk(trans, block_group->start);
1396
1397                 if (ret) {
1398                         if (trimming)
1399                                 btrfs_unfreeze_block_group(block_group);
1400                         goto end_trans;
1401                 }
1402
1403                 /*
1404                  * If we're not mounted with -odiscard, we can just forget
1405                  * about this block group. Otherwise we'll need to wait
1406                  * until transaction commit to do the actual discard.
1407                  */
1408                 if (trimming) {
1409                         spin_lock(&fs_info->unused_bgs_lock);
1410                         /*
1411                          * A concurrent scrub might have added us to the list
1412                          * fs_info->unused_bgs, so use a list_move operation
1413                          * to add the block group to the deleted_bgs list.
1414                          */
1415                         list_move(&block_group->bg_list,
1416                                   &trans->transaction->deleted_bgs);
1417                         spin_unlock(&fs_info->unused_bgs_lock);
1418                         btrfs_get_block_group(block_group);
1419                 }
1420 end_trans:
1421                 btrfs_end_transaction(trans);
1422 next:
1423                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1424                 btrfs_put_block_group(block_group);
1425                 spin_lock(&fs_info->unused_bgs_lock);
1426         }
1427         spin_unlock(&fs_info->unused_bgs_lock);
1428         return;
1429
1430 flip_async:
1431         btrfs_end_transaction(trans);
1432         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
1433         btrfs_put_block_group(block_group);
1434         btrfs_discard_punt_unused_bgs_list(fs_info);
1435 }
1436
1437 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1438 {
1439         struct btrfs_fs_info *fs_info = bg->fs_info;
1440
1441         spin_lock(&fs_info->unused_bgs_lock);
1442         if (list_empty(&bg->bg_list)) {
1443                 btrfs_get_block_group(bg);
1444                 trace_btrfs_add_unused_block_group(bg);
1445                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1446         }
1447         spin_unlock(&fs_info->unused_bgs_lock);
1448 }
1449
1450 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1451                            struct btrfs_path *path)
1452 {
1453         struct extent_map_tree *em_tree;
1454         struct extent_map *em;
1455         struct btrfs_block_group_item bg;
1456         struct extent_buffer *leaf;
1457         int slot;
1458         u64 flags;
1459         int ret = 0;
1460
1461         slot = path->slots[0];
1462         leaf = path->nodes[0];
1463
1464         em_tree = &fs_info->mapping_tree;
1465         read_lock(&em_tree->lock);
1466         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
1467         read_unlock(&em_tree->lock);
1468         if (!em) {
1469                 btrfs_err(fs_info,
1470                           "logical %llu len %llu found bg but no related chunk",
1471                           key->objectid, key->offset);
1472                 return -ENOENT;
1473         }
1474
1475         if (em->start != key->objectid || em->len != key->offset) {
1476                 btrfs_err(fs_info,
1477                         "block group %llu len %llu mismatch with chunk %llu len %llu",
1478                         key->objectid, key->offset, em->start, em->len);
1479                 ret = -EUCLEAN;
1480                 goto out_free_em;
1481         }
1482
1483         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1484                            sizeof(bg));
1485         flags = btrfs_stack_block_group_flags(&bg) &
1486                 BTRFS_BLOCK_GROUP_TYPE_MASK;
1487
1488         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1489                 btrfs_err(fs_info,
1490 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1491                           key->objectid, key->offset, flags,
1492                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
1493                 ret = -EUCLEAN;
1494         }
1495
1496 out_free_em:
1497         free_extent_map(em);
1498         return ret;
1499 }
1500
1501 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1502                                   struct btrfs_path *path,
1503                                   struct btrfs_key *key)
1504 {
1505         struct btrfs_root *root = fs_info->extent_root;
1506         int ret;
1507         struct btrfs_key found_key;
1508         struct extent_buffer *leaf;
1509         int slot;
1510
1511         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1512         if (ret < 0)
1513                 return ret;
1514
1515         while (1) {
1516                 slot = path->slots[0];
1517                 leaf = path->nodes[0];
1518                 if (slot >= btrfs_header_nritems(leaf)) {
1519                         ret = btrfs_next_leaf(root, path);
1520                         if (ret == 0)
1521                                 continue;
1522                         if (ret < 0)
1523                                 goto out;
1524                         break;
1525                 }
1526                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1527
1528                 if (found_key.objectid >= key->objectid &&
1529                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1530                         ret = read_bg_from_eb(fs_info, &found_key, path);
1531                         break;
1532                 }
1533
1534                 path->slots[0]++;
1535         }
1536 out:
1537         return ret;
1538 }
1539
1540 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1541 {
1542         u64 extra_flags = chunk_to_extended(flags) &
1543                                 BTRFS_EXTENDED_PROFILE_MASK;
1544
1545         write_seqlock(&fs_info->profiles_lock);
1546         if (flags & BTRFS_BLOCK_GROUP_DATA)
1547                 fs_info->avail_data_alloc_bits |= extra_flags;
1548         if (flags & BTRFS_BLOCK_GROUP_METADATA)
1549                 fs_info->avail_metadata_alloc_bits |= extra_flags;
1550         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1551                 fs_info->avail_system_alloc_bits |= extra_flags;
1552         write_sequnlock(&fs_info->profiles_lock);
1553 }
1554
1555 /**
1556  * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
1557  * @chunk_start:   logical address of block group
1558  * @physical:      physical address to map to logical addresses
1559  * @logical:       return array of logical addresses which map to @physical
1560  * @naddrs:        length of @logical
1561  * @stripe_len:    size of IO stripe for the given block group
1562  *
1563  * Maps a particular @physical disk address to a list of @logical addresses.
1564  * Used primarily to exclude those portions of a block group that contain super
1565  * block copies.
1566  */
1567 EXPORT_FOR_TESTS
1568 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1569                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
1570 {
1571         struct extent_map *em;
1572         struct map_lookup *map;
1573         u64 *buf;
1574         u64 bytenr;
1575         u64 data_stripe_length;
1576         u64 io_stripe_size;
1577         int i, nr = 0;
1578         int ret = 0;
1579
1580         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1581         if (IS_ERR(em))
1582                 return -EIO;
1583
1584         map = em->map_lookup;
1585         data_stripe_length = em->orig_block_len;
1586         io_stripe_size = map->stripe_len;
1587
1588         /* For RAID5/6 adjust to a full IO stripe length */
1589         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1590                 io_stripe_size = map->stripe_len * nr_data_stripes(map);
1591
1592         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1593         if (!buf) {
1594                 ret = -ENOMEM;
1595                 goto out;
1596         }
1597
1598         for (i = 0; i < map->num_stripes; i++) {
1599                 bool already_inserted = false;
1600                 u64 stripe_nr;
1601                 int j;
1602
1603                 if (!in_range(physical, map->stripes[i].physical,
1604                               data_stripe_length))
1605                         continue;
1606
1607                 stripe_nr = physical - map->stripes[i].physical;
1608                 stripe_nr = div64_u64(stripe_nr, map->stripe_len);
1609
1610                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1611                         stripe_nr = stripe_nr * map->num_stripes + i;
1612                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1613                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1614                         stripe_nr = stripe_nr * map->num_stripes + i;
1615                 }
1616                 /*
1617                  * The remaining case would be for RAID56, multiply by
1618                  * nr_data_stripes().  Alternatively, just use rmap_len below
1619                  * instead of map->stripe_len
1620                  */
1621
1622                 bytenr = chunk_start + stripe_nr * io_stripe_size;
1623
1624                 /* Ensure we don't add duplicate addresses */
1625                 for (j = 0; j < nr; j++) {
1626                         if (buf[j] == bytenr) {
1627                                 already_inserted = true;
1628                                 break;
1629                         }
1630                 }
1631
1632                 if (!already_inserted)
1633                         buf[nr++] = bytenr;
1634         }
1635
1636         *logical = buf;
1637         *naddrs = nr;
1638         *stripe_len = io_stripe_size;
1639 out:
1640         free_extent_map(em);
1641         return ret;
1642 }
1643
1644 static int exclude_super_stripes(struct btrfs_block_group *cache)
1645 {
1646         struct btrfs_fs_info *fs_info = cache->fs_info;
1647         const bool zoned = btrfs_is_zoned(fs_info);
1648         u64 bytenr;
1649         u64 *logical;
1650         int stripe_len;
1651         int i, nr, ret;
1652
1653         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1654                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1655                 cache->bytes_super += stripe_len;
1656                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
1657                                                 stripe_len);
1658                 if (ret)
1659                         return ret;
1660         }
1661
1662         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1663                 bytenr = btrfs_sb_offset(i);
1664                 ret = btrfs_rmap_block(fs_info, cache->start,
1665                                        bytenr, &logical, &nr, &stripe_len);
1666                 if (ret)
1667                         return ret;
1668
1669                 /* Shouldn't have super stripes in sequential zones */
1670                 if (zoned && nr) {
1671                         btrfs_err(fs_info,
1672                         "zoned: block group %llu must not contain super block",
1673                                   cache->start);
1674                         return -EUCLEAN;
1675                 }
1676
1677                 while (nr--) {
1678                         u64 len = min_t(u64, stripe_len,
1679                                 cache->start + cache->length - logical[nr]);
1680
1681                         cache->bytes_super += len;
1682                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
1683                                                         len);
1684                         if (ret) {
1685                                 kfree(logical);
1686                                 return ret;
1687                         }
1688                 }
1689
1690                 kfree(logical);
1691         }
1692         return 0;
1693 }
1694
1695 static void link_block_group(struct btrfs_block_group *cache)
1696 {
1697         struct btrfs_space_info *space_info = cache->space_info;
1698         int index = btrfs_bg_flags_to_raid_index(cache->flags);
1699
1700         down_write(&space_info->groups_sem);
1701         list_add_tail(&cache->list, &space_info->block_groups[index]);
1702         up_write(&space_info->groups_sem);
1703 }
1704
1705 static struct btrfs_block_group *btrfs_create_block_group_cache(
1706                 struct btrfs_fs_info *fs_info, u64 start)
1707 {
1708         struct btrfs_block_group *cache;
1709
1710         cache = kzalloc(sizeof(*cache), GFP_NOFS);
1711         if (!cache)
1712                 return NULL;
1713
1714         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1715                                         GFP_NOFS);
1716         if (!cache->free_space_ctl) {
1717                 kfree(cache);
1718                 return NULL;
1719         }
1720
1721         cache->start = start;
1722
1723         cache->fs_info = fs_info;
1724         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1725
1726         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1727
1728         refcount_set(&cache->refs, 1);
1729         spin_lock_init(&cache->lock);
1730         init_rwsem(&cache->data_rwsem);
1731         INIT_LIST_HEAD(&cache->list);
1732         INIT_LIST_HEAD(&cache->cluster_list);
1733         INIT_LIST_HEAD(&cache->bg_list);
1734         INIT_LIST_HEAD(&cache->ro_list);
1735         INIT_LIST_HEAD(&cache->discard_list);
1736         INIT_LIST_HEAD(&cache->dirty_list);
1737         INIT_LIST_HEAD(&cache->io_list);
1738         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
1739         atomic_set(&cache->frozen, 0);
1740         mutex_init(&cache->free_space_lock);
1741         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1742
1743         return cache;
1744 }
1745
1746 /*
1747  * Iterate all chunks and verify that each of them has the corresponding block
1748  * group
1749  */
1750 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1751 {
1752         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1753         struct extent_map *em;
1754         struct btrfs_block_group *bg;
1755         u64 start = 0;
1756         int ret = 0;
1757
1758         while (1) {
1759                 read_lock(&map_tree->lock);
1760                 /*
1761                  * lookup_extent_mapping will return the first extent map
1762                  * intersecting the range, so setting @len to 1 is enough to
1763                  * get the first chunk.
1764                  */
1765                 em = lookup_extent_mapping(map_tree, start, 1);
1766                 read_unlock(&map_tree->lock);
1767                 if (!em)
1768                         break;
1769
1770                 bg = btrfs_lookup_block_group(fs_info, em->start);
1771                 if (!bg) {
1772                         btrfs_err(fs_info,
1773         "chunk start=%llu len=%llu doesn't have corresponding block group",
1774                                      em->start, em->len);
1775                         ret = -EUCLEAN;
1776                         free_extent_map(em);
1777                         break;
1778                 }
1779                 if (bg->start != em->start || bg->length != em->len ||
1780                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1781                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1782                         btrfs_err(fs_info,
1783 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1784                                 em->start, em->len,
1785                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1786                                 bg->start, bg->length,
1787                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1788                         ret = -EUCLEAN;
1789                         free_extent_map(em);
1790                         btrfs_put_block_group(bg);
1791                         break;
1792                 }
1793                 start = em->start + em->len;
1794                 free_extent_map(em);
1795                 btrfs_put_block_group(bg);
1796         }
1797         return ret;
1798 }
1799
1800 static void read_block_group_item(struct btrfs_block_group *cache,
1801                                  struct btrfs_path *path,
1802                                  const struct btrfs_key *key)
1803 {
1804         struct extent_buffer *leaf = path->nodes[0];
1805         struct btrfs_block_group_item bgi;
1806         int slot = path->slots[0];
1807
1808         cache->length = key->offset;
1809
1810         read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
1811                            sizeof(bgi));
1812         cache->used = btrfs_stack_block_group_used(&bgi);
1813         cache->flags = btrfs_stack_block_group_flags(&bgi);
1814 }
1815
1816 static int read_one_block_group(struct btrfs_fs_info *info,
1817                                 struct btrfs_path *path,
1818                                 const struct btrfs_key *key,
1819                                 int need_clear)
1820 {
1821         struct btrfs_block_group *cache;
1822         struct btrfs_space_info *space_info;
1823         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
1824         int ret;
1825
1826         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
1827
1828         cache = btrfs_create_block_group_cache(info, key->objectid);
1829         if (!cache)
1830                 return -ENOMEM;
1831
1832         read_block_group_item(cache, path, key);
1833
1834         set_free_space_tree_thresholds(cache);
1835
1836         if (need_clear) {
1837                 /*
1838                  * When we mount with old space cache, we need to
1839                  * set BTRFS_DC_CLEAR and set dirty flag.
1840                  *
1841                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1842                  *    truncate the old free space cache inode and
1843                  *    setup a new one.
1844                  * b) Setting 'dirty flag' makes sure that we flush
1845                  *    the new space cache info onto disk.
1846                  */
1847                 if (btrfs_test_opt(info, SPACE_CACHE))
1848                         cache->disk_cache_state = BTRFS_DC_CLEAR;
1849         }
1850         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1851             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1852                         btrfs_err(info,
1853 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1854                                   cache->start);
1855                         ret = -EINVAL;
1856                         goto error;
1857         }
1858
1859         /*
1860          * We need to exclude the super stripes now so that the space info has
1861          * super bytes accounted for, otherwise we'll think we have more space
1862          * than we actually do.
1863          */
1864         ret = exclude_super_stripes(cache);
1865         if (ret) {
1866                 /* We may have excluded something, so call this just in case. */
1867                 btrfs_free_excluded_extents(cache);
1868                 goto error;
1869         }
1870
1871         /*
1872          * Check for two cases, either we are full, and therefore don't need
1873          * to bother with the caching work since we won't find any space, or we
1874          * are empty, and we can just add all the space in and be done with it.
1875          * This saves us _a_lot_ of time, particularly in the full case.
1876          */
1877         if (cache->length == cache->used) {
1878                 cache->last_byte_to_unpin = (u64)-1;
1879                 cache->cached = BTRFS_CACHE_FINISHED;
1880                 btrfs_free_excluded_extents(cache);
1881         } else if (cache->used == 0) {
1882                 cache->last_byte_to_unpin = (u64)-1;
1883                 cache->cached = BTRFS_CACHE_FINISHED;
1884                 add_new_free_space(cache, cache->start,
1885                                    cache->start + cache->length);
1886                 btrfs_free_excluded_extents(cache);
1887         }
1888
1889         ret = btrfs_add_block_group_cache(info, cache);
1890         if (ret) {
1891                 btrfs_remove_free_space_cache(cache);
1892                 goto error;
1893         }
1894         trace_btrfs_add_block_group(info, cache, 0);
1895         btrfs_update_space_info(info, cache->flags, cache->length,
1896                                 cache->used, cache->bytes_super, &space_info);
1897
1898         cache->space_info = space_info;
1899
1900         link_block_group(cache);
1901
1902         set_avail_alloc_bits(info, cache->flags);
1903         if (btrfs_chunk_readonly(info, cache->start)) {
1904                 inc_block_group_ro(cache, 1);
1905         } else if (cache->used == 0) {
1906                 ASSERT(list_empty(&cache->bg_list));
1907                 if (btrfs_test_opt(info, DISCARD_ASYNC))
1908                         btrfs_discard_queue_work(&info->discard_ctl, cache);
1909                 else
1910                         btrfs_mark_bg_unused(cache);
1911         }
1912         return 0;
1913 error:
1914         btrfs_put_block_group(cache);
1915         return ret;
1916 }
1917
1918 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
1919 {
1920         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1921         struct btrfs_space_info *space_info;
1922         struct rb_node *node;
1923         int ret = 0;
1924
1925         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
1926                 struct extent_map *em;
1927                 struct map_lookup *map;
1928                 struct btrfs_block_group *bg;
1929
1930                 em = rb_entry(node, struct extent_map, rb_node);
1931                 map = em->map_lookup;
1932                 bg = btrfs_create_block_group_cache(fs_info, em->start);
1933                 if (!bg) {
1934                         ret = -ENOMEM;
1935                         break;
1936                 }
1937
1938                 /* Fill dummy cache as FULL */
1939                 bg->length = em->len;
1940                 bg->flags = map->type;
1941                 bg->last_byte_to_unpin = (u64)-1;
1942                 bg->cached = BTRFS_CACHE_FINISHED;
1943                 bg->used = em->len;
1944                 bg->flags = map->type;
1945                 ret = btrfs_add_block_group_cache(fs_info, bg);
1946                 if (ret) {
1947                         btrfs_remove_free_space_cache(bg);
1948                         btrfs_put_block_group(bg);
1949                         break;
1950                 }
1951                 btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
1952                                         0, &space_info);
1953                 bg->space_info = space_info;
1954                 link_block_group(bg);
1955
1956                 set_avail_alloc_bits(fs_info, bg->flags);
1957         }
1958         if (!ret)
1959                 btrfs_init_global_block_rsv(fs_info);
1960         return ret;
1961 }
1962
1963 int btrfs_read_block_groups(struct btrfs_fs_info *info)
1964 {
1965         struct btrfs_path *path;
1966         int ret;
1967         struct btrfs_block_group *cache;
1968         struct btrfs_space_info *space_info;
1969         struct btrfs_key key;
1970         int need_clear = 0;
1971         u64 cache_gen;
1972
1973         if (!info->extent_root)
1974                 return fill_dummy_bgs(info);
1975
1976         key.objectid = 0;
1977         key.offset = 0;
1978         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1979         path = btrfs_alloc_path();
1980         if (!path)
1981                 return -ENOMEM;
1982
1983         cache_gen = btrfs_super_cache_generation(info->super_copy);
1984         if (btrfs_test_opt(info, SPACE_CACHE) &&
1985             btrfs_super_generation(info->super_copy) != cache_gen)
1986                 need_clear = 1;
1987         if (btrfs_test_opt(info, CLEAR_CACHE))
1988                 need_clear = 1;
1989
1990         while (1) {
1991                 ret = find_first_block_group(info, path, &key);
1992                 if (ret > 0)
1993                         break;
1994                 if (ret != 0)
1995                         goto error;
1996
1997                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1998                 ret = read_one_block_group(info, path, &key, need_clear);
1999                 if (ret < 0)
2000                         goto error;
2001                 key.objectid += key.offset;
2002                 key.offset = 0;
2003                 btrfs_release_path(path);
2004         }
2005         btrfs_release_path(path);
2006
2007         list_for_each_entry(space_info, &info->space_info, list) {
2008                 int i;
2009
2010                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2011                         if (list_empty(&space_info->block_groups[i]))
2012                                 continue;
2013                         cache = list_first_entry(&space_info->block_groups[i],
2014                                                  struct btrfs_block_group,
2015                                                  list);
2016                         btrfs_sysfs_add_block_group_type(cache);
2017                 }
2018
2019                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2020                       (BTRFS_BLOCK_GROUP_RAID10 |
2021                        BTRFS_BLOCK_GROUP_RAID1_MASK |
2022                        BTRFS_BLOCK_GROUP_RAID56_MASK |
2023                        BTRFS_BLOCK_GROUP_DUP)))
2024                         continue;
2025                 /*
2026                  * Avoid allocating from un-mirrored block group if there are
2027                  * mirrored block groups.
2028                  */
2029                 list_for_each_entry(cache,
2030                                 &space_info->block_groups[BTRFS_RAID_RAID0],
2031                                 list)
2032                         inc_block_group_ro(cache, 1);
2033                 list_for_each_entry(cache,
2034                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2035                                 list)
2036                         inc_block_group_ro(cache, 1);
2037         }
2038
2039         btrfs_init_global_block_rsv(info);
2040         ret = check_chunk_block_group_mappings(info);
2041 error:
2042         btrfs_free_path(path);
2043         return ret;
2044 }
2045
2046 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2047                                    struct btrfs_block_group *block_group)
2048 {
2049         struct btrfs_fs_info *fs_info = trans->fs_info;
2050         struct btrfs_block_group_item bgi;
2051         struct btrfs_root *root;
2052         struct btrfs_key key;
2053
2054         spin_lock(&block_group->lock);
2055         btrfs_set_stack_block_group_used(&bgi, block_group->used);
2056         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2057                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2058         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2059         key.objectid = block_group->start;
2060         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2061         key.offset = block_group->length;
2062         spin_unlock(&block_group->lock);
2063
2064         root = fs_info->extent_root;
2065         return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2066 }
2067
2068 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2069 {
2070         struct btrfs_fs_info *fs_info = trans->fs_info;
2071         struct btrfs_block_group *block_group;
2072         int ret = 0;
2073
2074         if (!trans->can_flush_pending_bgs)
2075                 return;
2076
2077         while (!list_empty(&trans->new_bgs)) {
2078                 int index;
2079
2080                 block_group = list_first_entry(&trans->new_bgs,
2081                                                struct btrfs_block_group,
2082                                                bg_list);
2083                 if (ret)
2084                         goto next;
2085
2086                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
2087
2088                 ret = insert_block_group_item(trans, block_group);
2089                 if (ret)
2090                         btrfs_abort_transaction(trans, ret);
2091                 ret = btrfs_finish_chunk_alloc(trans, block_group->start,
2092                                         block_group->length);
2093                 if (ret)
2094                         btrfs_abort_transaction(trans, ret);
2095                 add_block_group_free_space(trans, block_group);
2096
2097                 /*
2098                  * If we restriped during balance, we may have added a new raid
2099                  * type, so now add the sysfs entries when it is safe to do so.
2100                  * We don't have to worry about locking here as it's handled in
2101                  * btrfs_sysfs_add_block_group_type.
2102                  */
2103                 if (block_group->space_info->block_group_kobjs[index] == NULL)
2104                         btrfs_sysfs_add_block_group_type(block_group);
2105
2106                 /* Already aborted the transaction if it failed. */
2107 next:
2108                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2109                 list_del_init(&block_group->bg_list);
2110         }
2111         btrfs_trans_release_chunk_metadata(trans);
2112 }
2113
2114 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
2115                            u64 type, u64 chunk_offset, u64 size)
2116 {
2117         struct btrfs_fs_info *fs_info = trans->fs_info;
2118         struct btrfs_block_group *cache;
2119         int ret;
2120
2121         btrfs_set_log_full_commit(trans);
2122
2123         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2124         if (!cache)
2125                 return -ENOMEM;
2126
2127         cache->length = size;
2128         set_free_space_tree_thresholds(cache);
2129         cache->used = bytes_used;
2130         cache->flags = type;
2131         cache->last_byte_to_unpin = (u64)-1;
2132         cache->cached = BTRFS_CACHE_FINISHED;
2133         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2134                 cache->needs_free_space = 1;
2135         ret = exclude_super_stripes(cache);
2136         if (ret) {
2137                 /* We may have excluded something, so call this just in case */
2138                 btrfs_free_excluded_extents(cache);
2139                 btrfs_put_block_group(cache);
2140                 return ret;
2141         }
2142
2143         add_new_free_space(cache, chunk_offset, chunk_offset + size);
2144
2145         btrfs_free_excluded_extents(cache);
2146
2147 #ifdef CONFIG_BTRFS_DEBUG
2148         if (btrfs_should_fragment_free_space(cache)) {
2149                 u64 new_bytes_used = size - bytes_used;
2150
2151                 bytes_used += new_bytes_used >> 1;
2152                 fragment_free_space(cache);
2153         }
2154 #endif
2155         /*
2156          * Ensure the corresponding space_info object is created and
2157          * assigned to our block group. We want our bg to be added to the rbtree
2158          * with its ->space_info set.
2159          */
2160         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2161         ASSERT(cache->space_info);
2162
2163         ret = btrfs_add_block_group_cache(fs_info, cache);
2164         if (ret) {
2165                 btrfs_remove_free_space_cache(cache);
2166                 btrfs_put_block_group(cache);
2167                 return ret;
2168         }
2169
2170         /*
2171          * Now that our block group has its ->space_info set and is inserted in
2172          * the rbtree, update the space info's counters.
2173          */
2174         trace_btrfs_add_block_group(fs_info, cache, 1);
2175         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2176                                 cache->bytes_super, &cache->space_info);
2177         btrfs_update_global_block_rsv(fs_info);
2178
2179         link_block_group(cache);
2180
2181         list_add_tail(&cache->bg_list, &trans->new_bgs);
2182         trans->delayed_ref_updates++;
2183         btrfs_update_delayed_refs_rsv(trans);
2184
2185         set_avail_alloc_bits(fs_info, type);
2186         return 0;
2187 }
2188
2189 /*
2190  * Mark one block group RO, can be called several times for the same block
2191  * group.
2192  *
2193  * @cache:              the destination block group
2194  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
2195  *                      ensure we still have some free space after marking this
2196  *                      block group RO.
2197  */
2198 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2199                              bool do_chunk_alloc)
2200 {
2201         struct btrfs_fs_info *fs_info = cache->fs_info;
2202         struct btrfs_trans_handle *trans;
2203         u64 alloc_flags;
2204         int ret;
2205
2206 again:
2207         trans = btrfs_join_transaction(fs_info->extent_root);
2208         if (IS_ERR(trans))
2209                 return PTR_ERR(trans);
2210
2211         /*
2212          * we're not allowed to set block groups readonly after the dirty
2213          * block groups cache has started writing.  If it already started,
2214          * back off and let this transaction commit
2215          */
2216         mutex_lock(&fs_info->ro_block_group_mutex);
2217         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2218                 u64 transid = trans->transid;
2219
2220                 mutex_unlock(&fs_info->ro_block_group_mutex);
2221                 btrfs_end_transaction(trans);
2222
2223                 ret = btrfs_wait_for_commit(fs_info, transid);
2224                 if (ret)
2225                         return ret;
2226                 goto again;
2227         }
2228
2229         if (do_chunk_alloc) {
2230                 /*
2231                  * If we are changing raid levels, try to allocate a
2232                  * corresponding block group with the new raid level.
2233                  */
2234                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2235                 if (alloc_flags != cache->flags) {
2236                         ret = btrfs_chunk_alloc(trans, alloc_flags,
2237                                                 CHUNK_ALLOC_FORCE);
2238                         /*
2239                          * ENOSPC is allowed here, we may have enough space
2240                          * already allocated at the new raid level to carry on
2241                          */
2242                         if (ret == -ENOSPC)
2243                                 ret = 0;
2244                         if (ret < 0)
2245                                 goto out;
2246                 }
2247         }
2248
2249         ret = inc_block_group_ro(cache, 0);
2250         if (!do_chunk_alloc)
2251                 goto unlock_out;
2252         if (!ret)
2253                 goto out;
2254         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2255         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2256         if (ret < 0)
2257                 goto out;
2258         ret = inc_block_group_ro(cache, 0);
2259 out:
2260         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2261                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2262                 mutex_lock(&fs_info->chunk_mutex);
2263                 check_system_chunk(trans, alloc_flags);
2264                 mutex_unlock(&fs_info->chunk_mutex);
2265         }
2266 unlock_out:
2267         mutex_unlock(&fs_info->ro_block_group_mutex);
2268
2269         btrfs_end_transaction(trans);
2270         return ret;
2271 }
2272
2273 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2274 {
2275         struct btrfs_space_info *sinfo = cache->space_info;
2276         u64 num_bytes;
2277
2278         BUG_ON(!cache->ro);
2279
2280         spin_lock(&sinfo->lock);
2281         spin_lock(&cache->lock);
2282         if (!--cache->ro) {
2283                 num_bytes = cache->length - cache->reserved -
2284                             cache->pinned - cache->bytes_super - cache->used;
2285                 sinfo->bytes_readonly -= num_bytes;
2286                 list_del_init(&cache->ro_list);
2287         }
2288         spin_unlock(&cache->lock);
2289         spin_unlock(&sinfo->lock);
2290 }
2291
2292 static int update_block_group_item(struct btrfs_trans_handle *trans,
2293                                    struct btrfs_path *path,
2294                                    struct btrfs_block_group *cache)
2295 {
2296         struct btrfs_fs_info *fs_info = trans->fs_info;
2297         int ret;
2298         struct btrfs_root *root = fs_info->extent_root;
2299         unsigned long bi;
2300         struct extent_buffer *leaf;
2301         struct btrfs_block_group_item bgi;
2302         struct btrfs_key key;
2303
2304         key.objectid = cache->start;
2305         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2306         key.offset = cache->length;
2307
2308         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2309         if (ret) {
2310                 if (ret > 0)
2311                         ret = -ENOENT;
2312                 goto fail;
2313         }
2314
2315         leaf = path->nodes[0];
2316         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2317         btrfs_set_stack_block_group_used(&bgi, cache->used);
2318         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2319                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2320         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2321         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2322         btrfs_mark_buffer_dirty(leaf);
2323 fail:
2324         btrfs_release_path(path);
2325         return ret;
2326
2327 }
2328
2329 static int cache_save_setup(struct btrfs_block_group *block_group,
2330                             struct btrfs_trans_handle *trans,
2331                             struct btrfs_path *path)
2332 {
2333         struct btrfs_fs_info *fs_info = block_group->fs_info;
2334         struct btrfs_root *root = fs_info->tree_root;
2335         struct inode *inode = NULL;
2336         struct extent_changeset *data_reserved = NULL;
2337         u64 alloc_hint = 0;
2338         int dcs = BTRFS_DC_ERROR;
2339         u64 num_pages = 0;
2340         int retries = 0;
2341         int ret = 0;
2342
2343         if (!btrfs_test_opt(fs_info, SPACE_CACHE))
2344                 return 0;
2345
2346         /*
2347          * If this block group is smaller than 100 megs don't bother caching the
2348          * block group.
2349          */
2350         if (block_group->length < (100 * SZ_1M)) {
2351                 spin_lock(&block_group->lock);
2352                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2353                 spin_unlock(&block_group->lock);
2354                 return 0;
2355         }
2356
2357         if (TRANS_ABORTED(trans))
2358                 return 0;
2359 again:
2360         inode = lookup_free_space_inode(block_group, path);
2361         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2362                 ret = PTR_ERR(inode);
2363                 btrfs_release_path(path);
2364                 goto out;
2365         }
2366
2367         if (IS_ERR(inode)) {
2368                 BUG_ON(retries);
2369                 retries++;
2370
2371                 if (block_group->ro)
2372                         goto out_free;
2373
2374                 ret = create_free_space_inode(trans, block_group, path);
2375                 if (ret)
2376                         goto out_free;
2377                 goto again;
2378         }
2379
2380         /*
2381          * We want to set the generation to 0, that way if anything goes wrong
2382          * from here on out we know not to trust this cache when we load up next
2383          * time.
2384          */
2385         BTRFS_I(inode)->generation = 0;
2386         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2387         if (ret) {
2388                 /*
2389                  * So theoretically we could recover from this, simply set the
2390                  * super cache generation to 0 so we know to invalidate the
2391                  * cache, but then we'd have to keep track of the block groups
2392                  * that fail this way so we know we _have_ to reset this cache
2393                  * before the next commit or risk reading stale cache.  So to
2394                  * limit our exposure to horrible edge cases lets just abort the
2395                  * transaction, this only happens in really bad situations
2396                  * anyway.
2397                  */
2398                 btrfs_abort_transaction(trans, ret);
2399                 goto out_put;
2400         }
2401         WARN_ON(ret);
2402
2403         /* We've already setup this transaction, go ahead and exit */
2404         if (block_group->cache_generation == trans->transid &&
2405             i_size_read(inode)) {
2406                 dcs = BTRFS_DC_SETUP;
2407                 goto out_put;
2408         }
2409
2410         if (i_size_read(inode) > 0) {
2411                 ret = btrfs_check_trunc_cache_free_space(fs_info,
2412                                         &fs_info->global_block_rsv);
2413                 if (ret)
2414                         goto out_put;
2415
2416                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2417                 if (ret)
2418                         goto out_put;
2419         }
2420
2421         spin_lock(&block_group->lock);
2422         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2423             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2424                 /*
2425                  * don't bother trying to write stuff out _if_
2426                  * a) we're not cached,
2427                  * b) we're with nospace_cache mount option,
2428                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
2429                  */
2430                 dcs = BTRFS_DC_WRITTEN;
2431                 spin_unlock(&block_group->lock);
2432                 goto out_put;
2433         }
2434         spin_unlock(&block_group->lock);
2435
2436         /*
2437          * We hit an ENOSPC when setting up the cache in this transaction, just
2438          * skip doing the setup, we've already cleared the cache so we're safe.
2439          */
2440         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2441                 ret = -ENOSPC;
2442                 goto out_put;
2443         }
2444
2445         /*
2446          * Try to preallocate enough space based on how big the block group is.
2447          * Keep in mind this has to include any pinned space which could end up
2448          * taking up quite a bit since it's not folded into the other space
2449          * cache.
2450          */
2451         num_pages = div_u64(block_group->length, SZ_256M);
2452         if (!num_pages)
2453                 num_pages = 1;
2454
2455         num_pages *= 16;
2456         num_pages *= PAGE_SIZE;
2457
2458         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
2459                                           num_pages);
2460         if (ret)
2461                 goto out_put;
2462
2463         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2464                                               num_pages, num_pages,
2465                                               &alloc_hint);
2466         /*
2467          * Our cache requires contiguous chunks so that we don't modify a bunch
2468          * of metadata or split extents when writing the cache out, which means
2469          * we can enospc if we are heavily fragmented in addition to just normal
2470          * out of space conditions.  So if we hit this just skip setting up any
2471          * other block groups for this transaction, maybe we'll unpin enough
2472          * space the next time around.
2473          */
2474         if (!ret)
2475                 dcs = BTRFS_DC_SETUP;
2476         else if (ret == -ENOSPC)
2477                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2478
2479 out_put:
2480         iput(inode);
2481 out_free:
2482         btrfs_release_path(path);
2483 out:
2484         spin_lock(&block_group->lock);
2485         if (!ret && dcs == BTRFS_DC_SETUP)
2486                 block_group->cache_generation = trans->transid;
2487         block_group->disk_cache_state = dcs;
2488         spin_unlock(&block_group->lock);
2489
2490         extent_changeset_free(data_reserved);
2491         return ret;
2492 }
2493
2494 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2495 {
2496         struct btrfs_fs_info *fs_info = trans->fs_info;
2497         struct btrfs_block_group *cache, *tmp;
2498         struct btrfs_transaction *cur_trans = trans->transaction;
2499         struct btrfs_path *path;
2500
2501         if (list_empty(&cur_trans->dirty_bgs) ||
2502             !btrfs_test_opt(fs_info, SPACE_CACHE))
2503                 return 0;
2504
2505         path = btrfs_alloc_path();
2506         if (!path)
2507                 return -ENOMEM;
2508
2509         /* Could add new block groups, use _safe just in case */
2510         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2511                                  dirty_list) {
2512                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2513                         cache_save_setup(cache, trans, path);
2514         }
2515
2516         btrfs_free_path(path);
2517         return 0;
2518 }
2519
2520 /*
2521  * Transaction commit does final block group cache writeback during a critical
2522  * section where nothing is allowed to change the FS.  This is required in
2523  * order for the cache to actually match the block group, but can introduce a
2524  * lot of latency into the commit.
2525  *
2526  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2527  * There's a chance we'll have to redo some of it if the block group changes
2528  * again during the commit, but it greatly reduces the commit latency by
2529  * getting rid of the easy block groups while we're still allowing others to
2530  * join the commit.
2531  */
2532 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2533 {
2534         struct btrfs_fs_info *fs_info = trans->fs_info;
2535         struct btrfs_block_group *cache;
2536         struct btrfs_transaction *cur_trans = trans->transaction;
2537         int ret = 0;
2538         int should_put;
2539         struct btrfs_path *path = NULL;
2540         LIST_HEAD(dirty);
2541         struct list_head *io = &cur_trans->io_bgs;
2542         int num_started = 0;
2543         int loops = 0;
2544
2545         spin_lock(&cur_trans->dirty_bgs_lock);
2546         if (list_empty(&cur_trans->dirty_bgs)) {
2547                 spin_unlock(&cur_trans->dirty_bgs_lock);
2548                 return 0;
2549         }
2550         list_splice_init(&cur_trans->dirty_bgs, &dirty);
2551         spin_unlock(&cur_trans->dirty_bgs_lock);
2552
2553 again:
2554         /* Make sure all the block groups on our dirty list actually exist */
2555         btrfs_create_pending_block_groups(trans);
2556
2557         if (!path) {
2558                 path = btrfs_alloc_path();
2559                 if (!path)
2560                         return -ENOMEM;
2561         }
2562
2563         /*
2564          * cache_write_mutex is here only to save us from balance or automatic
2565          * removal of empty block groups deleting this block group while we are
2566          * writing out the cache
2567          */
2568         mutex_lock(&trans->transaction->cache_write_mutex);
2569         while (!list_empty(&dirty)) {
2570                 bool drop_reserve = true;
2571
2572                 cache = list_first_entry(&dirty, struct btrfs_block_group,
2573                                          dirty_list);
2574                 /*
2575                  * This can happen if something re-dirties a block group that
2576                  * is already under IO.  Just wait for it to finish and then do
2577                  * it all again
2578                  */
2579                 if (!list_empty(&cache->io_list)) {
2580                         list_del_init(&cache->io_list);
2581                         btrfs_wait_cache_io(trans, cache, path);
2582                         btrfs_put_block_group(cache);
2583                 }
2584
2585
2586                 /*
2587                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2588                  * it should update the cache_state.  Don't delete until after
2589                  * we wait.
2590                  *
2591                  * Since we're not running in the commit critical section
2592                  * we need the dirty_bgs_lock to protect from update_block_group
2593                  */
2594                 spin_lock(&cur_trans->dirty_bgs_lock);
2595                 list_del_init(&cache->dirty_list);
2596                 spin_unlock(&cur_trans->dirty_bgs_lock);
2597
2598                 should_put = 1;
2599
2600                 cache_save_setup(cache, trans, path);
2601
2602                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2603                         cache->io_ctl.inode = NULL;
2604                         ret = btrfs_write_out_cache(trans, cache, path);
2605                         if (ret == 0 && cache->io_ctl.inode) {
2606                                 num_started++;
2607                                 should_put = 0;
2608
2609                                 /*
2610                                  * The cache_write_mutex is protecting the
2611                                  * io_list, also refer to the definition of
2612                                  * btrfs_transaction::io_bgs for more details
2613                                  */
2614                                 list_add_tail(&cache->io_list, io);
2615                         } else {
2616                                 /*
2617                                  * If we failed to write the cache, the
2618                                  * generation will be bad and life goes on
2619                                  */
2620                                 ret = 0;
2621                         }
2622                 }
2623                 if (!ret) {
2624                         ret = update_block_group_item(trans, path, cache);
2625                         /*
2626                          * Our block group might still be attached to the list
2627                          * of new block groups in the transaction handle of some
2628                          * other task (struct btrfs_trans_handle->new_bgs). This
2629                          * means its block group item isn't yet in the extent
2630                          * tree. If this happens ignore the error, as we will
2631                          * try again later in the critical section of the
2632                          * transaction commit.
2633                          */
2634                         if (ret == -ENOENT) {
2635                                 ret = 0;
2636                                 spin_lock(&cur_trans->dirty_bgs_lock);
2637                                 if (list_empty(&cache->dirty_list)) {
2638                                         list_add_tail(&cache->dirty_list,
2639                                                       &cur_trans->dirty_bgs);
2640                                         btrfs_get_block_group(cache);
2641                                         drop_reserve = false;
2642                                 }
2643                                 spin_unlock(&cur_trans->dirty_bgs_lock);
2644                         } else if (ret) {
2645                                 btrfs_abort_transaction(trans, ret);
2646                         }
2647                 }
2648
2649                 /* If it's not on the io list, we need to put the block group */
2650                 if (should_put)
2651                         btrfs_put_block_group(cache);
2652                 if (drop_reserve)
2653                         btrfs_delayed_refs_rsv_release(fs_info, 1);
2654
2655                 if (ret)
2656                         break;
2657
2658                 /*
2659                  * Avoid blocking other tasks for too long. It might even save
2660                  * us from writing caches for block groups that are going to be
2661                  * removed.
2662                  */
2663                 mutex_unlock(&trans->transaction->cache_write_mutex);
2664                 mutex_lock(&trans->transaction->cache_write_mutex);
2665         }
2666         mutex_unlock(&trans->transaction->cache_write_mutex);
2667
2668         /*
2669          * Go through delayed refs for all the stuff we've just kicked off
2670          * and then loop back (just once)
2671          */
2672         ret = btrfs_run_delayed_refs(trans, 0);
2673         if (!ret && loops == 0) {
2674                 loops++;
2675                 spin_lock(&cur_trans->dirty_bgs_lock);
2676                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
2677                 /*
2678                  * dirty_bgs_lock protects us from concurrent block group
2679                  * deletes too (not just cache_write_mutex).
2680                  */
2681                 if (!list_empty(&dirty)) {
2682                         spin_unlock(&cur_trans->dirty_bgs_lock);
2683                         goto again;
2684                 }
2685                 spin_unlock(&cur_trans->dirty_bgs_lock);
2686         } else if (ret < 0) {
2687                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
2688         }
2689
2690         btrfs_free_path(path);
2691         return ret;
2692 }
2693
2694 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
2695 {
2696         struct btrfs_fs_info *fs_info = trans->fs_info;
2697         struct btrfs_block_group *cache;
2698         struct btrfs_transaction *cur_trans = trans->transaction;
2699         int ret = 0;
2700         int should_put;
2701         struct btrfs_path *path;
2702         struct list_head *io = &cur_trans->io_bgs;
2703         int num_started = 0;
2704
2705         path = btrfs_alloc_path();
2706         if (!path)
2707                 return -ENOMEM;
2708
2709         /*
2710          * Even though we are in the critical section of the transaction commit,
2711          * we can still have concurrent tasks adding elements to this
2712          * transaction's list of dirty block groups. These tasks correspond to
2713          * endio free space workers started when writeback finishes for a
2714          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
2715          * allocate new block groups as a result of COWing nodes of the root
2716          * tree when updating the free space inode. The writeback for the space
2717          * caches is triggered by an earlier call to
2718          * btrfs_start_dirty_block_groups() and iterations of the following
2719          * loop.
2720          * Also we want to do the cache_save_setup first and then run the
2721          * delayed refs to make sure we have the best chance at doing this all
2722          * in one shot.
2723          */
2724         spin_lock(&cur_trans->dirty_bgs_lock);
2725         while (!list_empty(&cur_trans->dirty_bgs)) {
2726                 cache = list_first_entry(&cur_trans->dirty_bgs,
2727                                          struct btrfs_block_group,
2728                                          dirty_list);
2729
2730                 /*
2731                  * This can happen if cache_save_setup re-dirties a block group
2732                  * that is already under IO.  Just wait for it to finish and
2733                  * then do it all again
2734                  */
2735                 if (!list_empty(&cache->io_list)) {
2736                         spin_unlock(&cur_trans->dirty_bgs_lock);
2737                         list_del_init(&cache->io_list);
2738                         btrfs_wait_cache_io(trans, cache, path);
2739                         btrfs_put_block_group(cache);
2740                         spin_lock(&cur_trans->dirty_bgs_lock);
2741                 }
2742
2743                 /*
2744                  * Don't remove from the dirty list until after we've waited on
2745                  * any pending IO
2746                  */
2747                 list_del_init(&cache->dirty_list);
2748                 spin_unlock(&cur_trans->dirty_bgs_lock);
2749                 should_put = 1;
2750
2751                 cache_save_setup(cache, trans, path);
2752
2753                 if (!ret)
2754                         ret = btrfs_run_delayed_refs(trans,
2755                                                      (unsigned long) -1);
2756
2757                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
2758                         cache->io_ctl.inode = NULL;
2759                         ret = btrfs_write_out_cache(trans, cache, path);
2760                         if (ret == 0 && cache->io_ctl.inode) {
2761                                 num_started++;
2762                                 should_put = 0;
2763                                 list_add_tail(&cache->io_list, io);
2764                         } else {
2765                                 /*
2766                                  * If we failed to write the cache, the
2767                                  * generation will be bad and life goes on
2768                                  */
2769                                 ret = 0;
2770                         }
2771                 }
2772                 if (!ret) {
2773                         ret = update_block_group_item(trans, path, cache);
2774                         /*
2775                          * One of the free space endio workers might have
2776                          * created a new block group while updating a free space
2777                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
2778                          * and hasn't released its transaction handle yet, in
2779                          * which case the new block group is still attached to
2780                          * its transaction handle and its creation has not
2781                          * finished yet (no block group item in the extent tree
2782                          * yet, etc). If this is the case, wait for all free
2783                          * space endio workers to finish and retry. This is a
2784                          * very rare case so no need for a more efficient and
2785                          * complex approach.
2786                          */
2787                         if (ret == -ENOENT) {
2788                                 wait_event(cur_trans->writer_wait,
2789                                    atomic_read(&cur_trans->num_writers) == 1);
2790                                 ret = update_block_group_item(trans, path, cache);
2791                         }
2792                         if (ret)
2793                                 btrfs_abort_transaction(trans, ret);
2794                 }
2795
2796                 /* If its not on the io list, we need to put the block group */
2797                 if (should_put)
2798                         btrfs_put_block_group(cache);
2799                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2800                 spin_lock(&cur_trans->dirty_bgs_lock);
2801         }
2802         spin_unlock(&cur_trans->dirty_bgs_lock);
2803
2804         /*
2805          * Refer to the definition of io_bgs member for details why it's safe
2806          * to use it without any locking
2807          */
2808         while (!list_empty(io)) {
2809                 cache = list_first_entry(io, struct btrfs_block_group,
2810                                          io_list);
2811                 list_del_init(&cache->io_list);
2812                 btrfs_wait_cache_io(trans, cache, path);
2813                 btrfs_put_block_group(cache);
2814         }
2815
2816         btrfs_free_path(path);
2817         return ret;
2818 }
2819
2820 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
2821                              u64 bytenr, u64 num_bytes, int alloc)
2822 {
2823         struct btrfs_fs_info *info = trans->fs_info;
2824         struct btrfs_block_group *cache = NULL;
2825         u64 total = num_bytes;
2826         u64 old_val;
2827         u64 byte_in_group;
2828         int factor;
2829         int ret = 0;
2830
2831         /* Block accounting for super block */
2832         spin_lock(&info->delalloc_root_lock);
2833         old_val = btrfs_super_bytes_used(info->super_copy);
2834         if (alloc)
2835                 old_val += num_bytes;
2836         else
2837                 old_val -= num_bytes;
2838         btrfs_set_super_bytes_used(info->super_copy, old_val);
2839         spin_unlock(&info->delalloc_root_lock);
2840
2841         while (total) {
2842                 cache = btrfs_lookup_block_group(info, bytenr);
2843                 if (!cache) {
2844                         ret = -ENOENT;
2845                         break;
2846                 }
2847                 factor = btrfs_bg_type_to_factor(cache->flags);
2848
2849                 /*
2850                  * If this block group has free space cache written out, we
2851                  * need to make sure to load it if we are removing space.  This
2852                  * is because we need the unpinning stage to actually add the
2853                  * space back to the block group, otherwise we will leak space.
2854                  */
2855                 if (!alloc && !btrfs_block_group_done(cache))
2856                         btrfs_cache_block_group(cache, 1);
2857
2858                 byte_in_group = bytenr - cache->start;
2859                 WARN_ON(byte_in_group > cache->length);
2860
2861                 spin_lock(&cache->space_info->lock);
2862                 spin_lock(&cache->lock);
2863
2864                 if (btrfs_test_opt(info, SPACE_CACHE) &&
2865                     cache->disk_cache_state < BTRFS_DC_CLEAR)
2866                         cache->disk_cache_state = BTRFS_DC_CLEAR;
2867
2868                 old_val = cache->used;
2869                 num_bytes = min(total, cache->length - byte_in_group);
2870                 if (alloc) {
2871                         old_val += num_bytes;
2872                         cache->used = old_val;
2873                         cache->reserved -= num_bytes;
2874                         cache->space_info->bytes_reserved -= num_bytes;
2875                         cache->space_info->bytes_used += num_bytes;
2876                         cache->space_info->disk_used += num_bytes * factor;
2877                         spin_unlock(&cache->lock);
2878                         spin_unlock(&cache->space_info->lock);
2879                 } else {
2880                         old_val -= num_bytes;
2881                         cache->used = old_val;
2882                         cache->pinned += num_bytes;
2883                         btrfs_space_info_update_bytes_pinned(info,
2884                                         cache->space_info, num_bytes);
2885                         cache->space_info->bytes_used -= num_bytes;
2886                         cache->space_info->disk_used -= num_bytes * factor;
2887                         spin_unlock(&cache->lock);
2888                         spin_unlock(&cache->space_info->lock);
2889
2890                         percpu_counter_add_batch(
2891                                         &cache->space_info->total_bytes_pinned,
2892                                         num_bytes,
2893                                         BTRFS_TOTAL_BYTES_PINNED_BATCH);
2894                         set_extent_dirty(&trans->transaction->pinned_extents,
2895                                          bytenr, bytenr + num_bytes - 1,
2896                                          GFP_NOFS | __GFP_NOFAIL);
2897                 }
2898
2899                 spin_lock(&trans->transaction->dirty_bgs_lock);
2900                 if (list_empty(&cache->dirty_list)) {
2901                         list_add_tail(&cache->dirty_list,
2902                                       &trans->transaction->dirty_bgs);
2903                         trans->delayed_ref_updates++;
2904                         btrfs_get_block_group(cache);
2905                 }
2906                 spin_unlock(&trans->transaction->dirty_bgs_lock);
2907
2908                 /*
2909                  * No longer have used bytes in this block group, queue it for
2910                  * deletion. We do this after adding the block group to the
2911                  * dirty list to avoid races between cleaner kthread and space
2912                  * cache writeout.
2913                  */
2914                 if (!alloc && old_val == 0) {
2915                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
2916                                 btrfs_mark_bg_unused(cache);
2917                 }
2918
2919                 btrfs_put_block_group(cache);
2920                 total -= num_bytes;
2921                 bytenr += num_bytes;
2922         }
2923
2924         /* Modified block groups are accounted for in the delayed_refs_rsv. */
2925         btrfs_update_delayed_refs_rsv(trans);
2926         return ret;
2927 }
2928
2929 /**
2930  * btrfs_add_reserved_bytes - update the block_group and space info counters
2931  * @cache:      The cache we are manipulating
2932  * @ram_bytes:  The number of bytes of file content, and will be same to
2933  *              @num_bytes except for the compress path.
2934  * @num_bytes:  The number of bytes in question
2935  * @delalloc:   The blocks are allocated for the delalloc write
2936  *
2937  * This is called by the allocator when it reserves space. If this is a
2938  * reservation and the block group has become read only we cannot make the
2939  * reservation and return -EAGAIN, otherwise this function always succeeds.
2940  */
2941 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
2942                              u64 ram_bytes, u64 num_bytes, int delalloc)
2943 {
2944         struct btrfs_space_info *space_info = cache->space_info;
2945         int ret = 0;
2946
2947         spin_lock(&space_info->lock);
2948         spin_lock(&cache->lock);
2949         if (cache->ro) {
2950                 ret = -EAGAIN;
2951         } else {
2952                 cache->reserved += num_bytes;
2953                 space_info->bytes_reserved += num_bytes;
2954                 trace_btrfs_space_reservation(cache->fs_info, "space_info",
2955                                               space_info->flags, num_bytes, 1);
2956                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
2957                                                       space_info, -ram_bytes);
2958                 if (delalloc)
2959                         cache->delalloc_bytes += num_bytes;
2960
2961                 /*
2962                  * Compression can use less space than we reserved, so wake
2963                  * tickets if that happens
2964                  */
2965                 if (num_bytes < ram_bytes)
2966                         btrfs_try_granting_tickets(cache->fs_info, space_info);
2967         }
2968         spin_unlock(&cache->lock);
2969         spin_unlock(&space_info->lock);
2970         return ret;
2971 }
2972
2973 /**
2974  * btrfs_free_reserved_bytes - update the block_group and space info counters
2975  * @cache:      The cache we are manipulating
2976  * @num_bytes:  The number of bytes in question
2977  * @delalloc:   The blocks are allocated for the delalloc write
2978  *
2979  * This is called by somebody who is freeing space that was never actually used
2980  * on disk.  For example if you reserve some space for a new leaf in transaction
2981  * A and before transaction A commits you free that leaf, you call this with
2982  * reserve set to 0 in order to clear the reservation.
2983  */
2984 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
2985                                u64 num_bytes, int delalloc)
2986 {
2987         struct btrfs_space_info *space_info = cache->space_info;
2988
2989         spin_lock(&space_info->lock);
2990         spin_lock(&cache->lock);
2991         if (cache->ro)
2992                 space_info->bytes_readonly += num_bytes;
2993         cache->reserved -= num_bytes;
2994         space_info->bytes_reserved -= num_bytes;
2995         space_info->max_extent_size = 0;
2996
2997         if (delalloc)
2998                 cache->delalloc_bytes -= num_bytes;
2999         spin_unlock(&cache->lock);
3000
3001         btrfs_try_granting_tickets(cache->fs_info, space_info);
3002         spin_unlock(&space_info->lock);
3003 }
3004
3005 static void force_metadata_allocation(struct btrfs_fs_info *info)
3006 {
3007         struct list_head *head = &info->space_info;
3008         struct btrfs_space_info *found;
3009
3010         list_for_each_entry(found, head, list) {
3011                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3012                         found->force_alloc = CHUNK_ALLOC_FORCE;
3013         }
3014 }
3015
3016 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3017                               struct btrfs_space_info *sinfo, int force)
3018 {
3019         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3020         u64 thresh;
3021
3022         if (force == CHUNK_ALLOC_FORCE)
3023                 return 1;
3024
3025         /*
3026          * in limited mode, we want to have some free space up to
3027          * about 1% of the FS size.
3028          */
3029         if (force == CHUNK_ALLOC_LIMITED) {
3030                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3031                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3032
3033                 if (sinfo->total_bytes - bytes_used < thresh)
3034                         return 1;
3035         }
3036
3037         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3038                 return 0;
3039         return 1;
3040 }
3041
3042 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3043 {
3044         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3045
3046         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3047 }
3048
3049 /*
3050  * If force is CHUNK_ALLOC_FORCE:
3051  *    - return 1 if it successfully allocates a chunk,
3052  *    - return errors including -ENOSPC otherwise.
3053  * If force is NOT CHUNK_ALLOC_FORCE:
3054  *    - return 0 if it doesn't need to allocate a new chunk,
3055  *    - return 1 if it successfully allocates a chunk,
3056  *    - return errors including -ENOSPC otherwise.
3057  */
3058 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3059                       enum btrfs_chunk_alloc_enum force)
3060 {
3061         struct btrfs_fs_info *fs_info = trans->fs_info;
3062         struct btrfs_space_info *space_info;
3063         bool wait_for_alloc = false;
3064         bool should_alloc = false;
3065         int ret = 0;
3066
3067         /* Don't re-enter if we're already allocating a chunk */
3068         if (trans->allocating_chunk)
3069                 return -ENOSPC;
3070
3071         space_info = btrfs_find_space_info(fs_info, flags);
3072         ASSERT(space_info);
3073
3074         do {
3075                 spin_lock(&space_info->lock);
3076                 if (force < space_info->force_alloc)
3077                         force = space_info->force_alloc;
3078                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3079                 if (space_info->full) {
3080                         /* No more free physical space */
3081                         if (should_alloc)
3082                                 ret = -ENOSPC;
3083                         else
3084                                 ret = 0;
3085                         spin_unlock(&space_info->lock);
3086                         return ret;
3087                 } else if (!should_alloc) {
3088                         spin_unlock(&space_info->lock);
3089                         return 0;
3090                 } else if (space_info->chunk_alloc) {
3091                         /*
3092                          * Someone is already allocating, so we need to block
3093                          * until this someone is finished and then loop to
3094                          * recheck if we should continue with our allocation
3095                          * attempt.
3096                          */
3097                         wait_for_alloc = true;
3098                         spin_unlock(&space_info->lock);
3099                         mutex_lock(&fs_info->chunk_mutex);
3100                         mutex_unlock(&fs_info->chunk_mutex);
3101                 } else {
3102                         /* Proceed with allocation */
3103                         space_info->chunk_alloc = 1;
3104                         wait_for_alloc = false;
3105                         spin_unlock(&space_info->lock);
3106                 }
3107
3108                 cond_resched();
3109         } while (wait_for_alloc);
3110
3111         mutex_lock(&fs_info->chunk_mutex);
3112         trans->allocating_chunk = true;
3113
3114         /*
3115          * If we have mixed data/metadata chunks we want to make sure we keep
3116          * allocating mixed chunks instead of individual chunks.
3117          */
3118         if (btrfs_mixed_space_info(space_info))
3119                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3120
3121         /*
3122          * if we're doing a data chunk, go ahead and make sure that
3123          * we keep a reasonable number of metadata chunks allocated in the
3124          * FS as well.
3125          */
3126         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3127                 fs_info->data_chunk_allocations++;
3128                 if (!(fs_info->data_chunk_allocations %
3129                       fs_info->metadata_ratio))
3130                         force_metadata_allocation(fs_info);
3131         }
3132
3133         /*
3134          * Check if we have enough space in SYSTEM chunk because we may need
3135          * to update devices.
3136          */
3137         check_system_chunk(trans, flags);
3138
3139         ret = btrfs_alloc_chunk(trans, flags);
3140         trans->allocating_chunk = false;
3141
3142         spin_lock(&space_info->lock);
3143         if (ret < 0) {
3144                 if (ret == -ENOSPC)
3145                         space_info->full = 1;
3146                 else
3147                         goto out;
3148         } else {
3149                 ret = 1;
3150                 space_info->max_extent_size = 0;
3151         }
3152
3153         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3154 out:
3155         space_info->chunk_alloc = 0;
3156         spin_unlock(&space_info->lock);
3157         mutex_unlock(&fs_info->chunk_mutex);
3158         /*
3159          * When we allocate a new chunk we reserve space in the chunk block
3160          * reserve to make sure we can COW nodes/leafs in the chunk tree or
3161          * add new nodes/leafs to it if we end up needing to do it when
3162          * inserting the chunk item and updating device items as part of the
3163          * second phase of chunk allocation, performed by
3164          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
3165          * large number of new block groups to create in our transaction
3166          * handle's new_bgs list to avoid exhausting the chunk block reserve
3167          * in extreme cases - like having a single transaction create many new
3168          * block groups when starting to write out the free space caches of all
3169          * the block groups that were made dirty during the lifetime of the
3170          * transaction.
3171          */
3172         if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
3173                 btrfs_create_pending_block_groups(trans);
3174
3175         return ret;
3176 }
3177
3178 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3179 {
3180         u64 num_dev;
3181
3182         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3183         if (!num_dev)
3184                 num_dev = fs_info->fs_devices->rw_devices;
3185
3186         return num_dev;
3187 }
3188
3189 /*
3190  * Reserve space in the system space for allocating or removing a chunk
3191  */
3192 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3193 {
3194         struct btrfs_fs_info *fs_info = trans->fs_info;
3195         struct btrfs_space_info *info;
3196         u64 left;
3197         u64 thresh;
3198         int ret = 0;
3199         u64 num_devs;
3200
3201         /*
3202          * Needed because we can end up allocating a system chunk and for an
3203          * atomic and race free space reservation in the chunk block reserve.
3204          */
3205         lockdep_assert_held(&fs_info->chunk_mutex);
3206
3207         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3208         spin_lock(&info->lock);
3209         left = info->total_bytes - btrfs_space_info_used(info, true);
3210         spin_unlock(&info->lock);
3211
3212         num_devs = get_profile_num_devs(fs_info, type);
3213
3214         /* num_devs device items to update and 1 chunk item to add or remove */
3215         thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
3216                 btrfs_calc_insert_metadata_size(fs_info, 1);
3217
3218         if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3219                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3220                            left, thresh, type);
3221                 btrfs_dump_space_info(fs_info, info, 0, 0);
3222         }
3223
3224         if (left < thresh) {
3225                 u64 flags = btrfs_system_alloc_profile(fs_info);
3226
3227                 /*
3228                  * Ignore failure to create system chunk. We might end up not
3229                  * needing it, as we might not need to COW all nodes/leafs from
3230                  * the paths we visit in the chunk tree (they were already COWed
3231                  * or created in the current transaction for example).
3232                  */
3233                 ret = btrfs_alloc_chunk(trans, flags);
3234         }
3235
3236         if (!ret) {
3237                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
3238                                           &fs_info->chunk_block_rsv,
3239                                           thresh, BTRFS_RESERVE_NO_FLUSH);
3240                 if (!ret)
3241                         trans->chunk_bytes_reserved += thresh;
3242         }
3243 }
3244
3245 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3246 {
3247         struct btrfs_block_group *block_group;
3248         u64 last = 0;
3249
3250         while (1) {
3251                 struct inode *inode;
3252
3253                 block_group = btrfs_lookup_first_block_group(info, last);
3254                 while (block_group) {
3255                         btrfs_wait_block_group_cache_done(block_group);
3256                         spin_lock(&block_group->lock);
3257                         if (block_group->iref)
3258                                 break;
3259                         spin_unlock(&block_group->lock);
3260                         block_group = btrfs_next_block_group(block_group);
3261                 }
3262                 if (!block_group) {
3263                         if (last == 0)
3264                                 break;
3265                         last = 0;
3266                         continue;
3267                 }
3268
3269                 inode = block_group->inode;
3270                 block_group->iref = 0;
3271                 block_group->inode = NULL;
3272                 spin_unlock(&block_group->lock);
3273                 ASSERT(block_group->io_ctl.inode == NULL);
3274                 iput(inode);
3275                 last = block_group->start + block_group->length;
3276                 btrfs_put_block_group(block_group);
3277         }
3278 }
3279
3280 /*
3281  * Must be called only after stopping all workers, since we could have block
3282  * group caching kthreads running, and therefore they could race with us if we
3283  * freed the block groups before stopping them.
3284  */
3285 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3286 {
3287         struct btrfs_block_group *block_group;
3288         struct btrfs_space_info *space_info;
3289         struct btrfs_caching_control *caching_ctl;
3290         struct rb_node *n;
3291
3292         spin_lock(&info->block_group_cache_lock);
3293         while (!list_empty(&info->caching_block_groups)) {
3294                 caching_ctl = list_entry(info->caching_block_groups.next,
3295                                          struct btrfs_caching_control, list);
3296                 list_del(&caching_ctl->list);
3297                 btrfs_put_caching_control(caching_ctl);
3298         }
3299         spin_unlock(&info->block_group_cache_lock);
3300
3301         spin_lock(&info->unused_bgs_lock);
3302         while (!list_empty(&info->unused_bgs)) {
3303                 block_group = list_first_entry(&info->unused_bgs,
3304                                                struct btrfs_block_group,
3305                                                bg_list);
3306                 list_del_init(&block_group->bg_list);
3307                 btrfs_put_block_group(block_group);
3308         }
3309         spin_unlock(&info->unused_bgs_lock);
3310
3311         spin_lock(&info->block_group_cache_lock);
3312         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3313                 block_group = rb_entry(n, struct btrfs_block_group,
3314                                        cache_node);
3315                 rb_erase(&block_group->cache_node,
3316                          &info->block_group_cache_tree);
3317                 RB_CLEAR_NODE(&block_group->cache_node);
3318                 spin_unlock(&info->block_group_cache_lock);
3319
3320                 down_write(&block_group->space_info->groups_sem);
3321                 list_del(&block_group->list);
3322                 up_write(&block_group->space_info->groups_sem);
3323
3324                 /*
3325                  * We haven't cached this block group, which means we could
3326                  * possibly have excluded extents on this block group.
3327                  */
3328                 if (block_group->cached == BTRFS_CACHE_NO ||
3329                     block_group->cached == BTRFS_CACHE_ERROR)
3330                         btrfs_free_excluded_extents(block_group);
3331
3332                 btrfs_remove_free_space_cache(block_group);
3333                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3334                 ASSERT(list_empty(&block_group->dirty_list));
3335                 ASSERT(list_empty(&block_group->io_list));
3336                 ASSERT(list_empty(&block_group->bg_list));
3337                 ASSERT(refcount_read(&block_group->refs) == 1);
3338                 btrfs_put_block_group(block_group);
3339
3340                 spin_lock(&info->block_group_cache_lock);
3341         }
3342         spin_unlock(&info->block_group_cache_lock);
3343
3344         btrfs_release_global_block_rsv(info);
3345
3346         while (!list_empty(&info->space_info)) {
3347                 space_info = list_entry(info->space_info.next,
3348                                         struct btrfs_space_info,
3349                                         list);
3350
3351                 /*
3352                  * Do not hide this behind enospc_debug, this is actually
3353                  * important and indicates a real bug if this happens.
3354                  */
3355                 if (WARN_ON(space_info->bytes_pinned > 0 ||
3356                             space_info->bytes_reserved > 0 ||
3357                             space_info->bytes_may_use > 0))
3358                         btrfs_dump_space_info(info, space_info, 0, 0);
3359                 WARN_ON(space_info->reclaim_size > 0);
3360                 list_del(&space_info->list);
3361                 btrfs_sysfs_remove_space_info(space_info);
3362         }
3363         return 0;
3364 }
3365
3366 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
3367 {
3368         atomic_inc(&cache->frozen);
3369 }
3370
3371 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
3372 {
3373         struct btrfs_fs_info *fs_info = block_group->fs_info;
3374         struct extent_map_tree *em_tree;
3375         struct extent_map *em;
3376         bool cleanup;
3377
3378         spin_lock(&block_group->lock);
3379         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
3380                    block_group->removed);
3381         spin_unlock(&block_group->lock);
3382
3383         if (cleanup) {
3384                 em_tree = &fs_info->mapping_tree;
3385                 write_lock(&em_tree->lock);
3386                 em = lookup_extent_mapping(em_tree, block_group->start,
3387                                            1);
3388                 BUG_ON(!em); /* logic error, can't happen */
3389                 remove_extent_mapping(em_tree, em);
3390                 write_unlock(&em_tree->lock);
3391
3392                 /* once for us and once for the tree */
3393                 free_extent_map(em);
3394                 free_extent_map(em);
3395
3396                 /*
3397                  * We may have left one free space entry and other possible
3398                  * tasks trimming this block group have left 1 entry each one.
3399                  * Free them if any.
3400                  */
3401                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3402         }
3403 }