btrfs: use a single variable for return value at lookup_inline_extent_backref()
[linux-2.6-microblaze.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "ctree.h"
20 #include "extent-tree.h"
21 #include "tree-log.h"
22 #include "disk-io.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "locking.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
29 #include "sysfs.h"
30 #include "qgroup.h"
31 #include "ref-verify.h"
32 #include "space-info.h"
33 #include "block-rsv.h"
34 #include "delalloc-space.h"
35 #include "discard.h"
36 #include "rcu-string.h"
37 #include "zoned.h"
38 #include "dev-replace.h"
39 #include "fs.h"
40 #include "accessors.h"
41 #include "root-tree.h"
42 #include "file-item.h"
43 #include "orphan.h"
44 #include "tree-checker.h"
45
46 #undef SCRAMBLE_DELAYED_REFS
47
48
49 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
50                                struct btrfs_delayed_ref_node *node, u64 parent,
51                                u64 root_objectid, u64 owner_objectid,
52                                u64 owner_offset,
53                                struct btrfs_delayed_extent_op *extra_op);
54 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
55                                     struct extent_buffer *leaf,
56                                     struct btrfs_extent_item *ei);
57 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
58                                       u64 parent, u64 root_objectid,
59                                       u64 flags, u64 owner, u64 offset,
60                                       struct btrfs_key *ins, int ref_mod);
61 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
62                                      struct btrfs_delayed_ref_node *node,
63                                      struct btrfs_delayed_extent_op *extent_op);
64 static int find_next_key(struct btrfs_path *path, int level,
65                          struct btrfs_key *key);
66
67 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
68 {
69         return (cache->flags & bits) == bits;
70 }
71
72 /* simple helper to search for an existing data extent at a given offset */
73 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
74 {
75         struct btrfs_root *root = btrfs_extent_root(fs_info, start);
76         int ret;
77         struct btrfs_key key;
78         struct btrfs_path *path;
79
80         path = btrfs_alloc_path();
81         if (!path)
82                 return -ENOMEM;
83
84         key.objectid = start;
85         key.offset = len;
86         key.type = BTRFS_EXTENT_ITEM_KEY;
87         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
88         btrfs_free_path(path);
89         return ret;
90 }
91
92 /*
93  * helper function to lookup reference count and flags of a tree block.
94  *
95  * the head node for delayed ref is used to store the sum of all the
96  * reference count modifications queued up in the rbtree. the head
97  * node may also store the extent flags to set. This way you can check
98  * to see what the reference count and extent flags would be if all of
99  * the delayed refs are not processed.
100  */
101 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
102                              struct btrfs_fs_info *fs_info, u64 bytenr,
103                              u64 offset, int metadata, u64 *refs, u64 *flags)
104 {
105         struct btrfs_root *extent_root;
106         struct btrfs_delayed_ref_head *head;
107         struct btrfs_delayed_ref_root *delayed_refs;
108         struct btrfs_path *path;
109         struct btrfs_extent_item *ei;
110         struct extent_buffer *leaf;
111         struct btrfs_key key;
112         u32 item_size;
113         u64 num_refs;
114         u64 extent_flags;
115         int ret;
116
117         /*
118          * If we don't have skinny metadata, don't bother doing anything
119          * different
120          */
121         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
122                 offset = fs_info->nodesize;
123                 metadata = 0;
124         }
125
126         path = btrfs_alloc_path();
127         if (!path)
128                 return -ENOMEM;
129
130         if (!trans) {
131                 path->skip_locking = 1;
132                 path->search_commit_root = 1;
133         }
134
135 search_again:
136         key.objectid = bytenr;
137         key.offset = offset;
138         if (metadata)
139                 key.type = BTRFS_METADATA_ITEM_KEY;
140         else
141                 key.type = BTRFS_EXTENT_ITEM_KEY;
142
143         extent_root = btrfs_extent_root(fs_info, bytenr);
144         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
145         if (ret < 0)
146                 goto out_free;
147
148         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
149                 if (path->slots[0]) {
150                         path->slots[0]--;
151                         btrfs_item_key_to_cpu(path->nodes[0], &key,
152                                               path->slots[0]);
153                         if (key.objectid == bytenr &&
154                             key.type == BTRFS_EXTENT_ITEM_KEY &&
155                             key.offset == fs_info->nodesize)
156                                 ret = 0;
157                 }
158         }
159
160         if (ret == 0) {
161                 leaf = path->nodes[0];
162                 item_size = btrfs_item_size(leaf, path->slots[0]);
163                 if (item_size >= sizeof(*ei)) {
164                         ei = btrfs_item_ptr(leaf, path->slots[0],
165                                             struct btrfs_extent_item);
166                         num_refs = btrfs_extent_refs(leaf, ei);
167                         extent_flags = btrfs_extent_flags(leaf, ei);
168                 } else {
169                         ret = -EUCLEAN;
170                         btrfs_err(fs_info,
171                         "unexpected extent item size, has %u expect >= %zu",
172                                   item_size, sizeof(*ei));
173                         if (trans)
174                                 btrfs_abort_transaction(trans, ret);
175                         else
176                                 btrfs_handle_fs_error(fs_info, ret, NULL);
177
178                         goto out_free;
179                 }
180
181                 BUG_ON(num_refs == 0);
182         } else {
183                 num_refs = 0;
184                 extent_flags = 0;
185                 ret = 0;
186         }
187
188         if (!trans)
189                 goto out;
190
191         delayed_refs = &trans->transaction->delayed_refs;
192         spin_lock(&delayed_refs->lock);
193         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
194         if (head) {
195                 if (!mutex_trylock(&head->mutex)) {
196                         refcount_inc(&head->refs);
197                         spin_unlock(&delayed_refs->lock);
198
199                         btrfs_release_path(path);
200
201                         /*
202                          * Mutex was contended, block until it's released and try
203                          * again
204                          */
205                         mutex_lock(&head->mutex);
206                         mutex_unlock(&head->mutex);
207                         btrfs_put_delayed_ref_head(head);
208                         goto search_again;
209                 }
210                 spin_lock(&head->lock);
211                 if (head->extent_op && head->extent_op->update_flags)
212                         extent_flags |= head->extent_op->flags_to_set;
213                 else
214                         BUG_ON(num_refs == 0);
215
216                 num_refs += head->ref_mod;
217                 spin_unlock(&head->lock);
218                 mutex_unlock(&head->mutex);
219         }
220         spin_unlock(&delayed_refs->lock);
221 out:
222         WARN_ON(num_refs == 0);
223         if (refs)
224                 *refs = num_refs;
225         if (flags)
226                 *flags = extent_flags;
227 out_free:
228         btrfs_free_path(path);
229         return ret;
230 }
231
232 /*
233  * Back reference rules.  Back refs have three main goals:
234  *
235  * 1) differentiate between all holders of references to an extent so that
236  *    when a reference is dropped we can make sure it was a valid reference
237  *    before freeing the extent.
238  *
239  * 2) Provide enough information to quickly find the holders of an extent
240  *    if we notice a given block is corrupted or bad.
241  *
242  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
243  *    maintenance.  This is actually the same as #2, but with a slightly
244  *    different use case.
245  *
246  * There are two kinds of back refs. The implicit back refs is optimized
247  * for pointers in non-shared tree blocks. For a given pointer in a block,
248  * back refs of this kind provide information about the block's owner tree
249  * and the pointer's key. These information allow us to find the block by
250  * b-tree searching. The full back refs is for pointers in tree blocks not
251  * referenced by their owner trees. The location of tree block is recorded
252  * in the back refs. Actually the full back refs is generic, and can be
253  * used in all cases the implicit back refs is used. The major shortcoming
254  * of the full back refs is its overhead. Every time a tree block gets
255  * COWed, we have to update back refs entry for all pointers in it.
256  *
257  * For a newly allocated tree block, we use implicit back refs for
258  * pointers in it. This means most tree related operations only involve
259  * implicit back refs. For a tree block created in old transaction, the
260  * only way to drop a reference to it is COW it. So we can detect the
261  * event that tree block loses its owner tree's reference and do the
262  * back refs conversion.
263  *
264  * When a tree block is COWed through a tree, there are four cases:
265  *
266  * The reference count of the block is one and the tree is the block's
267  * owner tree. Nothing to do in this case.
268  *
269  * The reference count of the block is one and the tree is not the
270  * block's owner tree. In this case, full back refs is used for pointers
271  * in the block. Remove these full back refs, add implicit back refs for
272  * every pointers in the new block.
273  *
274  * The reference count of the block is greater than one and the tree is
275  * the block's owner tree. In this case, implicit back refs is used for
276  * pointers in the block. Add full back refs for every pointers in the
277  * block, increase lower level extents' reference counts. The original
278  * implicit back refs are entailed to the new block.
279  *
280  * The reference count of the block is greater than one and the tree is
281  * not the block's owner tree. Add implicit back refs for every pointer in
282  * the new block, increase lower level extents' reference count.
283  *
284  * Back Reference Key composing:
285  *
286  * The key objectid corresponds to the first byte in the extent,
287  * The key type is used to differentiate between types of back refs.
288  * There are different meanings of the key offset for different types
289  * of back refs.
290  *
291  * File extents can be referenced by:
292  *
293  * - multiple snapshots, subvolumes, or different generations in one subvol
294  * - different files inside a single subvolume
295  * - different offsets inside a file (bookend extents in file.c)
296  *
297  * The extent ref structure for the implicit back refs has fields for:
298  *
299  * - Objectid of the subvolume root
300  * - objectid of the file holding the reference
301  * - original offset in the file
302  * - how many bookend extents
303  *
304  * The key offset for the implicit back refs is hash of the first
305  * three fields.
306  *
307  * The extent ref structure for the full back refs has field for:
308  *
309  * - number of pointers in the tree leaf
310  *
311  * The key offset for the implicit back refs is the first byte of
312  * the tree leaf
313  *
314  * When a file extent is allocated, The implicit back refs is used.
315  * the fields are filled in:
316  *
317  *     (root_key.objectid, inode objectid, offset in file, 1)
318  *
319  * When a file extent is removed file truncation, we find the
320  * corresponding implicit back refs and check the following fields:
321  *
322  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
323  *
324  * Btree extents can be referenced by:
325  *
326  * - Different subvolumes
327  *
328  * Both the implicit back refs and the full back refs for tree blocks
329  * only consist of key. The key offset for the implicit back refs is
330  * objectid of block's owner tree. The key offset for the full back refs
331  * is the first byte of parent block.
332  *
333  * When implicit back refs is used, information about the lowest key and
334  * level of the tree block are required. These information are stored in
335  * tree block info structure.
336  */
337
338 /*
339  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
340  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
341  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
342  */
343 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
344                                      struct btrfs_extent_inline_ref *iref,
345                                      enum btrfs_inline_ref_type is_data)
346 {
347         int type = btrfs_extent_inline_ref_type(eb, iref);
348         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
349
350         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
351             type == BTRFS_SHARED_BLOCK_REF_KEY ||
352             type == BTRFS_SHARED_DATA_REF_KEY ||
353             type == BTRFS_EXTENT_DATA_REF_KEY) {
354                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
355                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
356                                 return type;
357                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
358                                 ASSERT(eb->fs_info);
359                                 /*
360                                  * Every shared one has parent tree block,
361                                  * which must be aligned to sector size.
362                                  */
363                                 if (offset &&
364                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
365                                         return type;
366                         }
367                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
368                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
369                                 return type;
370                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
371                                 ASSERT(eb->fs_info);
372                                 /*
373                                  * Every shared one has parent tree block,
374                                  * which must be aligned to sector size.
375                                  */
376                                 if (offset &&
377                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
378                                         return type;
379                         }
380                 } else {
381                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
382                         return type;
383                 }
384         }
385
386         WARN_ON(1);
387         btrfs_print_leaf(eb);
388         btrfs_err(eb->fs_info,
389                   "eb %llu iref 0x%lx invalid extent inline ref type %d",
390                   eb->start, (unsigned long)iref, type);
391
392         return BTRFS_REF_TYPE_INVALID;
393 }
394
395 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
396 {
397         u32 high_crc = ~(u32)0;
398         u32 low_crc = ~(u32)0;
399         __le64 lenum;
400
401         lenum = cpu_to_le64(root_objectid);
402         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
403         lenum = cpu_to_le64(owner);
404         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
405         lenum = cpu_to_le64(offset);
406         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
407
408         return ((u64)high_crc << 31) ^ (u64)low_crc;
409 }
410
411 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
412                                      struct btrfs_extent_data_ref *ref)
413 {
414         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
415                                     btrfs_extent_data_ref_objectid(leaf, ref),
416                                     btrfs_extent_data_ref_offset(leaf, ref));
417 }
418
419 static int match_extent_data_ref(struct extent_buffer *leaf,
420                                  struct btrfs_extent_data_ref *ref,
421                                  u64 root_objectid, u64 owner, u64 offset)
422 {
423         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
424             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
425             btrfs_extent_data_ref_offset(leaf, ref) != offset)
426                 return 0;
427         return 1;
428 }
429
430 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
431                                            struct btrfs_path *path,
432                                            u64 bytenr, u64 parent,
433                                            u64 root_objectid,
434                                            u64 owner, u64 offset)
435 {
436         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
437         struct btrfs_key key;
438         struct btrfs_extent_data_ref *ref;
439         struct extent_buffer *leaf;
440         u32 nritems;
441         int ret;
442         int recow;
443         int err = -ENOENT;
444
445         key.objectid = bytenr;
446         if (parent) {
447                 key.type = BTRFS_SHARED_DATA_REF_KEY;
448                 key.offset = parent;
449         } else {
450                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
451                 key.offset = hash_extent_data_ref(root_objectid,
452                                                   owner, offset);
453         }
454 again:
455         recow = 0;
456         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
457         if (ret < 0) {
458                 err = ret;
459                 goto fail;
460         }
461
462         if (parent) {
463                 if (!ret)
464                         return 0;
465                 goto fail;
466         }
467
468         leaf = path->nodes[0];
469         nritems = btrfs_header_nritems(leaf);
470         while (1) {
471                 if (path->slots[0] >= nritems) {
472                         ret = btrfs_next_leaf(root, path);
473                         if (ret < 0)
474                                 err = ret;
475                         if (ret)
476                                 goto fail;
477
478                         leaf = path->nodes[0];
479                         nritems = btrfs_header_nritems(leaf);
480                         recow = 1;
481                 }
482
483                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
484                 if (key.objectid != bytenr ||
485                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
486                         goto fail;
487
488                 ref = btrfs_item_ptr(leaf, path->slots[0],
489                                      struct btrfs_extent_data_ref);
490
491                 if (match_extent_data_ref(leaf, ref, root_objectid,
492                                           owner, offset)) {
493                         if (recow) {
494                                 btrfs_release_path(path);
495                                 goto again;
496                         }
497                         err = 0;
498                         break;
499                 }
500                 path->slots[0]++;
501         }
502 fail:
503         return err;
504 }
505
506 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
507                                            struct btrfs_path *path,
508                                            u64 bytenr, u64 parent,
509                                            u64 root_objectid, u64 owner,
510                                            u64 offset, int refs_to_add)
511 {
512         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
513         struct btrfs_key key;
514         struct extent_buffer *leaf;
515         u32 size;
516         u32 num_refs;
517         int ret;
518
519         key.objectid = bytenr;
520         if (parent) {
521                 key.type = BTRFS_SHARED_DATA_REF_KEY;
522                 key.offset = parent;
523                 size = sizeof(struct btrfs_shared_data_ref);
524         } else {
525                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
526                 key.offset = hash_extent_data_ref(root_objectid,
527                                                   owner, offset);
528                 size = sizeof(struct btrfs_extent_data_ref);
529         }
530
531         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
532         if (ret && ret != -EEXIST)
533                 goto fail;
534
535         leaf = path->nodes[0];
536         if (parent) {
537                 struct btrfs_shared_data_ref *ref;
538                 ref = btrfs_item_ptr(leaf, path->slots[0],
539                                      struct btrfs_shared_data_ref);
540                 if (ret == 0) {
541                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
542                 } else {
543                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
544                         num_refs += refs_to_add;
545                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
546                 }
547         } else {
548                 struct btrfs_extent_data_ref *ref;
549                 while (ret == -EEXIST) {
550                         ref = btrfs_item_ptr(leaf, path->slots[0],
551                                              struct btrfs_extent_data_ref);
552                         if (match_extent_data_ref(leaf, ref, root_objectid,
553                                                   owner, offset))
554                                 break;
555                         btrfs_release_path(path);
556                         key.offset++;
557                         ret = btrfs_insert_empty_item(trans, root, path, &key,
558                                                       size);
559                         if (ret && ret != -EEXIST)
560                                 goto fail;
561
562                         leaf = path->nodes[0];
563                 }
564                 ref = btrfs_item_ptr(leaf, path->slots[0],
565                                      struct btrfs_extent_data_ref);
566                 if (ret == 0) {
567                         btrfs_set_extent_data_ref_root(leaf, ref,
568                                                        root_objectid);
569                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
570                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
571                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
572                 } else {
573                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
574                         num_refs += refs_to_add;
575                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
576                 }
577         }
578         btrfs_mark_buffer_dirty(leaf);
579         ret = 0;
580 fail:
581         btrfs_release_path(path);
582         return ret;
583 }
584
585 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
586                                            struct btrfs_root *root,
587                                            struct btrfs_path *path,
588                                            int refs_to_drop)
589 {
590         struct btrfs_key key;
591         struct btrfs_extent_data_ref *ref1 = NULL;
592         struct btrfs_shared_data_ref *ref2 = NULL;
593         struct extent_buffer *leaf;
594         u32 num_refs = 0;
595         int ret = 0;
596
597         leaf = path->nodes[0];
598         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
599
600         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
601                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
602                                       struct btrfs_extent_data_ref);
603                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
604         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
605                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
606                                       struct btrfs_shared_data_ref);
607                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
608         } else {
609                 btrfs_err(trans->fs_info,
610                           "unrecognized backref key (%llu %u %llu)",
611                           key.objectid, key.type, key.offset);
612                 btrfs_abort_transaction(trans, -EUCLEAN);
613                 return -EUCLEAN;
614         }
615
616         BUG_ON(num_refs < refs_to_drop);
617         num_refs -= refs_to_drop;
618
619         if (num_refs == 0) {
620                 ret = btrfs_del_item(trans, root, path);
621         } else {
622                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
623                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
624                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
625                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
626                 btrfs_mark_buffer_dirty(leaf);
627         }
628         return ret;
629 }
630
631 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
632                                           struct btrfs_extent_inline_ref *iref)
633 {
634         struct btrfs_key key;
635         struct extent_buffer *leaf;
636         struct btrfs_extent_data_ref *ref1;
637         struct btrfs_shared_data_ref *ref2;
638         u32 num_refs = 0;
639         int type;
640
641         leaf = path->nodes[0];
642         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
643
644         if (iref) {
645                 /*
646                  * If type is invalid, we should have bailed out earlier than
647                  * this call.
648                  */
649                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
650                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
651                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
652                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
653                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
654                 } else {
655                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
656                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
657                 }
658         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
659                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
660                                       struct btrfs_extent_data_ref);
661                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
662         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
663                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
664                                       struct btrfs_shared_data_ref);
665                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
666         } else {
667                 WARN_ON(1);
668         }
669         return num_refs;
670 }
671
672 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
673                                           struct btrfs_path *path,
674                                           u64 bytenr, u64 parent,
675                                           u64 root_objectid)
676 {
677         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
678         struct btrfs_key key;
679         int ret;
680
681         key.objectid = bytenr;
682         if (parent) {
683                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
684                 key.offset = parent;
685         } else {
686                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
687                 key.offset = root_objectid;
688         }
689
690         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
691         if (ret > 0)
692                 ret = -ENOENT;
693         return ret;
694 }
695
696 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
697                                           struct btrfs_path *path,
698                                           u64 bytenr, u64 parent,
699                                           u64 root_objectid)
700 {
701         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
702         struct btrfs_key key;
703         int ret;
704
705         key.objectid = bytenr;
706         if (parent) {
707                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
708                 key.offset = parent;
709         } else {
710                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
711                 key.offset = root_objectid;
712         }
713
714         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
715         btrfs_release_path(path);
716         return ret;
717 }
718
719 static inline int extent_ref_type(u64 parent, u64 owner)
720 {
721         int type;
722         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
723                 if (parent > 0)
724                         type = BTRFS_SHARED_BLOCK_REF_KEY;
725                 else
726                         type = BTRFS_TREE_BLOCK_REF_KEY;
727         } else {
728                 if (parent > 0)
729                         type = BTRFS_SHARED_DATA_REF_KEY;
730                 else
731                         type = BTRFS_EXTENT_DATA_REF_KEY;
732         }
733         return type;
734 }
735
736 static int find_next_key(struct btrfs_path *path, int level,
737                          struct btrfs_key *key)
738
739 {
740         for (; level < BTRFS_MAX_LEVEL; level++) {
741                 if (!path->nodes[level])
742                         break;
743                 if (path->slots[level] + 1 >=
744                     btrfs_header_nritems(path->nodes[level]))
745                         continue;
746                 if (level == 0)
747                         btrfs_item_key_to_cpu(path->nodes[level], key,
748                                               path->slots[level] + 1);
749                 else
750                         btrfs_node_key_to_cpu(path->nodes[level], key,
751                                               path->slots[level] + 1);
752                 return 0;
753         }
754         return 1;
755 }
756
757 /*
758  * look for inline back ref. if back ref is found, *ref_ret is set
759  * to the address of inline back ref, and 0 is returned.
760  *
761  * if back ref isn't found, *ref_ret is set to the address where it
762  * should be inserted, and -ENOENT is returned.
763  *
764  * if insert is true and there are too many inline back refs, the path
765  * points to the extent item, and -EAGAIN is returned.
766  *
767  * NOTE: inline back refs are ordered in the same way that back ref
768  *       items in the tree are ordered.
769  */
770 static noinline_for_stack
771 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
772                                  struct btrfs_path *path,
773                                  struct btrfs_extent_inline_ref **ref_ret,
774                                  u64 bytenr, u64 num_bytes,
775                                  u64 parent, u64 root_objectid,
776                                  u64 owner, u64 offset, int insert)
777 {
778         struct btrfs_fs_info *fs_info = trans->fs_info;
779         struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
780         struct btrfs_key key;
781         struct extent_buffer *leaf;
782         struct btrfs_extent_item *ei;
783         struct btrfs_extent_inline_ref *iref;
784         u64 flags;
785         u64 item_size;
786         unsigned long ptr;
787         unsigned long end;
788         int extra_size;
789         int type;
790         int want;
791         int ret;
792         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
793         int needed;
794
795         key.objectid = bytenr;
796         key.type = BTRFS_EXTENT_ITEM_KEY;
797         key.offset = num_bytes;
798
799         want = extent_ref_type(parent, owner);
800         if (insert) {
801                 extra_size = btrfs_extent_inline_ref_size(want);
802                 path->search_for_extension = 1;
803                 path->keep_locks = 1;
804         } else
805                 extra_size = -1;
806
807         /*
808          * Owner is our level, so we can just add one to get the level for the
809          * block we are interested in.
810          */
811         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
812                 key.type = BTRFS_METADATA_ITEM_KEY;
813                 key.offset = owner;
814         }
815
816 again:
817         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
818         if (ret < 0)
819                 goto out;
820
821         /*
822          * We may be a newly converted file system which still has the old fat
823          * extent entries for metadata, so try and see if we have one of those.
824          */
825         if (ret > 0 && skinny_metadata) {
826                 skinny_metadata = false;
827                 if (path->slots[0]) {
828                         path->slots[0]--;
829                         btrfs_item_key_to_cpu(path->nodes[0], &key,
830                                               path->slots[0]);
831                         if (key.objectid == bytenr &&
832                             key.type == BTRFS_EXTENT_ITEM_KEY &&
833                             key.offset == num_bytes)
834                                 ret = 0;
835                 }
836                 if (ret) {
837                         key.objectid = bytenr;
838                         key.type = BTRFS_EXTENT_ITEM_KEY;
839                         key.offset = num_bytes;
840                         btrfs_release_path(path);
841                         goto again;
842                 }
843         }
844
845         if (ret && !insert) {
846                 ret = -ENOENT;
847                 goto out;
848         } else if (WARN_ON(ret)) {
849                 btrfs_print_leaf(path->nodes[0]);
850                 btrfs_err(fs_info,
851 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
852                           bytenr, num_bytes, parent, root_objectid, owner,
853                           offset);
854                 ret = -EIO;
855                 goto out;
856         }
857
858         leaf = path->nodes[0];
859         item_size = btrfs_item_size(leaf, path->slots[0]);
860         if (unlikely(item_size < sizeof(*ei))) {
861                 ret = -EUCLEAN;
862                 btrfs_err(fs_info,
863                           "unexpected extent item size, has %llu expect >= %zu",
864                           item_size, sizeof(*ei));
865                 btrfs_abort_transaction(trans, ret);
866                 goto out;
867         }
868
869         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
870         flags = btrfs_extent_flags(leaf, ei);
871
872         ptr = (unsigned long)(ei + 1);
873         end = (unsigned long)ei + item_size;
874
875         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
876                 ptr += sizeof(struct btrfs_tree_block_info);
877                 BUG_ON(ptr > end);
878         }
879
880         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
881                 needed = BTRFS_REF_TYPE_DATA;
882         else
883                 needed = BTRFS_REF_TYPE_BLOCK;
884
885         ret = -ENOENT;
886         while (1) {
887                 if (ptr >= end) {
888                         if (ptr > end) {
889                                 ret = -EUCLEAN;
890                                 btrfs_print_leaf(path->nodes[0]);
891                                 btrfs_crit(fs_info,
892 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
893                                         path->slots[0], root_objectid, owner, offset, parent);
894                         }
895                         break;
896                 }
897                 iref = (struct btrfs_extent_inline_ref *)ptr;
898                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
899                 if (type == BTRFS_REF_TYPE_INVALID) {
900                         ret = -EUCLEAN;
901                         goto out;
902                 }
903
904                 if (want < type)
905                         break;
906                 if (want > type) {
907                         ptr += btrfs_extent_inline_ref_size(type);
908                         continue;
909                 }
910
911                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
912                         struct btrfs_extent_data_ref *dref;
913                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
914                         if (match_extent_data_ref(leaf, dref, root_objectid,
915                                                   owner, offset)) {
916                                 ret = 0;
917                                 break;
918                         }
919                         if (hash_extent_data_ref_item(leaf, dref) <
920                             hash_extent_data_ref(root_objectid, owner, offset))
921                                 break;
922                 } else {
923                         u64 ref_offset;
924                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
925                         if (parent > 0) {
926                                 if (parent == ref_offset) {
927                                         ret = 0;
928                                         break;
929                                 }
930                                 if (ref_offset < parent)
931                                         break;
932                         } else {
933                                 if (root_objectid == ref_offset) {
934                                         ret = 0;
935                                         break;
936                                 }
937                                 if (ref_offset < root_objectid)
938                                         break;
939                         }
940                 }
941                 ptr += btrfs_extent_inline_ref_size(type);
942         }
943         if (ret == -ENOENT && insert) {
944                 if (item_size + extra_size >=
945                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
946                         ret = -EAGAIN;
947                         goto out;
948                 }
949                 /*
950                  * To add new inline back ref, we have to make sure
951                  * there is no corresponding back ref item.
952                  * For simplicity, we just do not add new inline back
953                  * ref if there is any kind of item for this block
954                  */
955                 if (find_next_key(path, 0, &key) == 0 &&
956                     key.objectid == bytenr &&
957                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
958                         ret = -EAGAIN;
959                         goto out;
960                 }
961         }
962         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
963 out:
964         if (insert) {
965                 path->keep_locks = 0;
966                 path->search_for_extension = 0;
967                 btrfs_unlock_up_safe(path, 1);
968         }
969         return ret;
970 }
971
972 /*
973  * helper to add new inline back ref
974  */
975 static noinline_for_stack
976 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
977                                  struct btrfs_path *path,
978                                  struct btrfs_extent_inline_ref *iref,
979                                  u64 parent, u64 root_objectid,
980                                  u64 owner, u64 offset, int refs_to_add,
981                                  struct btrfs_delayed_extent_op *extent_op)
982 {
983         struct extent_buffer *leaf;
984         struct btrfs_extent_item *ei;
985         unsigned long ptr;
986         unsigned long end;
987         unsigned long item_offset;
988         u64 refs;
989         int size;
990         int type;
991
992         leaf = path->nodes[0];
993         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
994         item_offset = (unsigned long)iref - (unsigned long)ei;
995
996         type = extent_ref_type(parent, owner);
997         size = btrfs_extent_inline_ref_size(type);
998
999         btrfs_extend_item(path, size);
1000
1001         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002         refs = btrfs_extent_refs(leaf, ei);
1003         refs += refs_to_add;
1004         btrfs_set_extent_refs(leaf, ei, refs);
1005         if (extent_op)
1006                 __run_delayed_extent_op(extent_op, leaf, ei);
1007
1008         ptr = (unsigned long)ei + item_offset;
1009         end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1010         if (ptr < end - size)
1011                 memmove_extent_buffer(leaf, ptr + size, ptr,
1012                                       end - size - ptr);
1013
1014         iref = (struct btrfs_extent_inline_ref *)ptr;
1015         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1016         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1017                 struct btrfs_extent_data_ref *dref;
1018                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1019                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1020                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1021                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1022                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1023         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1024                 struct btrfs_shared_data_ref *sref;
1025                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1026                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1027                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1028         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1029                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1030         } else {
1031                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1032         }
1033         btrfs_mark_buffer_dirty(leaf);
1034 }
1035
1036 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1037                                  struct btrfs_path *path,
1038                                  struct btrfs_extent_inline_ref **ref_ret,
1039                                  u64 bytenr, u64 num_bytes, u64 parent,
1040                                  u64 root_objectid, u64 owner, u64 offset)
1041 {
1042         int ret;
1043
1044         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1045                                            num_bytes, parent, root_objectid,
1046                                            owner, offset, 0);
1047         if (ret != -ENOENT)
1048                 return ret;
1049
1050         btrfs_release_path(path);
1051         *ref_ret = NULL;
1052
1053         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1054                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1055                                             root_objectid);
1056         } else {
1057                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1058                                              root_objectid, owner, offset);
1059         }
1060         return ret;
1061 }
1062
1063 /*
1064  * helper to update/remove inline back ref
1065  */
1066 static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,
1067                                   struct btrfs_extent_inline_ref *iref,
1068                                   int refs_to_mod,
1069                                   struct btrfs_delayed_extent_op *extent_op)
1070 {
1071         struct extent_buffer *leaf = path->nodes[0];
1072         struct btrfs_fs_info *fs_info = leaf->fs_info;
1073         struct btrfs_extent_item *ei;
1074         struct btrfs_extent_data_ref *dref = NULL;
1075         struct btrfs_shared_data_ref *sref = NULL;
1076         unsigned long ptr;
1077         unsigned long end;
1078         u32 item_size;
1079         int size;
1080         int type;
1081         u64 refs;
1082
1083         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1084         refs = btrfs_extent_refs(leaf, ei);
1085         if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1086                 struct btrfs_key key;
1087                 u32 extent_size;
1088
1089                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1090                 if (key.type == BTRFS_METADATA_ITEM_KEY)
1091                         extent_size = fs_info->nodesize;
1092                 else
1093                         extent_size = key.offset;
1094                 btrfs_print_leaf(leaf);
1095                 btrfs_err(fs_info,
1096         "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1097                           key.objectid, extent_size, refs_to_mod, refs);
1098                 return -EUCLEAN;
1099         }
1100         refs += refs_to_mod;
1101         btrfs_set_extent_refs(leaf, ei, refs);
1102         if (extent_op)
1103                 __run_delayed_extent_op(extent_op, leaf, ei);
1104
1105         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1106         /*
1107          * Function btrfs_get_extent_inline_ref_type() has already printed
1108          * error messages.
1109          */
1110         if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1111                 return -EUCLEAN;
1112
1113         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1114                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1115                 refs = btrfs_extent_data_ref_count(leaf, dref);
1116         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1117                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1118                 refs = btrfs_shared_data_ref_count(leaf, sref);
1119         } else {
1120                 refs = 1;
1121                 /*
1122                  * For tree blocks we can only drop one ref for it, and tree
1123                  * blocks should not have refs > 1.
1124                  *
1125                  * Furthermore if we're inserting a new inline backref, we
1126                  * won't reach this path either. That would be
1127                  * setup_inline_extent_backref().
1128                  */
1129                 if (unlikely(refs_to_mod != -1)) {
1130                         struct btrfs_key key;
1131
1132                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1133
1134                         btrfs_print_leaf(leaf);
1135                         btrfs_err(fs_info,
1136                         "invalid refs_to_mod for tree block %llu, has %d expect -1",
1137                                   key.objectid, refs_to_mod);
1138                         return -EUCLEAN;
1139                 }
1140         }
1141
1142         if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1143                 struct btrfs_key key;
1144                 u32 extent_size;
1145
1146                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1147                 if (key.type == BTRFS_METADATA_ITEM_KEY)
1148                         extent_size = fs_info->nodesize;
1149                 else
1150                         extent_size = key.offset;
1151                 btrfs_print_leaf(leaf);
1152                 btrfs_err(fs_info,
1153 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1154                           (unsigned long)iref, key.objectid, extent_size,
1155                           refs_to_mod, refs);
1156                 return -EUCLEAN;
1157         }
1158         refs += refs_to_mod;
1159
1160         if (refs > 0) {
1161                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1162                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1163                 else
1164                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1165         } else {
1166                 size =  btrfs_extent_inline_ref_size(type);
1167                 item_size = btrfs_item_size(leaf, path->slots[0]);
1168                 ptr = (unsigned long)iref;
1169                 end = (unsigned long)ei + item_size;
1170                 if (ptr + size < end)
1171                         memmove_extent_buffer(leaf, ptr, ptr + size,
1172                                               end - ptr - size);
1173                 item_size -= size;
1174                 btrfs_truncate_item(path, item_size, 1);
1175         }
1176         btrfs_mark_buffer_dirty(leaf);
1177         return 0;
1178 }
1179
1180 static noinline_for_stack
1181 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1182                                  struct btrfs_path *path,
1183                                  u64 bytenr, u64 num_bytes, u64 parent,
1184                                  u64 root_objectid, u64 owner,
1185                                  u64 offset, int refs_to_add,
1186                                  struct btrfs_delayed_extent_op *extent_op)
1187 {
1188         struct btrfs_extent_inline_ref *iref;
1189         int ret;
1190
1191         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1192                                            num_bytes, parent, root_objectid,
1193                                            owner, offset, 1);
1194         if (ret == 0) {
1195                 /*
1196                  * We're adding refs to a tree block we already own, this
1197                  * should not happen at all.
1198                  */
1199                 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1200                         btrfs_print_leaf(path->nodes[0]);
1201                         btrfs_crit(trans->fs_info,
1202 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1203                                    bytenr, num_bytes, root_objectid, path->slots[0]);
1204                         return -EUCLEAN;
1205                 }
1206                 ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);
1207         } else if (ret == -ENOENT) {
1208                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1209                                             root_objectid, owner, offset,
1210                                             refs_to_add, extent_op);
1211                 ret = 0;
1212         }
1213         return ret;
1214 }
1215
1216 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1217                                  struct btrfs_root *root,
1218                                  struct btrfs_path *path,
1219                                  struct btrfs_extent_inline_ref *iref,
1220                                  int refs_to_drop, int is_data)
1221 {
1222         int ret = 0;
1223
1224         BUG_ON(!is_data && refs_to_drop != 1);
1225         if (iref)
1226                 ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
1227         else if (is_data)
1228                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1229         else
1230                 ret = btrfs_del_item(trans, root, path);
1231         return ret;
1232 }
1233
1234 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1235                                u64 *discarded_bytes)
1236 {
1237         int j, ret = 0;
1238         u64 bytes_left, end;
1239         u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1240
1241         if (WARN_ON(start != aligned_start)) {
1242                 len -= aligned_start - start;
1243                 len = round_down(len, 1 << SECTOR_SHIFT);
1244                 start = aligned_start;
1245         }
1246
1247         *discarded_bytes = 0;
1248
1249         if (!len)
1250                 return 0;
1251
1252         end = start + len;
1253         bytes_left = len;
1254
1255         /* Skip any superblocks on this device. */
1256         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1257                 u64 sb_start = btrfs_sb_offset(j);
1258                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1259                 u64 size = sb_start - start;
1260
1261                 if (!in_range(sb_start, start, bytes_left) &&
1262                     !in_range(sb_end, start, bytes_left) &&
1263                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1264                         continue;
1265
1266                 /*
1267                  * Superblock spans beginning of range.  Adjust start and
1268                  * try again.
1269                  */
1270                 if (sb_start <= start) {
1271                         start += sb_end - start;
1272                         if (start > end) {
1273                                 bytes_left = 0;
1274                                 break;
1275                         }
1276                         bytes_left = end - start;
1277                         continue;
1278                 }
1279
1280                 if (size) {
1281                         ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1282                                                    size >> SECTOR_SHIFT,
1283                                                    GFP_NOFS);
1284                         if (!ret)
1285                                 *discarded_bytes += size;
1286                         else if (ret != -EOPNOTSUPP)
1287                                 return ret;
1288                 }
1289
1290                 start = sb_end;
1291                 if (start > end) {
1292                         bytes_left = 0;
1293                         break;
1294                 }
1295                 bytes_left = end - start;
1296         }
1297
1298         if (bytes_left) {
1299                 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1300                                            bytes_left >> SECTOR_SHIFT,
1301                                            GFP_NOFS);
1302                 if (!ret)
1303                         *discarded_bytes += bytes_left;
1304         }
1305         return ret;
1306 }
1307
1308 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1309 {
1310         struct btrfs_device *dev = stripe->dev;
1311         struct btrfs_fs_info *fs_info = dev->fs_info;
1312         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1313         u64 phys = stripe->physical;
1314         u64 len = stripe->length;
1315         u64 discarded = 0;
1316         int ret = 0;
1317
1318         /* Zone reset on a zoned filesystem */
1319         if (btrfs_can_zone_reset(dev, phys, len)) {
1320                 u64 src_disc;
1321
1322                 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1323                 if (ret)
1324                         goto out;
1325
1326                 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1327                     dev != dev_replace->srcdev)
1328                         goto out;
1329
1330                 src_disc = discarded;
1331
1332                 /* Send to replace target as well */
1333                 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1334                                               &discarded);
1335                 discarded += src_disc;
1336         } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1337                 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1338         } else {
1339                 ret = 0;
1340                 *bytes = 0;
1341         }
1342
1343 out:
1344         *bytes = discarded;
1345         return ret;
1346 }
1347
1348 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1349                          u64 num_bytes, u64 *actual_bytes)
1350 {
1351         int ret = 0;
1352         u64 discarded_bytes = 0;
1353         u64 end = bytenr + num_bytes;
1354         u64 cur = bytenr;
1355
1356         /*
1357          * Avoid races with device replace and make sure the devices in the
1358          * stripes don't go away while we are discarding.
1359          */
1360         btrfs_bio_counter_inc_blocked(fs_info);
1361         while (cur < end) {
1362                 struct btrfs_discard_stripe *stripes;
1363                 unsigned int num_stripes;
1364                 int i;
1365
1366                 num_bytes = end - cur;
1367                 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1368                 if (IS_ERR(stripes)) {
1369                         ret = PTR_ERR(stripes);
1370                         if (ret == -EOPNOTSUPP)
1371                                 ret = 0;
1372                         break;
1373                 }
1374
1375                 for (i = 0; i < num_stripes; i++) {
1376                         struct btrfs_discard_stripe *stripe = stripes + i;
1377                         u64 bytes;
1378
1379                         if (!stripe->dev->bdev) {
1380                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1381                                 continue;
1382                         }
1383
1384                         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1385                                         &stripe->dev->dev_state))
1386                                 continue;
1387
1388                         ret = do_discard_extent(stripe, &bytes);
1389                         if (ret) {
1390                                 /*
1391                                  * Keep going if discard is not supported by the
1392                                  * device.
1393                                  */
1394                                 if (ret != -EOPNOTSUPP)
1395                                         break;
1396                                 ret = 0;
1397                         } else {
1398                                 discarded_bytes += bytes;
1399                         }
1400                 }
1401                 kfree(stripes);
1402                 if (ret)
1403                         break;
1404                 cur += num_bytes;
1405         }
1406         btrfs_bio_counter_dec(fs_info);
1407         if (actual_bytes)
1408                 *actual_bytes = discarded_bytes;
1409         return ret;
1410 }
1411
1412 /* Can return -ENOMEM */
1413 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1414                          struct btrfs_ref *generic_ref)
1415 {
1416         struct btrfs_fs_info *fs_info = trans->fs_info;
1417         int ret;
1418
1419         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1420                generic_ref->action);
1421         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1422                generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1423
1424         if (generic_ref->type == BTRFS_REF_METADATA)
1425                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1426         else
1427                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1428
1429         btrfs_ref_tree_mod(fs_info, generic_ref);
1430
1431         return ret;
1432 }
1433
1434 /*
1435  * Insert backreference for a given extent.
1436  *
1437  * The counterpart is in __btrfs_free_extent(), with examples and more details
1438  * how it works.
1439  *
1440  * @trans:          Handle of transaction
1441  *
1442  * @node:           The delayed ref node used to get the bytenr/length for
1443  *                  extent whose references are incremented.
1444  *
1445  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1446  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1447  *                  bytenr of the parent block. Since new extents are always
1448  *                  created with indirect references, this will only be the case
1449  *                  when relocating a shared extent. In that case, root_objectid
1450  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1451  *                  be 0
1452  *
1453  * @root_objectid:  The id of the root where this modification has originated,
1454  *                  this can be either one of the well-known metadata trees or
1455  *                  the subvolume id which references this extent.
1456  *
1457  * @owner:          For data extents it is the inode number of the owning file.
1458  *                  For metadata extents this parameter holds the level in the
1459  *                  tree of the extent.
1460  *
1461  * @offset:         For metadata extents the offset is ignored and is currently
1462  *                  always passed as 0. For data extents it is the fileoffset
1463  *                  this extent belongs to.
1464  *
1465  * @extent_op       Pointer to a structure, holding information necessary when
1466  *                  updating a tree block's flags
1467  *
1468  */
1469 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1470                                   struct btrfs_delayed_ref_node *node,
1471                                   u64 parent, u64 root_objectid,
1472                                   u64 owner, u64 offset,
1473                                   struct btrfs_delayed_extent_op *extent_op)
1474 {
1475         struct btrfs_path *path;
1476         struct extent_buffer *leaf;
1477         struct btrfs_extent_item *item;
1478         struct btrfs_key key;
1479         u64 bytenr = node->bytenr;
1480         u64 num_bytes = node->num_bytes;
1481         u64 refs;
1482         int refs_to_add = node->ref_mod;
1483         int ret;
1484
1485         path = btrfs_alloc_path();
1486         if (!path)
1487                 return -ENOMEM;
1488
1489         /* this will setup the path even if it fails to insert the back ref */
1490         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1491                                            parent, root_objectid, owner,
1492                                            offset, refs_to_add, extent_op);
1493         if ((ret < 0 && ret != -EAGAIN) || !ret)
1494                 goto out;
1495
1496         /*
1497          * Ok we had -EAGAIN which means we didn't have space to insert and
1498          * inline extent ref, so just update the reference count and add a
1499          * normal backref.
1500          */
1501         leaf = path->nodes[0];
1502         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1503         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1504         refs = btrfs_extent_refs(leaf, item);
1505         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1506         if (extent_op)
1507                 __run_delayed_extent_op(extent_op, leaf, item);
1508
1509         btrfs_mark_buffer_dirty(leaf);
1510         btrfs_release_path(path);
1511
1512         /* now insert the actual backref */
1513         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1514                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1515                                             root_objectid);
1516         else
1517                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1518                                              root_objectid, owner, offset,
1519                                              refs_to_add);
1520
1521         if (ret)
1522                 btrfs_abort_transaction(trans, ret);
1523 out:
1524         btrfs_free_path(path);
1525         return ret;
1526 }
1527
1528 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1529                                 struct btrfs_delayed_ref_node *node,
1530                                 struct btrfs_delayed_extent_op *extent_op,
1531                                 bool insert_reserved)
1532 {
1533         int ret = 0;
1534         struct btrfs_delayed_data_ref *ref;
1535         u64 parent = 0;
1536         u64 flags = 0;
1537
1538         ref = btrfs_delayed_node_to_data_ref(node);
1539         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1540
1541         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1542                 parent = ref->parent;
1543
1544         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1545                 struct btrfs_key key;
1546
1547                 if (extent_op)
1548                         flags |= extent_op->flags_to_set;
1549
1550                 key.objectid = node->bytenr;
1551                 key.type = BTRFS_EXTENT_ITEM_KEY;
1552                 key.offset = node->num_bytes;
1553
1554                 ret = alloc_reserved_file_extent(trans, parent, ref->root,
1555                                                  flags, ref->objectid,
1556                                                  ref->offset, &key,
1557                                                  node->ref_mod);
1558         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1559                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
1560                                              ref->objectid, ref->offset,
1561                                              extent_op);
1562         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1563                 ret = __btrfs_free_extent(trans, node, parent,
1564                                           ref->root, ref->objectid,
1565                                           ref->offset, extent_op);
1566         } else {
1567                 BUG();
1568         }
1569         return ret;
1570 }
1571
1572 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1573                                     struct extent_buffer *leaf,
1574                                     struct btrfs_extent_item *ei)
1575 {
1576         u64 flags = btrfs_extent_flags(leaf, ei);
1577         if (extent_op->update_flags) {
1578                 flags |= extent_op->flags_to_set;
1579                 btrfs_set_extent_flags(leaf, ei, flags);
1580         }
1581
1582         if (extent_op->update_key) {
1583                 struct btrfs_tree_block_info *bi;
1584                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1585                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1586                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1587         }
1588 }
1589
1590 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1591                                  struct btrfs_delayed_ref_head *head,
1592                                  struct btrfs_delayed_extent_op *extent_op)
1593 {
1594         struct btrfs_fs_info *fs_info = trans->fs_info;
1595         struct btrfs_root *root;
1596         struct btrfs_key key;
1597         struct btrfs_path *path;
1598         struct btrfs_extent_item *ei;
1599         struct extent_buffer *leaf;
1600         u32 item_size;
1601         int ret;
1602         int metadata = 1;
1603
1604         if (TRANS_ABORTED(trans))
1605                 return 0;
1606
1607         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1608                 metadata = 0;
1609
1610         path = btrfs_alloc_path();
1611         if (!path)
1612                 return -ENOMEM;
1613
1614         key.objectid = head->bytenr;
1615
1616         if (metadata) {
1617                 key.type = BTRFS_METADATA_ITEM_KEY;
1618                 key.offset = extent_op->level;
1619         } else {
1620                 key.type = BTRFS_EXTENT_ITEM_KEY;
1621                 key.offset = head->num_bytes;
1622         }
1623
1624         root = btrfs_extent_root(fs_info, key.objectid);
1625 again:
1626         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1627         if (ret < 0) {
1628                 goto out;
1629         } else if (ret > 0) {
1630                 if (metadata) {
1631                         if (path->slots[0] > 0) {
1632                                 path->slots[0]--;
1633                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1634                                                       path->slots[0]);
1635                                 if (key.objectid == head->bytenr &&
1636                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
1637                                     key.offset == head->num_bytes)
1638                                         ret = 0;
1639                         }
1640                         if (ret > 0) {
1641                                 btrfs_release_path(path);
1642                                 metadata = 0;
1643
1644                                 key.objectid = head->bytenr;
1645                                 key.offset = head->num_bytes;
1646                                 key.type = BTRFS_EXTENT_ITEM_KEY;
1647                                 goto again;
1648                         }
1649                 } else {
1650                         ret = -EUCLEAN;
1651                         btrfs_err(fs_info,
1652                   "missing extent item for extent %llu num_bytes %llu level %d",
1653                                   head->bytenr, head->num_bytes, extent_op->level);
1654                         goto out;
1655                 }
1656         }
1657
1658         leaf = path->nodes[0];
1659         item_size = btrfs_item_size(leaf, path->slots[0]);
1660
1661         if (unlikely(item_size < sizeof(*ei))) {
1662                 ret = -EUCLEAN;
1663                 btrfs_err(fs_info,
1664                           "unexpected extent item size, has %u expect >= %zu",
1665                           item_size, sizeof(*ei));
1666                 btrfs_abort_transaction(trans, ret);
1667                 goto out;
1668         }
1669
1670         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1671         __run_delayed_extent_op(extent_op, leaf, ei);
1672
1673         btrfs_mark_buffer_dirty(leaf);
1674 out:
1675         btrfs_free_path(path);
1676         return ret;
1677 }
1678
1679 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1680                                 struct btrfs_delayed_ref_node *node,
1681                                 struct btrfs_delayed_extent_op *extent_op,
1682                                 bool insert_reserved)
1683 {
1684         int ret = 0;
1685         struct btrfs_delayed_tree_ref *ref;
1686         u64 parent = 0;
1687         u64 ref_root = 0;
1688
1689         ref = btrfs_delayed_node_to_tree_ref(node);
1690         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1691
1692         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1693                 parent = ref->parent;
1694         ref_root = ref->root;
1695
1696         if (unlikely(node->ref_mod != 1)) {
1697                 btrfs_err(trans->fs_info,
1698         "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1699                           node->bytenr, node->ref_mod, node->action, ref_root,
1700                           parent);
1701                 return -EUCLEAN;
1702         }
1703         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1704                 BUG_ON(!extent_op || !extent_op->update_flags);
1705                 ret = alloc_reserved_tree_block(trans, node, extent_op);
1706         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1707                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1708                                              ref->level, 0, extent_op);
1709         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1710                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
1711                                           ref->level, 0, extent_op);
1712         } else {
1713                 BUG();
1714         }
1715         return ret;
1716 }
1717
1718 /* helper function to actually process a single delayed ref entry */
1719 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1720                                struct btrfs_delayed_ref_node *node,
1721                                struct btrfs_delayed_extent_op *extent_op,
1722                                bool insert_reserved)
1723 {
1724         int ret = 0;
1725
1726         if (TRANS_ABORTED(trans)) {
1727                 if (insert_reserved)
1728                         btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1729                 return 0;
1730         }
1731
1732         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1733             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1734                 ret = run_delayed_tree_ref(trans, node, extent_op,
1735                                            insert_reserved);
1736         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1737                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1738                 ret = run_delayed_data_ref(trans, node, extent_op,
1739                                            insert_reserved);
1740         else
1741                 BUG();
1742         if (ret && insert_reserved)
1743                 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1744         if (ret < 0)
1745                 btrfs_err(trans->fs_info,
1746 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1747                           node->bytenr, node->num_bytes, node->type,
1748                           node->action, node->ref_mod, ret);
1749         return ret;
1750 }
1751
1752 static inline struct btrfs_delayed_ref_node *
1753 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1754 {
1755         struct btrfs_delayed_ref_node *ref;
1756
1757         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1758                 return NULL;
1759
1760         /*
1761          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1762          * This is to prevent a ref count from going down to zero, which deletes
1763          * the extent item from the extent tree, when there still are references
1764          * to add, which would fail because they would not find the extent item.
1765          */
1766         if (!list_empty(&head->ref_add_list))
1767                 return list_first_entry(&head->ref_add_list,
1768                                 struct btrfs_delayed_ref_node, add_list);
1769
1770         ref = rb_entry(rb_first_cached(&head->ref_tree),
1771                        struct btrfs_delayed_ref_node, ref_node);
1772         ASSERT(list_empty(&ref->add_list));
1773         return ref;
1774 }
1775
1776 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1777                                       struct btrfs_delayed_ref_head *head)
1778 {
1779         spin_lock(&delayed_refs->lock);
1780         head->processing = false;
1781         delayed_refs->num_heads_ready++;
1782         spin_unlock(&delayed_refs->lock);
1783         btrfs_delayed_ref_unlock(head);
1784 }
1785
1786 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1787                                 struct btrfs_delayed_ref_head *head)
1788 {
1789         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1790
1791         if (!extent_op)
1792                 return NULL;
1793
1794         if (head->must_insert_reserved) {
1795                 head->extent_op = NULL;
1796                 btrfs_free_delayed_extent_op(extent_op);
1797                 return NULL;
1798         }
1799         return extent_op;
1800 }
1801
1802 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1803                                      struct btrfs_delayed_ref_head *head)
1804 {
1805         struct btrfs_delayed_extent_op *extent_op;
1806         int ret;
1807
1808         extent_op = cleanup_extent_op(head);
1809         if (!extent_op)
1810                 return 0;
1811         head->extent_op = NULL;
1812         spin_unlock(&head->lock);
1813         ret = run_delayed_extent_op(trans, head, extent_op);
1814         btrfs_free_delayed_extent_op(extent_op);
1815         return ret ? ret : 1;
1816 }
1817
1818 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1819                                   struct btrfs_delayed_ref_root *delayed_refs,
1820                                   struct btrfs_delayed_ref_head *head)
1821 {
1822         int nr_items = 1;       /* Dropping this ref head update. */
1823
1824         /*
1825          * We had csum deletions accounted for in our delayed refs rsv, we need
1826          * to drop the csum leaves for this update from our delayed_refs_rsv.
1827          */
1828         if (head->total_ref_mod < 0 && head->is_data) {
1829                 spin_lock(&delayed_refs->lock);
1830                 delayed_refs->pending_csums -= head->num_bytes;
1831                 spin_unlock(&delayed_refs->lock);
1832                 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1833         }
1834
1835         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1836 }
1837
1838 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1839                             struct btrfs_delayed_ref_head *head)
1840 {
1841
1842         struct btrfs_fs_info *fs_info = trans->fs_info;
1843         struct btrfs_delayed_ref_root *delayed_refs;
1844         int ret;
1845
1846         delayed_refs = &trans->transaction->delayed_refs;
1847
1848         ret = run_and_cleanup_extent_op(trans, head);
1849         if (ret < 0) {
1850                 unselect_delayed_ref_head(delayed_refs, head);
1851                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1852                 return ret;
1853         } else if (ret) {
1854                 return ret;
1855         }
1856
1857         /*
1858          * Need to drop our head ref lock and re-acquire the delayed ref lock
1859          * and then re-check to make sure nobody got added.
1860          */
1861         spin_unlock(&head->lock);
1862         spin_lock(&delayed_refs->lock);
1863         spin_lock(&head->lock);
1864         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1865                 spin_unlock(&head->lock);
1866                 spin_unlock(&delayed_refs->lock);
1867                 return 1;
1868         }
1869         btrfs_delete_ref_head(delayed_refs, head);
1870         spin_unlock(&head->lock);
1871         spin_unlock(&delayed_refs->lock);
1872
1873         if (head->must_insert_reserved) {
1874                 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1875                 if (head->is_data) {
1876                         struct btrfs_root *csum_root;
1877
1878                         csum_root = btrfs_csum_root(fs_info, head->bytenr);
1879                         ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1880                                               head->num_bytes);
1881                 }
1882         }
1883
1884         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1885
1886         trace_run_delayed_ref_head(fs_info, head, 0);
1887         btrfs_delayed_ref_unlock(head);
1888         btrfs_put_delayed_ref_head(head);
1889         return ret;
1890 }
1891
1892 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1893                                         struct btrfs_trans_handle *trans)
1894 {
1895         struct btrfs_delayed_ref_root *delayed_refs =
1896                 &trans->transaction->delayed_refs;
1897         struct btrfs_delayed_ref_head *head = NULL;
1898         int ret;
1899
1900         spin_lock(&delayed_refs->lock);
1901         head = btrfs_select_ref_head(delayed_refs);
1902         if (!head) {
1903                 spin_unlock(&delayed_refs->lock);
1904                 return head;
1905         }
1906
1907         /*
1908          * Grab the lock that says we are going to process all the refs for
1909          * this head
1910          */
1911         ret = btrfs_delayed_ref_lock(delayed_refs, head);
1912         spin_unlock(&delayed_refs->lock);
1913
1914         /*
1915          * We may have dropped the spin lock to get the head mutex lock, and
1916          * that might have given someone else time to free the head.  If that's
1917          * true, it has been removed from our list and we can move on.
1918          */
1919         if (ret == -EAGAIN)
1920                 head = ERR_PTR(-EAGAIN);
1921
1922         return head;
1923 }
1924
1925 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1926                                            struct btrfs_delayed_ref_head *locked_ref)
1927 {
1928         struct btrfs_fs_info *fs_info = trans->fs_info;
1929         struct btrfs_delayed_ref_root *delayed_refs;
1930         struct btrfs_delayed_extent_op *extent_op;
1931         struct btrfs_delayed_ref_node *ref;
1932         bool must_insert_reserved;
1933         int ret;
1934
1935         delayed_refs = &trans->transaction->delayed_refs;
1936
1937         lockdep_assert_held(&locked_ref->mutex);
1938         lockdep_assert_held(&locked_ref->lock);
1939
1940         while ((ref = select_delayed_ref(locked_ref))) {
1941                 if (ref->seq &&
1942                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
1943                         spin_unlock(&locked_ref->lock);
1944                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1945                         return -EAGAIN;
1946                 }
1947
1948                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1949                 RB_CLEAR_NODE(&ref->ref_node);
1950                 if (!list_empty(&ref->add_list))
1951                         list_del(&ref->add_list);
1952                 /*
1953                  * When we play the delayed ref, also correct the ref_mod on
1954                  * head
1955                  */
1956                 switch (ref->action) {
1957                 case BTRFS_ADD_DELAYED_REF:
1958                 case BTRFS_ADD_DELAYED_EXTENT:
1959                         locked_ref->ref_mod -= ref->ref_mod;
1960                         break;
1961                 case BTRFS_DROP_DELAYED_REF:
1962                         locked_ref->ref_mod += ref->ref_mod;
1963                         break;
1964                 default:
1965                         WARN_ON(1);
1966                 }
1967                 atomic_dec(&delayed_refs->num_entries);
1968
1969                 /*
1970                  * Record the must_insert_reserved flag before we drop the
1971                  * spin lock.
1972                  */
1973                 must_insert_reserved = locked_ref->must_insert_reserved;
1974                 locked_ref->must_insert_reserved = false;
1975
1976                 extent_op = locked_ref->extent_op;
1977                 locked_ref->extent_op = NULL;
1978                 spin_unlock(&locked_ref->lock);
1979
1980                 ret = run_one_delayed_ref(trans, ref, extent_op,
1981                                           must_insert_reserved);
1982
1983                 btrfs_free_delayed_extent_op(extent_op);
1984                 if (ret) {
1985                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1986                         btrfs_put_delayed_ref(ref);
1987                         return ret;
1988                 }
1989
1990                 btrfs_put_delayed_ref(ref);
1991                 cond_resched();
1992
1993                 spin_lock(&locked_ref->lock);
1994                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
1995         }
1996
1997         return 0;
1998 }
1999
2000 /*
2001  * Returns 0 on success or if called with an already aborted transaction.
2002  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2003  */
2004 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2005                                              unsigned long nr)
2006 {
2007         struct btrfs_fs_info *fs_info = trans->fs_info;
2008         struct btrfs_delayed_ref_root *delayed_refs;
2009         struct btrfs_delayed_ref_head *locked_ref = NULL;
2010         int ret;
2011         unsigned long count = 0;
2012
2013         delayed_refs = &trans->transaction->delayed_refs;
2014         do {
2015                 if (!locked_ref) {
2016                         locked_ref = btrfs_obtain_ref_head(trans);
2017                         if (IS_ERR_OR_NULL(locked_ref)) {
2018                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2019                                         continue;
2020                                 } else {
2021                                         break;
2022                                 }
2023                         }
2024                         count++;
2025                 }
2026                 /*
2027                  * We need to try and merge add/drops of the same ref since we
2028                  * can run into issues with relocate dropping the implicit ref
2029                  * and then it being added back again before the drop can
2030                  * finish.  If we merged anything we need to re-loop so we can
2031                  * get a good ref.
2032                  * Or we can get node references of the same type that weren't
2033                  * merged when created due to bumps in the tree mod seq, and
2034                  * we need to merge them to prevent adding an inline extent
2035                  * backref before dropping it (triggering a BUG_ON at
2036                  * insert_inline_extent_backref()).
2037                  */
2038                 spin_lock(&locked_ref->lock);
2039                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2040
2041                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
2042                 if (ret < 0 && ret != -EAGAIN) {
2043                         /*
2044                          * Error, btrfs_run_delayed_refs_for_head already
2045                          * unlocked everything so just bail out
2046                          */
2047                         return ret;
2048                 } else if (!ret) {
2049                         /*
2050                          * Success, perform the usual cleanup of a processed
2051                          * head
2052                          */
2053                         ret = cleanup_ref_head(trans, locked_ref);
2054                         if (ret > 0 ) {
2055                                 /* We dropped our lock, we need to loop. */
2056                                 ret = 0;
2057                                 continue;
2058                         } else if (ret) {
2059                                 return ret;
2060                         }
2061                 }
2062
2063                 /*
2064                  * Either success case or btrfs_run_delayed_refs_for_head
2065                  * returned -EAGAIN, meaning we need to select another head
2066                  */
2067
2068                 locked_ref = NULL;
2069                 cond_resched();
2070         } while ((nr != -1 && count < nr) || locked_ref);
2071
2072         return 0;
2073 }
2074
2075 #ifdef SCRAMBLE_DELAYED_REFS
2076 /*
2077  * Normally delayed refs get processed in ascending bytenr order. This
2078  * correlates in most cases to the order added. To expose dependencies on this
2079  * order, we start to process the tree in the middle instead of the beginning
2080  */
2081 static u64 find_middle(struct rb_root *root)
2082 {
2083         struct rb_node *n = root->rb_node;
2084         struct btrfs_delayed_ref_node *entry;
2085         int alt = 1;
2086         u64 middle;
2087         u64 first = 0, last = 0;
2088
2089         n = rb_first(root);
2090         if (n) {
2091                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2092                 first = entry->bytenr;
2093         }
2094         n = rb_last(root);
2095         if (n) {
2096                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2097                 last = entry->bytenr;
2098         }
2099         n = root->rb_node;
2100
2101         while (n) {
2102                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2103                 WARN_ON(!entry->in_tree);
2104
2105                 middle = entry->bytenr;
2106
2107                 if (alt)
2108                         n = n->rb_left;
2109                 else
2110                         n = n->rb_right;
2111
2112                 alt = 1 - alt;
2113         }
2114         return middle;
2115 }
2116 #endif
2117
2118 /*
2119  * this starts processing the delayed reference count updates and
2120  * extent insertions we have queued up so far.  count can be
2121  * 0, which means to process everything in the tree at the start
2122  * of the run (but not newly added entries), or it can be some target
2123  * number you'd like to process.
2124  *
2125  * Returns 0 on success or if called with an aborted transaction
2126  * Returns <0 on error and aborts the transaction
2127  */
2128 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2129                            unsigned long count)
2130 {
2131         struct btrfs_fs_info *fs_info = trans->fs_info;
2132         struct btrfs_delayed_ref_root *delayed_refs;
2133         int ret;
2134         int run_all = count == (unsigned long)-1;
2135
2136         /* We'll clean this up in btrfs_cleanup_transaction */
2137         if (TRANS_ABORTED(trans))
2138                 return 0;
2139
2140         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2141                 return 0;
2142
2143         delayed_refs = &trans->transaction->delayed_refs;
2144         if (count == 0)
2145                 count = delayed_refs->num_heads_ready;
2146
2147 again:
2148 #ifdef SCRAMBLE_DELAYED_REFS
2149         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2150 #endif
2151         ret = __btrfs_run_delayed_refs(trans, count);
2152         if (ret < 0) {
2153                 btrfs_abort_transaction(trans, ret);
2154                 return ret;
2155         }
2156
2157         if (run_all) {
2158                 btrfs_create_pending_block_groups(trans);
2159
2160                 spin_lock(&delayed_refs->lock);
2161                 if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
2162                         spin_unlock(&delayed_refs->lock);
2163                         return 0;
2164                 }
2165                 spin_unlock(&delayed_refs->lock);
2166
2167                 cond_resched();
2168                 goto again;
2169         }
2170
2171         return 0;
2172 }
2173
2174 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2175                                 struct extent_buffer *eb, u64 flags)
2176 {
2177         struct btrfs_delayed_extent_op *extent_op;
2178         int level = btrfs_header_level(eb);
2179         int ret;
2180
2181         extent_op = btrfs_alloc_delayed_extent_op();
2182         if (!extent_op)
2183                 return -ENOMEM;
2184
2185         extent_op->flags_to_set = flags;
2186         extent_op->update_flags = true;
2187         extent_op->update_key = false;
2188         extent_op->level = level;
2189
2190         ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2191         if (ret)
2192                 btrfs_free_delayed_extent_op(extent_op);
2193         return ret;
2194 }
2195
2196 static noinline int check_delayed_ref(struct btrfs_root *root,
2197                                       struct btrfs_path *path,
2198                                       u64 objectid, u64 offset, u64 bytenr)
2199 {
2200         struct btrfs_delayed_ref_head *head;
2201         struct btrfs_delayed_ref_node *ref;
2202         struct btrfs_delayed_data_ref *data_ref;
2203         struct btrfs_delayed_ref_root *delayed_refs;
2204         struct btrfs_transaction *cur_trans;
2205         struct rb_node *node;
2206         int ret = 0;
2207
2208         spin_lock(&root->fs_info->trans_lock);
2209         cur_trans = root->fs_info->running_transaction;
2210         if (cur_trans)
2211                 refcount_inc(&cur_trans->use_count);
2212         spin_unlock(&root->fs_info->trans_lock);
2213         if (!cur_trans)
2214                 return 0;
2215
2216         delayed_refs = &cur_trans->delayed_refs;
2217         spin_lock(&delayed_refs->lock);
2218         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2219         if (!head) {
2220                 spin_unlock(&delayed_refs->lock);
2221                 btrfs_put_transaction(cur_trans);
2222                 return 0;
2223         }
2224
2225         if (!mutex_trylock(&head->mutex)) {
2226                 if (path->nowait) {
2227                         spin_unlock(&delayed_refs->lock);
2228                         btrfs_put_transaction(cur_trans);
2229                         return -EAGAIN;
2230                 }
2231
2232                 refcount_inc(&head->refs);
2233                 spin_unlock(&delayed_refs->lock);
2234
2235                 btrfs_release_path(path);
2236
2237                 /*
2238                  * Mutex was contended, block until it's released and let
2239                  * caller try again
2240                  */
2241                 mutex_lock(&head->mutex);
2242                 mutex_unlock(&head->mutex);
2243                 btrfs_put_delayed_ref_head(head);
2244                 btrfs_put_transaction(cur_trans);
2245                 return -EAGAIN;
2246         }
2247         spin_unlock(&delayed_refs->lock);
2248
2249         spin_lock(&head->lock);
2250         /*
2251          * XXX: We should replace this with a proper search function in the
2252          * future.
2253          */
2254         for (node = rb_first_cached(&head->ref_tree); node;
2255              node = rb_next(node)) {
2256                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2257                 /* If it's a shared ref we know a cross reference exists */
2258                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2259                         ret = 1;
2260                         break;
2261                 }
2262
2263                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2264
2265                 /*
2266                  * If our ref doesn't match the one we're currently looking at
2267                  * then we have a cross reference.
2268                  */
2269                 if (data_ref->root != root->root_key.objectid ||
2270                     data_ref->objectid != objectid ||
2271                     data_ref->offset != offset) {
2272                         ret = 1;
2273                         break;
2274                 }
2275         }
2276         spin_unlock(&head->lock);
2277         mutex_unlock(&head->mutex);
2278         btrfs_put_transaction(cur_trans);
2279         return ret;
2280 }
2281
2282 static noinline int check_committed_ref(struct btrfs_root *root,
2283                                         struct btrfs_path *path,
2284                                         u64 objectid, u64 offset, u64 bytenr,
2285                                         bool strict)
2286 {
2287         struct btrfs_fs_info *fs_info = root->fs_info;
2288         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2289         struct extent_buffer *leaf;
2290         struct btrfs_extent_data_ref *ref;
2291         struct btrfs_extent_inline_ref *iref;
2292         struct btrfs_extent_item *ei;
2293         struct btrfs_key key;
2294         u32 item_size;
2295         int type;
2296         int ret;
2297
2298         key.objectid = bytenr;
2299         key.offset = (u64)-1;
2300         key.type = BTRFS_EXTENT_ITEM_KEY;
2301
2302         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2303         if (ret < 0)
2304                 goto out;
2305         BUG_ON(ret == 0); /* Corruption */
2306
2307         ret = -ENOENT;
2308         if (path->slots[0] == 0)
2309                 goto out;
2310
2311         path->slots[0]--;
2312         leaf = path->nodes[0];
2313         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2314
2315         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2316                 goto out;
2317
2318         ret = 1;
2319         item_size = btrfs_item_size(leaf, path->slots[0]);
2320         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2321
2322         /* If extent item has more than 1 inline ref then it's shared */
2323         if (item_size != sizeof(*ei) +
2324             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2325                 goto out;
2326
2327         /*
2328          * If extent created before last snapshot => it's shared unless the
2329          * snapshot has been deleted. Use the heuristic if strict is false.
2330          */
2331         if (!strict &&
2332             (btrfs_extent_generation(leaf, ei) <=
2333              btrfs_root_last_snapshot(&root->root_item)))
2334                 goto out;
2335
2336         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2337
2338         /* If this extent has SHARED_DATA_REF then it's shared */
2339         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2340         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2341                 goto out;
2342
2343         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2344         if (btrfs_extent_refs(leaf, ei) !=
2345             btrfs_extent_data_ref_count(leaf, ref) ||
2346             btrfs_extent_data_ref_root(leaf, ref) !=
2347             root->root_key.objectid ||
2348             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2349             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2350                 goto out;
2351
2352         ret = 0;
2353 out:
2354         return ret;
2355 }
2356
2357 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2358                           u64 bytenr, bool strict, struct btrfs_path *path)
2359 {
2360         int ret;
2361
2362         do {
2363                 ret = check_committed_ref(root, path, objectid,
2364                                           offset, bytenr, strict);
2365                 if (ret && ret != -ENOENT)
2366                         goto out;
2367
2368                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2369         } while (ret == -EAGAIN);
2370
2371 out:
2372         btrfs_release_path(path);
2373         if (btrfs_is_data_reloc_root(root))
2374                 WARN_ON(ret > 0);
2375         return ret;
2376 }
2377
2378 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2379                            struct btrfs_root *root,
2380                            struct extent_buffer *buf,
2381                            int full_backref, int inc)
2382 {
2383         struct btrfs_fs_info *fs_info = root->fs_info;
2384         u64 bytenr;
2385         u64 num_bytes;
2386         u64 parent;
2387         u64 ref_root;
2388         u32 nritems;
2389         struct btrfs_key key;
2390         struct btrfs_file_extent_item *fi;
2391         struct btrfs_ref generic_ref = { 0 };
2392         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2393         int i;
2394         int action;
2395         int level;
2396         int ret = 0;
2397
2398         if (btrfs_is_testing(fs_info))
2399                 return 0;
2400
2401         ref_root = btrfs_header_owner(buf);
2402         nritems = btrfs_header_nritems(buf);
2403         level = btrfs_header_level(buf);
2404
2405         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2406                 return 0;
2407
2408         if (full_backref)
2409                 parent = buf->start;
2410         else
2411                 parent = 0;
2412         if (inc)
2413                 action = BTRFS_ADD_DELAYED_REF;
2414         else
2415                 action = BTRFS_DROP_DELAYED_REF;
2416
2417         for (i = 0; i < nritems; i++) {
2418                 if (level == 0) {
2419                         btrfs_item_key_to_cpu(buf, &key, i);
2420                         if (key.type != BTRFS_EXTENT_DATA_KEY)
2421                                 continue;
2422                         fi = btrfs_item_ptr(buf, i,
2423                                             struct btrfs_file_extent_item);
2424                         if (btrfs_file_extent_type(buf, fi) ==
2425                             BTRFS_FILE_EXTENT_INLINE)
2426                                 continue;
2427                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2428                         if (bytenr == 0)
2429                                 continue;
2430
2431                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2432                         key.offset -= btrfs_file_extent_offset(buf, fi);
2433                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2434                                                num_bytes, parent);
2435                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2436                                             key.offset, root->root_key.objectid,
2437                                             for_reloc);
2438                         if (inc)
2439                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2440                         else
2441                                 ret = btrfs_free_extent(trans, &generic_ref);
2442                         if (ret)
2443                                 goto fail;
2444                 } else {
2445                         bytenr = btrfs_node_blockptr(buf, i);
2446                         num_bytes = fs_info->nodesize;
2447                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2448                                                num_bytes, parent);
2449                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2450                                             root->root_key.objectid, for_reloc);
2451                         if (inc)
2452                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2453                         else
2454                                 ret = btrfs_free_extent(trans, &generic_ref);
2455                         if (ret)
2456                                 goto fail;
2457                 }
2458         }
2459         return 0;
2460 fail:
2461         return ret;
2462 }
2463
2464 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2465                   struct extent_buffer *buf, int full_backref)
2466 {
2467         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2468 }
2469
2470 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2471                   struct extent_buffer *buf, int full_backref)
2472 {
2473         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2474 }
2475
2476 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2477 {
2478         struct btrfs_fs_info *fs_info = root->fs_info;
2479         u64 flags;
2480         u64 ret;
2481
2482         if (data)
2483                 flags = BTRFS_BLOCK_GROUP_DATA;
2484         else if (root == fs_info->chunk_root)
2485                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2486         else
2487                 flags = BTRFS_BLOCK_GROUP_METADATA;
2488
2489         ret = btrfs_get_alloc_profile(fs_info, flags);
2490         return ret;
2491 }
2492
2493 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2494 {
2495         struct rb_node *leftmost;
2496         u64 bytenr = 0;
2497
2498         read_lock(&fs_info->block_group_cache_lock);
2499         /* Get the block group with the lowest logical start address. */
2500         leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2501         if (leftmost) {
2502                 struct btrfs_block_group *bg;
2503
2504                 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2505                 bytenr = bg->start;
2506         }
2507         read_unlock(&fs_info->block_group_cache_lock);
2508
2509         return bytenr;
2510 }
2511
2512 static int pin_down_extent(struct btrfs_trans_handle *trans,
2513                            struct btrfs_block_group *cache,
2514                            u64 bytenr, u64 num_bytes, int reserved)
2515 {
2516         struct btrfs_fs_info *fs_info = cache->fs_info;
2517
2518         spin_lock(&cache->space_info->lock);
2519         spin_lock(&cache->lock);
2520         cache->pinned += num_bytes;
2521         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2522                                              num_bytes);
2523         if (reserved) {
2524                 cache->reserved -= num_bytes;
2525                 cache->space_info->bytes_reserved -= num_bytes;
2526         }
2527         spin_unlock(&cache->lock);
2528         spin_unlock(&cache->space_info->lock);
2529
2530         set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2531                        bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2532         return 0;
2533 }
2534
2535 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2536                      u64 bytenr, u64 num_bytes, int reserved)
2537 {
2538         struct btrfs_block_group *cache;
2539
2540         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2541         BUG_ON(!cache); /* Logic error */
2542
2543         pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2544
2545         btrfs_put_block_group(cache);
2546         return 0;
2547 }
2548
2549 /*
2550  * this function must be called within transaction
2551  */
2552 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2553                                     const struct extent_buffer *eb)
2554 {
2555         struct btrfs_block_group *cache;
2556         int ret;
2557
2558         cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
2559         if (!cache)
2560                 return -EINVAL;
2561
2562         /*
2563          * Fully cache the free space first so that our pin removes the free space
2564          * from the cache.
2565          */
2566         ret = btrfs_cache_block_group(cache, true);
2567         if (ret)
2568                 goto out;
2569
2570         pin_down_extent(trans, cache, eb->start, eb->len, 0);
2571
2572         /* remove us from the free space cache (if we're there at all) */
2573         ret = btrfs_remove_free_space(cache, eb->start, eb->len);
2574 out:
2575         btrfs_put_block_group(cache);
2576         return ret;
2577 }
2578
2579 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2580                                    u64 start, u64 num_bytes)
2581 {
2582         int ret;
2583         struct btrfs_block_group *block_group;
2584
2585         block_group = btrfs_lookup_block_group(fs_info, start);
2586         if (!block_group)
2587                 return -EINVAL;
2588
2589         ret = btrfs_cache_block_group(block_group, true);
2590         if (ret)
2591                 goto out;
2592
2593         ret = btrfs_remove_free_space(block_group, start, num_bytes);
2594 out:
2595         btrfs_put_block_group(block_group);
2596         return ret;
2597 }
2598
2599 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2600 {
2601         struct btrfs_fs_info *fs_info = eb->fs_info;
2602         struct btrfs_file_extent_item *item;
2603         struct btrfs_key key;
2604         int found_type;
2605         int i;
2606         int ret = 0;
2607
2608         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2609                 return 0;
2610
2611         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2612                 btrfs_item_key_to_cpu(eb, &key, i);
2613                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2614                         continue;
2615                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2616                 found_type = btrfs_file_extent_type(eb, item);
2617                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2618                         continue;
2619                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2620                         continue;
2621                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2622                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2623                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2624                 if (ret)
2625                         break;
2626         }
2627
2628         return ret;
2629 }
2630
2631 static void
2632 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2633 {
2634         atomic_inc(&bg->reservations);
2635 }
2636
2637 /*
2638  * Returns the free cluster for the given space info and sets empty_cluster to
2639  * what it should be based on the mount options.
2640  */
2641 static struct btrfs_free_cluster *
2642 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2643                    struct btrfs_space_info *space_info, u64 *empty_cluster)
2644 {
2645         struct btrfs_free_cluster *ret = NULL;
2646
2647         *empty_cluster = 0;
2648         if (btrfs_mixed_space_info(space_info))
2649                 return ret;
2650
2651         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2652                 ret = &fs_info->meta_alloc_cluster;
2653                 if (btrfs_test_opt(fs_info, SSD))
2654                         *empty_cluster = SZ_2M;
2655                 else
2656                         *empty_cluster = SZ_64K;
2657         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2658                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
2659                 *empty_cluster = SZ_2M;
2660                 ret = &fs_info->data_alloc_cluster;
2661         }
2662
2663         return ret;
2664 }
2665
2666 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2667                               u64 start, u64 end,
2668                               const bool return_free_space)
2669 {
2670         struct btrfs_block_group *cache = NULL;
2671         struct btrfs_space_info *space_info;
2672         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2673         struct btrfs_free_cluster *cluster = NULL;
2674         u64 len;
2675         u64 total_unpinned = 0;
2676         u64 empty_cluster = 0;
2677         bool readonly;
2678
2679         while (start <= end) {
2680                 readonly = false;
2681                 if (!cache ||
2682                     start >= cache->start + cache->length) {
2683                         if (cache)
2684                                 btrfs_put_block_group(cache);
2685                         total_unpinned = 0;
2686                         cache = btrfs_lookup_block_group(fs_info, start);
2687                         BUG_ON(!cache); /* Logic error */
2688
2689                         cluster = fetch_cluster_info(fs_info,
2690                                                      cache->space_info,
2691                                                      &empty_cluster);
2692                         empty_cluster <<= 1;
2693                 }
2694
2695                 len = cache->start + cache->length - start;
2696                 len = min(len, end + 1 - start);
2697
2698                 if (return_free_space)
2699                         btrfs_add_free_space(cache, start, len);
2700
2701                 start += len;
2702                 total_unpinned += len;
2703                 space_info = cache->space_info;
2704
2705                 /*
2706                  * If this space cluster has been marked as fragmented and we've
2707                  * unpinned enough in this block group to potentially allow a
2708                  * cluster to be created inside of it go ahead and clear the
2709                  * fragmented check.
2710                  */
2711                 if (cluster && cluster->fragmented &&
2712                     total_unpinned > empty_cluster) {
2713                         spin_lock(&cluster->lock);
2714                         cluster->fragmented = 0;
2715                         spin_unlock(&cluster->lock);
2716                 }
2717
2718                 spin_lock(&space_info->lock);
2719                 spin_lock(&cache->lock);
2720                 cache->pinned -= len;
2721                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2722                 space_info->max_extent_size = 0;
2723                 if (cache->ro) {
2724                         space_info->bytes_readonly += len;
2725                         readonly = true;
2726                 } else if (btrfs_is_zoned(fs_info)) {
2727                         /* Need reset before reusing in a zoned block group */
2728                         space_info->bytes_zone_unusable += len;
2729                         readonly = true;
2730                 }
2731                 spin_unlock(&cache->lock);
2732                 if (!readonly && return_free_space &&
2733                     global_rsv->space_info == space_info) {
2734                         spin_lock(&global_rsv->lock);
2735                         if (!global_rsv->full) {
2736                                 u64 to_add = min(len, global_rsv->size -
2737                                                       global_rsv->reserved);
2738
2739                                 global_rsv->reserved += to_add;
2740                                 btrfs_space_info_update_bytes_may_use(fs_info,
2741                                                 space_info, to_add);
2742                                 if (global_rsv->reserved >= global_rsv->size)
2743                                         global_rsv->full = 1;
2744                                 len -= to_add;
2745                         }
2746                         spin_unlock(&global_rsv->lock);
2747                 }
2748                 /* Add to any tickets we may have */
2749                 if (!readonly && return_free_space && len)
2750                         btrfs_try_granting_tickets(fs_info, space_info);
2751                 spin_unlock(&space_info->lock);
2752         }
2753
2754         if (cache)
2755                 btrfs_put_block_group(cache);
2756         return 0;
2757 }
2758
2759 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2760 {
2761         struct btrfs_fs_info *fs_info = trans->fs_info;
2762         struct btrfs_block_group *block_group, *tmp;
2763         struct list_head *deleted_bgs;
2764         struct extent_io_tree *unpin;
2765         u64 start;
2766         u64 end;
2767         int ret;
2768
2769         unpin = &trans->transaction->pinned_extents;
2770
2771         while (!TRANS_ABORTED(trans)) {
2772                 struct extent_state *cached_state = NULL;
2773
2774                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
2775                 if (!find_first_extent_bit(unpin, 0, &start, &end,
2776                                            EXTENT_DIRTY, &cached_state)) {
2777                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2778                         break;
2779                 }
2780
2781                 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2782                         ret = btrfs_discard_extent(fs_info, start,
2783                                                    end + 1 - start, NULL);
2784
2785                 clear_extent_dirty(unpin, start, end, &cached_state);
2786                 unpin_extent_range(fs_info, start, end, true);
2787                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2788                 free_extent_state(cached_state);
2789                 cond_resched();
2790         }
2791
2792         if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2793                 btrfs_discard_calc_delay(&fs_info->discard_ctl);
2794                 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2795         }
2796
2797         /*
2798          * Transaction is finished.  We don't need the lock anymore.  We
2799          * do need to clean up the block groups in case of a transaction
2800          * abort.
2801          */
2802         deleted_bgs = &trans->transaction->deleted_bgs;
2803         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2804                 u64 trimmed = 0;
2805
2806                 ret = -EROFS;
2807                 if (!TRANS_ABORTED(trans))
2808                         ret = btrfs_discard_extent(fs_info,
2809                                                    block_group->start,
2810                                                    block_group->length,
2811                                                    &trimmed);
2812
2813                 list_del_init(&block_group->bg_list);
2814                 btrfs_unfreeze_block_group(block_group);
2815                 btrfs_put_block_group(block_group);
2816
2817                 if (ret) {
2818                         const char *errstr = btrfs_decode_error(ret);
2819                         btrfs_warn(fs_info,
2820                            "discard failed while removing blockgroup: errno=%d %s",
2821                                    ret, errstr);
2822                 }
2823         }
2824
2825         return 0;
2826 }
2827
2828 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2829                                      u64 bytenr, u64 num_bytes, bool is_data)
2830 {
2831         int ret;
2832
2833         if (is_data) {
2834                 struct btrfs_root *csum_root;
2835
2836                 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2837                 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2838                 if (ret) {
2839                         btrfs_abort_transaction(trans, ret);
2840                         return ret;
2841                 }
2842         }
2843
2844         ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2845         if (ret) {
2846                 btrfs_abort_transaction(trans, ret);
2847                 return ret;
2848         }
2849
2850         ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2851         if (ret)
2852                 btrfs_abort_transaction(trans, ret);
2853
2854         return ret;
2855 }
2856
2857 #define abort_and_dump(trans, path, fmt, args...)       \
2858 ({                                                      \
2859         btrfs_abort_transaction(trans, -EUCLEAN);       \
2860         btrfs_print_leaf(path->nodes[0]);               \
2861         btrfs_crit(trans->fs_info, fmt, ##args);        \
2862 })
2863
2864 /*
2865  * Drop one or more refs of @node.
2866  *
2867  * 1. Locate the extent refs.
2868  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2869  *    Locate it, then reduce the refs number or remove the ref line completely.
2870  *
2871  * 2. Update the refs count in EXTENT/METADATA_ITEM
2872  *
2873  * Inline backref case:
2874  *
2875  * in extent tree we have:
2876  *
2877  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2878  *              refs 2 gen 6 flags DATA
2879  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2880  *              extent data backref root FS_TREE objectid 257 offset 0 count 1
2881  *
2882  * This function gets called with:
2883  *
2884  *    node->bytenr = 13631488
2885  *    node->num_bytes = 1048576
2886  *    root_objectid = FS_TREE
2887  *    owner_objectid = 257
2888  *    owner_offset = 0
2889  *    refs_to_drop = 1
2890  *
2891  * Then we should get some like:
2892  *
2893  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2894  *              refs 1 gen 6 flags DATA
2895  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2896  *
2897  * Keyed backref case:
2898  *
2899  * in extent tree we have:
2900  *
2901  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2902  *              refs 754 gen 6 flags DATA
2903  *      [...]
2904  *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2905  *              extent data backref root FS_TREE objectid 866 offset 0 count 1
2906  *
2907  * This function get called with:
2908  *
2909  *    node->bytenr = 13631488
2910  *    node->num_bytes = 1048576
2911  *    root_objectid = FS_TREE
2912  *    owner_objectid = 866
2913  *    owner_offset = 0
2914  *    refs_to_drop = 1
2915  *
2916  * Then we should get some like:
2917  *
2918  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2919  *              refs 753 gen 6 flags DATA
2920  *
2921  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2922  */
2923 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2924                                struct btrfs_delayed_ref_node *node, u64 parent,
2925                                u64 root_objectid, u64 owner_objectid,
2926                                u64 owner_offset,
2927                                struct btrfs_delayed_extent_op *extent_op)
2928 {
2929         struct btrfs_fs_info *info = trans->fs_info;
2930         struct btrfs_key key;
2931         struct btrfs_path *path;
2932         struct btrfs_root *extent_root;
2933         struct extent_buffer *leaf;
2934         struct btrfs_extent_item *ei;
2935         struct btrfs_extent_inline_ref *iref;
2936         int ret;
2937         int is_data;
2938         int extent_slot = 0;
2939         int found_extent = 0;
2940         int num_to_del = 1;
2941         int refs_to_drop = node->ref_mod;
2942         u32 item_size;
2943         u64 refs;
2944         u64 bytenr = node->bytenr;
2945         u64 num_bytes = node->num_bytes;
2946         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2947
2948         extent_root = btrfs_extent_root(info, bytenr);
2949         ASSERT(extent_root);
2950
2951         path = btrfs_alloc_path();
2952         if (!path)
2953                 return -ENOMEM;
2954
2955         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2956
2957         if (!is_data && refs_to_drop != 1) {
2958                 btrfs_crit(info,
2959 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2960                            node->bytenr, refs_to_drop);
2961                 ret = -EINVAL;
2962                 btrfs_abort_transaction(trans, ret);
2963                 goto out;
2964         }
2965
2966         if (is_data)
2967                 skinny_metadata = false;
2968
2969         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2970                                     parent, root_objectid, owner_objectid,
2971                                     owner_offset);
2972         if (ret == 0) {
2973                 /*
2974                  * Either the inline backref or the SHARED_DATA_REF/
2975                  * SHARED_BLOCK_REF is found
2976                  *
2977                  * Here is a quick path to locate EXTENT/METADATA_ITEM.
2978                  * It's possible the EXTENT/METADATA_ITEM is near current slot.
2979                  */
2980                 extent_slot = path->slots[0];
2981                 while (extent_slot >= 0) {
2982                         btrfs_item_key_to_cpu(path->nodes[0], &key,
2983                                               extent_slot);
2984                         if (key.objectid != bytenr)
2985                                 break;
2986                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
2987                             key.offset == num_bytes) {
2988                                 found_extent = 1;
2989                                 break;
2990                         }
2991                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
2992                             key.offset == owner_objectid) {
2993                                 found_extent = 1;
2994                                 break;
2995                         }
2996
2997                         /* Quick path didn't find the EXTEMT/METADATA_ITEM */
2998                         if (path->slots[0] - extent_slot > 5)
2999                                 break;
3000                         extent_slot--;
3001                 }
3002
3003                 if (!found_extent) {
3004                         if (iref) {
3005                                 abort_and_dump(trans, path,
3006 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3007                                            path->slots[0]);
3008                                 ret = -EUCLEAN;
3009                                 goto out;
3010                         }
3011                         /* Must be SHARED_* item, remove the backref first */
3012                         ret = remove_extent_backref(trans, extent_root, path,
3013                                                     NULL, refs_to_drop, is_data);
3014                         if (ret) {
3015                                 btrfs_abort_transaction(trans, ret);
3016                                 goto out;
3017                         }
3018                         btrfs_release_path(path);
3019
3020                         /* Slow path to locate EXTENT/METADATA_ITEM */
3021                         key.objectid = bytenr;
3022                         key.type = BTRFS_EXTENT_ITEM_KEY;
3023                         key.offset = num_bytes;
3024
3025                         if (!is_data && skinny_metadata) {
3026                                 key.type = BTRFS_METADATA_ITEM_KEY;
3027                                 key.offset = owner_objectid;
3028                         }
3029
3030                         ret = btrfs_search_slot(trans, extent_root,
3031                                                 &key, path, -1, 1);
3032                         if (ret > 0 && skinny_metadata && path->slots[0]) {
3033                                 /*
3034                                  * Couldn't find our skinny metadata item,
3035                                  * see if we have ye olde extent item.
3036                                  */
3037                                 path->slots[0]--;
3038                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
3039                                                       path->slots[0]);
3040                                 if (key.objectid == bytenr &&
3041                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
3042                                     key.offset == num_bytes)
3043                                         ret = 0;
3044                         }
3045
3046                         if (ret > 0 && skinny_metadata) {
3047                                 skinny_metadata = false;
3048                                 key.objectid = bytenr;
3049                                 key.type = BTRFS_EXTENT_ITEM_KEY;
3050                                 key.offset = num_bytes;
3051                                 btrfs_release_path(path);
3052                                 ret = btrfs_search_slot(trans, extent_root,
3053                                                         &key, path, -1, 1);
3054                         }
3055
3056                         if (ret) {
3057                                 if (ret > 0)
3058                                         btrfs_print_leaf(path->nodes[0]);
3059                                 btrfs_err(info,
3060                         "umm, got %d back from search, was looking for %llu, slot %d",
3061                                           ret, bytenr, path->slots[0]);
3062                         }
3063                         if (ret < 0) {
3064                                 btrfs_abort_transaction(trans, ret);
3065                                 goto out;
3066                         }
3067                         extent_slot = path->slots[0];
3068                 }
3069         } else if (WARN_ON(ret == -ENOENT)) {
3070                 abort_and_dump(trans, path,
3071 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3072                                bytenr, parent, root_objectid, owner_objectid,
3073                                owner_offset, path->slots[0]);
3074                 goto out;
3075         } else {
3076                 btrfs_abort_transaction(trans, ret);
3077                 goto out;
3078         }
3079
3080         leaf = path->nodes[0];
3081         item_size = btrfs_item_size(leaf, extent_slot);
3082         if (unlikely(item_size < sizeof(*ei))) {
3083                 ret = -EUCLEAN;
3084                 btrfs_err(trans->fs_info,
3085                           "unexpected extent item size, has %u expect >= %zu",
3086                           item_size, sizeof(*ei));
3087                 btrfs_abort_transaction(trans, ret);
3088                 goto out;
3089         }
3090         ei = btrfs_item_ptr(leaf, extent_slot,
3091                             struct btrfs_extent_item);
3092         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3093             key.type == BTRFS_EXTENT_ITEM_KEY) {
3094                 struct btrfs_tree_block_info *bi;
3095
3096                 if (item_size < sizeof(*ei) + sizeof(*bi)) {
3097                         abort_and_dump(trans, path,
3098 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3099                                        key.objectid, key.type, key.offset,
3100                                        path->slots[0], owner_objectid, item_size,
3101                                        sizeof(*ei) + sizeof(*bi));
3102                         ret = -EUCLEAN;
3103                         goto out;
3104                 }
3105                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3106                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3107         }
3108
3109         refs = btrfs_extent_refs(leaf, ei);
3110         if (refs < refs_to_drop) {
3111                 abort_and_dump(trans, path,
3112                 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3113                                refs_to_drop, refs, bytenr, path->slots[0]);
3114                 ret = -EUCLEAN;
3115                 goto out;
3116         }
3117         refs -= refs_to_drop;
3118
3119         if (refs > 0) {
3120                 if (extent_op)
3121                         __run_delayed_extent_op(extent_op, leaf, ei);
3122                 /*
3123                  * In the case of inline back ref, reference count will
3124                  * be updated by remove_extent_backref
3125                  */
3126                 if (iref) {
3127                         if (!found_extent) {
3128                                 abort_and_dump(trans, path,
3129 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3130                                                path->slots[0]);
3131                                 ret = -EUCLEAN;
3132                                 goto out;
3133                         }
3134                 } else {
3135                         btrfs_set_extent_refs(leaf, ei, refs);
3136                         btrfs_mark_buffer_dirty(leaf);
3137                 }
3138                 if (found_extent) {
3139                         ret = remove_extent_backref(trans, extent_root, path,
3140                                                     iref, refs_to_drop, is_data);
3141                         if (ret) {
3142                                 btrfs_abort_transaction(trans, ret);
3143                                 goto out;
3144                         }
3145                 }
3146         } else {
3147                 /* In this branch refs == 1 */
3148                 if (found_extent) {
3149                         if (is_data && refs_to_drop !=
3150                             extent_data_ref_count(path, iref)) {
3151                                 abort_and_dump(trans, path,
3152                 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3153                                                extent_data_ref_count(path, iref),
3154                                                refs_to_drop, path->slots[0]);
3155                                 ret = -EUCLEAN;
3156                                 goto out;
3157                         }
3158                         if (iref) {
3159                                 if (path->slots[0] != extent_slot) {
3160                                         abort_and_dump(trans, path,
3161 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3162                                                        key.objectid, key.type,
3163                                                        key.offset, path->slots[0]);
3164                                         ret = -EUCLEAN;
3165                                         goto out;
3166                                 }
3167                         } else {
3168                                 /*
3169                                  * No inline ref, we must be at SHARED_* item,
3170                                  * And it's single ref, it must be:
3171                                  * |    extent_slot       ||extent_slot + 1|
3172                                  * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3173                                  */
3174                                 if (path->slots[0] != extent_slot + 1) {
3175                                         abort_and_dump(trans, path,
3176         "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3177                                                        path->slots[0]);
3178                                         ret = -EUCLEAN;
3179                                         goto out;
3180                                 }
3181                                 path->slots[0] = extent_slot;
3182                                 num_to_del = 2;
3183                         }
3184                 }
3185
3186                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3187                                       num_to_del);
3188                 if (ret) {
3189                         btrfs_abort_transaction(trans, ret);
3190                         goto out;
3191                 }
3192                 btrfs_release_path(path);
3193
3194                 ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
3195         }
3196         btrfs_release_path(path);
3197
3198 out:
3199         btrfs_free_path(path);
3200         return ret;
3201 }
3202
3203 /*
3204  * when we free an block, it is possible (and likely) that we free the last
3205  * delayed ref for that extent as well.  This searches the delayed ref tree for
3206  * a given extent, and if there are no other delayed refs to be processed, it
3207  * removes it from the tree.
3208  */
3209 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3210                                       u64 bytenr)
3211 {
3212         struct btrfs_delayed_ref_head *head;
3213         struct btrfs_delayed_ref_root *delayed_refs;
3214         int ret = 0;
3215
3216         delayed_refs = &trans->transaction->delayed_refs;
3217         spin_lock(&delayed_refs->lock);
3218         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3219         if (!head)
3220                 goto out_delayed_unlock;
3221
3222         spin_lock(&head->lock);
3223         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3224                 goto out;
3225
3226         if (cleanup_extent_op(head) != NULL)
3227                 goto out;
3228
3229         /*
3230          * waiting for the lock here would deadlock.  If someone else has it
3231          * locked they are already in the process of dropping it anyway
3232          */
3233         if (!mutex_trylock(&head->mutex))
3234                 goto out;
3235
3236         btrfs_delete_ref_head(delayed_refs, head);
3237         head->processing = false;
3238
3239         spin_unlock(&head->lock);
3240         spin_unlock(&delayed_refs->lock);
3241
3242         BUG_ON(head->extent_op);
3243         if (head->must_insert_reserved)
3244                 ret = 1;
3245
3246         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3247         mutex_unlock(&head->mutex);
3248         btrfs_put_delayed_ref_head(head);
3249         return ret;
3250 out:
3251         spin_unlock(&head->lock);
3252
3253 out_delayed_unlock:
3254         spin_unlock(&delayed_refs->lock);
3255         return 0;
3256 }
3257
3258 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3259                            u64 root_id,
3260                            struct extent_buffer *buf,
3261                            u64 parent, int last_ref)
3262 {
3263         struct btrfs_fs_info *fs_info = trans->fs_info;
3264         struct btrfs_ref generic_ref = { 0 };
3265         int ret;
3266
3267         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3268                                buf->start, buf->len, parent);
3269         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3270                             root_id, 0, false);
3271
3272         if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3273                 btrfs_ref_tree_mod(fs_info, &generic_ref);
3274                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3275                 BUG_ON(ret); /* -ENOMEM */
3276         }
3277
3278         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3279                 struct btrfs_block_group *cache;
3280                 bool must_pin = false;
3281
3282                 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3283                         ret = check_ref_cleanup(trans, buf->start);
3284                         if (!ret) {
3285                                 btrfs_redirty_list_add(trans->transaction, buf);
3286                                 goto out;
3287                         }
3288                 }
3289
3290                 cache = btrfs_lookup_block_group(fs_info, buf->start);
3291
3292                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3293                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3294                         btrfs_put_block_group(cache);
3295                         goto out;
3296                 }
3297
3298                 /*
3299                  * If there are tree mod log users we may have recorded mod log
3300                  * operations for this node.  If we re-allocate this node we
3301                  * could replay operations on this node that happened when it
3302                  * existed in a completely different root.  For example if it
3303                  * was part of root A, then was reallocated to root B, and we
3304                  * are doing a btrfs_old_search_slot(root b), we could replay
3305                  * operations that happened when the block was part of root A,
3306                  * giving us an inconsistent view of the btree.
3307                  *
3308                  * We are safe from races here because at this point no other
3309                  * node or root points to this extent buffer, so if after this
3310                  * check a new tree mod log user joins we will not have an
3311                  * existing log of operations on this node that we have to
3312                  * contend with.
3313                  */
3314                 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3315                         must_pin = true;
3316
3317                 if (must_pin || btrfs_is_zoned(fs_info)) {
3318                         btrfs_redirty_list_add(trans->transaction, buf);
3319                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3320                         btrfs_put_block_group(cache);
3321                         goto out;
3322                 }
3323
3324                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3325
3326                 btrfs_add_free_space(cache, buf->start, buf->len);
3327                 btrfs_free_reserved_bytes(cache, buf->len, 0);
3328                 btrfs_put_block_group(cache);
3329                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3330         }
3331 out:
3332         if (last_ref) {
3333                 /*
3334                  * Deleting the buffer, clear the corrupt flag since it doesn't
3335                  * matter anymore.
3336                  */
3337                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3338         }
3339 }
3340
3341 /* Can return -ENOMEM */
3342 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3343 {
3344         struct btrfs_fs_info *fs_info = trans->fs_info;
3345         int ret;
3346
3347         if (btrfs_is_testing(fs_info))
3348                 return 0;
3349
3350         /*
3351          * tree log blocks never actually go into the extent allocation
3352          * tree, just update pinning info and exit early.
3353          */
3354         if ((ref->type == BTRFS_REF_METADATA &&
3355              ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3356             (ref->type == BTRFS_REF_DATA &&
3357              ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3358                 /* unlocks the pinned mutex */
3359                 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3360                 ret = 0;
3361         } else if (ref->type == BTRFS_REF_METADATA) {
3362                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3363         } else {
3364                 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3365         }
3366
3367         if (!((ref->type == BTRFS_REF_METADATA &&
3368                ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3369               (ref->type == BTRFS_REF_DATA &&
3370                ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3371                 btrfs_ref_tree_mod(fs_info, ref);
3372
3373         return ret;
3374 }
3375
3376 enum btrfs_loop_type {
3377         /*
3378          * Start caching block groups but do not wait for progress or for them
3379          * to be done.
3380          */
3381         LOOP_CACHING_NOWAIT,
3382
3383         /*
3384          * Wait for the block group free_space >= the space we're waiting for if
3385          * the block group isn't cached.
3386          */
3387         LOOP_CACHING_WAIT,
3388
3389         /*
3390          * Allow allocations to happen from block groups that do not yet have a
3391          * size classification.
3392          */
3393         LOOP_UNSET_SIZE_CLASS,
3394
3395         /*
3396          * Allocate a chunk and then retry the allocation.
3397          */
3398         LOOP_ALLOC_CHUNK,
3399
3400         /*
3401          * Ignore the size class restrictions for this allocation.
3402          */
3403         LOOP_WRONG_SIZE_CLASS,
3404
3405         /*
3406          * Ignore the empty size, only try to allocate the number of bytes
3407          * needed for this allocation.
3408          */
3409         LOOP_NO_EMPTY_SIZE,
3410 };
3411
3412 static inline void
3413 btrfs_lock_block_group(struct btrfs_block_group *cache,
3414                        int delalloc)
3415 {
3416         if (delalloc)
3417                 down_read(&cache->data_rwsem);
3418 }
3419
3420 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3421                        int delalloc)
3422 {
3423         btrfs_get_block_group(cache);
3424         if (delalloc)
3425                 down_read(&cache->data_rwsem);
3426 }
3427
3428 static struct btrfs_block_group *btrfs_lock_cluster(
3429                    struct btrfs_block_group *block_group,
3430                    struct btrfs_free_cluster *cluster,
3431                    int delalloc)
3432         __acquires(&cluster->refill_lock)
3433 {
3434         struct btrfs_block_group *used_bg = NULL;
3435
3436         spin_lock(&cluster->refill_lock);
3437         while (1) {
3438                 used_bg = cluster->block_group;
3439                 if (!used_bg)
3440                         return NULL;
3441
3442                 if (used_bg == block_group)
3443                         return used_bg;
3444
3445                 btrfs_get_block_group(used_bg);
3446
3447                 if (!delalloc)
3448                         return used_bg;
3449
3450                 if (down_read_trylock(&used_bg->data_rwsem))
3451                         return used_bg;
3452
3453                 spin_unlock(&cluster->refill_lock);
3454
3455                 /* We should only have one-level nested. */
3456                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3457
3458                 spin_lock(&cluster->refill_lock);
3459                 if (used_bg == cluster->block_group)
3460                         return used_bg;
3461
3462                 up_read(&used_bg->data_rwsem);
3463                 btrfs_put_block_group(used_bg);
3464         }
3465 }
3466
3467 static inline void
3468 btrfs_release_block_group(struct btrfs_block_group *cache,
3469                          int delalloc)
3470 {
3471         if (delalloc)
3472                 up_read(&cache->data_rwsem);
3473         btrfs_put_block_group(cache);
3474 }
3475
3476 /*
3477  * Helper function for find_free_extent().
3478  *
3479  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3480  * Return >0 to inform caller that we find nothing
3481  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3482  */
3483 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3484                                       struct find_free_extent_ctl *ffe_ctl,
3485                                       struct btrfs_block_group **cluster_bg_ret)
3486 {
3487         struct btrfs_block_group *cluster_bg;
3488         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3489         u64 aligned_cluster;
3490         u64 offset;
3491         int ret;
3492
3493         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3494         if (!cluster_bg)
3495                 goto refill_cluster;
3496         if (cluster_bg != bg && (cluster_bg->ro ||
3497             !block_group_bits(cluster_bg, ffe_ctl->flags)))
3498                 goto release_cluster;
3499
3500         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3501                         ffe_ctl->num_bytes, cluster_bg->start,
3502                         &ffe_ctl->max_extent_size);
3503         if (offset) {
3504                 /* We have a block, we're done */
3505                 spin_unlock(&last_ptr->refill_lock);
3506                 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3507                 *cluster_bg_ret = cluster_bg;
3508                 ffe_ctl->found_offset = offset;
3509                 return 0;
3510         }
3511         WARN_ON(last_ptr->block_group != cluster_bg);
3512
3513 release_cluster:
3514         /*
3515          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3516          * lets just skip it and let the allocator find whatever block it can
3517          * find. If we reach this point, we will have tried the cluster
3518          * allocator plenty of times and not have found anything, so we are
3519          * likely way too fragmented for the clustering stuff to find anything.
3520          *
3521          * However, if the cluster is taken from the current block group,
3522          * release the cluster first, so that we stand a better chance of
3523          * succeeding in the unclustered allocation.
3524          */
3525         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3526                 spin_unlock(&last_ptr->refill_lock);
3527                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3528                 return -ENOENT;
3529         }
3530
3531         /* This cluster didn't work out, free it and start over */
3532         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3533
3534         if (cluster_bg != bg)
3535                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3536
3537 refill_cluster:
3538         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3539                 spin_unlock(&last_ptr->refill_lock);
3540                 return -ENOENT;
3541         }
3542
3543         aligned_cluster = max_t(u64,
3544                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3545                         bg->full_stripe_len);
3546         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3547                         ffe_ctl->num_bytes, aligned_cluster);
3548         if (ret == 0) {
3549                 /* Now pull our allocation out of this cluster */
3550                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3551                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
3552                                 &ffe_ctl->max_extent_size);
3553                 if (offset) {
3554                         /* We found one, proceed */
3555                         spin_unlock(&last_ptr->refill_lock);
3556                         ffe_ctl->found_offset = offset;
3557                         trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3558                         return 0;
3559                 }
3560         }
3561         /*
3562          * At this point we either didn't find a cluster or we weren't able to
3563          * allocate a block from our cluster.  Free the cluster we've been
3564          * trying to use, and go to the next block group.
3565          */
3566         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3567         spin_unlock(&last_ptr->refill_lock);
3568         return 1;
3569 }
3570
3571 /*
3572  * Return >0 to inform caller that we find nothing
3573  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3574  */
3575 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3576                                         struct find_free_extent_ctl *ffe_ctl)
3577 {
3578         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3579         u64 offset;
3580
3581         /*
3582          * We are doing an unclustered allocation, set the fragmented flag so
3583          * we don't bother trying to setup a cluster again until we get more
3584          * space.
3585          */
3586         if (unlikely(last_ptr)) {
3587                 spin_lock(&last_ptr->lock);
3588                 last_ptr->fragmented = 1;
3589                 spin_unlock(&last_ptr->lock);
3590         }
3591         if (ffe_ctl->cached) {
3592                 struct btrfs_free_space_ctl *free_space_ctl;
3593
3594                 free_space_ctl = bg->free_space_ctl;
3595                 spin_lock(&free_space_ctl->tree_lock);
3596                 if (free_space_ctl->free_space <
3597                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3598                     ffe_ctl->empty_size) {
3599                         ffe_ctl->total_free_space = max_t(u64,
3600                                         ffe_ctl->total_free_space,
3601                                         free_space_ctl->free_space);
3602                         spin_unlock(&free_space_ctl->tree_lock);
3603                         return 1;
3604                 }
3605                 spin_unlock(&free_space_ctl->tree_lock);
3606         }
3607
3608         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3609                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
3610                         &ffe_ctl->max_extent_size);
3611         if (!offset)
3612                 return 1;
3613         ffe_ctl->found_offset = offset;
3614         return 0;
3615 }
3616
3617 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3618                                    struct find_free_extent_ctl *ffe_ctl,
3619                                    struct btrfs_block_group **bg_ret)
3620 {
3621         int ret;
3622
3623         /* We want to try and use the cluster allocator, so lets look there */
3624         if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3625                 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3626                 if (ret >= 0)
3627                         return ret;
3628                 /* ret == -ENOENT case falls through */
3629         }
3630
3631         return find_free_extent_unclustered(block_group, ffe_ctl);
3632 }
3633
3634 /*
3635  * Tree-log block group locking
3636  * ============================
3637  *
3638  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3639  * indicates the starting address of a block group, which is reserved only
3640  * for tree-log metadata.
3641  *
3642  * Lock nesting
3643  * ============
3644  *
3645  * space_info::lock
3646  *   block_group::lock
3647  *     fs_info::treelog_bg_lock
3648  */
3649
3650 /*
3651  * Simple allocator for sequential-only block group. It only allows sequential
3652  * allocation. No need to play with trees. This function also reserves the
3653  * bytes as in btrfs_add_reserved_bytes.
3654  */
3655 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3656                                struct find_free_extent_ctl *ffe_ctl,
3657                                struct btrfs_block_group **bg_ret)
3658 {
3659         struct btrfs_fs_info *fs_info = block_group->fs_info;
3660         struct btrfs_space_info *space_info = block_group->space_info;
3661         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3662         u64 start = block_group->start;
3663         u64 num_bytes = ffe_ctl->num_bytes;
3664         u64 avail;
3665         u64 bytenr = block_group->start;
3666         u64 log_bytenr;
3667         u64 data_reloc_bytenr;
3668         int ret = 0;
3669         bool skip = false;
3670
3671         ASSERT(btrfs_is_zoned(block_group->fs_info));
3672
3673         /*
3674          * Do not allow non-tree-log blocks in the dedicated tree-log block
3675          * group, and vice versa.
3676          */
3677         spin_lock(&fs_info->treelog_bg_lock);
3678         log_bytenr = fs_info->treelog_bg;
3679         if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3680                            (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3681                 skip = true;
3682         spin_unlock(&fs_info->treelog_bg_lock);
3683         if (skip)
3684                 return 1;
3685
3686         /*
3687          * Do not allow non-relocation blocks in the dedicated relocation block
3688          * group, and vice versa.
3689          */
3690         spin_lock(&fs_info->relocation_bg_lock);
3691         data_reloc_bytenr = fs_info->data_reloc_bg;
3692         if (data_reloc_bytenr &&
3693             ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3694              (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3695                 skip = true;
3696         spin_unlock(&fs_info->relocation_bg_lock);
3697         if (skip)
3698                 return 1;
3699
3700         /* Check RO and no space case before trying to activate it */
3701         spin_lock(&block_group->lock);
3702         if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3703                 ret = 1;
3704                 /*
3705                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3706                  * Return the error after taking the locks.
3707                  */
3708         }
3709         spin_unlock(&block_group->lock);
3710
3711         /* Metadata block group is activated at write time. */
3712         if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3713             !btrfs_zone_activate(block_group)) {
3714                 ret = 1;
3715                 /*
3716                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3717                  * Return the error after taking the locks.
3718                  */
3719         }
3720
3721         spin_lock(&space_info->lock);
3722         spin_lock(&block_group->lock);
3723         spin_lock(&fs_info->treelog_bg_lock);
3724         spin_lock(&fs_info->relocation_bg_lock);
3725
3726         if (ret)
3727                 goto out;
3728
3729         ASSERT(!ffe_ctl->for_treelog ||
3730                block_group->start == fs_info->treelog_bg ||
3731                fs_info->treelog_bg == 0);
3732         ASSERT(!ffe_ctl->for_data_reloc ||
3733                block_group->start == fs_info->data_reloc_bg ||
3734                fs_info->data_reloc_bg == 0);
3735
3736         if (block_group->ro ||
3737             (!ffe_ctl->for_data_reloc &&
3738              test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3739                 ret = 1;
3740                 goto out;
3741         }
3742
3743         /*
3744          * Do not allow currently using block group to be tree-log dedicated
3745          * block group.
3746          */
3747         if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3748             (block_group->used || block_group->reserved)) {
3749                 ret = 1;
3750                 goto out;
3751         }
3752
3753         /*
3754          * Do not allow currently used block group to be the data relocation
3755          * dedicated block group.
3756          */
3757         if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3758             (block_group->used || block_group->reserved)) {
3759                 ret = 1;
3760                 goto out;
3761         }
3762
3763         WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3764         avail = block_group->zone_capacity - block_group->alloc_offset;
3765         if (avail < num_bytes) {
3766                 if (ffe_ctl->max_extent_size < avail) {
3767                         /*
3768                          * With sequential allocator, free space is always
3769                          * contiguous
3770                          */
3771                         ffe_ctl->max_extent_size = avail;
3772                         ffe_ctl->total_free_space = avail;
3773                 }
3774                 ret = 1;
3775                 goto out;
3776         }
3777
3778         if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3779                 fs_info->treelog_bg = block_group->start;
3780
3781         if (ffe_ctl->for_data_reloc) {
3782                 if (!fs_info->data_reloc_bg)
3783                         fs_info->data_reloc_bg = block_group->start;
3784                 /*
3785                  * Do not allow allocations from this block group, unless it is
3786                  * for data relocation. Compared to increasing the ->ro, setting
3787                  * the ->zoned_data_reloc_ongoing flag still allows nocow
3788                  * writers to come in. See btrfs_inc_nocow_writers().
3789                  *
3790                  * We need to disable an allocation to avoid an allocation of
3791                  * regular (non-relocation data) extent. With mix of relocation
3792                  * extents and regular extents, we can dispatch WRITE commands
3793                  * (for relocation extents) and ZONE APPEND commands (for
3794                  * regular extents) at the same time to the same zone, which
3795                  * easily break the write pointer.
3796                  *
3797                  * Also, this flag avoids this block group to be zone finished.
3798                  */
3799                 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3800         }
3801
3802         ffe_ctl->found_offset = start + block_group->alloc_offset;
3803         block_group->alloc_offset += num_bytes;
3804         spin_lock(&ctl->tree_lock);
3805         ctl->free_space -= num_bytes;
3806         spin_unlock(&ctl->tree_lock);
3807
3808         /*
3809          * We do not check if found_offset is aligned to stripesize. The
3810          * address is anyway rewritten when using zone append writing.
3811          */
3812
3813         ffe_ctl->search_start = ffe_ctl->found_offset;
3814
3815 out:
3816         if (ret && ffe_ctl->for_treelog)
3817                 fs_info->treelog_bg = 0;
3818         if (ret && ffe_ctl->for_data_reloc)
3819                 fs_info->data_reloc_bg = 0;
3820         spin_unlock(&fs_info->relocation_bg_lock);
3821         spin_unlock(&fs_info->treelog_bg_lock);
3822         spin_unlock(&block_group->lock);
3823         spin_unlock(&space_info->lock);
3824         return ret;
3825 }
3826
3827 static int do_allocation(struct btrfs_block_group *block_group,
3828                          struct find_free_extent_ctl *ffe_ctl,
3829                          struct btrfs_block_group **bg_ret)
3830 {
3831         switch (ffe_ctl->policy) {
3832         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3833                 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3834         case BTRFS_EXTENT_ALLOC_ZONED:
3835                 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3836         default:
3837                 BUG();
3838         }
3839 }
3840
3841 static void release_block_group(struct btrfs_block_group *block_group,
3842                                 struct find_free_extent_ctl *ffe_ctl,
3843                                 int delalloc)
3844 {
3845         switch (ffe_ctl->policy) {
3846         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3847                 ffe_ctl->retry_uncached = false;
3848                 break;
3849         case BTRFS_EXTENT_ALLOC_ZONED:
3850                 /* Nothing to do */
3851                 break;
3852         default:
3853                 BUG();
3854         }
3855
3856         BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3857                ffe_ctl->index);
3858         btrfs_release_block_group(block_group, delalloc);
3859 }
3860
3861 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3862                                    struct btrfs_key *ins)
3863 {
3864         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3865
3866         if (!ffe_ctl->use_cluster && last_ptr) {
3867                 spin_lock(&last_ptr->lock);
3868                 last_ptr->window_start = ins->objectid;
3869                 spin_unlock(&last_ptr->lock);
3870         }
3871 }
3872
3873 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3874                          struct btrfs_key *ins)
3875 {
3876         switch (ffe_ctl->policy) {
3877         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3878                 found_extent_clustered(ffe_ctl, ins);
3879                 break;
3880         case BTRFS_EXTENT_ALLOC_ZONED:
3881                 /* Nothing to do */
3882                 break;
3883         default:
3884                 BUG();
3885         }
3886 }
3887
3888 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
3889                                     struct find_free_extent_ctl *ffe_ctl)
3890 {
3891         /* Block group's activeness is not a requirement for METADATA block groups. */
3892         if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
3893                 return 0;
3894
3895         /* If we can activate new zone, just allocate a chunk and use it */
3896         if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
3897                 return 0;
3898
3899         /*
3900          * We already reached the max active zones. Try to finish one block
3901          * group to make a room for a new block group. This is only possible
3902          * for a data block group because btrfs_zone_finish() may need to wait
3903          * for a running transaction which can cause a deadlock for metadata
3904          * allocation.
3905          */
3906         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
3907                 int ret = btrfs_zone_finish_one_bg(fs_info);
3908
3909                 if (ret == 1)
3910                         return 0;
3911                 else if (ret < 0)
3912                         return ret;
3913         }
3914
3915         /*
3916          * If we have enough free space left in an already active block group
3917          * and we can't activate any other zone now, do not allow allocating a
3918          * new chunk and let find_free_extent() retry with a smaller size.
3919          */
3920         if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
3921                 return -ENOSPC;
3922
3923         /*
3924          * Even min_alloc_size is not left in any block groups. Since we cannot
3925          * activate a new block group, allocating it may not help. Let's tell a
3926          * caller to try again and hope it progress something by writing some
3927          * parts of the region. That is only possible for data block groups,
3928          * where a part of the region can be written.
3929          */
3930         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
3931                 return -EAGAIN;
3932
3933         /*
3934          * We cannot activate a new block group and no enough space left in any
3935          * block groups. So, allocating a new block group may not help. But,
3936          * there is nothing to do anyway, so let's go with it.
3937          */
3938         return 0;
3939 }
3940
3941 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
3942                               struct find_free_extent_ctl *ffe_ctl)
3943 {
3944         switch (ffe_ctl->policy) {
3945         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3946                 return 0;
3947         case BTRFS_EXTENT_ALLOC_ZONED:
3948                 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
3949         default:
3950                 BUG();
3951         }
3952 }
3953
3954 /*
3955  * Return >0 means caller needs to re-search for free extent
3956  * Return 0 means we have the needed free extent.
3957  * Return <0 means we failed to locate any free extent.
3958  */
3959 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3960                                         struct btrfs_key *ins,
3961                                         struct find_free_extent_ctl *ffe_ctl,
3962                                         bool full_search)
3963 {
3964         struct btrfs_root *root = fs_info->chunk_root;
3965         int ret;
3966
3967         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3968             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3969                 ffe_ctl->orig_have_caching_bg = true;
3970
3971         if (ins->objectid) {
3972                 found_extent(ffe_ctl, ins);
3973                 return 0;
3974         }
3975
3976         if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
3977                 return 1;
3978
3979         ffe_ctl->index++;
3980         if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
3981                 return 1;
3982
3983         /* See the comments for btrfs_loop_type for an explanation of the phases. */
3984         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3985                 ffe_ctl->index = 0;
3986                 /*
3987                  * We want to skip the LOOP_CACHING_WAIT step if we don't have
3988                  * any uncached bgs and we've already done a full search
3989                  * through.
3990                  */
3991                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
3992                     (!ffe_ctl->orig_have_caching_bg && full_search))
3993                         ffe_ctl->loop++;
3994                 ffe_ctl->loop++;
3995
3996                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3997                         struct btrfs_trans_handle *trans;
3998                         int exist = 0;
3999
4000                         /* Check if allocation policy allows to create a new chunk */
4001                         ret = can_allocate_chunk(fs_info, ffe_ctl);
4002                         if (ret)
4003                                 return ret;
4004
4005                         trans = current->journal_info;
4006                         if (trans)
4007                                 exist = 1;
4008                         else
4009                                 trans = btrfs_join_transaction(root);
4010
4011                         if (IS_ERR(trans)) {
4012                                 ret = PTR_ERR(trans);
4013                                 return ret;
4014                         }
4015
4016                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4017                                                 CHUNK_ALLOC_FORCE_FOR_EXTENT);
4018
4019                         /* Do not bail out on ENOSPC since we can do more. */
4020                         if (ret == -ENOSPC) {
4021                                 ret = 0;
4022                                 ffe_ctl->loop++;
4023                         }
4024                         else if (ret < 0)
4025                                 btrfs_abort_transaction(trans, ret);
4026                         else
4027                                 ret = 0;
4028                         if (!exist)
4029                                 btrfs_end_transaction(trans);
4030                         if (ret)
4031                                 return ret;
4032                 }
4033
4034                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4035                         if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4036                                 return -ENOSPC;
4037
4038                         /*
4039                          * Don't loop again if we already have no empty_size and
4040                          * no empty_cluster.
4041                          */
4042                         if (ffe_ctl->empty_size == 0 &&
4043                             ffe_ctl->empty_cluster == 0)
4044                                 return -ENOSPC;
4045                         ffe_ctl->empty_size = 0;
4046                         ffe_ctl->empty_cluster = 0;
4047                 }
4048                 return 1;
4049         }
4050         return -ENOSPC;
4051 }
4052
4053 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4054                                               struct btrfs_block_group *bg)
4055 {
4056         if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4057                 return true;
4058         if (!btrfs_block_group_should_use_size_class(bg))
4059                 return true;
4060         if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4061                 return true;
4062         if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4063             bg->size_class == BTRFS_BG_SZ_NONE)
4064                 return true;
4065         return ffe_ctl->size_class == bg->size_class;
4066 }
4067
4068 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4069                                         struct find_free_extent_ctl *ffe_ctl,
4070                                         struct btrfs_space_info *space_info,
4071                                         struct btrfs_key *ins)
4072 {
4073         /*
4074          * If our free space is heavily fragmented we may not be able to make
4075          * big contiguous allocations, so instead of doing the expensive search
4076          * for free space, simply return ENOSPC with our max_extent_size so we
4077          * can go ahead and search for a more manageable chunk.
4078          *
4079          * If our max_extent_size is large enough for our allocation simply
4080          * disable clustering since we will likely not be able to find enough
4081          * space to create a cluster and induce latency trying.
4082          */
4083         if (space_info->max_extent_size) {
4084                 spin_lock(&space_info->lock);
4085                 if (space_info->max_extent_size &&
4086                     ffe_ctl->num_bytes > space_info->max_extent_size) {
4087                         ins->offset = space_info->max_extent_size;
4088                         spin_unlock(&space_info->lock);
4089                         return -ENOSPC;
4090                 } else if (space_info->max_extent_size) {
4091                         ffe_ctl->use_cluster = false;
4092                 }
4093                 spin_unlock(&space_info->lock);
4094         }
4095
4096         ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4097                                                &ffe_ctl->empty_cluster);
4098         if (ffe_ctl->last_ptr) {
4099                 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4100
4101                 spin_lock(&last_ptr->lock);
4102                 if (last_ptr->block_group)
4103                         ffe_ctl->hint_byte = last_ptr->window_start;
4104                 if (last_ptr->fragmented) {
4105                         /*
4106                          * We still set window_start so we can keep track of the
4107                          * last place we found an allocation to try and save
4108                          * some time.
4109                          */
4110                         ffe_ctl->hint_byte = last_ptr->window_start;
4111                         ffe_ctl->use_cluster = false;
4112                 }
4113                 spin_unlock(&last_ptr->lock);
4114         }
4115
4116         return 0;
4117 }
4118
4119 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4120                               struct find_free_extent_ctl *ffe_ctl,
4121                               struct btrfs_space_info *space_info,
4122                               struct btrfs_key *ins)
4123 {
4124         switch (ffe_ctl->policy) {
4125         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4126                 return prepare_allocation_clustered(fs_info, ffe_ctl,
4127                                                     space_info, ins);
4128         case BTRFS_EXTENT_ALLOC_ZONED:
4129                 if (ffe_ctl->for_treelog) {
4130                         spin_lock(&fs_info->treelog_bg_lock);
4131                         if (fs_info->treelog_bg)
4132                                 ffe_ctl->hint_byte = fs_info->treelog_bg;
4133                         spin_unlock(&fs_info->treelog_bg_lock);
4134                 }
4135                 if (ffe_ctl->for_data_reloc) {
4136                         spin_lock(&fs_info->relocation_bg_lock);
4137                         if (fs_info->data_reloc_bg)
4138                                 ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4139                         spin_unlock(&fs_info->relocation_bg_lock);
4140                 }
4141                 return 0;
4142         default:
4143                 BUG();
4144         }
4145 }
4146
4147 /*
4148  * walks the btree of allocated extents and find a hole of a given size.
4149  * The key ins is changed to record the hole:
4150  * ins->objectid == start position
4151  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4152  * ins->offset == the size of the hole.
4153  * Any available blocks before search_start are skipped.
4154  *
4155  * If there is no suitable free space, we will record the max size of
4156  * the free space extent currently.
4157  *
4158  * The overall logic and call chain:
4159  *
4160  * find_free_extent()
4161  * |- Iterate through all block groups
4162  * |  |- Get a valid block group
4163  * |  |- Try to do clustered allocation in that block group
4164  * |  |- Try to do unclustered allocation in that block group
4165  * |  |- Check if the result is valid
4166  * |  |  |- If valid, then exit
4167  * |  |- Jump to next block group
4168  * |
4169  * |- Push harder to find free extents
4170  *    |- If not found, re-iterate all block groups
4171  */
4172 static noinline int find_free_extent(struct btrfs_root *root,
4173                                      struct btrfs_key *ins,
4174                                      struct find_free_extent_ctl *ffe_ctl)
4175 {
4176         struct btrfs_fs_info *fs_info = root->fs_info;
4177         int ret = 0;
4178         int cache_block_group_error = 0;
4179         struct btrfs_block_group *block_group = NULL;
4180         struct btrfs_space_info *space_info;
4181         bool full_search = false;
4182
4183         WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4184
4185         ffe_ctl->search_start = 0;
4186         /* For clustered allocation */
4187         ffe_ctl->empty_cluster = 0;
4188         ffe_ctl->last_ptr = NULL;
4189         ffe_ctl->use_cluster = true;
4190         ffe_ctl->have_caching_bg = false;
4191         ffe_ctl->orig_have_caching_bg = false;
4192         ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4193         ffe_ctl->loop = 0;
4194         ffe_ctl->retry_uncached = false;
4195         ffe_ctl->cached = 0;
4196         ffe_ctl->max_extent_size = 0;
4197         ffe_ctl->total_free_space = 0;
4198         ffe_ctl->found_offset = 0;
4199         ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4200         ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4201
4202         if (btrfs_is_zoned(fs_info))
4203                 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4204
4205         ins->type = BTRFS_EXTENT_ITEM_KEY;
4206         ins->objectid = 0;
4207         ins->offset = 0;
4208
4209         trace_find_free_extent(root, ffe_ctl);
4210
4211         space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4212         if (!space_info) {
4213                 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4214                 return -ENOSPC;
4215         }
4216
4217         ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4218         if (ret < 0)
4219                 return ret;
4220
4221         ffe_ctl->search_start = max(ffe_ctl->search_start,
4222                                     first_logical_byte(fs_info));
4223         ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4224         if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4225                 block_group = btrfs_lookup_block_group(fs_info,
4226                                                        ffe_ctl->search_start);
4227                 /*
4228                  * we don't want to use the block group if it doesn't match our
4229                  * allocation bits, or if its not cached.
4230                  *
4231                  * However if we are re-searching with an ideal block group
4232                  * picked out then we don't care that the block group is cached.
4233                  */
4234                 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4235                     block_group->cached != BTRFS_CACHE_NO) {
4236                         down_read(&space_info->groups_sem);
4237                         if (list_empty(&block_group->list) ||
4238                             block_group->ro) {
4239                                 /*
4240                                  * someone is removing this block group,
4241                                  * we can't jump into the have_block_group
4242                                  * target because our list pointers are not
4243                                  * valid
4244                                  */
4245                                 btrfs_put_block_group(block_group);
4246                                 up_read(&space_info->groups_sem);
4247                         } else {
4248                                 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4249                                                         block_group->flags);
4250                                 btrfs_lock_block_group(block_group,
4251                                                        ffe_ctl->delalloc);
4252                                 ffe_ctl->hinted = true;
4253                                 goto have_block_group;
4254                         }
4255                 } else if (block_group) {
4256                         btrfs_put_block_group(block_group);
4257                 }
4258         }
4259 search:
4260         trace_find_free_extent_search_loop(root, ffe_ctl);
4261         ffe_ctl->have_caching_bg = false;
4262         if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4263             ffe_ctl->index == 0)
4264                 full_search = true;
4265         down_read(&space_info->groups_sem);
4266         list_for_each_entry(block_group,
4267                             &space_info->block_groups[ffe_ctl->index], list) {
4268                 struct btrfs_block_group *bg_ret;
4269
4270                 ffe_ctl->hinted = false;
4271                 /* If the block group is read-only, we can skip it entirely. */
4272                 if (unlikely(block_group->ro)) {
4273                         if (ffe_ctl->for_treelog)
4274                                 btrfs_clear_treelog_bg(block_group);
4275                         if (ffe_ctl->for_data_reloc)
4276                                 btrfs_clear_data_reloc_bg(block_group);
4277                         continue;
4278                 }
4279
4280                 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4281                 ffe_ctl->search_start = block_group->start;
4282
4283                 /*
4284                  * this can happen if we end up cycling through all the
4285                  * raid types, but we want to make sure we only allocate
4286                  * for the proper type.
4287                  */
4288                 if (!block_group_bits(block_group, ffe_ctl->flags)) {
4289                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
4290                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
4291                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
4292                                 BTRFS_BLOCK_GROUP_RAID10;
4293
4294                         /*
4295                          * if they asked for extra copies and this block group
4296                          * doesn't provide them, bail.  This does allow us to
4297                          * fill raid0 from raid1.
4298                          */
4299                         if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4300                                 goto loop;
4301
4302                         /*
4303                          * This block group has different flags than we want.
4304                          * It's possible that we have MIXED_GROUP flag but no
4305                          * block group is mixed.  Just skip such block group.
4306                          */
4307                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4308                         continue;
4309                 }
4310
4311 have_block_group:
4312                 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4313                 ffe_ctl->cached = btrfs_block_group_done(block_group);
4314                 if (unlikely(!ffe_ctl->cached)) {
4315                         ffe_ctl->have_caching_bg = true;
4316                         ret = btrfs_cache_block_group(block_group, false);
4317
4318                         /*
4319                          * If we get ENOMEM here or something else we want to
4320                          * try other block groups, because it may not be fatal.
4321                          * However if we can't find anything else we need to
4322                          * save our return here so that we return the actual
4323                          * error that caused problems, not ENOSPC.
4324                          */
4325                         if (ret < 0) {
4326                                 if (!cache_block_group_error)
4327                                         cache_block_group_error = ret;
4328                                 ret = 0;
4329                                 goto loop;
4330                         }
4331                         ret = 0;
4332                 }
4333
4334                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4335                         if (!cache_block_group_error)
4336                                 cache_block_group_error = -EIO;
4337                         goto loop;
4338                 }
4339
4340                 if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4341                         goto loop;
4342
4343                 bg_ret = NULL;
4344                 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4345                 if (ret > 0)
4346                         goto loop;
4347
4348                 if (bg_ret && bg_ret != block_group) {
4349                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4350                         block_group = bg_ret;
4351                 }
4352
4353                 /* Checks */
4354                 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4355                                                  fs_info->stripesize);
4356
4357                 /* move on to the next group */
4358                 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4359                     block_group->start + block_group->length) {
4360                         btrfs_add_free_space_unused(block_group,
4361                                             ffe_ctl->found_offset,
4362                                             ffe_ctl->num_bytes);
4363                         goto loop;
4364                 }
4365
4366                 if (ffe_ctl->found_offset < ffe_ctl->search_start)
4367                         btrfs_add_free_space_unused(block_group,
4368                                         ffe_ctl->found_offset,
4369                                         ffe_ctl->search_start - ffe_ctl->found_offset);
4370
4371                 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4372                                                ffe_ctl->num_bytes,
4373                                                ffe_ctl->delalloc,
4374                                                ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4375                 if (ret == -EAGAIN) {
4376                         btrfs_add_free_space_unused(block_group,
4377                                         ffe_ctl->found_offset,
4378                                         ffe_ctl->num_bytes);
4379                         goto loop;
4380                 }
4381                 btrfs_inc_block_group_reservations(block_group);
4382
4383                 /* we are all good, lets return */
4384                 ins->objectid = ffe_ctl->search_start;
4385                 ins->offset = ffe_ctl->num_bytes;
4386
4387                 trace_btrfs_reserve_extent(block_group, ffe_ctl);
4388                 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4389                 break;
4390 loop:
4391                 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4392                     !ffe_ctl->retry_uncached) {
4393                         ffe_ctl->retry_uncached = true;
4394                         btrfs_wait_block_group_cache_progress(block_group,
4395                                                 ffe_ctl->num_bytes +
4396                                                 ffe_ctl->empty_cluster +
4397                                                 ffe_ctl->empty_size);
4398                         goto have_block_group;
4399                 }
4400                 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4401                 cond_resched();
4402         }
4403         up_read(&space_info->groups_sem);
4404
4405         ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4406         if (ret > 0)
4407                 goto search;
4408
4409         if (ret == -ENOSPC && !cache_block_group_error) {
4410                 /*
4411                  * Use ffe_ctl->total_free_space as fallback if we can't find
4412                  * any contiguous hole.
4413                  */
4414                 if (!ffe_ctl->max_extent_size)
4415                         ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4416                 spin_lock(&space_info->lock);
4417                 space_info->max_extent_size = ffe_ctl->max_extent_size;
4418                 spin_unlock(&space_info->lock);
4419                 ins->offset = ffe_ctl->max_extent_size;
4420         } else if (ret == -ENOSPC) {
4421                 ret = cache_block_group_error;
4422         }
4423         return ret;
4424 }
4425
4426 /*
4427  * Entry point to the extent allocator. Tries to find a hole that is at least
4428  * as big as @num_bytes.
4429  *
4430  * @root           -    The root that will contain this extent
4431  *
4432  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4433  *                      is used for accounting purposes. This value differs
4434  *                      from @num_bytes only in the case of compressed extents.
4435  *
4436  * @num_bytes      -    Number of bytes to allocate on-disk.
4437  *
4438  * @min_alloc_size -    Indicates the minimum amount of space that the
4439  *                      allocator should try to satisfy. In some cases
4440  *                      @num_bytes may be larger than what is required and if
4441  *                      the filesystem is fragmented then allocation fails.
4442  *                      However, the presence of @min_alloc_size gives a
4443  *                      chance to try and satisfy the smaller allocation.
4444  *
4445  * @empty_size     -    A hint that you plan on doing more COW. This is the
4446  *                      size in bytes the allocator should try to find free
4447  *                      next to the block it returns.  This is just a hint and
4448  *                      may be ignored by the allocator.
4449  *
4450  * @hint_byte      -    Hint to the allocator to start searching above the byte
4451  *                      address passed. It might be ignored.
4452  *
4453  * @ins            -    This key is modified to record the found hole. It will
4454  *                      have the following values:
4455  *                      ins->objectid == start position
4456  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4457  *                      ins->offset == the size of the hole.
4458  *
4459  * @is_data        -    Boolean flag indicating whether an extent is
4460  *                      allocated for data (true) or metadata (false)
4461  *
4462  * @delalloc       -    Boolean flag indicating whether this allocation is for
4463  *                      delalloc or not. If 'true' data_rwsem of block groups
4464  *                      is going to be acquired.
4465  *
4466  *
4467  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4468  * case -ENOSPC is returned then @ins->offset will contain the size of the
4469  * largest available hole the allocator managed to find.
4470  */
4471 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4472                          u64 num_bytes, u64 min_alloc_size,
4473                          u64 empty_size, u64 hint_byte,
4474                          struct btrfs_key *ins, int is_data, int delalloc)
4475 {
4476         struct btrfs_fs_info *fs_info = root->fs_info;
4477         struct find_free_extent_ctl ffe_ctl = {};
4478         bool final_tried = num_bytes == min_alloc_size;
4479         u64 flags;
4480         int ret;
4481         bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4482         bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4483
4484         flags = get_alloc_profile_by_root(root, is_data);
4485 again:
4486         WARN_ON(num_bytes < fs_info->sectorsize);
4487
4488         ffe_ctl.ram_bytes = ram_bytes;
4489         ffe_ctl.num_bytes = num_bytes;
4490         ffe_ctl.min_alloc_size = min_alloc_size;
4491         ffe_ctl.empty_size = empty_size;
4492         ffe_ctl.flags = flags;
4493         ffe_ctl.delalloc = delalloc;
4494         ffe_ctl.hint_byte = hint_byte;
4495         ffe_ctl.for_treelog = for_treelog;
4496         ffe_ctl.for_data_reloc = for_data_reloc;
4497
4498         ret = find_free_extent(root, ins, &ffe_ctl);
4499         if (!ret && !is_data) {
4500                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4501         } else if (ret == -ENOSPC) {
4502                 if (!final_tried && ins->offset) {
4503                         num_bytes = min(num_bytes >> 1, ins->offset);
4504                         num_bytes = round_down(num_bytes,
4505                                                fs_info->sectorsize);
4506                         num_bytes = max(num_bytes, min_alloc_size);
4507                         ram_bytes = num_bytes;
4508                         if (num_bytes == min_alloc_size)
4509                                 final_tried = true;
4510                         goto again;
4511                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4512                         struct btrfs_space_info *sinfo;
4513
4514                         sinfo = btrfs_find_space_info(fs_info, flags);
4515                         btrfs_err(fs_info,
4516         "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4517                                   flags, num_bytes, for_treelog, for_data_reloc);
4518                         if (sinfo)
4519                                 btrfs_dump_space_info(fs_info, sinfo,
4520                                                       num_bytes, 1);
4521                 }
4522         }
4523
4524         return ret;
4525 }
4526
4527 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4528                                u64 start, u64 len, int delalloc)
4529 {
4530         struct btrfs_block_group *cache;
4531
4532         cache = btrfs_lookup_block_group(fs_info, start);
4533         if (!cache) {
4534                 btrfs_err(fs_info, "Unable to find block group for %llu",
4535                           start);
4536                 return -ENOSPC;
4537         }
4538
4539         btrfs_add_free_space(cache, start, len);
4540         btrfs_free_reserved_bytes(cache, len, delalloc);
4541         trace_btrfs_reserved_extent_free(fs_info, start, len);
4542
4543         btrfs_put_block_group(cache);
4544         return 0;
4545 }
4546
4547 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
4548                               const struct extent_buffer *eb)
4549 {
4550         struct btrfs_block_group *cache;
4551         int ret = 0;
4552
4553         cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
4554         if (!cache) {
4555                 btrfs_err(trans->fs_info, "unable to find block group for %llu",
4556                           eb->start);
4557                 return -ENOSPC;
4558         }
4559
4560         ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
4561         btrfs_put_block_group(cache);
4562         return ret;
4563 }
4564
4565 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4566                                  u64 num_bytes)
4567 {
4568         struct btrfs_fs_info *fs_info = trans->fs_info;
4569         int ret;
4570
4571         ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4572         if (ret)
4573                 return ret;
4574
4575         ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4576         if (ret) {
4577                 ASSERT(!ret);
4578                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4579                           bytenr, num_bytes);
4580                 return ret;
4581         }
4582
4583         trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4584         return 0;
4585 }
4586
4587 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4588                                       u64 parent, u64 root_objectid,
4589                                       u64 flags, u64 owner, u64 offset,
4590                                       struct btrfs_key *ins, int ref_mod)
4591 {
4592         struct btrfs_fs_info *fs_info = trans->fs_info;
4593         struct btrfs_root *extent_root;
4594         int ret;
4595         struct btrfs_extent_item *extent_item;
4596         struct btrfs_extent_inline_ref *iref;
4597         struct btrfs_path *path;
4598         struct extent_buffer *leaf;
4599         int type;
4600         u32 size;
4601
4602         if (parent > 0)
4603                 type = BTRFS_SHARED_DATA_REF_KEY;
4604         else
4605                 type = BTRFS_EXTENT_DATA_REF_KEY;
4606
4607         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4608
4609         path = btrfs_alloc_path();
4610         if (!path)
4611                 return -ENOMEM;
4612
4613         extent_root = btrfs_extent_root(fs_info, ins->objectid);
4614         ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4615         if (ret) {
4616                 btrfs_free_path(path);
4617                 return ret;
4618         }
4619
4620         leaf = path->nodes[0];
4621         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4622                                      struct btrfs_extent_item);
4623         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4624         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4625         btrfs_set_extent_flags(leaf, extent_item,
4626                                flags | BTRFS_EXTENT_FLAG_DATA);
4627
4628         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4629         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4630         if (parent > 0) {
4631                 struct btrfs_shared_data_ref *ref;
4632                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4633                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4634                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4635         } else {
4636                 struct btrfs_extent_data_ref *ref;
4637                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4638                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4639                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4640                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4641                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4642         }
4643
4644         btrfs_mark_buffer_dirty(path->nodes[0]);
4645         btrfs_free_path(path);
4646
4647         return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4648 }
4649
4650 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4651                                      struct btrfs_delayed_ref_node *node,
4652                                      struct btrfs_delayed_extent_op *extent_op)
4653 {
4654         struct btrfs_fs_info *fs_info = trans->fs_info;
4655         struct btrfs_root *extent_root;
4656         int ret;
4657         struct btrfs_extent_item *extent_item;
4658         struct btrfs_key extent_key;
4659         struct btrfs_tree_block_info *block_info;
4660         struct btrfs_extent_inline_ref *iref;
4661         struct btrfs_path *path;
4662         struct extent_buffer *leaf;
4663         struct btrfs_delayed_tree_ref *ref;
4664         u32 size = sizeof(*extent_item) + sizeof(*iref);
4665         u64 flags = extent_op->flags_to_set;
4666         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4667
4668         ref = btrfs_delayed_node_to_tree_ref(node);
4669
4670         extent_key.objectid = node->bytenr;
4671         if (skinny_metadata) {
4672                 extent_key.offset = ref->level;
4673                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4674         } else {
4675                 extent_key.offset = node->num_bytes;
4676                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4677                 size += sizeof(*block_info);
4678         }
4679
4680         path = btrfs_alloc_path();
4681         if (!path)
4682                 return -ENOMEM;
4683
4684         extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4685         ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4686                                       size);
4687         if (ret) {
4688                 btrfs_free_path(path);
4689                 return ret;
4690         }
4691
4692         leaf = path->nodes[0];
4693         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4694                                      struct btrfs_extent_item);
4695         btrfs_set_extent_refs(leaf, extent_item, 1);
4696         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4697         btrfs_set_extent_flags(leaf, extent_item,
4698                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4699
4700         if (skinny_metadata) {
4701                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4702         } else {
4703                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4704                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4705                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
4706                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4707         }
4708
4709         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4710                 btrfs_set_extent_inline_ref_type(leaf, iref,
4711                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4712                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4713         } else {
4714                 btrfs_set_extent_inline_ref_type(leaf, iref,
4715                                                  BTRFS_TREE_BLOCK_REF_KEY);
4716                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4717         }
4718
4719         btrfs_mark_buffer_dirty(leaf);
4720         btrfs_free_path(path);
4721
4722         return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4723 }
4724
4725 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4726                                      struct btrfs_root *root, u64 owner,
4727                                      u64 offset, u64 ram_bytes,
4728                                      struct btrfs_key *ins)
4729 {
4730         struct btrfs_ref generic_ref = { 0 };
4731
4732         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4733
4734         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4735                                ins->objectid, ins->offset, 0);
4736         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4737                             offset, 0, false);
4738         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4739
4740         return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4741 }
4742
4743 /*
4744  * this is used by the tree logging recovery code.  It records that
4745  * an extent has been allocated and makes sure to clear the free
4746  * space cache bits as well
4747  */
4748 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4749                                    u64 root_objectid, u64 owner, u64 offset,
4750                                    struct btrfs_key *ins)
4751 {
4752         struct btrfs_fs_info *fs_info = trans->fs_info;
4753         int ret;
4754         struct btrfs_block_group *block_group;
4755         struct btrfs_space_info *space_info;
4756
4757         /*
4758          * Mixed block groups will exclude before processing the log so we only
4759          * need to do the exclude dance if this fs isn't mixed.
4760          */
4761         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4762                 ret = __exclude_logged_extent(fs_info, ins->objectid,
4763                                               ins->offset);
4764                 if (ret)
4765                         return ret;
4766         }
4767
4768         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4769         if (!block_group)
4770                 return -EINVAL;
4771
4772         space_info = block_group->space_info;
4773         spin_lock(&space_info->lock);
4774         spin_lock(&block_group->lock);
4775         space_info->bytes_reserved += ins->offset;
4776         block_group->reserved += ins->offset;
4777         spin_unlock(&block_group->lock);
4778         spin_unlock(&space_info->lock);
4779
4780         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4781                                          offset, ins, 1);
4782         if (ret)
4783                 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4784         btrfs_put_block_group(block_group);
4785         return ret;
4786 }
4787
4788 #ifdef CONFIG_BTRFS_DEBUG
4789 /*
4790  * Extra safety check in case the extent tree is corrupted and extent allocator
4791  * chooses to use a tree block which is already used and locked.
4792  */
4793 static bool check_eb_lock_owner(const struct extent_buffer *eb)
4794 {
4795         if (eb->lock_owner == current->pid) {
4796                 btrfs_err_rl(eb->fs_info,
4797 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4798                              eb->start, btrfs_header_owner(eb), current->pid);
4799                 return true;
4800         }
4801         return false;
4802 }
4803 #else
4804 static bool check_eb_lock_owner(struct extent_buffer *eb)
4805 {
4806         return false;
4807 }
4808 #endif
4809
4810 static struct extent_buffer *
4811 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4812                       u64 bytenr, int level, u64 owner,
4813                       enum btrfs_lock_nesting nest)
4814 {
4815         struct btrfs_fs_info *fs_info = root->fs_info;
4816         struct extent_buffer *buf;
4817         u64 lockdep_owner = owner;
4818
4819         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4820         if (IS_ERR(buf))
4821                 return buf;
4822
4823         if (check_eb_lock_owner(buf)) {
4824                 free_extent_buffer(buf);
4825                 return ERR_PTR(-EUCLEAN);
4826         }
4827
4828         /*
4829          * The reloc trees are just snapshots, so we need them to appear to be
4830          * just like any other fs tree WRT lockdep.
4831          *
4832          * The exception however is in replace_path() in relocation, where we
4833          * hold the lock on the original fs root and then search for the reloc
4834          * root.  At that point we need to make sure any reloc root buffers are
4835          * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
4836          * lockdep happy.
4837          */
4838         if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
4839             !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
4840                 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
4841
4842         /* btrfs_clear_buffer_dirty() accesses generation field. */
4843         btrfs_set_header_generation(buf, trans->transid);
4844
4845         /*
4846          * This needs to stay, because we could allocate a freed block from an
4847          * old tree into a new tree, so we need to make sure this new block is
4848          * set to the appropriate level and owner.
4849          */
4850         btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
4851
4852         __btrfs_tree_lock(buf, nest);
4853         btrfs_clear_buffer_dirty(trans, buf);
4854         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4855         clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4856
4857         set_extent_buffer_uptodate(buf);
4858
4859         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4860         btrfs_set_header_level(buf, level);
4861         btrfs_set_header_bytenr(buf, buf->start);
4862         btrfs_set_header_generation(buf, trans->transid);
4863         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4864         btrfs_set_header_owner(buf, owner);
4865         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4866         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4867         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4868                 buf->log_index = root->log_transid % 2;
4869                 /*
4870                  * we allow two log transactions at a time, use different
4871                  * EXTENT bit to differentiate dirty pages.
4872                  */
4873                 if (buf->log_index == 0)
4874                         set_extent_bit(&root->dirty_log_pages, buf->start,
4875                                        buf->start + buf->len - 1,
4876                                        EXTENT_DIRTY, NULL);
4877                 else
4878                         set_extent_bit(&root->dirty_log_pages, buf->start,
4879                                        buf->start + buf->len - 1,
4880                                        EXTENT_NEW, NULL);
4881         } else {
4882                 buf->log_index = -1;
4883                 set_extent_bit(&trans->transaction->dirty_pages, buf->start,
4884                                buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
4885         }
4886         /* this returns a buffer locked for blocking */
4887         return buf;
4888 }
4889
4890 /*
4891  * finds a free extent and does all the dirty work required for allocation
4892  * returns the tree buffer or an ERR_PTR on error.
4893  */
4894 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4895                                              struct btrfs_root *root,
4896                                              u64 parent, u64 root_objectid,
4897                                              const struct btrfs_disk_key *key,
4898                                              int level, u64 hint,
4899                                              u64 empty_size,
4900                                              enum btrfs_lock_nesting nest)
4901 {
4902         struct btrfs_fs_info *fs_info = root->fs_info;
4903         struct btrfs_key ins;
4904         struct btrfs_block_rsv *block_rsv;
4905         struct extent_buffer *buf;
4906         struct btrfs_delayed_extent_op *extent_op;
4907         struct btrfs_ref generic_ref = { 0 };
4908         u64 flags = 0;
4909         int ret;
4910         u32 blocksize = fs_info->nodesize;
4911         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4912
4913 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4914         if (btrfs_is_testing(fs_info)) {
4915                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4916                                             level, root_objectid, nest);
4917                 if (!IS_ERR(buf))
4918                         root->alloc_bytenr += blocksize;
4919                 return buf;
4920         }
4921 #endif
4922
4923         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4924         if (IS_ERR(block_rsv))
4925                 return ERR_CAST(block_rsv);
4926
4927         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4928                                    empty_size, hint, &ins, 0, 0);
4929         if (ret)
4930                 goto out_unuse;
4931
4932         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4933                                     root_objectid, nest);
4934         if (IS_ERR(buf)) {
4935                 ret = PTR_ERR(buf);
4936                 goto out_free_reserved;
4937         }
4938
4939         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4940                 if (parent == 0)
4941                         parent = ins.objectid;
4942                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4943         } else
4944                 BUG_ON(parent > 0);
4945
4946         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4947                 extent_op = btrfs_alloc_delayed_extent_op();
4948                 if (!extent_op) {
4949                         ret = -ENOMEM;
4950                         goto out_free_buf;
4951                 }
4952                 if (key)
4953                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4954                 else
4955                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4956                 extent_op->flags_to_set = flags;
4957                 extent_op->update_key = skinny_metadata ? false : true;
4958                 extent_op->update_flags = true;
4959                 extent_op->level = level;
4960
4961                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4962                                        ins.objectid, ins.offset, parent);
4963                 btrfs_init_tree_ref(&generic_ref, level, root_objectid,
4964                                     root->root_key.objectid, false);
4965                 btrfs_ref_tree_mod(fs_info, &generic_ref);
4966                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4967                 if (ret)
4968                         goto out_free_delayed;
4969         }
4970         return buf;
4971
4972 out_free_delayed:
4973         btrfs_free_delayed_extent_op(extent_op);
4974 out_free_buf:
4975         btrfs_tree_unlock(buf);
4976         free_extent_buffer(buf);
4977 out_free_reserved:
4978         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4979 out_unuse:
4980         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4981         return ERR_PTR(ret);
4982 }
4983
4984 struct walk_control {
4985         u64 refs[BTRFS_MAX_LEVEL];
4986         u64 flags[BTRFS_MAX_LEVEL];
4987         struct btrfs_key update_progress;
4988         struct btrfs_key drop_progress;
4989         int drop_level;
4990         int stage;
4991         int level;
4992         int shared_level;
4993         int update_ref;
4994         int keep_locks;
4995         int reada_slot;
4996         int reada_count;
4997         int restarted;
4998 };
4999
5000 #define DROP_REFERENCE  1
5001 #define UPDATE_BACKREF  2
5002
5003 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5004                                      struct btrfs_root *root,
5005                                      struct walk_control *wc,
5006                                      struct btrfs_path *path)
5007 {
5008         struct btrfs_fs_info *fs_info = root->fs_info;
5009         u64 bytenr;
5010         u64 generation;
5011         u64 refs;
5012         u64 flags;
5013         u32 nritems;
5014         struct btrfs_key key;
5015         struct extent_buffer *eb;
5016         int ret;
5017         int slot;
5018         int nread = 0;
5019
5020         if (path->slots[wc->level] < wc->reada_slot) {
5021                 wc->reada_count = wc->reada_count * 2 / 3;
5022                 wc->reada_count = max(wc->reada_count, 2);
5023         } else {
5024                 wc->reada_count = wc->reada_count * 3 / 2;
5025                 wc->reada_count = min_t(int, wc->reada_count,
5026                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5027         }
5028
5029         eb = path->nodes[wc->level];
5030         nritems = btrfs_header_nritems(eb);
5031
5032         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5033                 if (nread >= wc->reada_count)
5034                         break;
5035
5036                 cond_resched();
5037                 bytenr = btrfs_node_blockptr(eb, slot);
5038                 generation = btrfs_node_ptr_generation(eb, slot);
5039
5040                 if (slot == path->slots[wc->level])
5041                         goto reada;
5042
5043                 if (wc->stage == UPDATE_BACKREF &&
5044                     generation <= root->root_key.offset)
5045                         continue;
5046
5047                 /* We don't lock the tree block, it's OK to be racy here */
5048                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5049                                                wc->level - 1, 1, &refs,
5050                                                &flags);
5051                 /* We don't care about errors in readahead. */
5052                 if (ret < 0)
5053                         continue;
5054                 BUG_ON(refs == 0);
5055
5056                 if (wc->stage == DROP_REFERENCE) {
5057                         if (refs == 1)
5058                                 goto reada;
5059
5060                         if (wc->level == 1 &&
5061                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5062                                 continue;
5063                         if (!wc->update_ref ||
5064                             generation <= root->root_key.offset)
5065                                 continue;
5066                         btrfs_node_key_to_cpu(eb, &key, slot);
5067                         ret = btrfs_comp_cpu_keys(&key,
5068                                                   &wc->update_progress);
5069                         if (ret < 0)
5070                                 continue;
5071                 } else {
5072                         if (wc->level == 1 &&
5073                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5074                                 continue;
5075                 }
5076 reada:
5077                 btrfs_readahead_node_child(eb, slot);
5078                 nread++;
5079         }
5080         wc->reada_slot = slot;
5081 }
5082
5083 /*
5084  * helper to process tree block while walking down the tree.
5085  *
5086  * when wc->stage == UPDATE_BACKREF, this function updates
5087  * back refs for pointers in the block.
5088  *
5089  * NOTE: return value 1 means we should stop walking down.
5090  */
5091 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5092                                    struct btrfs_root *root,
5093                                    struct btrfs_path *path,
5094                                    struct walk_control *wc, int lookup_info)
5095 {
5096         struct btrfs_fs_info *fs_info = root->fs_info;
5097         int level = wc->level;
5098         struct extent_buffer *eb = path->nodes[level];
5099         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5100         int ret;
5101
5102         if (wc->stage == UPDATE_BACKREF &&
5103             btrfs_header_owner(eb) != root->root_key.objectid)
5104                 return 1;
5105
5106         /*
5107          * when reference count of tree block is 1, it won't increase
5108          * again. once full backref flag is set, we never clear it.
5109          */
5110         if (lookup_info &&
5111             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5112              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5113                 BUG_ON(!path->locks[level]);
5114                 ret = btrfs_lookup_extent_info(trans, fs_info,
5115                                                eb->start, level, 1,
5116                                                &wc->refs[level],
5117                                                &wc->flags[level]);
5118                 BUG_ON(ret == -ENOMEM);
5119                 if (ret)
5120                         return ret;
5121                 BUG_ON(wc->refs[level] == 0);
5122         }
5123
5124         if (wc->stage == DROP_REFERENCE) {
5125                 if (wc->refs[level] > 1)
5126                         return 1;
5127
5128                 if (path->locks[level] && !wc->keep_locks) {
5129                         btrfs_tree_unlock_rw(eb, path->locks[level]);
5130                         path->locks[level] = 0;
5131                 }
5132                 return 0;
5133         }
5134
5135         /* wc->stage == UPDATE_BACKREF */
5136         if (!(wc->flags[level] & flag)) {
5137                 BUG_ON(!path->locks[level]);
5138                 ret = btrfs_inc_ref(trans, root, eb, 1);
5139                 BUG_ON(ret); /* -ENOMEM */
5140                 ret = btrfs_dec_ref(trans, root, eb, 0);
5141                 BUG_ON(ret); /* -ENOMEM */
5142                 ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5143                 BUG_ON(ret); /* -ENOMEM */
5144                 wc->flags[level] |= flag;
5145         }
5146
5147         /*
5148          * the block is shared by multiple trees, so it's not good to
5149          * keep the tree lock
5150          */
5151         if (path->locks[level] && level > 0) {
5152                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5153                 path->locks[level] = 0;
5154         }
5155         return 0;
5156 }
5157
5158 /*
5159  * This is used to verify a ref exists for this root to deal with a bug where we
5160  * would have a drop_progress key that hadn't been updated properly.
5161  */
5162 static int check_ref_exists(struct btrfs_trans_handle *trans,
5163                             struct btrfs_root *root, u64 bytenr, u64 parent,
5164                             int level)
5165 {
5166         struct btrfs_path *path;
5167         struct btrfs_extent_inline_ref *iref;
5168         int ret;
5169
5170         path = btrfs_alloc_path();
5171         if (!path)
5172                 return -ENOMEM;
5173
5174         ret = lookup_extent_backref(trans, path, &iref, bytenr,
5175                                     root->fs_info->nodesize, parent,
5176                                     root->root_key.objectid, level, 0);
5177         btrfs_free_path(path);
5178         if (ret == -ENOENT)
5179                 return 0;
5180         if (ret < 0)
5181                 return ret;
5182         return 1;
5183 }
5184
5185 /*
5186  * helper to process tree block pointer.
5187  *
5188  * when wc->stage == DROP_REFERENCE, this function checks
5189  * reference count of the block pointed to. if the block
5190  * is shared and we need update back refs for the subtree
5191  * rooted at the block, this function changes wc->stage to
5192  * UPDATE_BACKREF. if the block is shared and there is no
5193  * need to update back, this function drops the reference
5194  * to the block.
5195  *
5196  * NOTE: return value 1 means we should stop walking down.
5197  */
5198 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5199                                  struct btrfs_root *root,
5200                                  struct btrfs_path *path,
5201                                  struct walk_control *wc, int *lookup_info)
5202 {
5203         struct btrfs_fs_info *fs_info = root->fs_info;
5204         u64 bytenr;
5205         u64 generation;
5206         u64 parent;
5207         struct btrfs_tree_parent_check check = { 0 };
5208         struct btrfs_key key;
5209         struct btrfs_ref ref = { 0 };
5210         struct extent_buffer *next;
5211         int level = wc->level;
5212         int reada = 0;
5213         int ret = 0;
5214         bool need_account = false;
5215
5216         generation = btrfs_node_ptr_generation(path->nodes[level],
5217                                                path->slots[level]);
5218         /*
5219          * if the lower level block was created before the snapshot
5220          * was created, we know there is no need to update back refs
5221          * for the subtree
5222          */
5223         if (wc->stage == UPDATE_BACKREF &&
5224             generation <= root->root_key.offset) {
5225                 *lookup_info = 1;
5226                 return 1;
5227         }
5228
5229         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5230
5231         check.level = level - 1;
5232         check.transid = generation;
5233         check.owner_root = root->root_key.objectid;
5234         check.has_first_key = true;
5235         btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
5236                               path->slots[level]);
5237
5238         next = find_extent_buffer(fs_info, bytenr);
5239         if (!next) {
5240                 next = btrfs_find_create_tree_block(fs_info, bytenr,
5241                                 root->root_key.objectid, level - 1);
5242                 if (IS_ERR(next))
5243                         return PTR_ERR(next);
5244                 reada = 1;
5245         }
5246         btrfs_tree_lock(next);
5247
5248         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5249                                        &wc->refs[level - 1],
5250                                        &wc->flags[level - 1]);
5251         if (ret < 0)
5252                 goto out_unlock;
5253
5254         if (unlikely(wc->refs[level - 1] == 0)) {
5255                 btrfs_err(fs_info, "Missing references.");
5256                 ret = -EIO;
5257                 goto out_unlock;
5258         }
5259         *lookup_info = 0;
5260
5261         if (wc->stage == DROP_REFERENCE) {
5262                 if (wc->refs[level - 1] > 1) {
5263                         need_account = true;
5264                         if (level == 1 &&
5265                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5266                                 goto skip;
5267
5268                         if (!wc->update_ref ||
5269                             generation <= root->root_key.offset)
5270                                 goto skip;
5271
5272                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5273                                               path->slots[level]);
5274                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5275                         if (ret < 0)
5276                                 goto skip;
5277
5278                         wc->stage = UPDATE_BACKREF;
5279                         wc->shared_level = level - 1;
5280                 }
5281         } else {
5282                 if (level == 1 &&
5283                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5284                         goto skip;
5285         }
5286
5287         if (!btrfs_buffer_uptodate(next, generation, 0)) {
5288                 btrfs_tree_unlock(next);
5289                 free_extent_buffer(next);
5290                 next = NULL;
5291                 *lookup_info = 1;
5292         }
5293
5294         if (!next) {
5295                 if (reada && level == 1)
5296                         reada_walk_down(trans, root, wc, path);
5297                 next = read_tree_block(fs_info, bytenr, &check);
5298                 if (IS_ERR(next)) {
5299                         return PTR_ERR(next);
5300                 } else if (!extent_buffer_uptodate(next)) {
5301                         free_extent_buffer(next);
5302                         return -EIO;
5303                 }
5304                 btrfs_tree_lock(next);
5305         }
5306
5307         level--;
5308         ASSERT(level == btrfs_header_level(next));
5309         if (level != btrfs_header_level(next)) {
5310                 btrfs_err(root->fs_info, "mismatched level");
5311                 ret = -EIO;
5312                 goto out_unlock;
5313         }
5314         path->nodes[level] = next;
5315         path->slots[level] = 0;
5316         path->locks[level] = BTRFS_WRITE_LOCK;
5317         wc->level = level;
5318         if (wc->level == 1)
5319                 wc->reada_slot = 0;
5320         return 0;
5321 skip:
5322         wc->refs[level - 1] = 0;
5323         wc->flags[level - 1] = 0;
5324         if (wc->stage == DROP_REFERENCE) {
5325                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5326                         parent = path->nodes[level]->start;
5327                 } else {
5328                         ASSERT(root->root_key.objectid ==
5329                                btrfs_header_owner(path->nodes[level]));
5330                         if (root->root_key.objectid !=
5331                             btrfs_header_owner(path->nodes[level])) {
5332                                 btrfs_err(root->fs_info,
5333                                                 "mismatched block owner");
5334                                 ret = -EIO;
5335                                 goto out_unlock;
5336                         }
5337                         parent = 0;
5338                 }
5339
5340                 /*
5341                  * If we had a drop_progress we need to verify the refs are set
5342                  * as expected.  If we find our ref then we know that from here
5343                  * on out everything should be correct, and we can clear the
5344                  * ->restarted flag.
5345                  */
5346                 if (wc->restarted) {
5347                         ret = check_ref_exists(trans, root, bytenr, parent,
5348                                                level - 1);
5349                         if (ret < 0)
5350                                 goto out_unlock;
5351                         if (ret == 0)
5352                                 goto no_delete;
5353                         ret = 0;
5354                         wc->restarted = 0;
5355                 }
5356
5357                 /*
5358                  * Reloc tree doesn't contribute to qgroup numbers, and we have
5359                  * already accounted them at merge time (replace_path),
5360                  * thus we could skip expensive subtree trace here.
5361                  */
5362                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5363                     need_account) {
5364                         ret = btrfs_qgroup_trace_subtree(trans, next,
5365                                                          generation, level - 1);
5366                         if (ret) {
5367                                 btrfs_err_rl(fs_info,
5368                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5369                                              ret);
5370                         }
5371                 }
5372
5373                 /*
5374                  * We need to update the next key in our walk control so we can
5375                  * update the drop_progress key accordingly.  We don't care if
5376                  * find_next_key doesn't find a key because that means we're at
5377                  * the end and are going to clean up now.
5378                  */
5379                 wc->drop_level = level;
5380                 find_next_key(path, level, &wc->drop_progress);
5381
5382                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5383                                        fs_info->nodesize, parent);
5384                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5385                                     0, false);
5386                 ret = btrfs_free_extent(trans, &ref);
5387                 if (ret)
5388                         goto out_unlock;
5389         }
5390 no_delete:
5391         *lookup_info = 1;
5392         ret = 1;
5393
5394 out_unlock:
5395         btrfs_tree_unlock(next);
5396         free_extent_buffer(next);
5397
5398         return ret;
5399 }
5400
5401 /*
5402  * helper to process tree block while walking up the tree.
5403  *
5404  * when wc->stage == DROP_REFERENCE, this function drops
5405  * reference count on the block.
5406  *
5407  * when wc->stage == UPDATE_BACKREF, this function changes
5408  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5409  * to UPDATE_BACKREF previously while processing the block.
5410  *
5411  * NOTE: return value 1 means we should stop walking up.
5412  */
5413 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5414                                  struct btrfs_root *root,
5415                                  struct btrfs_path *path,
5416                                  struct walk_control *wc)
5417 {
5418         struct btrfs_fs_info *fs_info = root->fs_info;
5419         int ret;
5420         int level = wc->level;
5421         struct extent_buffer *eb = path->nodes[level];
5422         u64 parent = 0;
5423
5424         if (wc->stage == UPDATE_BACKREF) {
5425                 BUG_ON(wc->shared_level < level);
5426                 if (level < wc->shared_level)
5427                         goto out;
5428
5429                 ret = find_next_key(path, level + 1, &wc->update_progress);
5430                 if (ret > 0)
5431                         wc->update_ref = 0;
5432
5433                 wc->stage = DROP_REFERENCE;
5434                 wc->shared_level = -1;
5435                 path->slots[level] = 0;
5436
5437                 /*
5438                  * check reference count again if the block isn't locked.
5439                  * we should start walking down the tree again if reference
5440                  * count is one.
5441                  */
5442                 if (!path->locks[level]) {
5443                         BUG_ON(level == 0);
5444                         btrfs_tree_lock(eb);
5445                         path->locks[level] = BTRFS_WRITE_LOCK;
5446
5447                         ret = btrfs_lookup_extent_info(trans, fs_info,
5448                                                        eb->start, level, 1,
5449                                                        &wc->refs[level],
5450                                                        &wc->flags[level]);
5451                         if (ret < 0) {
5452                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5453                                 path->locks[level] = 0;
5454                                 return ret;
5455                         }
5456                         BUG_ON(wc->refs[level] == 0);
5457                         if (wc->refs[level] == 1) {
5458                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5459                                 path->locks[level] = 0;
5460                                 return 1;
5461                         }
5462                 }
5463         }
5464
5465         /* wc->stage == DROP_REFERENCE */
5466         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5467
5468         if (wc->refs[level] == 1) {
5469                 if (level == 0) {
5470                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5471                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5472                         else
5473                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5474                         BUG_ON(ret); /* -ENOMEM */
5475                         if (is_fstree(root->root_key.objectid)) {
5476                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5477                                 if (ret) {
5478                                         btrfs_err_rl(fs_info,
5479         "error %d accounting leaf items, quota is out of sync, rescan required",
5480                                              ret);
5481                                 }
5482                         }
5483                 }
5484                 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5485                 if (!path->locks[level]) {
5486                         btrfs_tree_lock(eb);
5487                         path->locks[level] = BTRFS_WRITE_LOCK;
5488                 }
5489                 btrfs_clear_buffer_dirty(trans, eb);
5490         }
5491
5492         if (eb == root->node) {
5493                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5494                         parent = eb->start;
5495                 else if (root->root_key.objectid != btrfs_header_owner(eb))
5496                         goto owner_mismatch;
5497         } else {
5498                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5499                         parent = path->nodes[level + 1]->start;
5500                 else if (root->root_key.objectid !=
5501                          btrfs_header_owner(path->nodes[level + 1]))
5502                         goto owner_mismatch;
5503         }
5504
5505         btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5506                               wc->refs[level] == 1);
5507 out:
5508         wc->refs[level] = 0;
5509         wc->flags[level] = 0;
5510         return 0;
5511
5512 owner_mismatch:
5513         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5514                      btrfs_header_owner(eb), root->root_key.objectid);
5515         return -EUCLEAN;
5516 }
5517
5518 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5519                                    struct btrfs_root *root,
5520                                    struct btrfs_path *path,
5521                                    struct walk_control *wc)
5522 {
5523         int level = wc->level;
5524         int lookup_info = 1;
5525         int ret = 0;
5526
5527         while (level >= 0) {
5528                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5529                 if (ret)
5530                         break;
5531
5532                 if (level == 0)
5533                         break;
5534
5535                 if (path->slots[level] >=
5536                     btrfs_header_nritems(path->nodes[level]))
5537                         break;
5538
5539                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5540                 if (ret > 0) {
5541                         path->slots[level]++;
5542                         continue;
5543                 } else if (ret < 0)
5544                         break;
5545                 level = wc->level;
5546         }
5547         return (ret == 1) ? 0 : ret;
5548 }
5549
5550 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5551                                  struct btrfs_root *root,
5552                                  struct btrfs_path *path,
5553                                  struct walk_control *wc, int max_level)
5554 {
5555         int level = wc->level;
5556         int ret;
5557
5558         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5559         while (level < max_level && path->nodes[level]) {
5560                 wc->level = level;
5561                 if (path->slots[level] + 1 <
5562                     btrfs_header_nritems(path->nodes[level])) {
5563                         path->slots[level]++;
5564                         return 0;
5565                 } else {
5566                         ret = walk_up_proc(trans, root, path, wc);
5567                         if (ret > 0)
5568                                 return 0;
5569                         if (ret < 0)
5570                                 return ret;
5571
5572                         if (path->locks[level]) {
5573                                 btrfs_tree_unlock_rw(path->nodes[level],
5574                                                      path->locks[level]);
5575                                 path->locks[level] = 0;
5576                         }
5577                         free_extent_buffer(path->nodes[level]);
5578                         path->nodes[level] = NULL;
5579                         level++;
5580                 }
5581         }
5582         return 1;
5583 }
5584
5585 /*
5586  * drop a subvolume tree.
5587  *
5588  * this function traverses the tree freeing any blocks that only
5589  * referenced by the tree.
5590  *
5591  * when a shared tree block is found. this function decreases its
5592  * reference count by one. if update_ref is true, this function
5593  * also make sure backrefs for the shared block and all lower level
5594  * blocks are properly updated.
5595  *
5596  * If called with for_reloc == 0, may exit early with -EAGAIN
5597  */
5598 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5599 {
5600         const bool is_reloc_root = (root->root_key.objectid ==
5601                                     BTRFS_TREE_RELOC_OBJECTID);
5602         struct btrfs_fs_info *fs_info = root->fs_info;
5603         struct btrfs_path *path;
5604         struct btrfs_trans_handle *trans;
5605         struct btrfs_root *tree_root = fs_info->tree_root;
5606         struct btrfs_root_item *root_item = &root->root_item;
5607         struct walk_control *wc;
5608         struct btrfs_key key;
5609         int err = 0;
5610         int ret;
5611         int level;
5612         bool root_dropped = false;
5613         bool unfinished_drop = false;
5614
5615         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5616
5617         path = btrfs_alloc_path();
5618         if (!path) {
5619                 err = -ENOMEM;
5620                 goto out;
5621         }
5622
5623         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5624         if (!wc) {
5625                 btrfs_free_path(path);
5626                 err = -ENOMEM;
5627                 goto out;
5628         }
5629
5630         /*
5631          * Use join to avoid potential EINTR from transaction start. See
5632          * wait_reserve_ticket and the whole reservation callchain.
5633          */
5634         if (for_reloc)
5635                 trans = btrfs_join_transaction(tree_root);
5636         else
5637                 trans = btrfs_start_transaction(tree_root, 0);
5638         if (IS_ERR(trans)) {
5639                 err = PTR_ERR(trans);
5640                 goto out_free;
5641         }
5642
5643         err = btrfs_run_delayed_items(trans);
5644         if (err)
5645                 goto out_end_trans;
5646
5647         /*
5648          * This will help us catch people modifying the fs tree while we're
5649          * dropping it.  It is unsafe to mess with the fs tree while it's being
5650          * dropped as we unlock the root node and parent nodes as we walk down
5651          * the tree, assuming nothing will change.  If something does change
5652          * then we'll have stale information and drop references to blocks we've
5653          * already dropped.
5654          */
5655         set_bit(BTRFS_ROOT_DELETING, &root->state);
5656         unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5657
5658         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5659                 level = btrfs_header_level(root->node);
5660                 path->nodes[level] = btrfs_lock_root_node(root);
5661                 path->slots[level] = 0;
5662                 path->locks[level] = BTRFS_WRITE_LOCK;
5663                 memset(&wc->update_progress, 0,
5664                        sizeof(wc->update_progress));
5665         } else {
5666                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5667                 memcpy(&wc->update_progress, &key,
5668                        sizeof(wc->update_progress));
5669
5670                 level = btrfs_root_drop_level(root_item);
5671                 BUG_ON(level == 0);
5672                 path->lowest_level = level;
5673                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5674                 path->lowest_level = 0;
5675                 if (ret < 0) {
5676                         err = ret;
5677                         goto out_end_trans;
5678                 }
5679                 WARN_ON(ret > 0);
5680
5681                 /*
5682                  * unlock our path, this is safe because only this
5683                  * function is allowed to delete this snapshot
5684                  */
5685                 btrfs_unlock_up_safe(path, 0);
5686
5687                 level = btrfs_header_level(root->node);
5688                 while (1) {
5689                         btrfs_tree_lock(path->nodes[level]);
5690                         path->locks[level] = BTRFS_WRITE_LOCK;
5691
5692                         ret = btrfs_lookup_extent_info(trans, fs_info,
5693                                                 path->nodes[level]->start,
5694                                                 level, 1, &wc->refs[level],
5695                                                 &wc->flags[level]);
5696                         if (ret < 0) {
5697                                 err = ret;
5698                                 goto out_end_trans;
5699                         }
5700                         BUG_ON(wc->refs[level] == 0);
5701
5702                         if (level == btrfs_root_drop_level(root_item))
5703                                 break;
5704
5705                         btrfs_tree_unlock(path->nodes[level]);
5706                         path->locks[level] = 0;
5707                         WARN_ON(wc->refs[level] != 1);
5708                         level--;
5709                 }
5710         }
5711
5712         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5713         wc->level = level;
5714         wc->shared_level = -1;
5715         wc->stage = DROP_REFERENCE;
5716         wc->update_ref = update_ref;
5717         wc->keep_locks = 0;
5718         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5719
5720         while (1) {
5721
5722                 ret = walk_down_tree(trans, root, path, wc);
5723                 if (ret < 0) {
5724                         btrfs_abort_transaction(trans, ret);
5725                         err = ret;
5726                         break;
5727                 }
5728
5729                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5730                 if (ret < 0) {
5731                         btrfs_abort_transaction(trans, ret);
5732                         err = ret;
5733                         break;
5734                 }
5735
5736                 if (ret > 0) {
5737                         BUG_ON(wc->stage != DROP_REFERENCE);
5738                         break;
5739                 }
5740
5741                 if (wc->stage == DROP_REFERENCE) {
5742                         wc->drop_level = wc->level;
5743                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5744                                               &wc->drop_progress,
5745                                               path->slots[wc->drop_level]);
5746                 }
5747                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5748                                       &wc->drop_progress);
5749                 btrfs_set_root_drop_level(root_item, wc->drop_level);
5750
5751                 BUG_ON(wc->level == 0);
5752                 if (btrfs_should_end_transaction(trans) ||
5753                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5754                         ret = btrfs_update_root(trans, tree_root,
5755                                                 &root->root_key,
5756                                                 root_item);
5757                         if (ret) {
5758                                 btrfs_abort_transaction(trans, ret);
5759                                 err = ret;
5760                                 goto out_end_trans;
5761                         }
5762
5763                         if (!is_reloc_root)
5764                                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5765
5766                         btrfs_end_transaction_throttle(trans);
5767                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5768                                 btrfs_debug(fs_info,
5769                                             "drop snapshot early exit");
5770                                 err = -EAGAIN;
5771                                 goto out_free;
5772                         }
5773
5774                        /*
5775                         * Use join to avoid potential EINTR from transaction
5776                         * start. See wait_reserve_ticket and the whole
5777                         * reservation callchain.
5778                         */
5779                         if (for_reloc)
5780                                 trans = btrfs_join_transaction(tree_root);
5781                         else
5782                                 trans = btrfs_start_transaction(tree_root, 0);
5783                         if (IS_ERR(trans)) {
5784                                 err = PTR_ERR(trans);
5785                                 goto out_free;
5786                         }
5787                 }
5788         }
5789         btrfs_release_path(path);
5790         if (err)
5791                 goto out_end_trans;
5792
5793         ret = btrfs_del_root(trans, &root->root_key);
5794         if (ret) {
5795                 btrfs_abort_transaction(trans, ret);
5796                 err = ret;
5797                 goto out_end_trans;
5798         }
5799
5800         if (!is_reloc_root) {
5801                 ret = btrfs_find_root(tree_root, &root->root_key, path,
5802                                       NULL, NULL);
5803                 if (ret < 0) {
5804                         btrfs_abort_transaction(trans, ret);
5805                         err = ret;
5806                         goto out_end_trans;
5807                 } else if (ret > 0) {
5808                         /* if we fail to delete the orphan item this time
5809                          * around, it'll get picked up the next time.
5810                          *
5811                          * The most common failure here is just -ENOENT.
5812                          */
5813                         btrfs_del_orphan_item(trans, tree_root,
5814                                               root->root_key.objectid);
5815                 }
5816         }
5817
5818         /*
5819          * This subvolume is going to be completely dropped, and won't be
5820          * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5821          * commit transaction time.  So free it here manually.
5822          */
5823         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5824         btrfs_qgroup_free_meta_all_pertrans(root);
5825
5826         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5827                 btrfs_add_dropped_root(trans, root);
5828         else
5829                 btrfs_put_root(root);
5830         root_dropped = true;
5831 out_end_trans:
5832         if (!is_reloc_root)
5833                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5834
5835         btrfs_end_transaction_throttle(trans);
5836 out_free:
5837         kfree(wc);
5838         btrfs_free_path(path);
5839 out:
5840         /*
5841          * We were an unfinished drop root, check to see if there are any
5842          * pending, and if not clear and wake up any waiters.
5843          */
5844         if (!err && unfinished_drop)
5845                 btrfs_maybe_wake_unfinished_drop(fs_info);
5846
5847         /*
5848          * So if we need to stop dropping the snapshot for whatever reason we
5849          * need to make sure to add it back to the dead root list so that we
5850          * keep trying to do the work later.  This also cleans up roots if we
5851          * don't have it in the radix (like when we recover after a power fail
5852          * or unmount) so we don't leak memory.
5853          */
5854         if (!for_reloc && !root_dropped)
5855                 btrfs_add_dead_root(root);
5856         return err;
5857 }
5858
5859 /*
5860  * drop subtree rooted at tree block 'node'.
5861  *
5862  * NOTE: this function will unlock and release tree block 'node'
5863  * only used by relocation code
5864  */
5865 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5866                         struct btrfs_root *root,
5867                         struct extent_buffer *node,
5868                         struct extent_buffer *parent)
5869 {
5870         struct btrfs_fs_info *fs_info = root->fs_info;
5871         struct btrfs_path *path;
5872         struct walk_control *wc;
5873         int level;
5874         int parent_level;
5875         int ret = 0;
5876         int wret;
5877
5878         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5879
5880         path = btrfs_alloc_path();
5881         if (!path)
5882                 return -ENOMEM;
5883
5884         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5885         if (!wc) {
5886                 btrfs_free_path(path);
5887                 return -ENOMEM;
5888         }
5889
5890         btrfs_assert_tree_write_locked(parent);
5891         parent_level = btrfs_header_level(parent);
5892         atomic_inc(&parent->refs);
5893         path->nodes[parent_level] = parent;
5894         path->slots[parent_level] = btrfs_header_nritems(parent);
5895
5896         btrfs_assert_tree_write_locked(node);
5897         level = btrfs_header_level(node);
5898         path->nodes[level] = node;
5899         path->slots[level] = 0;
5900         path->locks[level] = BTRFS_WRITE_LOCK;
5901
5902         wc->refs[parent_level] = 1;
5903         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5904         wc->level = level;
5905         wc->shared_level = -1;
5906         wc->stage = DROP_REFERENCE;
5907         wc->update_ref = 0;
5908         wc->keep_locks = 1;
5909         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5910
5911         while (1) {
5912                 wret = walk_down_tree(trans, root, path, wc);
5913                 if (wret < 0) {
5914                         ret = wret;
5915                         break;
5916                 }
5917
5918                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5919                 if (wret < 0)
5920                         ret = wret;
5921                 if (wret != 0)
5922                         break;
5923         }
5924
5925         kfree(wc);
5926         btrfs_free_path(path);
5927         return ret;
5928 }
5929
5930 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5931                                    u64 start, u64 end)
5932 {
5933         return unpin_extent_range(fs_info, start, end, false);
5934 }
5935
5936 /*
5937  * It used to be that old block groups would be left around forever.
5938  * Iterating over them would be enough to trim unused space.  Since we
5939  * now automatically remove them, we also need to iterate over unallocated
5940  * space.
5941  *
5942  * We don't want a transaction for this since the discard may take a
5943  * substantial amount of time.  We don't require that a transaction be
5944  * running, but we do need to take a running transaction into account
5945  * to ensure that we're not discarding chunks that were released or
5946  * allocated in the current transaction.
5947  *
5948  * Holding the chunks lock will prevent other threads from allocating
5949  * or releasing chunks, but it won't prevent a running transaction
5950  * from committing and releasing the memory that the pending chunks
5951  * list head uses.  For that, we need to take a reference to the
5952  * transaction and hold the commit root sem.  We only need to hold
5953  * it while performing the free space search since we have already
5954  * held back allocations.
5955  */
5956 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5957 {
5958         u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
5959         int ret;
5960
5961         *trimmed = 0;
5962
5963         /* Discard not supported = nothing to do. */
5964         if (!bdev_max_discard_sectors(device->bdev))
5965                 return 0;
5966
5967         /* Not writable = nothing to do. */
5968         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5969                 return 0;
5970
5971         /* No free space = nothing to do. */
5972         if (device->total_bytes <= device->bytes_used)
5973                 return 0;
5974
5975         ret = 0;
5976
5977         while (1) {
5978                 struct btrfs_fs_info *fs_info = device->fs_info;
5979                 u64 bytes;
5980
5981                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5982                 if (ret)
5983                         break;
5984
5985                 find_first_clear_extent_bit(&device->alloc_state, start,
5986                                             &start, &end,
5987                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
5988
5989                 /* Check if there are any CHUNK_* bits left */
5990                 if (start > device->total_bytes) {
5991                         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5992                         btrfs_warn_in_rcu(fs_info,
5993 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5994                                           start, end - start + 1,
5995                                           btrfs_dev_name(device),
5996                                           device->total_bytes);
5997                         mutex_unlock(&fs_info->chunk_mutex);
5998                         ret = 0;
5999                         break;
6000                 }
6001
6002                 /* Ensure we skip the reserved space on each device. */
6003                 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6004
6005                 /*
6006                  * If find_first_clear_extent_bit find a range that spans the
6007                  * end of the device it will set end to -1, in this case it's up
6008                  * to the caller to trim the value to the size of the device.
6009                  */
6010                 end = min(end, device->total_bytes - 1);
6011
6012                 len = end - start + 1;
6013
6014                 /* We didn't find any extents */
6015                 if (!len) {
6016                         mutex_unlock(&fs_info->chunk_mutex);
6017                         ret = 0;
6018                         break;
6019                 }
6020
6021                 ret = btrfs_issue_discard(device->bdev, start, len,
6022                                           &bytes);
6023                 if (!ret)
6024                         set_extent_bit(&device->alloc_state, start,
6025                                        start + bytes - 1, CHUNK_TRIMMED, NULL);
6026                 mutex_unlock(&fs_info->chunk_mutex);
6027
6028                 if (ret)
6029                         break;
6030
6031                 start += len;
6032                 *trimmed += bytes;
6033
6034                 if (fatal_signal_pending(current)) {
6035                         ret = -ERESTARTSYS;
6036                         break;
6037                 }
6038
6039                 cond_resched();
6040         }
6041
6042         return ret;
6043 }
6044
6045 /*
6046  * Trim the whole filesystem by:
6047  * 1) trimming the free space in each block group
6048  * 2) trimming the unallocated space on each device
6049  *
6050  * This will also continue trimming even if a block group or device encounters
6051  * an error.  The return value will be the last error, or 0 if nothing bad
6052  * happens.
6053  */
6054 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6055 {
6056         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6057         struct btrfs_block_group *cache = NULL;
6058         struct btrfs_device *device;
6059         u64 group_trimmed;
6060         u64 range_end = U64_MAX;
6061         u64 start;
6062         u64 end;
6063         u64 trimmed = 0;
6064         u64 bg_failed = 0;
6065         u64 dev_failed = 0;
6066         int bg_ret = 0;
6067         int dev_ret = 0;
6068         int ret = 0;
6069
6070         if (range->start == U64_MAX)
6071                 return -EINVAL;
6072
6073         /*
6074          * Check range overflow if range->len is set.
6075          * The default range->len is U64_MAX.
6076          */
6077         if (range->len != U64_MAX &&
6078             check_add_overflow(range->start, range->len, &range_end))
6079                 return -EINVAL;
6080
6081         cache = btrfs_lookup_first_block_group(fs_info, range->start);
6082         for (; cache; cache = btrfs_next_block_group(cache)) {
6083                 if (cache->start >= range_end) {
6084                         btrfs_put_block_group(cache);
6085                         break;
6086                 }
6087
6088                 start = max(range->start, cache->start);
6089                 end = min(range_end, cache->start + cache->length);
6090
6091                 if (end - start >= range->minlen) {
6092                         if (!btrfs_block_group_done(cache)) {
6093                                 ret = btrfs_cache_block_group(cache, true);
6094                                 if (ret) {
6095                                         bg_failed++;
6096                                         bg_ret = ret;
6097                                         continue;
6098                                 }
6099                         }
6100                         ret = btrfs_trim_block_group(cache,
6101                                                      &group_trimmed,
6102                                                      start,
6103                                                      end,
6104                                                      range->minlen);
6105
6106                         trimmed += group_trimmed;
6107                         if (ret) {
6108                                 bg_failed++;
6109                                 bg_ret = ret;
6110                                 continue;
6111                         }
6112                 }
6113         }
6114
6115         if (bg_failed)
6116                 btrfs_warn(fs_info,
6117                         "failed to trim %llu block group(s), last error %d",
6118                         bg_failed, bg_ret);
6119
6120         mutex_lock(&fs_devices->device_list_mutex);
6121         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6122                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6123                         continue;
6124
6125                 ret = btrfs_trim_free_extents(device, &group_trimmed);
6126                 if (ret) {
6127                         dev_failed++;
6128                         dev_ret = ret;
6129                         break;
6130                 }
6131
6132                 trimmed += group_trimmed;
6133         }
6134         mutex_unlock(&fs_devices->device_list_mutex);
6135
6136         if (dev_failed)
6137                 btrfs_warn(fs_info,
6138                         "failed to trim %llu device(s), last error %d",
6139                         dev_failed, dev_ret);
6140         range->len = trimmed;
6141         if (bg_ret)
6142                 return bg_ret;
6143         return dev_ret;
6144 }