Merge tag '5.11-rc-smb3-part2' of git://git.samba.org/sfrench/cifs-2.6
[linux-2.6-microblaze.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "misc.h"
20 #include "tree-log.h"
21 #include "disk-io.h"
22 #include "print-tree.h"
23 #include "volumes.h"
24 #include "raid56.h"
25 #include "locking.h"
26 #include "free-space-cache.h"
27 #include "free-space-tree.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
33 #include "delalloc-space.h"
34 #include "block-group.h"
35 #include "discard.h"
36 #include "rcu-string.h"
37
38 #undef SCRAMBLE_DELAYED_REFS
39
40
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                struct btrfs_delayed_ref_node *node, u64 parent,
43                                u64 root_objectid, u64 owner_objectid,
44                                u64 owner_offset, int refs_to_drop,
45                                struct btrfs_delayed_extent_op *extra_op);
46 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
47                                     struct extent_buffer *leaf,
48                                     struct btrfs_extent_item *ei);
49 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
50                                       u64 parent, u64 root_objectid,
51                                       u64 flags, u64 owner, u64 offset,
52                                       struct btrfs_key *ins, int ref_mod);
53 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
54                                      struct btrfs_delayed_ref_node *node,
55                                      struct btrfs_delayed_extent_op *extent_op);
56 static int find_next_key(struct btrfs_path *path, int level,
57                          struct btrfs_key *key);
58
59 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
60 {
61         return (cache->flags & bits) == bits;
62 }
63
64 int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
65                               u64 start, u64 num_bytes)
66 {
67         u64 end = start + num_bytes - 1;
68         set_extent_bits(&fs_info->excluded_extents, start, end,
69                         EXTENT_UPTODATE);
70         return 0;
71 }
72
73 void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
74 {
75         struct btrfs_fs_info *fs_info = cache->fs_info;
76         u64 start, end;
77
78         start = cache->start;
79         end = start + cache->length - 1;
80
81         clear_extent_bits(&fs_info->excluded_extents, start, end,
82                           EXTENT_UPTODATE);
83 }
84
85 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
86 {
87         if (ref->type == BTRFS_REF_METADATA) {
88                 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
89                         return BTRFS_BLOCK_GROUP_SYSTEM;
90                 else
91                         return BTRFS_BLOCK_GROUP_METADATA;
92         }
93         return BTRFS_BLOCK_GROUP_DATA;
94 }
95
96 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
97                              struct btrfs_ref *ref)
98 {
99         struct btrfs_space_info *space_info;
100         u64 flags = generic_ref_to_space_flags(ref);
101
102         space_info = btrfs_find_space_info(fs_info, flags);
103         ASSERT(space_info);
104         percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
105                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
106 }
107
108 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
109                              struct btrfs_ref *ref)
110 {
111         struct btrfs_space_info *space_info;
112         u64 flags = generic_ref_to_space_flags(ref);
113
114         space_info = btrfs_find_space_info(fs_info, flags);
115         ASSERT(space_info);
116         percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
117                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
118 }
119
120 /* simple helper to search for an existing data extent at a given offset */
121 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
122 {
123         int ret;
124         struct btrfs_key key;
125         struct btrfs_path *path;
126
127         path = btrfs_alloc_path();
128         if (!path)
129                 return -ENOMEM;
130
131         key.objectid = start;
132         key.offset = len;
133         key.type = BTRFS_EXTENT_ITEM_KEY;
134         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
135         btrfs_free_path(path);
136         return ret;
137 }
138
139 /*
140  * helper function to lookup reference count and flags of a tree block.
141  *
142  * the head node for delayed ref is used to store the sum of all the
143  * reference count modifications queued up in the rbtree. the head
144  * node may also store the extent flags to set. This way you can check
145  * to see what the reference count and extent flags would be if all of
146  * the delayed refs are not processed.
147  */
148 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
149                              struct btrfs_fs_info *fs_info, u64 bytenr,
150                              u64 offset, int metadata, u64 *refs, u64 *flags)
151 {
152         struct btrfs_delayed_ref_head *head;
153         struct btrfs_delayed_ref_root *delayed_refs;
154         struct btrfs_path *path;
155         struct btrfs_extent_item *ei;
156         struct extent_buffer *leaf;
157         struct btrfs_key key;
158         u32 item_size;
159         u64 num_refs;
160         u64 extent_flags;
161         int ret;
162
163         /*
164          * If we don't have skinny metadata, don't bother doing anything
165          * different
166          */
167         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
168                 offset = fs_info->nodesize;
169                 metadata = 0;
170         }
171
172         path = btrfs_alloc_path();
173         if (!path)
174                 return -ENOMEM;
175
176         if (!trans) {
177                 path->skip_locking = 1;
178                 path->search_commit_root = 1;
179         }
180
181 search_again:
182         key.objectid = bytenr;
183         key.offset = offset;
184         if (metadata)
185                 key.type = BTRFS_METADATA_ITEM_KEY;
186         else
187                 key.type = BTRFS_EXTENT_ITEM_KEY;
188
189         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
190         if (ret < 0)
191                 goto out_free;
192
193         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
194                 if (path->slots[0]) {
195                         path->slots[0]--;
196                         btrfs_item_key_to_cpu(path->nodes[0], &key,
197                                               path->slots[0]);
198                         if (key.objectid == bytenr &&
199                             key.type == BTRFS_EXTENT_ITEM_KEY &&
200                             key.offset == fs_info->nodesize)
201                                 ret = 0;
202                 }
203         }
204
205         if (ret == 0) {
206                 leaf = path->nodes[0];
207                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
208                 if (item_size >= sizeof(*ei)) {
209                         ei = btrfs_item_ptr(leaf, path->slots[0],
210                                             struct btrfs_extent_item);
211                         num_refs = btrfs_extent_refs(leaf, ei);
212                         extent_flags = btrfs_extent_flags(leaf, ei);
213                 } else {
214                         ret = -EINVAL;
215                         btrfs_print_v0_err(fs_info);
216                         if (trans)
217                                 btrfs_abort_transaction(trans, ret);
218                         else
219                                 btrfs_handle_fs_error(fs_info, ret, NULL);
220
221                         goto out_free;
222                 }
223
224                 BUG_ON(num_refs == 0);
225         } else {
226                 num_refs = 0;
227                 extent_flags = 0;
228                 ret = 0;
229         }
230
231         if (!trans)
232                 goto out;
233
234         delayed_refs = &trans->transaction->delayed_refs;
235         spin_lock(&delayed_refs->lock);
236         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
237         if (head) {
238                 if (!mutex_trylock(&head->mutex)) {
239                         refcount_inc(&head->refs);
240                         spin_unlock(&delayed_refs->lock);
241
242                         btrfs_release_path(path);
243
244                         /*
245                          * Mutex was contended, block until it's released and try
246                          * again
247                          */
248                         mutex_lock(&head->mutex);
249                         mutex_unlock(&head->mutex);
250                         btrfs_put_delayed_ref_head(head);
251                         goto search_again;
252                 }
253                 spin_lock(&head->lock);
254                 if (head->extent_op && head->extent_op->update_flags)
255                         extent_flags |= head->extent_op->flags_to_set;
256                 else
257                         BUG_ON(num_refs == 0);
258
259                 num_refs += head->ref_mod;
260                 spin_unlock(&head->lock);
261                 mutex_unlock(&head->mutex);
262         }
263         spin_unlock(&delayed_refs->lock);
264 out:
265         WARN_ON(num_refs == 0);
266         if (refs)
267                 *refs = num_refs;
268         if (flags)
269                 *flags = extent_flags;
270 out_free:
271         btrfs_free_path(path);
272         return ret;
273 }
274
275 /*
276  * Back reference rules.  Back refs have three main goals:
277  *
278  * 1) differentiate between all holders of references to an extent so that
279  *    when a reference is dropped we can make sure it was a valid reference
280  *    before freeing the extent.
281  *
282  * 2) Provide enough information to quickly find the holders of an extent
283  *    if we notice a given block is corrupted or bad.
284  *
285  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
286  *    maintenance.  This is actually the same as #2, but with a slightly
287  *    different use case.
288  *
289  * There are two kinds of back refs. The implicit back refs is optimized
290  * for pointers in non-shared tree blocks. For a given pointer in a block,
291  * back refs of this kind provide information about the block's owner tree
292  * and the pointer's key. These information allow us to find the block by
293  * b-tree searching. The full back refs is for pointers in tree blocks not
294  * referenced by their owner trees. The location of tree block is recorded
295  * in the back refs. Actually the full back refs is generic, and can be
296  * used in all cases the implicit back refs is used. The major shortcoming
297  * of the full back refs is its overhead. Every time a tree block gets
298  * COWed, we have to update back refs entry for all pointers in it.
299  *
300  * For a newly allocated tree block, we use implicit back refs for
301  * pointers in it. This means most tree related operations only involve
302  * implicit back refs. For a tree block created in old transaction, the
303  * only way to drop a reference to it is COW it. So we can detect the
304  * event that tree block loses its owner tree's reference and do the
305  * back refs conversion.
306  *
307  * When a tree block is COWed through a tree, there are four cases:
308  *
309  * The reference count of the block is one and the tree is the block's
310  * owner tree. Nothing to do in this case.
311  *
312  * The reference count of the block is one and the tree is not the
313  * block's owner tree. In this case, full back refs is used for pointers
314  * in the block. Remove these full back refs, add implicit back refs for
315  * every pointers in the new block.
316  *
317  * The reference count of the block is greater than one and the tree is
318  * the block's owner tree. In this case, implicit back refs is used for
319  * pointers in the block. Add full back refs for every pointers in the
320  * block, increase lower level extents' reference counts. The original
321  * implicit back refs are entailed to the new block.
322  *
323  * The reference count of the block is greater than one and the tree is
324  * not the block's owner tree. Add implicit back refs for every pointer in
325  * the new block, increase lower level extents' reference count.
326  *
327  * Back Reference Key composing:
328  *
329  * The key objectid corresponds to the first byte in the extent,
330  * The key type is used to differentiate between types of back refs.
331  * There are different meanings of the key offset for different types
332  * of back refs.
333  *
334  * File extents can be referenced by:
335  *
336  * - multiple snapshots, subvolumes, or different generations in one subvol
337  * - different files inside a single subvolume
338  * - different offsets inside a file (bookend extents in file.c)
339  *
340  * The extent ref structure for the implicit back refs has fields for:
341  *
342  * - Objectid of the subvolume root
343  * - objectid of the file holding the reference
344  * - original offset in the file
345  * - how many bookend extents
346  *
347  * The key offset for the implicit back refs is hash of the first
348  * three fields.
349  *
350  * The extent ref structure for the full back refs has field for:
351  *
352  * - number of pointers in the tree leaf
353  *
354  * The key offset for the implicit back refs is the first byte of
355  * the tree leaf
356  *
357  * When a file extent is allocated, The implicit back refs is used.
358  * the fields are filled in:
359  *
360  *     (root_key.objectid, inode objectid, offset in file, 1)
361  *
362  * When a file extent is removed file truncation, we find the
363  * corresponding implicit back refs and check the following fields:
364  *
365  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
366  *
367  * Btree extents can be referenced by:
368  *
369  * - Different subvolumes
370  *
371  * Both the implicit back refs and the full back refs for tree blocks
372  * only consist of key. The key offset for the implicit back refs is
373  * objectid of block's owner tree. The key offset for the full back refs
374  * is the first byte of parent block.
375  *
376  * When implicit back refs is used, information about the lowest key and
377  * level of the tree block are required. These information are stored in
378  * tree block info structure.
379  */
380
381 /*
382  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
383  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
384  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
385  */
386 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
387                                      struct btrfs_extent_inline_ref *iref,
388                                      enum btrfs_inline_ref_type is_data)
389 {
390         int type = btrfs_extent_inline_ref_type(eb, iref);
391         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
392
393         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
394             type == BTRFS_SHARED_BLOCK_REF_KEY ||
395             type == BTRFS_SHARED_DATA_REF_KEY ||
396             type == BTRFS_EXTENT_DATA_REF_KEY) {
397                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
398                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
399                                 return type;
400                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
401                                 ASSERT(eb->fs_info);
402                                 /*
403                                  * Every shared one has parent tree block,
404                                  * which must be aligned to sector size.
405                                  */
406                                 if (offset &&
407                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
408                                         return type;
409                         }
410                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
411                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
412                                 return type;
413                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
414                                 ASSERT(eb->fs_info);
415                                 /*
416                                  * Every shared one has parent tree block,
417                                  * which must be aligned to sector size.
418                                  */
419                                 if (offset &&
420                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
421                                         return type;
422                         }
423                 } else {
424                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
425                         return type;
426                 }
427         }
428
429         btrfs_print_leaf((struct extent_buffer *)eb);
430         btrfs_err(eb->fs_info,
431                   "eb %llu iref 0x%lx invalid extent inline ref type %d",
432                   eb->start, (unsigned long)iref, type);
433         WARN_ON(1);
434
435         return BTRFS_REF_TYPE_INVALID;
436 }
437
438 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
439 {
440         u32 high_crc = ~(u32)0;
441         u32 low_crc = ~(u32)0;
442         __le64 lenum;
443
444         lenum = cpu_to_le64(root_objectid);
445         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
446         lenum = cpu_to_le64(owner);
447         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
448         lenum = cpu_to_le64(offset);
449         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
450
451         return ((u64)high_crc << 31) ^ (u64)low_crc;
452 }
453
454 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
455                                      struct btrfs_extent_data_ref *ref)
456 {
457         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
458                                     btrfs_extent_data_ref_objectid(leaf, ref),
459                                     btrfs_extent_data_ref_offset(leaf, ref));
460 }
461
462 static int match_extent_data_ref(struct extent_buffer *leaf,
463                                  struct btrfs_extent_data_ref *ref,
464                                  u64 root_objectid, u64 owner, u64 offset)
465 {
466         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
467             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
468             btrfs_extent_data_ref_offset(leaf, ref) != offset)
469                 return 0;
470         return 1;
471 }
472
473 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
474                                            struct btrfs_path *path,
475                                            u64 bytenr, u64 parent,
476                                            u64 root_objectid,
477                                            u64 owner, u64 offset)
478 {
479         struct btrfs_root *root = trans->fs_info->extent_root;
480         struct btrfs_key key;
481         struct btrfs_extent_data_ref *ref;
482         struct extent_buffer *leaf;
483         u32 nritems;
484         int ret;
485         int recow;
486         int err = -ENOENT;
487
488         key.objectid = bytenr;
489         if (parent) {
490                 key.type = BTRFS_SHARED_DATA_REF_KEY;
491                 key.offset = parent;
492         } else {
493                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
494                 key.offset = hash_extent_data_ref(root_objectid,
495                                                   owner, offset);
496         }
497 again:
498         recow = 0;
499         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
500         if (ret < 0) {
501                 err = ret;
502                 goto fail;
503         }
504
505         if (parent) {
506                 if (!ret)
507                         return 0;
508                 goto fail;
509         }
510
511         leaf = path->nodes[0];
512         nritems = btrfs_header_nritems(leaf);
513         while (1) {
514                 if (path->slots[0] >= nritems) {
515                         ret = btrfs_next_leaf(root, path);
516                         if (ret < 0)
517                                 err = ret;
518                         if (ret)
519                                 goto fail;
520
521                         leaf = path->nodes[0];
522                         nritems = btrfs_header_nritems(leaf);
523                         recow = 1;
524                 }
525
526                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
527                 if (key.objectid != bytenr ||
528                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
529                         goto fail;
530
531                 ref = btrfs_item_ptr(leaf, path->slots[0],
532                                      struct btrfs_extent_data_ref);
533
534                 if (match_extent_data_ref(leaf, ref, root_objectid,
535                                           owner, offset)) {
536                         if (recow) {
537                                 btrfs_release_path(path);
538                                 goto again;
539                         }
540                         err = 0;
541                         break;
542                 }
543                 path->slots[0]++;
544         }
545 fail:
546         return err;
547 }
548
549 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
550                                            struct btrfs_path *path,
551                                            u64 bytenr, u64 parent,
552                                            u64 root_objectid, u64 owner,
553                                            u64 offset, int refs_to_add)
554 {
555         struct btrfs_root *root = trans->fs_info->extent_root;
556         struct btrfs_key key;
557         struct extent_buffer *leaf;
558         u32 size;
559         u32 num_refs;
560         int ret;
561
562         key.objectid = bytenr;
563         if (parent) {
564                 key.type = BTRFS_SHARED_DATA_REF_KEY;
565                 key.offset = parent;
566                 size = sizeof(struct btrfs_shared_data_ref);
567         } else {
568                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
569                 key.offset = hash_extent_data_ref(root_objectid,
570                                                   owner, offset);
571                 size = sizeof(struct btrfs_extent_data_ref);
572         }
573
574         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
575         if (ret && ret != -EEXIST)
576                 goto fail;
577
578         leaf = path->nodes[0];
579         if (parent) {
580                 struct btrfs_shared_data_ref *ref;
581                 ref = btrfs_item_ptr(leaf, path->slots[0],
582                                      struct btrfs_shared_data_ref);
583                 if (ret == 0) {
584                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
585                 } else {
586                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
587                         num_refs += refs_to_add;
588                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
589                 }
590         } else {
591                 struct btrfs_extent_data_ref *ref;
592                 while (ret == -EEXIST) {
593                         ref = btrfs_item_ptr(leaf, path->slots[0],
594                                              struct btrfs_extent_data_ref);
595                         if (match_extent_data_ref(leaf, ref, root_objectid,
596                                                   owner, offset))
597                                 break;
598                         btrfs_release_path(path);
599                         key.offset++;
600                         ret = btrfs_insert_empty_item(trans, root, path, &key,
601                                                       size);
602                         if (ret && ret != -EEXIST)
603                                 goto fail;
604
605                         leaf = path->nodes[0];
606                 }
607                 ref = btrfs_item_ptr(leaf, path->slots[0],
608                                      struct btrfs_extent_data_ref);
609                 if (ret == 0) {
610                         btrfs_set_extent_data_ref_root(leaf, ref,
611                                                        root_objectid);
612                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
613                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
614                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
615                 } else {
616                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
617                         num_refs += refs_to_add;
618                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
619                 }
620         }
621         btrfs_mark_buffer_dirty(leaf);
622         ret = 0;
623 fail:
624         btrfs_release_path(path);
625         return ret;
626 }
627
628 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
629                                            struct btrfs_path *path,
630                                            int refs_to_drop, int *last_ref)
631 {
632         struct btrfs_key key;
633         struct btrfs_extent_data_ref *ref1 = NULL;
634         struct btrfs_shared_data_ref *ref2 = NULL;
635         struct extent_buffer *leaf;
636         u32 num_refs = 0;
637         int ret = 0;
638
639         leaf = path->nodes[0];
640         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
641
642         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
643                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
644                                       struct btrfs_extent_data_ref);
645                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
646         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
647                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
648                                       struct btrfs_shared_data_ref);
649                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
650         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
651                 btrfs_print_v0_err(trans->fs_info);
652                 btrfs_abort_transaction(trans, -EINVAL);
653                 return -EINVAL;
654         } else {
655                 BUG();
656         }
657
658         BUG_ON(num_refs < refs_to_drop);
659         num_refs -= refs_to_drop;
660
661         if (num_refs == 0) {
662                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
663                 *last_ref = 1;
664         } else {
665                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
666                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
667                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
668                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
669                 btrfs_mark_buffer_dirty(leaf);
670         }
671         return ret;
672 }
673
674 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
675                                           struct btrfs_extent_inline_ref *iref)
676 {
677         struct btrfs_key key;
678         struct extent_buffer *leaf;
679         struct btrfs_extent_data_ref *ref1;
680         struct btrfs_shared_data_ref *ref2;
681         u32 num_refs = 0;
682         int type;
683
684         leaf = path->nodes[0];
685         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
686
687         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
688         if (iref) {
689                 /*
690                  * If type is invalid, we should have bailed out earlier than
691                  * this call.
692                  */
693                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
694                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
695                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
696                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
697                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
698                 } else {
699                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
700                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
701                 }
702         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
703                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
704                                       struct btrfs_extent_data_ref);
705                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
706         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
707                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
708                                       struct btrfs_shared_data_ref);
709                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
710         } else {
711                 WARN_ON(1);
712         }
713         return num_refs;
714 }
715
716 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
717                                           struct btrfs_path *path,
718                                           u64 bytenr, u64 parent,
719                                           u64 root_objectid)
720 {
721         struct btrfs_root *root = trans->fs_info->extent_root;
722         struct btrfs_key key;
723         int ret;
724
725         key.objectid = bytenr;
726         if (parent) {
727                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
728                 key.offset = parent;
729         } else {
730                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
731                 key.offset = root_objectid;
732         }
733
734         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
735         if (ret > 0)
736                 ret = -ENOENT;
737         return ret;
738 }
739
740 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
741                                           struct btrfs_path *path,
742                                           u64 bytenr, u64 parent,
743                                           u64 root_objectid)
744 {
745         struct btrfs_key key;
746         int ret;
747
748         key.objectid = bytenr;
749         if (parent) {
750                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
751                 key.offset = parent;
752         } else {
753                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
754                 key.offset = root_objectid;
755         }
756
757         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
758                                       path, &key, 0);
759         btrfs_release_path(path);
760         return ret;
761 }
762
763 static inline int extent_ref_type(u64 parent, u64 owner)
764 {
765         int type;
766         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
767                 if (parent > 0)
768                         type = BTRFS_SHARED_BLOCK_REF_KEY;
769                 else
770                         type = BTRFS_TREE_BLOCK_REF_KEY;
771         } else {
772                 if (parent > 0)
773                         type = BTRFS_SHARED_DATA_REF_KEY;
774                 else
775                         type = BTRFS_EXTENT_DATA_REF_KEY;
776         }
777         return type;
778 }
779
780 static int find_next_key(struct btrfs_path *path, int level,
781                          struct btrfs_key *key)
782
783 {
784         for (; level < BTRFS_MAX_LEVEL; level++) {
785                 if (!path->nodes[level])
786                         break;
787                 if (path->slots[level] + 1 >=
788                     btrfs_header_nritems(path->nodes[level]))
789                         continue;
790                 if (level == 0)
791                         btrfs_item_key_to_cpu(path->nodes[level], key,
792                                               path->slots[level] + 1);
793                 else
794                         btrfs_node_key_to_cpu(path->nodes[level], key,
795                                               path->slots[level] + 1);
796                 return 0;
797         }
798         return 1;
799 }
800
801 /*
802  * look for inline back ref. if back ref is found, *ref_ret is set
803  * to the address of inline back ref, and 0 is returned.
804  *
805  * if back ref isn't found, *ref_ret is set to the address where it
806  * should be inserted, and -ENOENT is returned.
807  *
808  * if insert is true and there are too many inline back refs, the path
809  * points to the extent item, and -EAGAIN is returned.
810  *
811  * NOTE: inline back refs are ordered in the same way that back ref
812  *       items in the tree are ordered.
813  */
814 static noinline_for_stack
815 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
816                                  struct btrfs_path *path,
817                                  struct btrfs_extent_inline_ref **ref_ret,
818                                  u64 bytenr, u64 num_bytes,
819                                  u64 parent, u64 root_objectid,
820                                  u64 owner, u64 offset, int insert)
821 {
822         struct btrfs_fs_info *fs_info = trans->fs_info;
823         struct btrfs_root *root = fs_info->extent_root;
824         struct btrfs_key key;
825         struct extent_buffer *leaf;
826         struct btrfs_extent_item *ei;
827         struct btrfs_extent_inline_ref *iref;
828         u64 flags;
829         u64 item_size;
830         unsigned long ptr;
831         unsigned long end;
832         int extra_size;
833         int type;
834         int want;
835         int ret;
836         int err = 0;
837         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
838         int needed;
839
840         key.objectid = bytenr;
841         key.type = BTRFS_EXTENT_ITEM_KEY;
842         key.offset = num_bytes;
843
844         want = extent_ref_type(parent, owner);
845         if (insert) {
846                 extra_size = btrfs_extent_inline_ref_size(want);
847                 path->keep_locks = 1;
848         } else
849                 extra_size = -1;
850
851         /*
852          * Owner is our level, so we can just add one to get the level for the
853          * block we are interested in.
854          */
855         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
856                 key.type = BTRFS_METADATA_ITEM_KEY;
857                 key.offset = owner;
858         }
859
860 again:
861         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
862         if (ret < 0) {
863                 err = ret;
864                 goto out;
865         }
866
867         /*
868          * We may be a newly converted file system which still has the old fat
869          * extent entries for metadata, so try and see if we have one of those.
870          */
871         if (ret > 0 && skinny_metadata) {
872                 skinny_metadata = false;
873                 if (path->slots[0]) {
874                         path->slots[0]--;
875                         btrfs_item_key_to_cpu(path->nodes[0], &key,
876                                               path->slots[0]);
877                         if (key.objectid == bytenr &&
878                             key.type == BTRFS_EXTENT_ITEM_KEY &&
879                             key.offset == num_bytes)
880                                 ret = 0;
881                 }
882                 if (ret) {
883                         key.objectid = bytenr;
884                         key.type = BTRFS_EXTENT_ITEM_KEY;
885                         key.offset = num_bytes;
886                         btrfs_release_path(path);
887                         goto again;
888                 }
889         }
890
891         if (ret && !insert) {
892                 err = -ENOENT;
893                 goto out;
894         } else if (WARN_ON(ret)) {
895                 err = -EIO;
896                 goto out;
897         }
898
899         leaf = path->nodes[0];
900         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
901         if (unlikely(item_size < sizeof(*ei))) {
902                 err = -EINVAL;
903                 btrfs_print_v0_err(fs_info);
904                 btrfs_abort_transaction(trans, err);
905                 goto out;
906         }
907
908         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
909         flags = btrfs_extent_flags(leaf, ei);
910
911         ptr = (unsigned long)(ei + 1);
912         end = (unsigned long)ei + item_size;
913
914         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
915                 ptr += sizeof(struct btrfs_tree_block_info);
916                 BUG_ON(ptr > end);
917         }
918
919         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
920                 needed = BTRFS_REF_TYPE_DATA;
921         else
922                 needed = BTRFS_REF_TYPE_BLOCK;
923
924         err = -ENOENT;
925         while (1) {
926                 if (ptr >= end) {
927                         WARN_ON(ptr > end);
928                         break;
929                 }
930                 iref = (struct btrfs_extent_inline_ref *)ptr;
931                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
932                 if (type == BTRFS_REF_TYPE_INVALID) {
933                         err = -EUCLEAN;
934                         goto out;
935                 }
936
937                 if (want < type)
938                         break;
939                 if (want > type) {
940                         ptr += btrfs_extent_inline_ref_size(type);
941                         continue;
942                 }
943
944                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
945                         struct btrfs_extent_data_ref *dref;
946                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
947                         if (match_extent_data_ref(leaf, dref, root_objectid,
948                                                   owner, offset)) {
949                                 err = 0;
950                                 break;
951                         }
952                         if (hash_extent_data_ref_item(leaf, dref) <
953                             hash_extent_data_ref(root_objectid, owner, offset))
954                                 break;
955                 } else {
956                         u64 ref_offset;
957                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
958                         if (parent > 0) {
959                                 if (parent == ref_offset) {
960                                         err = 0;
961                                         break;
962                                 }
963                                 if (ref_offset < parent)
964                                         break;
965                         } else {
966                                 if (root_objectid == ref_offset) {
967                                         err = 0;
968                                         break;
969                                 }
970                                 if (ref_offset < root_objectid)
971                                         break;
972                         }
973                 }
974                 ptr += btrfs_extent_inline_ref_size(type);
975         }
976         if (err == -ENOENT && insert) {
977                 if (item_size + extra_size >=
978                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
979                         err = -EAGAIN;
980                         goto out;
981                 }
982                 /*
983                  * To add new inline back ref, we have to make sure
984                  * there is no corresponding back ref item.
985                  * For simplicity, we just do not add new inline back
986                  * ref if there is any kind of item for this block
987                  */
988                 if (find_next_key(path, 0, &key) == 0 &&
989                     key.objectid == bytenr &&
990                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
991                         err = -EAGAIN;
992                         goto out;
993                 }
994         }
995         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
996 out:
997         if (insert) {
998                 path->keep_locks = 0;
999                 btrfs_unlock_up_safe(path, 1);
1000         }
1001         return err;
1002 }
1003
1004 /*
1005  * helper to add new inline back ref
1006  */
1007 static noinline_for_stack
1008 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1009                                  struct btrfs_path *path,
1010                                  struct btrfs_extent_inline_ref *iref,
1011                                  u64 parent, u64 root_objectid,
1012                                  u64 owner, u64 offset, int refs_to_add,
1013                                  struct btrfs_delayed_extent_op *extent_op)
1014 {
1015         struct extent_buffer *leaf;
1016         struct btrfs_extent_item *ei;
1017         unsigned long ptr;
1018         unsigned long end;
1019         unsigned long item_offset;
1020         u64 refs;
1021         int size;
1022         int type;
1023
1024         leaf = path->nodes[0];
1025         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1026         item_offset = (unsigned long)iref - (unsigned long)ei;
1027
1028         type = extent_ref_type(parent, owner);
1029         size = btrfs_extent_inline_ref_size(type);
1030
1031         btrfs_extend_item(path, size);
1032
1033         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1034         refs = btrfs_extent_refs(leaf, ei);
1035         refs += refs_to_add;
1036         btrfs_set_extent_refs(leaf, ei, refs);
1037         if (extent_op)
1038                 __run_delayed_extent_op(extent_op, leaf, ei);
1039
1040         ptr = (unsigned long)ei + item_offset;
1041         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1042         if (ptr < end - size)
1043                 memmove_extent_buffer(leaf, ptr + size, ptr,
1044                                       end - size - ptr);
1045
1046         iref = (struct btrfs_extent_inline_ref *)ptr;
1047         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1048         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1049                 struct btrfs_extent_data_ref *dref;
1050                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1051                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1052                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1053                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1054                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1055         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1056                 struct btrfs_shared_data_ref *sref;
1057                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1058                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1059                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1060         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1061                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1062         } else {
1063                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1064         }
1065         btrfs_mark_buffer_dirty(leaf);
1066 }
1067
1068 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1069                                  struct btrfs_path *path,
1070                                  struct btrfs_extent_inline_ref **ref_ret,
1071                                  u64 bytenr, u64 num_bytes, u64 parent,
1072                                  u64 root_objectid, u64 owner, u64 offset)
1073 {
1074         int ret;
1075
1076         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1077                                            num_bytes, parent, root_objectid,
1078                                            owner, offset, 0);
1079         if (ret != -ENOENT)
1080                 return ret;
1081
1082         btrfs_release_path(path);
1083         *ref_ret = NULL;
1084
1085         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1086                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1087                                             root_objectid);
1088         } else {
1089                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1090                                              root_objectid, owner, offset);
1091         }
1092         return ret;
1093 }
1094
1095 /*
1096  * helper to update/remove inline back ref
1097  */
1098 static noinline_for_stack
1099 void update_inline_extent_backref(struct btrfs_path *path,
1100                                   struct btrfs_extent_inline_ref *iref,
1101                                   int refs_to_mod,
1102                                   struct btrfs_delayed_extent_op *extent_op,
1103                                   int *last_ref)
1104 {
1105         struct extent_buffer *leaf = path->nodes[0];
1106         struct btrfs_extent_item *ei;
1107         struct btrfs_extent_data_ref *dref = NULL;
1108         struct btrfs_shared_data_ref *sref = NULL;
1109         unsigned long ptr;
1110         unsigned long end;
1111         u32 item_size;
1112         int size;
1113         int type;
1114         u64 refs;
1115
1116         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1117         refs = btrfs_extent_refs(leaf, ei);
1118         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1119         refs += refs_to_mod;
1120         btrfs_set_extent_refs(leaf, ei, refs);
1121         if (extent_op)
1122                 __run_delayed_extent_op(extent_op, leaf, ei);
1123
1124         /*
1125          * If type is invalid, we should have bailed out after
1126          * lookup_inline_extent_backref().
1127          */
1128         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1129         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1130
1131         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1132                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1133                 refs = btrfs_extent_data_ref_count(leaf, dref);
1134         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1135                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1136                 refs = btrfs_shared_data_ref_count(leaf, sref);
1137         } else {
1138                 refs = 1;
1139                 BUG_ON(refs_to_mod != -1);
1140         }
1141
1142         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1143         refs += refs_to_mod;
1144
1145         if (refs > 0) {
1146                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1147                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1148                 else
1149                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1150         } else {
1151                 *last_ref = 1;
1152                 size =  btrfs_extent_inline_ref_size(type);
1153                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1154                 ptr = (unsigned long)iref;
1155                 end = (unsigned long)ei + item_size;
1156                 if (ptr + size < end)
1157                         memmove_extent_buffer(leaf, ptr, ptr + size,
1158                                               end - ptr - size);
1159                 item_size -= size;
1160                 btrfs_truncate_item(path, item_size, 1);
1161         }
1162         btrfs_mark_buffer_dirty(leaf);
1163 }
1164
1165 static noinline_for_stack
1166 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1167                                  struct btrfs_path *path,
1168                                  u64 bytenr, u64 num_bytes, u64 parent,
1169                                  u64 root_objectid, u64 owner,
1170                                  u64 offset, int refs_to_add,
1171                                  struct btrfs_delayed_extent_op *extent_op)
1172 {
1173         struct btrfs_extent_inline_ref *iref;
1174         int ret;
1175
1176         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1177                                            num_bytes, parent, root_objectid,
1178                                            owner, offset, 1);
1179         if (ret == 0) {
1180                 /*
1181                  * We're adding refs to a tree block we already own, this
1182                  * should not happen at all.
1183                  */
1184                 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1185                         btrfs_crit(trans->fs_info,
1186 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1187                                    bytenr, num_bytes, root_objectid);
1188                         if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1189                                 WARN_ON(1);
1190                                 btrfs_crit(trans->fs_info,
1191                         "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1192                                 btrfs_print_leaf(path->nodes[0]);
1193                         }
1194                         return -EUCLEAN;
1195                 }
1196                 update_inline_extent_backref(path, iref, refs_to_add,
1197                                              extent_op, NULL);
1198         } else if (ret == -ENOENT) {
1199                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1200                                             root_objectid, owner, offset,
1201                                             refs_to_add, extent_op);
1202                 ret = 0;
1203         }
1204         return ret;
1205 }
1206
1207 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1208                                  struct btrfs_path *path,
1209                                  struct btrfs_extent_inline_ref *iref,
1210                                  int refs_to_drop, int is_data, int *last_ref)
1211 {
1212         int ret = 0;
1213
1214         BUG_ON(!is_data && refs_to_drop != 1);
1215         if (iref) {
1216                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1217                                              last_ref);
1218         } else if (is_data) {
1219                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1220                                              last_ref);
1221         } else {
1222                 *last_ref = 1;
1223                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1224         }
1225         return ret;
1226 }
1227
1228 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1229                                u64 *discarded_bytes)
1230 {
1231         int j, ret = 0;
1232         u64 bytes_left, end;
1233         u64 aligned_start = ALIGN(start, 1 << 9);
1234
1235         if (WARN_ON(start != aligned_start)) {
1236                 len -= aligned_start - start;
1237                 len = round_down(len, 1 << 9);
1238                 start = aligned_start;
1239         }
1240
1241         *discarded_bytes = 0;
1242
1243         if (!len)
1244                 return 0;
1245
1246         end = start + len;
1247         bytes_left = len;
1248
1249         /* Skip any superblocks on this device. */
1250         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1251                 u64 sb_start = btrfs_sb_offset(j);
1252                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1253                 u64 size = sb_start - start;
1254
1255                 if (!in_range(sb_start, start, bytes_left) &&
1256                     !in_range(sb_end, start, bytes_left) &&
1257                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1258                         continue;
1259
1260                 /*
1261                  * Superblock spans beginning of range.  Adjust start and
1262                  * try again.
1263                  */
1264                 if (sb_start <= start) {
1265                         start += sb_end - start;
1266                         if (start > end) {
1267                                 bytes_left = 0;
1268                                 break;
1269                         }
1270                         bytes_left = end - start;
1271                         continue;
1272                 }
1273
1274                 if (size) {
1275                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1276                                                    GFP_NOFS, 0);
1277                         if (!ret)
1278                                 *discarded_bytes += size;
1279                         else if (ret != -EOPNOTSUPP)
1280                                 return ret;
1281                 }
1282
1283                 start = sb_end;
1284                 if (start > end) {
1285                         bytes_left = 0;
1286                         break;
1287                 }
1288                 bytes_left = end - start;
1289         }
1290
1291         if (bytes_left) {
1292                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1293                                            GFP_NOFS, 0);
1294                 if (!ret)
1295                         *discarded_bytes += bytes_left;
1296         }
1297         return ret;
1298 }
1299
1300 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1301                          u64 num_bytes, u64 *actual_bytes)
1302 {
1303         int ret = 0;
1304         u64 discarded_bytes = 0;
1305         u64 end = bytenr + num_bytes;
1306         u64 cur = bytenr;
1307         struct btrfs_bio *bbio = NULL;
1308
1309
1310         /*
1311          * Avoid races with device replace and make sure our bbio has devices
1312          * associated to its stripes that don't go away while we are discarding.
1313          */
1314         btrfs_bio_counter_inc_blocked(fs_info);
1315         while (cur < end) {
1316                 struct btrfs_bio_stripe *stripe;
1317                 int i;
1318
1319                 num_bytes = end - cur;
1320                 /* Tell the block device(s) that the sectors can be discarded */
1321                 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1322                                       &num_bytes, &bbio, 0);
1323                 /*
1324                  * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1325                  * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1326                  * thus we can't continue anyway.
1327                  */
1328                 if (ret < 0)
1329                         goto out;
1330
1331                 stripe = bbio->stripes;
1332                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1333                         u64 bytes;
1334                         struct request_queue *req_q;
1335
1336                         if (!stripe->dev->bdev) {
1337                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1338                                 continue;
1339                         }
1340                         req_q = bdev_get_queue(stripe->dev->bdev);
1341                         if (!blk_queue_discard(req_q))
1342                                 continue;
1343
1344                         ret = btrfs_issue_discard(stripe->dev->bdev,
1345                                                   stripe->physical,
1346                                                   stripe->length,
1347                                                   &bytes);
1348                         if (!ret) {
1349                                 discarded_bytes += bytes;
1350                         } else if (ret != -EOPNOTSUPP) {
1351                                 /*
1352                                  * Logic errors or -ENOMEM, or -EIO, but
1353                                  * unlikely to happen.
1354                                  *
1355                                  * And since there are two loops, explicitly
1356                                  * go to out to avoid confusion.
1357                                  */
1358                                 btrfs_put_bbio(bbio);
1359                                 goto out;
1360                         }
1361
1362                         /*
1363                          * Just in case we get back EOPNOTSUPP for some reason,
1364                          * just ignore the return value so we don't screw up
1365                          * people calling discard_extent.
1366                          */
1367                         ret = 0;
1368                 }
1369                 btrfs_put_bbio(bbio);
1370                 cur += num_bytes;
1371         }
1372 out:
1373         btrfs_bio_counter_dec(fs_info);
1374
1375         if (actual_bytes)
1376                 *actual_bytes = discarded_bytes;
1377
1378
1379         if (ret == -EOPNOTSUPP)
1380                 ret = 0;
1381         return ret;
1382 }
1383
1384 /* Can return -ENOMEM */
1385 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1386                          struct btrfs_ref *generic_ref)
1387 {
1388         struct btrfs_fs_info *fs_info = trans->fs_info;
1389         int old_ref_mod, new_ref_mod;
1390         int ret;
1391
1392         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1393                generic_ref->action);
1394         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1395                generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1396
1397         if (generic_ref->type == BTRFS_REF_METADATA)
1398                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1399                                 NULL, &old_ref_mod, &new_ref_mod);
1400         else
1401                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1402                                                  &old_ref_mod, &new_ref_mod);
1403
1404         btrfs_ref_tree_mod(fs_info, generic_ref);
1405
1406         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1407                 sub_pinned_bytes(fs_info, generic_ref);
1408
1409         return ret;
1410 }
1411
1412 /*
1413  * __btrfs_inc_extent_ref - insert backreference for a given extent
1414  *
1415  * The counterpart is in __btrfs_free_extent(), with examples and more details
1416  * how it works.
1417  *
1418  * @trans:          Handle of transaction
1419  *
1420  * @node:           The delayed ref node used to get the bytenr/length for
1421  *                  extent whose references are incremented.
1422  *
1423  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1424  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1425  *                  bytenr of the parent block. Since new extents are always
1426  *                  created with indirect references, this will only be the case
1427  *                  when relocating a shared extent. In that case, root_objectid
1428  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
1429  *                  be 0
1430  *
1431  * @root_objectid:  The id of the root where this modification has originated,
1432  *                  this can be either one of the well-known metadata trees or
1433  *                  the subvolume id which references this extent.
1434  *
1435  * @owner:          For data extents it is the inode number of the owning file.
1436  *                  For metadata extents this parameter holds the level in the
1437  *                  tree of the extent.
1438  *
1439  * @offset:         For metadata extents the offset is ignored and is currently
1440  *                  always passed as 0. For data extents it is the fileoffset
1441  *                  this extent belongs to.
1442  *
1443  * @refs_to_add     Number of references to add
1444  *
1445  * @extent_op       Pointer to a structure, holding information necessary when
1446  *                  updating a tree block's flags
1447  *
1448  */
1449 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1450                                   struct btrfs_delayed_ref_node *node,
1451                                   u64 parent, u64 root_objectid,
1452                                   u64 owner, u64 offset, int refs_to_add,
1453                                   struct btrfs_delayed_extent_op *extent_op)
1454 {
1455         struct btrfs_path *path;
1456         struct extent_buffer *leaf;
1457         struct btrfs_extent_item *item;
1458         struct btrfs_key key;
1459         u64 bytenr = node->bytenr;
1460         u64 num_bytes = node->num_bytes;
1461         u64 refs;
1462         int ret;
1463
1464         path = btrfs_alloc_path();
1465         if (!path)
1466                 return -ENOMEM;
1467
1468         /* this will setup the path even if it fails to insert the back ref */
1469         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1470                                            parent, root_objectid, owner,
1471                                            offset, refs_to_add, extent_op);
1472         if ((ret < 0 && ret != -EAGAIN) || !ret)
1473                 goto out;
1474
1475         /*
1476          * Ok we had -EAGAIN which means we didn't have space to insert and
1477          * inline extent ref, so just update the reference count and add a
1478          * normal backref.
1479          */
1480         leaf = path->nodes[0];
1481         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1482         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1483         refs = btrfs_extent_refs(leaf, item);
1484         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1485         if (extent_op)
1486                 __run_delayed_extent_op(extent_op, leaf, item);
1487
1488         btrfs_mark_buffer_dirty(leaf);
1489         btrfs_release_path(path);
1490
1491         /* now insert the actual backref */
1492         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1493                 BUG_ON(refs_to_add != 1);
1494                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1495                                             root_objectid);
1496         } else {
1497                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1498                                              root_objectid, owner, offset,
1499                                              refs_to_add);
1500         }
1501         if (ret)
1502                 btrfs_abort_transaction(trans, ret);
1503 out:
1504         btrfs_free_path(path);
1505         return ret;
1506 }
1507
1508 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1509                                 struct btrfs_delayed_ref_node *node,
1510                                 struct btrfs_delayed_extent_op *extent_op,
1511                                 int insert_reserved)
1512 {
1513         int ret = 0;
1514         struct btrfs_delayed_data_ref *ref;
1515         struct btrfs_key ins;
1516         u64 parent = 0;
1517         u64 ref_root = 0;
1518         u64 flags = 0;
1519
1520         ins.objectid = node->bytenr;
1521         ins.offset = node->num_bytes;
1522         ins.type = BTRFS_EXTENT_ITEM_KEY;
1523
1524         ref = btrfs_delayed_node_to_data_ref(node);
1525         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1526
1527         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1528                 parent = ref->parent;
1529         ref_root = ref->root;
1530
1531         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1532                 if (extent_op)
1533                         flags |= extent_op->flags_to_set;
1534                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
1535                                                  flags, ref->objectid,
1536                                                  ref->offset, &ins,
1537                                                  node->ref_mod);
1538         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1539                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1540                                              ref->objectid, ref->offset,
1541                                              node->ref_mod, extent_op);
1542         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1543                 ret = __btrfs_free_extent(trans, node, parent,
1544                                           ref_root, ref->objectid,
1545                                           ref->offset, node->ref_mod,
1546                                           extent_op);
1547         } else {
1548                 BUG();
1549         }
1550         return ret;
1551 }
1552
1553 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1554                                     struct extent_buffer *leaf,
1555                                     struct btrfs_extent_item *ei)
1556 {
1557         u64 flags = btrfs_extent_flags(leaf, ei);
1558         if (extent_op->update_flags) {
1559                 flags |= extent_op->flags_to_set;
1560                 btrfs_set_extent_flags(leaf, ei, flags);
1561         }
1562
1563         if (extent_op->update_key) {
1564                 struct btrfs_tree_block_info *bi;
1565                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1566                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1567                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1568         }
1569 }
1570
1571 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1572                                  struct btrfs_delayed_ref_head *head,
1573                                  struct btrfs_delayed_extent_op *extent_op)
1574 {
1575         struct btrfs_fs_info *fs_info = trans->fs_info;
1576         struct btrfs_key key;
1577         struct btrfs_path *path;
1578         struct btrfs_extent_item *ei;
1579         struct extent_buffer *leaf;
1580         u32 item_size;
1581         int ret;
1582         int err = 0;
1583         int metadata = !extent_op->is_data;
1584
1585         if (TRANS_ABORTED(trans))
1586                 return 0;
1587
1588         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1589                 metadata = 0;
1590
1591         path = btrfs_alloc_path();
1592         if (!path)
1593                 return -ENOMEM;
1594
1595         key.objectid = head->bytenr;
1596
1597         if (metadata) {
1598                 key.type = BTRFS_METADATA_ITEM_KEY;
1599                 key.offset = extent_op->level;
1600         } else {
1601                 key.type = BTRFS_EXTENT_ITEM_KEY;
1602                 key.offset = head->num_bytes;
1603         }
1604
1605 again:
1606         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1607         if (ret < 0) {
1608                 err = ret;
1609                 goto out;
1610         }
1611         if (ret > 0) {
1612                 if (metadata) {
1613                         if (path->slots[0] > 0) {
1614                                 path->slots[0]--;
1615                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1616                                                       path->slots[0]);
1617                                 if (key.objectid == head->bytenr &&
1618                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
1619                                     key.offset == head->num_bytes)
1620                                         ret = 0;
1621                         }
1622                         if (ret > 0) {
1623                                 btrfs_release_path(path);
1624                                 metadata = 0;
1625
1626                                 key.objectid = head->bytenr;
1627                                 key.offset = head->num_bytes;
1628                                 key.type = BTRFS_EXTENT_ITEM_KEY;
1629                                 goto again;
1630                         }
1631                 } else {
1632                         err = -EIO;
1633                         goto out;
1634                 }
1635         }
1636
1637         leaf = path->nodes[0];
1638         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1639
1640         if (unlikely(item_size < sizeof(*ei))) {
1641                 err = -EINVAL;
1642                 btrfs_print_v0_err(fs_info);
1643                 btrfs_abort_transaction(trans, err);
1644                 goto out;
1645         }
1646
1647         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1648         __run_delayed_extent_op(extent_op, leaf, ei);
1649
1650         btrfs_mark_buffer_dirty(leaf);
1651 out:
1652         btrfs_free_path(path);
1653         return err;
1654 }
1655
1656 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1657                                 struct btrfs_delayed_ref_node *node,
1658                                 struct btrfs_delayed_extent_op *extent_op,
1659                                 int insert_reserved)
1660 {
1661         int ret = 0;
1662         struct btrfs_delayed_tree_ref *ref;
1663         u64 parent = 0;
1664         u64 ref_root = 0;
1665
1666         ref = btrfs_delayed_node_to_tree_ref(node);
1667         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1668
1669         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1670                 parent = ref->parent;
1671         ref_root = ref->root;
1672
1673         if (node->ref_mod != 1) {
1674                 btrfs_err(trans->fs_info,
1675         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1676                           node->bytenr, node->ref_mod, node->action, ref_root,
1677                           parent);
1678                 return -EIO;
1679         }
1680         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1681                 BUG_ON(!extent_op || !extent_op->update_flags);
1682                 ret = alloc_reserved_tree_block(trans, node, extent_op);
1683         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1684                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1685                                              ref->level, 0, 1, extent_op);
1686         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1687                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
1688                                           ref->level, 0, 1, extent_op);
1689         } else {
1690                 BUG();
1691         }
1692         return ret;
1693 }
1694
1695 /* helper function to actually process a single delayed ref entry */
1696 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1697                                struct btrfs_delayed_ref_node *node,
1698                                struct btrfs_delayed_extent_op *extent_op,
1699                                int insert_reserved)
1700 {
1701         int ret = 0;
1702
1703         if (TRANS_ABORTED(trans)) {
1704                 if (insert_reserved)
1705                         btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1706                 return 0;
1707         }
1708
1709         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1710             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1711                 ret = run_delayed_tree_ref(trans, node, extent_op,
1712                                            insert_reserved);
1713         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1714                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1715                 ret = run_delayed_data_ref(trans, node, extent_op,
1716                                            insert_reserved);
1717         else
1718                 BUG();
1719         if (ret && insert_reserved)
1720                 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1721         return ret;
1722 }
1723
1724 static inline struct btrfs_delayed_ref_node *
1725 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1726 {
1727         struct btrfs_delayed_ref_node *ref;
1728
1729         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1730                 return NULL;
1731
1732         /*
1733          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1734          * This is to prevent a ref count from going down to zero, which deletes
1735          * the extent item from the extent tree, when there still are references
1736          * to add, which would fail because they would not find the extent item.
1737          */
1738         if (!list_empty(&head->ref_add_list))
1739                 return list_first_entry(&head->ref_add_list,
1740                                 struct btrfs_delayed_ref_node, add_list);
1741
1742         ref = rb_entry(rb_first_cached(&head->ref_tree),
1743                        struct btrfs_delayed_ref_node, ref_node);
1744         ASSERT(list_empty(&ref->add_list));
1745         return ref;
1746 }
1747
1748 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1749                                       struct btrfs_delayed_ref_head *head)
1750 {
1751         spin_lock(&delayed_refs->lock);
1752         head->processing = 0;
1753         delayed_refs->num_heads_ready++;
1754         spin_unlock(&delayed_refs->lock);
1755         btrfs_delayed_ref_unlock(head);
1756 }
1757
1758 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1759                                 struct btrfs_delayed_ref_head *head)
1760 {
1761         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1762
1763         if (!extent_op)
1764                 return NULL;
1765
1766         if (head->must_insert_reserved) {
1767                 head->extent_op = NULL;
1768                 btrfs_free_delayed_extent_op(extent_op);
1769                 return NULL;
1770         }
1771         return extent_op;
1772 }
1773
1774 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1775                                      struct btrfs_delayed_ref_head *head)
1776 {
1777         struct btrfs_delayed_extent_op *extent_op;
1778         int ret;
1779
1780         extent_op = cleanup_extent_op(head);
1781         if (!extent_op)
1782                 return 0;
1783         head->extent_op = NULL;
1784         spin_unlock(&head->lock);
1785         ret = run_delayed_extent_op(trans, head, extent_op);
1786         btrfs_free_delayed_extent_op(extent_op);
1787         return ret ? ret : 1;
1788 }
1789
1790 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1791                                   struct btrfs_delayed_ref_root *delayed_refs,
1792                                   struct btrfs_delayed_ref_head *head)
1793 {
1794         int nr_items = 1;       /* Dropping this ref head update. */
1795
1796         if (head->total_ref_mod < 0) {
1797                 struct btrfs_space_info *space_info;
1798                 u64 flags;
1799
1800                 if (head->is_data)
1801                         flags = BTRFS_BLOCK_GROUP_DATA;
1802                 else if (head->is_system)
1803                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
1804                 else
1805                         flags = BTRFS_BLOCK_GROUP_METADATA;
1806                 space_info = btrfs_find_space_info(fs_info, flags);
1807                 ASSERT(space_info);
1808                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1809                                    -head->num_bytes,
1810                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
1811
1812                 /*
1813                  * We had csum deletions accounted for in our delayed refs rsv,
1814                  * we need to drop the csum leaves for this update from our
1815                  * delayed_refs_rsv.
1816                  */
1817                 if (head->is_data) {
1818                         spin_lock(&delayed_refs->lock);
1819                         delayed_refs->pending_csums -= head->num_bytes;
1820                         spin_unlock(&delayed_refs->lock);
1821                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
1822                                 head->num_bytes);
1823                 }
1824         }
1825
1826         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1827 }
1828
1829 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1830                             struct btrfs_delayed_ref_head *head)
1831 {
1832
1833         struct btrfs_fs_info *fs_info = trans->fs_info;
1834         struct btrfs_delayed_ref_root *delayed_refs;
1835         int ret;
1836
1837         delayed_refs = &trans->transaction->delayed_refs;
1838
1839         ret = run_and_cleanup_extent_op(trans, head);
1840         if (ret < 0) {
1841                 unselect_delayed_ref_head(delayed_refs, head);
1842                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1843                 return ret;
1844         } else if (ret) {
1845                 return ret;
1846         }
1847
1848         /*
1849          * Need to drop our head ref lock and re-acquire the delayed ref lock
1850          * and then re-check to make sure nobody got added.
1851          */
1852         spin_unlock(&head->lock);
1853         spin_lock(&delayed_refs->lock);
1854         spin_lock(&head->lock);
1855         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1856                 spin_unlock(&head->lock);
1857                 spin_unlock(&delayed_refs->lock);
1858                 return 1;
1859         }
1860         btrfs_delete_ref_head(delayed_refs, head);
1861         spin_unlock(&head->lock);
1862         spin_unlock(&delayed_refs->lock);
1863
1864         if (head->must_insert_reserved) {
1865                 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1866                 if (head->is_data) {
1867                         ret = btrfs_del_csums(trans, fs_info->csum_root,
1868                                               head->bytenr, head->num_bytes);
1869                 }
1870         }
1871
1872         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1873
1874         trace_run_delayed_ref_head(fs_info, head, 0);
1875         btrfs_delayed_ref_unlock(head);
1876         btrfs_put_delayed_ref_head(head);
1877         return 0;
1878 }
1879
1880 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1881                                         struct btrfs_trans_handle *trans)
1882 {
1883         struct btrfs_delayed_ref_root *delayed_refs =
1884                 &trans->transaction->delayed_refs;
1885         struct btrfs_delayed_ref_head *head = NULL;
1886         int ret;
1887
1888         spin_lock(&delayed_refs->lock);
1889         head = btrfs_select_ref_head(delayed_refs);
1890         if (!head) {
1891                 spin_unlock(&delayed_refs->lock);
1892                 return head;
1893         }
1894
1895         /*
1896          * Grab the lock that says we are going to process all the refs for
1897          * this head
1898          */
1899         ret = btrfs_delayed_ref_lock(delayed_refs, head);
1900         spin_unlock(&delayed_refs->lock);
1901
1902         /*
1903          * We may have dropped the spin lock to get the head mutex lock, and
1904          * that might have given someone else time to free the head.  If that's
1905          * true, it has been removed from our list and we can move on.
1906          */
1907         if (ret == -EAGAIN)
1908                 head = ERR_PTR(-EAGAIN);
1909
1910         return head;
1911 }
1912
1913 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1914                                     struct btrfs_delayed_ref_head *locked_ref,
1915                                     unsigned long *run_refs)
1916 {
1917         struct btrfs_fs_info *fs_info = trans->fs_info;
1918         struct btrfs_delayed_ref_root *delayed_refs;
1919         struct btrfs_delayed_extent_op *extent_op;
1920         struct btrfs_delayed_ref_node *ref;
1921         int must_insert_reserved = 0;
1922         int ret;
1923
1924         delayed_refs = &trans->transaction->delayed_refs;
1925
1926         lockdep_assert_held(&locked_ref->mutex);
1927         lockdep_assert_held(&locked_ref->lock);
1928
1929         while ((ref = select_delayed_ref(locked_ref))) {
1930                 if (ref->seq &&
1931                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
1932                         spin_unlock(&locked_ref->lock);
1933                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1934                         return -EAGAIN;
1935                 }
1936
1937                 (*run_refs)++;
1938                 ref->in_tree = 0;
1939                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1940                 RB_CLEAR_NODE(&ref->ref_node);
1941                 if (!list_empty(&ref->add_list))
1942                         list_del(&ref->add_list);
1943                 /*
1944                  * When we play the delayed ref, also correct the ref_mod on
1945                  * head
1946                  */
1947                 switch (ref->action) {
1948                 case BTRFS_ADD_DELAYED_REF:
1949                 case BTRFS_ADD_DELAYED_EXTENT:
1950                         locked_ref->ref_mod -= ref->ref_mod;
1951                         break;
1952                 case BTRFS_DROP_DELAYED_REF:
1953                         locked_ref->ref_mod += ref->ref_mod;
1954                         break;
1955                 default:
1956                         WARN_ON(1);
1957                 }
1958                 atomic_dec(&delayed_refs->num_entries);
1959
1960                 /*
1961                  * Record the must_insert_reserved flag before we drop the
1962                  * spin lock.
1963                  */
1964                 must_insert_reserved = locked_ref->must_insert_reserved;
1965                 locked_ref->must_insert_reserved = 0;
1966
1967                 extent_op = locked_ref->extent_op;
1968                 locked_ref->extent_op = NULL;
1969                 spin_unlock(&locked_ref->lock);
1970
1971                 ret = run_one_delayed_ref(trans, ref, extent_op,
1972                                           must_insert_reserved);
1973
1974                 btrfs_free_delayed_extent_op(extent_op);
1975                 if (ret) {
1976                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1977                         btrfs_put_delayed_ref(ref);
1978                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
1979                                     ret);
1980                         return ret;
1981                 }
1982
1983                 btrfs_put_delayed_ref(ref);
1984                 cond_resched();
1985
1986                 spin_lock(&locked_ref->lock);
1987                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1988         }
1989
1990         return 0;
1991 }
1992
1993 /*
1994  * Returns 0 on success or if called with an already aborted transaction.
1995  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1996  */
1997 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1998                                              unsigned long nr)
1999 {
2000         struct btrfs_fs_info *fs_info = trans->fs_info;
2001         struct btrfs_delayed_ref_root *delayed_refs;
2002         struct btrfs_delayed_ref_head *locked_ref = NULL;
2003         ktime_t start = ktime_get();
2004         int ret;
2005         unsigned long count = 0;
2006         unsigned long actual_count = 0;
2007
2008         delayed_refs = &trans->transaction->delayed_refs;
2009         do {
2010                 if (!locked_ref) {
2011                         locked_ref = btrfs_obtain_ref_head(trans);
2012                         if (IS_ERR_OR_NULL(locked_ref)) {
2013                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2014                                         continue;
2015                                 } else {
2016                                         break;
2017                                 }
2018                         }
2019                         count++;
2020                 }
2021                 /*
2022                  * We need to try and merge add/drops of the same ref since we
2023                  * can run into issues with relocate dropping the implicit ref
2024                  * and then it being added back again before the drop can
2025                  * finish.  If we merged anything we need to re-loop so we can
2026                  * get a good ref.
2027                  * Or we can get node references of the same type that weren't
2028                  * merged when created due to bumps in the tree mod seq, and
2029                  * we need to merge them to prevent adding an inline extent
2030                  * backref before dropping it (triggering a BUG_ON at
2031                  * insert_inline_extent_backref()).
2032                  */
2033                 spin_lock(&locked_ref->lock);
2034                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2035
2036                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2037                                                       &actual_count);
2038                 if (ret < 0 && ret != -EAGAIN) {
2039                         /*
2040                          * Error, btrfs_run_delayed_refs_for_head already
2041                          * unlocked everything so just bail out
2042                          */
2043                         return ret;
2044                 } else if (!ret) {
2045                         /*
2046                          * Success, perform the usual cleanup of a processed
2047                          * head
2048                          */
2049                         ret = cleanup_ref_head(trans, locked_ref);
2050                         if (ret > 0 ) {
2051                                 /* We dropped our lock, we need to loop. */
2052                                 ret = 0;
2053                                 continue;
2054                         } else if (ret) {
2055                                 return ret;
2056                         }
2057                 }
2058
2059                 /*
2060                  * Either success case or btrfs_run_delayed_refs_for_head
2061                  * returned -EAGAIN, meaning we need to select another head
2062                  */
2063
2064                 locked_ref = NULL;
2065                 cond_resched();
2066         } while ((nr != -1 && count < nr) || locked_ref);
2067
2068         /*
2069          * We don't want to include ref heads since we can have empty ref heads
2070          * and those will drastically skew our runtime down since we just do
2071          * accounting, no actual extent tree updates.
2072          */
2073         if (actual_count > 0) {
2074                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2075                 u64 avg;
2076
2077                 /*
2078                  * We weigh the current average higher than our current runtime
2079                  * to avoid large swings in the average.
2080                  */
2081                 spin_lock(&delayed_refs->lock);
2082                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2083                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2084                 spin_unlock(&delayed_refs->lock);
2085         }
2086         return 0;
2087 }
2088
2089 #ifdef SCRAMBLE_DELAYED_REFS
2090 /*
2091  * Normally delayed refs get processed in ascending bytenr order. This
2092  * correlates in most cases to the order added. To expose dependencies on this
2093  * order, we start to process the tree in the middle instead of the beginning
2094  */
2095 static u64 find_middle(struct rb_root *root)
2096 {
2097         struct rb_node *n = root->rb_node;
2098         struct btrfs_delayed_ref_node *entry;
2099         int alt = 1;
2100         u64 middle;
2101         u64 first = 0, last = 0;
2102
2103         n = rb_first(root);
2104         if (n) {
2105                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2106                 first = entry->bytenr;
2107         }
2108         n = rb_last(root);
2109         if (n) {
2110                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2111                 last = entry->bytenr;
2112         }
2113         n = root->rb_node;
2114
2115         while (n) {
2116                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2117                 WARN_ON(!entry->in_tree);
2118
2119                 middle = entry->bytenr;
2120
2121                 if (alt)
2122                         n = n->rb_left;
2123                 else
2124                         n = n->rb_right;
2125
2126                 alt = 1 - alt;
2127         }
2128         return middle;
2129 }
2130 #endif
2131
2132 /*
2133  * this starts processing the delayed reference count updates and
2134  * extent insertions we have queued up so far.  count can be
2135  * 0, which means to process everything in the tree at the start
2136  * of the run (but not newly added entries), or it can be some target
2137  * number you'd like to process.
2138  *
2139  * Returns 0 on success or if called with an aborted transaction
2140  * Returns <0 on error and aborts the transaction
2141  */
2142 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2143                            unsigned long count)
2144 {
2145         struct btrfs_fs_info *fs_info = trans->fs_info;
2146         struct rb_node *node;
2147         struct btrfs_delayed_ref_root *delayed_refs;
2148         struct btrfs_delayed_ref_head *head;
2149         int ret;
2150         int run_all = count == (unsigned long)-1;
2151
2152         /* We'll clean this up in btrfs_cleanup_transaction */
2153         if (TRANS_ABORTED(trans))
2154                 return 0;
2155
2156         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2157                 return 0;
2158
2159         delayed_refs = &trans->transaction->delayed_refs;
2160         if (count == 0)
2161                 count = atomic_read(&delayed_refs->num_entries) * 2;
2162
2163 again:
2164 #ifdef SCRAMBLE_DELAYED_REFS
2165         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2166 #endif
2167         ret = __btrfs_run_delayed_refs(trans, count);
2168         if (ret < 0) {
2169                 btrfs_abort_transaction(trans, ret);
2170                 return ret;
2171         }
2172
2173         if (run_all) {
2174                 btrfs_create_pending_block_groups(trans);
2175
2176                 spin_lock(&delayed_refs->lock);
2177                 node = rb_first_cached(&delayed_refs->href_root);
2178                 if (!node) {
2179                         spin_unlock(&delayed_refs->lock);
2180                         goto out;
2181                 }
2182                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2183                                 href_node);
2184                 refcount_inc(&head->refs);
2185                 spin_unlock(&delayed_refs->lock);
2186
2187                 /* Mutex was contended, block until it's released and retry. */
2188                 mutex_lock(&head->mutex);
2189                 mutex_unlock(&head->mutex);
2190
2191                 btrfs_put_delayed_ref_head(head);
2192                 cond_resched();
2193                 goto again;
2194         }
2195 out:
2196         return 0;
2197 }
2198
2199 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2200                                 struct extent_buffer *eb, u64 flags,
2201                                 int level, int is_data)
2202 {
2203         struct btrfs_delayed_extent_op *extent_op;
2204         int ret;
2205
2206         extent_op = btrfs_alloc_delayed_extent_op();
2207         if (!extent_op)
2208                 return -ENOMEM;
2209
2210         extent_op->flags_to_set = flags;
2211         extent_op->update_flags = true;
2212         extent_op->update_key = false;
2213         extent_op->is_data = is_data ? true : false;
2214         extent_op->level = level;
2215
2216         ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2217         if (ret)
2218                 btrfs_free_delayed_extent_op(extent_op);
2219         return ret;
2220 }
2221
2222 static noinline int check_delayed_ref(struct btrfs_root *root,
2223                                       struct btrfs_path *path,
2224                                       u64 objectid, u64 offset, u64 bytenr)
2225 {
2226         struct btrfs_delayed_ref_head *head;
2227         struct btrfs_delayed_ref_node *ref;
2228         struct btrfs_delayed_data_ref *data_ref;
2229         struct btrfs_delayed_ref_root *delayed_refs;
2230         struct btrfs_transaction *cur_trans;
2231         struct rb_node *node;
2232         int ret = 0;
2233
2234         spin_lock(&root->fs_info->trans_lock);
2235         cur_trans = root->fs_info->running_transaction;
2236         if (cur_trans)
2237                 refcount_inc(&cur_trans->use_count);
2238         spin_unlock(&root->fs_info->trans_lock);
2239         if (!cur_trans)
2240                 return 0;
2241
2242         delayed_refs = &cur_trans->delayed_refs;
2243         spin_lock(&delayed_refs->lock);
2244         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2245         if (!head) {
2246                 spin_unlock(&delayed_refs->lock);
2247                 btrfs_put_transaction(cur_trans);
2248                 return 0;
2249         }
2250
2251         if (!mutex_trylock(&head->mutex)) {
2252                 refcount_inc(&head->refs);
2253                 spin_unlock(&delayed_refs->lock);
2254
2255                 btrfs_release_path(path);
2256
2257                 /*
2258                  * Mutex was contended, block until it's released and let
2259                  * caller try again
2260                  */
2261                 mutex_lock(&head->mutex);
2262                 mutex_unlock(&head->mutex);
2263                 btrfs_put_delayed_ref_head(head);
2264                 btrfs_put_transaction(cur_trans);
2265                 return -EAGAIN;
2266         }
2267         spin_unlock(&delayed_refs->lock);
2268
2269         spin_lock(&head->lock);
2270         /*
2271          * XXX: We should replace this with a proper search function in the
2272          * future.
2273          */
2274         for (node = rb_first_cached(&head->ref_tree); node;
2275              node = rb_next(node)) {
2276                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2277                 /* If it's a shared ref we know a cross reference exists */
2278                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2279                         ret = 1;
2280                         break;
2281                 }
2282
2283                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2284
2285                 /*
2286                  * If our ref doesn't match the one we're currently looking at
2287                  * then we have a cross reference.
2288                  */
2289                 if (data_ref->root != root->root_key.objectid ||
2290                     data_ref->objectid != objectid ||
2291                     data_ref->offset != offset) {
2292                         ret = 1;
2293                         break;
2294                 }
2295         }
2296         spin_unlock(&head->lock);
2297         mutex_unlock(&head->mutex);
2298         btrfs_put_transaction(cur_trans);
2299         return ret;
2300 }
2301
2302 static noinline int check_committed_ref(struct btrfs_root *root,
2303                                         struct btrfs_path *path,
2304                                         u64 objectid, u64 offset, u64 bytenr,
2305                                         bool strict)
2306 {
2307         struct btrfs_fs_info *fs_info = root->fs_info;
2308         struct btrfs_root *extent_root = fs_info->extent_root;
2309         struct extent_buffer *leaf;
2310         struct btrfs_extent_data_ref *ref;
2311         struct btrfs_extent_inline_ref *iref;
2312         struct btrfs_extent_item *ei;
2313         struct btrfs_key key;
2314         u32 item_size;
2315         int type;
2316         int ret;
2317
2318         key.objectid = bytenr;
2319         key.offset = (u64)-1;
2320         key.type = BTRFS_EXTENT_ITEM_KEY;
2321
2322         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2323         if (ret < 0)
2324                 goto out;
2325         BUG_ON(ret == 0); /* Corruption */
2326
2327         ret = -ENOENT;
2328         if (path->slots[0] == 0)
2329                 goto out;
2330
2331         path->slots[0]--;
2332         leaf = path->nodes[0];
2333         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2334
2335         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2336                 goto out;
2337
2338         ret = 1;
2339         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2340         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2341
2342         /* If extent item has more than 1 inline ref then it's shared */
2343         if (item_size != sizeof(*ei) +
2344             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2345                 goto out;
2346
2347         /*
2348          * If extent created before last snapshot => it's shared unless the
2349          * snapshot has been deleted. Use the heuristic if strict is false.
2350          */
2351         if (!strict &&
2352             (btrfs_extent_generation(leaf, ei) <=
2353              btrfs_root_last_snapshot(&root->root_item)))
2354                 goto out;
2355
2356         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2357
2358         /* If this extent has SHARED_DATA_REF then it's shared */
2359         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2360         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2361                 goto out;
2362
2363         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2364         if (btrfs_extent_refs(leaf, ei) !=
2365             btrfs_extent_data_ref_count(leaf, ref) ||
2366             btrfs_extent_data_ref_root(leaf, ref) !=
2367             root->root_key.objectid ||
2368             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2369             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2370                 goto out;
2371
2372         ret = 0;
2373 out:
2374         return ret;
2375 }
2376
2377 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2378                           u64 bytenr, bool strict)
2379 {
2380         struct btrfs_path *path;
2381         int ret;
2382
2383         path = btrfs_alloc_path();
2384         if (!path)
2385                 return -ENOMEM;
2386
2387         do {
2388                 ret = check_committed_ref(root, path, objectid,
2389                                           offset, bytenr, strict);
2390                 if (ret && ret != -ENOENT)
2391                         goto out;
2392
2393                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2394         } while (ret == -EAGAIN);
2395
2396 out:
2397         btrfs_free_path(path);
2398         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2399                 WARN_ON(ret > 0);
2400         return ret;
2401 }
2402
2403 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2404                            struct btrfs_root *root,
2405                            struct extent_buffer *buf,
2406                            int full_backref, int inc)
2407 {
2408         struct btrfs_fs_info *fs_info = root->fs_info;
2409         u64 bytenr;
2410         u64 num_bytes;
2411         u64 parent;
2412         u64 ref_root;
2413         u32 nritems;
2414         struct btrfs_key key;
2415         struct btrfs_file_extent_item *fi;
2416         struct btrfs_ref generic_ref = { 0 };
2417         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2418         int i;
2419         int action;
2420         int level;
2421         int ret = 0;
2422
2423         if (btrfs_is_testing(fs_info))
2424                 return 0;
2425
2426         ref_root = btrfs_header_owner(buf);
2427         nritems = btrfs_header_nritems(buf);
2428         level = btrfs_header_level(buf);
2429
2430         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2431                 return 0;
2432
2433         if (full_backref)
2434                 parent = buf->start;
2435         else
2436                 parent = 0;
2437         if (inc)
2438                 action = BTRFS_ADD_DELAYED_REF;
2439         else
2440                 action = BTRFS_DROP_DELAYED_REF;
2441
2442         for (i = 0; i < nritems; i++) {
2443                 if (level == 0) {
2444                         btrfs_item_key_to_cpu(buf, &key, i);
2445                         if (key.type != BTRFS_EXTENT_DATA_KEY)
2446                                 continue;
2447                         fi = btrfs_item_ptr(buf, i,
2448                                             struct btrfs_file_extent_item);
2449                         if (btrfs_file_extent_type(buf, fi) ==
2450                             BTRFS_FILE_EXTENT_INLINE)
2451                                 continue;
2452                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2453                         if (bytenr == 0)
2454                                 continue;
2455
2456                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2457                         key.offset -= btrfs_file_extent_offset(buf, fi);
2458                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2459                                                num_bytes, parent);
2460                         generic_ref.real_root = root->root_key.objectid;
2461                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2462                                             key.offset);
2463                         generic_ref.skip_qgroup = for_reloc;
2464                         if (inc)
2465                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2466                         else
2467                                 ret = btrfs_free_extent(trans, &generic_ref);
2468                         if (ret)
2469                                 goto fail;
2470                 } else {
2471                         bytenr = btrfs_node_blockptr(buf, i);
2472                         num_bytes = fs_info->nodesize;
2473                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2474                                                num_bytes, parent);
2475                         generic_ref.real_root = root->root_key.objectid;
2476                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
2477                         generic_ref.skip_qgroup = for_reloc;
2478                         if (inc)
2479                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2480                         else
2481                                 ret = btrfs_free_extent(trans, &generic_ref);
2482                         if (ret)
2483                                 goto fail;
2484                 }
2485         }
2486         return 0;
2487 fail:
2488         return ret;
2489 }
2490
2491 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2492                   struct extent_buffer *buf, int full_backref)
2493 {
2494         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2495 }
2496
2497 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2498                   struct extent_buffer *buf, int full_backref)
2499 {
2500         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2501 }
2502
2503 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2504 {
2505         struct btrfs_block_group *block_group;
2506         int readonly = 0;
2507
2508         block_group = btrfs_lookup_block_group(fs_info, bytenr);
2509         if (!block_group || block_group->ro)
2510                 readonly = 1;
2511         if (block_group)
2512                 btrfs_put_block_group(block_group);
2513         return readonly;
2514 }
2515
2516 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2517 {
2518         struct btrfs_fs_info *fs_info = root->fs_info;
2519         u64 flags;
2520         u64 ret;
2521
2522         if (data)
2523                 flags = BTRFS_BLOCK_GROUP_DATA;
2524         else if (root == fs_info->chunk_root)
2525                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2526         else
2527                 flags = BTRFS_BLOCK_GROUP_METADATA;
2528
2529         ret = btrfs_get_alloc_profile(fs_info, flags);
2530         return ret;
2531 }
2532
2533 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2534 {
2535         struct btrfs_block_group *cache;
2536         u64 bytenr;
2537
2538         spin_lock(&fs_info->block_group_cache_lock);
2539         bytenr = fs_info->first_logical_byte;
2540         spin_unlock(&fs_info->block_group_cache_lock);
2541
2542         if (bytenr < (u64)-1)
2543                 return bytenr;
2544
2545         cache = btrfs_lookup_first_block_group(fs_info, search_start);
2546         if (!cache)
2547                 return 0;
2548
2549         bytenr = cache->start;
2550         btrfs_put_block_group(cache);
2551
2552         return bytenr;
2553 }
2554
2555 static int pin_down_extent(struct btrfs_trans_handle *trans,
2556                            struct btrfs_block_group *cache,
2557                            u64 bytenr, u64 num_bytes, int reserved)
2558 {
2559         struct btrfs_fs_info *fs_info = cache->fs_info;
2560
2561         spin_lock(&cache->space_info->lock);
2562         spin_lock(&cache->lock);
2563         cache->pinned += num_bytes;
2564         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2565                                              num_bytes);
2566         if (reserved) {
2567                 cache->reserved -= num_bytes;
2568                 cache->space_info->bytes_reserved -= num_bytes;
2569         }
2570         spin_unlock(&cache->lock);
2571         spin_unlock(&cache->space_info->lock);
2572
2573         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
2574                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2575         set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2576                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2577         return 0;
2578 }
2579
2580 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2581                      u64 bytenr, u64 num_bytes, int reserved)
2582 {
2583         struct btrfs_block_group *cache;
2584
2585         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2586         BUG_ON(!cache); /* Logic error */
2587
2588         pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2589
2590         btrfs_put_block_group(cache);
2591         return 0;
2592 }
2593
2594 /*
2595  * this function must be called within transaction
2596  */
2597 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2598                                     u64 bytenr, u64 num_bytes)
2599 {
2600         struct btrfs_block_group *cache;
2601         int ret;
2602
2603         btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
2604
2605         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2606         if (!cache)
2607                 return -EINVAL;
2608
2609         /*
2610          * pull in the free space cache (if any) so that our pin
2611          * removes the free space from the cache.  We have load_only set
2612          * to one because the slow code to read in the free extents does check
2613          * the pinned extents.
2614          */
2615         btrfs_cache_block_group(cache, 1);
2616
2617         pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2618
2619         /* remove us from the free space cache (if we're there at all) */
2620         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2621         btrfs_put_block_group(cache);
2622         return ret;
2623 }
2624
2625 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2626                                    u64 start, u64 num_bytes)
2627 {
2628         int ret;
2629         struct btrfs_block_group *block_group;
2630         struct btrfs_caching_control *caching_ctl;
2631
2632         block_group = btrfs_lookup_block_group(fs_info, start);
2633         if (!block_group)
2634                 return -EINVAL;
2635
2636         btrfs_cache_block_group(block_group, 0);
2637         caching_ctl = btrfs_get_caching_control(block_group);
2638
2639         if (!caching_ctl) {
2640                 /* Logic error */
2641                 BUG_ON(!btrfs_block_group_done(block_group));
2642                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
2643         } else {
2644                 /*
2645                  * We must wait for v1 caching to finish, otherwise we may not
2646                  * remove our space.
2647                  */
2648                 btrfs_wait_space_cache_v1_finished(block_group, caching_ctl);
2649                 mutex_lock(&caching_ctl->mutex);
2650
2651                 if (start >= caching_ctl->progress) {
2652                         ret = btrfs_add_excluded_extent(fs_info, start,
2653                                                         num_bytes);
2654                 } else if (start + num_bytes <= caching_ctl->progress) {
2655                         ret = btrfs_remove_free_space(block_group,
2656                                                       start, num_bytes);
2657                 } else {
2658                         num_bytes = caching_ctl->progress - start;
2659                         ret = btrfs_remove_free_space(block_group,
2660                                                       start, num_bytes);
2661                         if (ret)
2662                                 goto out_lock;
2663
2664                         num_bytes = (start + num_bytes) -
2665                                 caching_ctl->progress;
2666                         start = caching_ctl->progress;
2667                         ret = btrfs_add_excluded_extent(fs_info, start,
2668                                                         num_bytes);
2669                 }
2670 out_lock:
2671                 mutex_unlock(&caching_ctl->mutex);
2672                 btrfs_put_caching_control(caching_ctl);
2673         }
2674         btrfs_put_block_group(block_group);
2675         return ret;
2676 }
2677
2678 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2679 {
2680         struct btrfs_fs_info *fs_info = eb->fs_info;
2681         struct btrfs_file_extent_item *item;
2682         struct btrfs_key key;
2683         int found_type;
2684         int i;
2685         int ret = 0;
2686
2687         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2688                 return 0;
2689
2690         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2691                 btrfs_item_key_to_cpu(eb, &key, i);
2692                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2693                         continue;
2694                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2695                 found_type = btrfs_file_extent_type(eb, item);
2696                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2697                         continue;
2698                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2699                         continue;
2700                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2701                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2702                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2703                 if (ret)
2704                         break;
2705         }
2706
2707         return ret;
2708 }
2709
2710 static void
2711 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2712 {
2713         atomic_inc(&bg->reservations);
2714 }
2715
2716 /*
2717  * Returns the free cluster for the given space info and sets empty_cluster to
2718  * what it should be based on the mount options.
2719  */
2720 static struct btrfs_free_cluster *
2721 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2722                    struct btrfs_space_info *space_info, u64 *empty_cluster)
2723 {
2724         struct btrfs_free_cluster *ret = NULL;
2725
2726         *empty_cluster = 0;
2727         if (btrfs_mixed_space_info(space_info))
2728                 return ret;
2729
2730         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2731                 ret = &fs_info->meta_alloc_cluster;
2732                 if (btrfs_test_opt(fs_info, SSD))
2733                         *empty_cluster = SZ_2M;
2734                 else
2735                         *empty_cluster = SZ_64K;
2736         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2737                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
2738                 *empty_cluster = SZ_2M;
2739                 ret = &fs_info->data_alloc_cluster;
2740         }
2741
2742         return ret;
2743 }
2744
2745 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2746                               u64 start, u64 end,
2747                               const bool return_free_space)
2748 {
2749         struct btrfs_block_group *cache = NULL;
2750         struct btrfs_space_info *space_info;
2751         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2752         struct btrfs_free_cluster *cluster = NULL;
2753         u64 len;
2754         u64 total_unpinned = 0;
2755         u64 empty_cluster = 0;
2756         bool readonly;
2757
2758         while (start <= end) {
2759                 readonly = false;
2760                 if (!cache ||
2761                     start >= cache->start + cache->length) {
2762                         if (cache)
2763                                 btrfs_put_block_group(cache);
2764                         total_unpinned = 0;
2765                         cache = btrfs_lookup_block_group(fs_info, start);
2766                         BUG_ON(!cache); /* Logic error */
2767
2768                         cluster = fetch_cluster_info(fs_info,
2769                                                      cache->space_info,
2770                                                      &empty_cluster);
2771                         empty_cluster <<= 1;
2772                 }
2773
2774                 len = cache->start + cache->length - start;
2775                 len = min(len, end + 1 - start);
2776
2777                 down_read(&fs_info->commit_root_sem);
2778                 if (start < cache->last_byte_to_unpin && return_free_space) {
2779                         u64 add_len = min(len, cache->last_byte_to_unpin - start);
2780
2781                         btrfs_add_free_space(cache, start, add_len);
2782                 }
2783                 up_read(&fs_info->commit_root_sem);
2784
2785                 start += len;
2786                 total_unpinned += len;
2787                 space_info = cache->space_info;
2788
2789                 /*
2790                  * If this space cluster has been marked as fragmented and we've
2791                  * unpinned enough in this block group to potentially allow a
2792                  * cluster to be created inside of it go ahead and clear the
2793                  * fragmented check.
2794                  */
2795                 if (cluster && cluster->fragmented &&
2796                     total_unpinned > empty_cluster) {
2797                         spin_lock(&cluster->lock);
2798                         cluster->fragmented = 0;
2799                         spin_unlock(&cluster->lock);
2800                 }
2801
2802                 spin_lock(&space_info->lock);
2803                 spin_lock(&cache->lock);
2804                 cache->pinned -= len;
2805                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2806                 space_info->max_extent_size = 0;
2807                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2808                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2809                 if (cache->ro) {
2810                         space_info->bytes_readonly += len;
2811                         readonly = true;
2812                 }
2813                 spin_unlock(&cache->lock);
2814                 if (!readonly && return_free_space &&
2815                     global_rsv->space_info == space_info) {
2816                         u64 to_add = len;
2817
2818                         spin_lock(&global_rsv->lock);
2819                         if (!global_rsv->full) {
2820                                 to_add = min(len, global_rsv->size -
2821                                              global_rsv->reserved);
2822                                 global_rsv->reserved += to_add;
2823                                 btrfs_space_info_update_bytes_may_use(fs_info,
2824                                                 space_info, to_add);
2825                                 if (global_rsv->reserved >= global_rsv->size)
2826                                         global_rsv->full = 1;
2827                                 len -= to_add;
2828                         }
2829                         spin_unlock(&global_rsv->lock);
2830                 }
2831                 /* Add to any tickets we may have */
2832                 if (!readonly && return_free_space && len)
2833                         btrfs_try_granting_tickets(fs_info, space_info);
2834                 spin_unlock(&space_info->lock);
2835         }
2836
2837         if (cache)
2838                 btrfs_put_block_group(cache);
2839         return 0;
2840 }
2841
2842 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2843 {
2844         struct btrfs_fs_info *fs_info = trans->fs_info;
2845         struct btrfs_block_group *block_group, *tmp;
2846         struct list_head *deleted_bgs;
2847         struct extent_io_tree *unpin;
2848         u64 start;
2849         u64 end;
2850         int ret;
2851
2852         unpin = &trans->transaction->pinned_extents;
2853
2854         while (!TRANS_ABORTED(trans)) {
2855                 struct extent_state *cached_state = NULL;
2856
2857                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
2858                 ret = find_first_extent_bit(unpin, 0, &start, &end,
2859                                             EXTENT_DIRTY, &cached_state);
2860                 if (ret) {
2861                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2862                         break;
2863                 }
2864                 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
2865                         clear_extent_bits(&fs_info->excluded_extents, start,
2866                                           end, EXTENT_UPTODATE);
2867
2868                 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2869                         ret = btrfs_discard_extent(fs_info, start,
2870                                                    end + 1 - start, NULL);
2871
2872                 clear_extent_dirty(unpin, start, end, &cached_state);
2873                 unpin_extent_range(fs_info, start, end, true);
2874                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2875                 free_extent_state(cached_state);
2876                 cond_resched();
2877         }
2878
2879         if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2880                 btrfs_discard_calc_delay(&fs_info->discard_ctl);
2881                 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2882         }
2883
2884         /*
2885          * Transaction is finished.  We don't need the lock anymore.  We
2886          * do need to clean up the block groups in case of a transaction
2887          * abort.
2888          */
2889         deleted_bgs = &trans->transaction->deleted_bgs;
2890         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2891                 u64 trimmed = 0;
2892
2893                 ret = -EROFS;
2894                 if (!TRANS_ABORTED(trans))
2895                         ret = btrfs_discard_extent(fs_info,
2896                                                    block_group->start,
2897                                                    block_group->length,
2898                                                    &trimmed);
2899
2900                 list_del_init(&block_group->bg_list);
2901                 btrfs_unfreeze_block_group(block_group);
2902                 btrfs_put_block_group(block_group);
2903
2904                 if (ret) {
2905                         const char *errstr = btrfs_decode_error(ret);
2906                         btrfs_warn(fs_info,
2907                            "discard failed while removing blockgroup: errno=%d %s",
2908                                    ret, errstr);
2909                 }
2910         }
2911
2912         return 0;
2913 }
2914
2915 /*
2916  * Drop one or more refs of @node.
2917  *
2918  * 1. Locate the extent refs.
2919  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2920  *    Locate it, then reduce the refs number or remove the ref line completely.
2921  *
2922  * 2. Update the refs count in EXTENT/METADATA_ITEM
2923  *
2924  * Inline backref case:
2925  *
2926  * in extent tree we have:
2927  *
2928  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2929  *              refs 2 gen 6 flags DATA
2930  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2931  *              extent data backref root FS_TREE objectid 257 offset 0 count 1
2932  *
2933  * This function gets called with:
2934  *
2935  *    node->bytenr = 13631488
2936  *    node->num_bytes = 1048576
2937  *    root_objectid = FS_TREE
2938  *    owner_objectid = 257
2939  *    owner_offset = 0
2940  *    refs_to_drop = 1
2941  *
2942  * Then we should get some like:
2943  *
2944  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2945  *              refs 1 gen 6 flags DATA
2946  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2947  *
2948  * Keyed backref case:
2949  *
2950  * in extent tree we have:
2951  *
2952  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2953  *              refs 754 gen 6 flags DATA
2954  *      [...]
2955  *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2956  *              extent data backref root FS_TREE objectid 866 offset 0 count 1
2957  *
2958  * This function get called with:
2959  *
2960  *    node->bytenr = 13631488
2961  *    node->num_bytes = 1048576
2962  *    root_objectid = FS_TREE
2963  *    owner_objectid = 866
2964  *    owner_offset = 0
2965  *    refs_to_drop = 1
2966  *
2967  * Then we should get some like:
2968  *
2969  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2970  *              refs 753 gen 6 flags DATA
2971  *
2972  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2973  */
2974 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2975                                struct btrfs_delayed_ref_node *node, u64 parent,
2976                                u64 root_objectid, u64 owner_objectid,
2977                                u64 owner_offset, int refs_to_drop,
2978                                struct btrfs_delayed_extent_op *extent_op)
2979 {
2980         struct btrfs_fs_info *info = trans->fs_info;
2981         struct btrfs_key key;
2982         struct btrfs_path *path;
2983         struct btrfs_root *extent_root = info->extent_root;
2984         struct extent_buffer *leaf;
2985         struct btrfs_extent_item *ei;
2986         struct btrfs_extent_inline_ref *iref;
2987         int ret;
2988         int is_data;
2989         int extent_slot = 0;
2990         int found_extent = 0;
2991         int num_to_del = 1;
2992         u32 item_size;
2993         u64 refs;
2994         u64 bytenr = node->bytenr;
2995         u64 num_bytes = node->num_bytes;
2996         int last_ref = 0;
2997         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2998
2999         path = btrfs_alloc_path();
3000         if (!path)
3001                 return -ENOMEM;
3002
3003         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3004
3005         if (!is_data && refs_to_drop != 1) {
3006                 btrfs_crit(info,
3007 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
3008                            node->bytenr, refs_to_drop);
3009                 ret = -EINVAL;
3010                 btrfs_abort_transaction(trans, ret);
3011                 goto out;
3012         }
3013
3014         if (is_data)
3015                 skinny_metadata = false;
3016
3017         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3018                                     parent, root_objectid, owner_objectid,
3019                                     owner_offset);
3020         if (ret == 0) {
3021                 /*
3022                  * Either the inline backref or the SHARED_DATA_REF/
3023                  * SHARED_BLOCK_REF is found
3024                  *
3025                  * Here is a quick path to locate EXTENT/METADATA_ITEM.
3026                  * It's possible the EXTENT/METADATA_ITEM is near current slot.
3027                  */
3028                 extent_slot = path->slots[0];
3029                 while (extent_slot >= 0) {
3030                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3031                                               extent_slot);
3032                         if (key.objectid != bytenr)
3033                                 break;
3034                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3035                             key.offset == num_bytes) {
3036                                 found_extent = 1;
3037                                 break;
3038                         }
3039                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
3040                             key.offset == owner_objectid) {
3041                                 found_extent = 1;
3042                                 break;
3043                         }
3044
3045                         /* Quick path didn't find the EXTEMT/METADATA_ITEM */
3046                         if (path->slots[0] - extent_slot > 5)
3047                                 break;
3048                         extent_slot--;
3049                 }
3050
3051                 if (!found_extent) {
3052                         if (iref) {
3053                                 btrfs_crit(info,
3054 "invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
3055                                 btrfs_abort_transaction(trans, -EUCLEAN);
3056                                 goto err_dump;
3057                         }
3058                         /* Must be SHARED_* item, remove the backref first */
3059                         ret = remove_extent_backref(trans, path, NULL,
3060                                                     refs_to_drop,
3061                                                     is_data, &last_ref);
3062                         if (ret) {
3063                                 btrfs_abort_transaction(trans, ret);
3064                                 goto out;
3065                         }
3066                         btrfs_release_path(path);
3067
3068                         /* Slow path to locate EXTENT/METADATA_ITEM */
3069                         key.objectid = bytenr;
3070                         key.type = BTRFS_EXTENT_ITEM_KEY;
3071                         key.offset = num_bytes;
3072
3073                         if (!is_data && skinny_metadata) {
3074                                 key.type = BTRFS_METADATA_ITEM_KEY;
3075                                 key.offset = owner_objectid;
3076                         }
3077
3078                         ret = btrfs_search_slot(trans, extent_root,
3079                                                 &key, path, -1, 1);
3080                         if (ret > 0 && skinny_metadata && path->slots[0]) {
3081                                 /*
3082                                  * Couldn't find our skinny metadata item,
3083                                  * see if we have ye olde extent item.
3084                                  */
3085                                 path->slots[0]--;
3086                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
3087                                                       path->slots[0]);
3088                                 if (key.objectid == bytenr &&
3089                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
3090                                     key.offset == num_bytes)
3091                                         ret = 0;
3092                         }
3093
3094                         if (ret > 0 && skinny_metadata) {
3095                                 skinny_metadata = false;
3096                                 key.objectid = bytenr;
3097                                 key.type = BTRFS_EXTENT_ITEM_KEY;
3098                                 key.offset = num_bytes;
3099                                 btrfs_release_path(path);
3100                                 ret = btrfs_search_slot(trans, extent_root,
3101                                                         &key, path, -1, 1);
3102                         }
3103
3104                         if (ret) {
3105                                 btrfs_err(info,
3106                                           "umm, got %d back from search, was looking for %llu",
3107                                           ret, bytenr);
3108                                 if (ret > 0)
3109                                         btrfs_print_leaf(path->nodes[0]);
3110                         }
3111                         if (ret < 0) {
3112                                 btrfs_abort_transaction(trans, ret);
3113                                 goto out;
3114                         }
3115                         extent_slot = path->slots[0];
3116                 }
3117         } else if (WARN_ON(ret == -ENOENT)) {
3118                 btrfs_print_leaf(path->nodes[0]);
3119                 btrfs_err(info,
3120                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3121                         bytenr, parent, root_objectid, owner_objectid,
3122                         owner_offset);
3123                 btrfs_abort_transaction(trans, ret);
3124                 goto out;
3125         } else {
3126                 btrfs_abort_transaction(trans, ret);
3127                 goto out;
3128         }
3129
3130         leaf = path->nodes[0];
3131         item_size = btrfs_item_size_nr(leaf, extent_slot);
3132         if (unlikely(item_size < sizeof(*ei))) {
3133                 ret = -EINVAL;
3134                 btrfs_print_v0_err(info);
3135                 btrfs_abort_transaction(trans, ret);
3136                 goto out;
3137         }
3138         ei = btrfs_item_ptr(leaf, extent_slot,
3139                             struct btrfs_extent_item);
3140         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3141             key.type == BTRFS_EXTENT_ITEM_KEY) {
3142                 struct btrfs_tree_block_info *bi;
3143                 if (item_size < sizeof(*ei) + sizeof(*bi)) {
3144                         btrfs_crit(info,
3145 "invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3146                                    key.objectid, key.type, key.offset,
3147                                    owner_objectid, item_size,
3148                                    sizeof(*ei) + sizeof(*bi));
3149                         btrfs_abort_transaction(trans, -EUCLEAN);
3150                         goto err_dump;
3151                 }
3152                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3153                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3154         }
3155
3156         refs = btrfs_extent_refs(leaf, ei);
3157         if (refs < refs_to_drop) {
3158                 btrfs_crit(info,
3159                 "trying to drop %d refs but we only have %llu for bytenr %llu",
3160                           refs_to_drop, refs, bytenr);
3161                 btrfs_abort_transaction(trans, -EUCLEAN);
3162                 goto err_dump;
3163         }
3164         refs -= refs_to_drop;
3165
3166         if (refs > 0) {
3167                 if (extent_op)
3168                         __run_delayed_extent_op(extent_op, leaf, ei);
3169                 /*
3170                  * In the case of inline back ref, reference count will
3171                  * be updated by remove_extent_backref
3172                  */
3173                 if (iref) {
3174                         if (!found_extent) {
3175                                 btrfs_crit(info,
3176 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3177                                 btrfs_abort_transaction(trans, -EUCLEAN);
3178                                 goto err_dump;
3179                         }
3180                 } else {
3181                         btrfs_set_extent_refs(leaf, ei, refs);
3182                         btrfs_mark_buffer_dirty(leaf);
3183                 }
3184                 if (found_extent) {
3185                         ret = remove_extent_backref(trans, path, iref,
3186                                                     refs_to_drop, is_data,
3187                                                     &last_ref);
3188                         if (ret) {
3189                                 btrfs_abort_transaction(trans, ret);
3190                                 goto out;
3191                         }
3192                 }
3193         } else {
3194                 /* In this branch refs == 1 */
3195                 if (found_extent) {
3196                         if (is_data && refs_to_drop !=
3197                             extent_data_ref_count(path, iref)) {
3198                                 btrfs_crit(info,
3199                 "invalid refs_to_drop, current refs %u refs_to_drop %u",
3200                                            extent_data_ref_count(path, iref),
3201                                            refs_to_drop);
3202                                 btrfs_abort_transaction(trans, -EUCLEAN);
3203                                 goto err_dump;
3204                         }
3205                         if (iref) {
3206                                 if (path->slots[0] != extent_slot) {
3207                                         btrfs_crit(info,
3208 "invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3209                                                    key.objectid, key.type,
3210                                                    key.offset);
3211                                         btrfs_abort_transaction(trans, -EUCLEAN);
3212                                         goto err_dump;
3213                                 }
3214                         } else {
3215                                 /*
3216                                  * No inline ref, we must be at SHARED_* item,
3217                                  * And it's single ref, it must be:
3218                                  * |    extent_slot       ||extent_slot + 1|
3219                                  * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3220                                  */
3221                                 if (path->slots[0] != extent_slot + 1) {
3222                                         btrfs_crit(info,
3223         "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3224                                         btrfs_abort_transaction(trans, -EUCLEAN);
3225                                         goto err_dump;
3226                                 }
3227                                 path->slots[0] = extent_slot;
3228                                 num_to_del = 2;
3229                         }
3230                 }
3231
3232                 last_ref = 1;
3233                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3234                                       num_to_del);
3235                 if (ret) {
3236                         btrfs_abort_transaction(trans, ret);
3237                         goto out;
3238                 }
3239                 btrfs_release_path(path);
3240
3241                 if (is_data) {
3242                         ret = btrfs_del_csums(trans, info->csum_root, bytenr,
3243                                               num_bytes);
3244                         if (ret) {
3245                                 btrfs_abort_transaction(trans, ret);
3246                                 goto out;
3247                         }
3248                 }
3249
3250                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3251                 if (ret) {
3252                         btrfs_abort_transaction(trans, ret);
3253                         goto out;
3254                 }
3255
3256                 ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3257                 if (ret) {
3258                         btrfs_abort_transaction(trans, ret);
3259                         goto out;
3260                 }
3261         }
3262         btrfs_release_path(path);
3263
3264 out:
3265         btrfs_free_path(path);
3266         return ret;
3267 err_dump:
3268         /*
3269          * Leaf dump can take up a lot of log buffer, so we only do full leaf
3270          * dump for debug build.
3271          */
3272         if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3273                 btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3274                            path->slots[0], extent_slot);
3275                 btrfs_print_leaf(path->nodes[0]);
3276         }
3277
3278         btrfs_free_path(path);
3279         return -EUCLEAN;
3280 }
3281
3282 /*
3283  * when we free an block, it is possible (and likely) that we free the last
3284  * delayed ref for that extent as well.  This searches the delayed ref tree for
3285  * a given extent, and if there are no other delayed refs to be processed, it
3286  * removes it from the tree.
3287  */
3288 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3289                                       u64 bytenr)
3290 {
3291         struct btrfs_delayed_ref_head *head;
3292         struct btrfs_delayed_ref_root *delayed_refs;
3293         int ret = 0;
3294
3295         delayed_refs = &trans->transaction->delayed_refs;
3296         spin_lock(&delayed_refs->lock);
3297         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3298         if (!head)
3299                 goto out_delayed_unlock;
3300
3301         spin_lock(&head->lock);
3302         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3303                 goto out;
3304
3305         if (cleanup_extent_op(head) != NULL)
3306                 goto out;
3307
3308         /*
3309          * waiting for the lock here would deadlock.  If someone else has it
3310          * locked they are already in the process of dropping it anyway
3311          */
3312         if (!mutex_trylock(&head->mutex))
3313                 goto out;
3314
3315         btrfs_delete_ref_head(delayed_refs, head);
3316         head->processing = 0;
3317
3318         spin_unlock(&head->lock);
3319         spin_unlock(&delayed_refs->lock);
3320
3321         BUG_ON(head->extent_op);
3322         if (head->must_insert_reserved)
3323                 ret = 1;
3324
3325         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3326         mutex_unlock(&head->mutex);
3327         btrfs_put_delayed_ref_head(head);
3328         return ret;
3329 out:
3330         spin_unlock(&head->lock);
3331
3332 out_delayed_unlock:
3333         spin_unlock(&delayed_refs->lock);
3334         return 0;
3335 }
3336
3337 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3338                            struct btrfs_root *root,
3339                            struct extent_buffer *buf,
3340                            u64 parent, int last_ref)
3341 {
3342         struct btrfs_fs_info *fs_info = root->fs_info;
3343         struct btrfs_ref generic_ref = { 0 };
3344         int pin = 1;
3345         int ret;
3346
3347         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3348                                buf->start, buf->len, parent);
3349         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3350                             root->root_key.objectid);
3351
3352         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3353                 int old_ref_mod, new_ref_mod;
3354
3355                 btrfs_ref_tree_mod(fs_info, &generic_ref);
3356                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
3357                                                  &old_ref_mod, &new_ref_mod);
3358                 BUG_ON(ret); /* -ENOMEM */
3359                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
3360         }
3361
3362         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3363                 struct btrfs_block_group *cache;
3364
3365                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3366                         ret = check_ref_cleanup(trans, buf->start);
3367                         if (!ret)
3368                                 goto out;
3369                 }
3370
3371                 pin = 0;
3372                 cache = btrfs_lookup_block_group(fs_info, buf->start);
3373
3374                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3375                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3376                         btrfs_put_block_group(cache);
3377                         goto out;
3378                 }
3379
3380                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3381
3382                 btrfs_add_free_space(cache, buf->start, buf->len);
3383                 btrfs_free_reserved_bytes(cache, buf->len, 0);
3384                 btrfs_put_block_group(cache);
3385                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3386         }
3387 out:
3388         if (pin)
3389                 add_pinned_bytes(fs_info, &generic_ref);
3390
3391         if (last_ref) {
3392                 /*
3393                  * Deleting the buffer, clear the corrupt flag since it doesn't
3394                  * matter anymore.
3395                  */
3396                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3397         }
3398 }
3399
3400 /* Can return -ENOMEM */
3401 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3402 {
3403         struct btrfs_fs_info *fs_info = trans->fs_info;
3404         int old_ref_mod, new_ref_mod;
3405         int ret;
3406
3407         if (btrfs_is_testing(fs_info))
3408                 return 0;
3409
3410         /*
3411          * tree log blocks never actually go into the extent allocation
3412          * tree, just update pinning info and exit early.
3413          */
3414         if ((ref->type == BTRFS_REF_METADATA &&
3415              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3416             (ref->type == BTRFS_REF_DATA &&
3417              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3418                 /* unlocks the pinned mutex */
3419                 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3420                 old_ref_mod = new_ref_mod = 0;
3421                 ret = 0;
3422         } else if (ref->type == BTRFS_REF_METADATA) {
3423                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
3424                                                  &old_ref_mod, &new_ref_mod);
3425         } else {
3426                 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
3427                                                  &old_ref_mod, &new_ref_mod);
3428         }
3429
3430         if (!((ref->type == BTRFS_REF_METADATA &&
3431                ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3432               (ref->type == BTRFS_REF_DATA &&
3433                ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
3434                 btrfs_ref_tree_mod(fs_info, ref);
3435
3436         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
3437                 add_pinned_bytes(fs_info, ref);
3438
3439         return ret;
3440 }
3441
3442 enum btrfs_loop_type {
3443         LOOP_CACHING_NOWAIT,
3444         LOOP_CACHING_WAIT,
3445         LOOP_ALLOC_CHUNK,
3446         LOOP_NO_EMPTY_SIZE,
3447 };
3448
3449 static inline void
3450 btrfs_lock_block_group(struct btrfs_block_group *cache,
3451                        int delalloc)
3452 {
3453         if (delalloc)
3454                 down_read(&cache->data_rwsem);
3455 }
3456
3457 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3458                        int delalloc)
3459 {
3460         btrfs_get_block_group(cache);
3461         if (delalloc)
3462                 down_read(&cache->data_rwsem);
3463 }
3464
3465 static struct btrfs_block_group *btrfs_lock_cluster(
3466                    struct btrfs_block_group *block_group,
3467                    struct btrfs_free_cluster *cluster,
3468                    int delalloc)
3469         __acquires(&cluster->refill_lock)
3470 {
3471         struct btrfs_block_group *used_bg = NULL;
3472
3473         spin_lock(&cluster->refill_lock);
3474         while (1) {
3475                 used_bg = cluster->block_group;
3476                 if (!used_bg)
3477                         return NULL;
3478
3479                 if (used_bg == block_group)
3480                         return used_bg;
3481
3482                 btrfs_get_block_group(used_bg);
3483
3484                 if (!delalloc)
3485                         return used_bg;
3486
3487                 if (down_read_trylock(&used_bg->data_rwsem))
3488                         return used_bg;
3489
3490                 spin_unlock(&cluster->refill_lock);
3491
3492                 /* We should only have one-level nested. */
3493                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3494
3495                 spin_lock(&cluster->refill_lock);
3496                 if (used_bg == cluster->block_group)
3497                         return used_bg;
3498
3499                 up_read(&used_bg->data_rwsem);
3500                 btrfs_put_block_group(used_bg);
3501         }
3502 }
3503
3504 static inline void
3505 btrfs_release_block_group(struct btrfs_block_group *cache,
3506                          int delalloc)
3507 {
3508         if (delalloc)
3509                 up_read(&cache->data_rwsem);
3510         btrfs_put_block_group(cache);
3511 }
3512
3513 enum btrfs_extent_allocation_policy {
3514         BTRFS_EXTENT_ALLOC_CLUSTERED,
3515 };
3516
3517 /*
3518  * Structure used internally for find_free_extent() function.  Wraps needed
3519  * parameters.
3520  */
3521 struct find_free_extent_ctl {
3522         /* Basic allocation info */
3523         u64 num_bytes;
3524         u64 empty_size;
3525         u64 flags;
3526         int delalloc;
3527
3528         /* Where to start the search inside the bg */
3529         u64 search_start;
3530
3531         /* For clustered allocation */
3532         u64 empty_cluster;
3533         struct btrfs_free_cluster *last_ptr;
3534         bool use_cluster;
3535
3536         bool have_caching_bg;
3537         bool orig_have_caching_bg;
3538
3539         /* RAID index, converted from flags */
3540         int index;
3541
3542         /*
3543          * Current loop number, check find_free_extent_update_loop() for details
3544          */
3545         int loop;
3546
3547         /*
3548          * Whether we're refilling a cluster, if true we need to re-search
3549          * current block group but don't try to refill the cluster again.
3550          */
3551         bool retry_clustered;
3552
3553         /*
3554          * Whether we're updating free space cache, if true we need to re-search
3555          * current block group but don't try updating free space cache again.
3556          */
3557         bool retry_unclustered;
3558
3559         /* If current block group is cached */
3560         int cached;
3561
3562         /* Max contiguous hole found */
3563         u64 max_extent_size;
3564
3565         /* Total free space from free space cache, not always contiguous */
3566         u64 total_free_space;
3567
3568         /* Found result */
3569         u64 found_offset;
3570
3571         /* Hint where to start looking for an empty space */
3572         u64 hint_byte;
3573
3574         /* Allocation policy */
3575         enum btrfs_extent_allocation_policy policy;
3576 };
3577
3578
3579 /*
3580  * Helper function for find_free_extent().
3581  *
3582  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3583  * Return -EAGAIN to inform caller that we need to re-search this block group
3584  * Return >0 to inform caller that we find nothing
3585  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3586  */
3587 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3588                                       struct find_free_extent_ctl *ffe_ctl,
3589                                       struct btrfs_block_group **cluster_bg_ret)
3590 {
3591         struct btrfs_block_group *cluster_bg;
3592         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3593         u64 aligned_cluster;
3594         u64 offset;
3595         int ret;
3596
3597         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3598         if (!cluster_bg)
3599                 goto refill_cluster;
3600         if (cluster_bg != bg && (cluster_bg->ro ||
3601             !block_group_bits(cluster_bg, ffe_ctl->flags)))
3602                 goto release_cluster;
3603
3604         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3605                         ffe_ctl->num_bytes, cluster_bg->start,
3606                         &ffe_ctl->max_extent_size);
3607         if (offset) {
3608                 /* We have a block, we're done */
3609                 spin_unlock(&last_ptr->refill_lock);
3610                 trace_btrfs_reserve_extent_cluster(cluster_bg,
3611                                 ffe_ctl->search_start, ffe_ctl->num_bytes);
3612                 *cluster_bg_ret = cluster_bg;
3613                 ffe_ctl->found_offset = offset;
3614                 return 0;
3615         }
3616         WARN_ON(last_ptr->block_group != cluster_bg);
3617
3618 release_cluster:
3619         /*
3620          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3621          * lets just skip it and let the allocator find whatever block it can
3622          * find. If we reach this point, we will have tried the cluster
3623          * allocator plenty of times and not have found anything, so we are
3624          * likely way too fragmented for the clustering stuff to find anything.
3625          *
3626          * However, if the cluster is taken from the current block group,
3627          * release the cluster first, so that we stand a better chance of
3628          * succeeding in the unclustered allocation.
3629          */
3630         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3631                 spin_unlock(&last_ptr->refill_lock);
3632                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3633                 return -ENOENT;
3634         }
3635
3636         /* This cluster didn't work out, free it and start over */
3637         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3638
3639         if (cluster_bg != bg)
3640                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3641
3642 refill_cluster:
3643         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3644                 spin_unlock(&last_ptr->refill_lock);
3645                 return -ENOENT;
3646         }
3647
3648         aligned_cluster = max_t(u64,
3649                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3650                         bg->full_stripe_len);
3651         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3652                         ffe_ctl->num_bytes, aligned_cluster);
3653         if (ret == 0) {
3654                 /* Now pull our allocation out of this cluster */
3655                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3656                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
3657                                 &ffe_ctl->max_extent_size);
3658                 if (offset) {
3659                         /* We found one, proceed */
3660                         spin_unlock(&last_ptr->refill_lock);
3661                         trace_btrfs_reserve_extent_cluster(bg,
3662                                         ffe_ctl->search_start,
3663                                         ffe_ctl->num_bytes);
3664                         ffe_ctl->found_offset = offset;
3665                         return 0;
3666                 }
3667         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3668                    !ffe_ctl->retry_clustered) {
3669                 spin_unlock(&last_ptr->refill_lock);
3670
3671                 ffe_ctl->retry_clustered = true;
3672                 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3673                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3674                 return -EAGAIN;
3675         }
3676         /*
3677          * At this point we either didn't find a cluster or we weren't able to
3678          * allocate a block from our cluster.  Free the cluster we've been
3679          * trying to use, and go to the next block group.
3680          */
3681         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3682         spin_unlock(&last_ptr->refill_lock);
3683         return 1;
3684 }
3685
3686 /*
3687  * Return >0 to inform caller that we find nothing
3688  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3689  * Return -EAGAIN to inform caller that we need to re-search this block group
3690  */
3691 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3692                                         struct find_free_extent_ctl *ffe_ctl)
3693 {
3694         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3695         u64 offset;
3696
3697         /*
3698          * We are doing an unclustered allocation, set the fragmented flag so
3699          * we don't bother trying to setup a cluster again until we get more
3700          * space.
3701          */
3702         if (unlikely(last_ptr)) {
3703                 spin_lock(&last_ptr->lock);
3704                 last_ptr->fragmented = 1;
3705                 spin_unlock(&last_ptr->lock);
3706         }
3707         if (ffe_ctl->cached) {
3708                 struct btrfs_free_space_ctl *free_space_ctl;
3709
3710                 free_space_ctl = bg->free_space_ctl;
3711                 spin_lock(&free_space_ctl->tree_lock);
3712                 if (free_space_ctl->free_space <
3713                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3714                     ffe_ctl->empty_size) {
3715                         ffe_ctl->total_free_space = max_t(u64,
3716                                         ffe_ctl->total_free_space,
3717                                         free_space_ctl->free_space);
3718                         spin_unlock(&free_space_ctl->tree_lock);
3719                         return 1;
3720                 }
3721                 spin_unlock(&free_space_ctl->tree_lock);
3722         }
3723
3724         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3725                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
3726                         &ffe_ctl->max_extent_size);
3727
3728         /*
3729          * If we didn't find a chunk, and we haven't failed on this block group
3730          * before, and this block group is in the middle of caching and we are
3731          * ok with waiting, then go ahead and wait for progress to be made, and
3732          * set @retry_unclustered to true.
3733          *
3734          * If @retry_unclustered is true then we've already waited on this
3735          * block group once and should move on to the next block group.
3736          */
3737         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3738             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3739                 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3740                                                       ffe_ctl->empty_size);
3741                 ffe_ctl->retry_unclustered = true;
3742                 return -EAGAIN;
3743         } else if (!offset) {
3744                 return 1;
3745         }
3746         ffe_ctl->found_offset = offset;
3747         return 0;
3748 }
3749
3750 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3751                                    struct find_free_extent_ctl *ffe_ctl,
3752                                    struct btrfs_block_group **bg_ret)
3753 {
3754         int ret;
3755
3756         /* We want to try and use the cluster allocator, so lets look there */
3757         if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3758                 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3759                 if (ret >= 0 || ret == -EAGAIN)
3760                         return ret;
3761                 /* ret == -ENOENT case falls through */
3762         }
3763
3764         return find_free_extent_unclustered(block_group, ffe_ctl);
3765 }
3766
3767 static int do_allocation(struct btrfs_block_group *block_group,
3768                          struct find_free_extent_ctl *ffe_ctl,
3769                          struct btrfs_block_group **bg_ret)
3770 {
3771         switch (ffe_ctl->policy) {
3772         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3773                 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3774         default:
3775                 BUG();
3776         }
3777 }
3778
3779 static void release_block_group(struct btrfs_block_group *block_group,
3780                                 struct find_free_extent_ctl *ffe_ctl,
3781                                 int delalloc)
3782 {
3783         switch (ffe_ctl->policy) {
3784         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3785                 ffe_ctl->retry_clustered = false;
3786                 ffe_ctl->retry_unclustered = false;
3787                 break;
3788         default:
3789                 BUG();
3790         }
3791
3792         BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3793                ffe_ctl->index);
3794         btrfs_release_block_group(block_group, delalloc);
3795 }
3796
3797 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3798                                    struct btrfs_key *ins)
3799 {
3800         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3801
3802         if (!ffe_ctl->use_cluster && last_ptr) {
3803                 spin_lock(&last_ptr->lock);
3804                 last_ptr->window_start = ins->objectid;
3805                 spin_unlock(&last_ptr->lock);
3806         }
3807 }
3808
3809 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3810                          struct btrfs_key *ins)
3811 {
3812         switch (ffe_ctl->policy) {
3813         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3814                 found_extent_clustered(ffe_ctl, ins);
3815                 break;
3816         default:
3817                 BUG();
3818         }
3819 }
3820
3821 static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
3822 {
3823         switch (ffe_ctl->policy) {
3824         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3825                 /*
3826                  * If we can't allocate a new chunk we've already looped through
3827                  * at least once, move on to the NO_EMPTY_SIZE case.
3828                  */
3829                 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3830                 return 0;
3831         default:
3832                 BUG();
3833         }
3834 }
3835
3836 /*
3837  * Return >0 means caller needs to re-search for free extent
3838  * Return 0 means we have the needed free extent.
3839  * Return <0 means we failed to locate any free extent.
3840  */
3841 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3842                                         struct btrfs_key *ins,
3843                                         struct find_free_extent_ctl *ffe_ctl,
3844                                         bool full_search)
3845 {
3846         struct btrfs_root *root = fs_info->extent_root;
3847         int ret;
3848
3849         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3850             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3851                 ffe_ctl->orig_have_caching_bg = true;
3852
3853         if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
3854             ffe_ctl->have_caching_bg)
3855                 return 1;
3856
3857         if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
3858                 return 1;
3859
3860         if (ins->objectid) {
3861                 found_extent(ffe_ctl, ins);
3862                 return 0;
3863         }
3864
3865         /*
3866          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3867          *                      caching kthreads as we move along
3868          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3869          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3870          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3871          *                     again
3872          */
3873         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3874                 ffe_ctl->index = 0;
3875                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
3876                         /*
3877                          * We want to skip the LOOP_CACHING_WAIT step if we
3878                          * don't have any uncached bgs and we've already done a
3879                          * full search through.
3880                          */
3881                         if (ffe_ctl->orig_have_caching_bg || !full_search)
3882                                 ffe_ctl->loop = LOOP_CACHING_WAIT;
3883                         else
3884                                 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
3885                 } else {
3886                         ffe_ctl->loop++;
3887                 }
3888
3889                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3890                         struct btrfs_trans_handle *trans;
3891                         int exist = 0;
3892
3893                         trans = current->journal_info;
3894                         if (trans)
3895                                 exist = 1;
3896                         else
3897                                 trans = btrfs_join_transaction(root);
3898
3899                         if (IS_ERR(trans)) {
3900                                 ret = PTR_ERR(trans);
3901                                 return ret;
3902                         }
3903
3904                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3905                                                 CHUNK_ALLOC_FORCE);
3906
3907                         /* Do not bail out on ENOSPC since we can do more. */
3908                         if (ret == -ENOSPC)
3909                                 ret = chunk_allocation_failed(ffe_ctl);
3910                         else if (ret < 0)
3911                                 btrfs_abort_transaction(trans, ret);
3912                         else
3913                                 ret = 0;
3914                         if (!exist)
3915                                 btrfs_end_transaction(trans);
3916                         if (ret)
3917                                 return ret;
3918                 }
3919
3920                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
3921                         if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
3922                                 return -ENOSPC;
3923
3924                         /*
3925                          * Don't loop again if we already have no empty_size and
3926                          * no empty_cluster.
3927                          */
3928                         if (ffe_ctl->empty_size == 0 &&
3929                             ffe_ctl->empty_cluster == 0)
3930                                 return -ENOSPC;
3931                         ffe_ctl->empty_size = 0;
3932                         ffe_ctl->empty_cluster = 0;
3933                 }
3934                 return 1;
3935         }
3936         return -ENOSPC;
3937 }
3938
3939 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
3940                                         struct find_free_extent_ctl *ffe_ctl,
3941                                         struct btrfs_space_info *space_info,
3942                                         struct btrfs_key *ins)
3943 {
3944         /*
3945          * If our free space is heavily fragmented we may not be able to make
3946          * big contiguous allocations, so instead of doing the expensive search
3947          * for free space, simply return ENOSPC with our max_extent_size so we
3948          * can go ahead and search for a more manageable chunk.
3949          *
3950          * If our max_extent_size is large enough for our allocation simply
3951          * disable clustering since we will likely not be able to find enough
3952          * space to create a cluster and induce latency trying.
3953          */
3954         if (space_info->max_extent_size) {
3955                 spin_lock(&space_info->lock);
3956                 if (space_info->max_extent_size &&
3957                     ffe_ctl->num_bytes > space_info->max_extent_size) {
3958                         ins->offset = space_info->max_extent_size;
3959                         spin_unlock(&space_info->lock);
3960                         return -ENOSPC;
3961                 } else if (space_info->max_extent_size) {
3962                         ffe_ctl->use_cluster = false;
3963                 }
3964                 spin_unlock(&space_info->lock);
3965         }
3966
3967         ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
3968                                                &ffe_ctl->empty_cluster);
3969         if (ffe_ctl->last_ptr) {
3970                 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3971
3972                 spin_lock(&last_ptr->lock);
3973                 if (last_ptr->block_group)
3974                         ffe_ctl->hint_byte = last_ptr->window_start;
3975                 if (last_ptr->fragmented) {
3976                         /*
3977                          * We still set window_start so we can keep track of the
3978                          * last place we found an allocation to try and save
3979                          * some time.
3980                          */
3981                         ffe_ctl->hint_byte = last_ptr->window_start;
3982                         ffe_ctl->use_cluster = false;
3983                 }
3984                 spin_unlock(&last_ptr->lock);
3985         }
3986
3987         return 0;
3988 }
3989
3990 static int prepare_allocation(struct btrfs_fs_info *fs_info,
3991                               struct find_free_extent_ctl *ffe_ctl,
3992                               struct btrfs_space_info *space_info,
3993                               struct btrfs_key *ins)
3994 {
3995         switch (ffe_ctl->policy) {
3996         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3997                 return prepare_allocation_clustered(fs_info, ffe_ctl,
3998                                                     space_info, ins);
3999         default:
4000                 BUG();
4001         }
4002 }
4003
4004 /*
4005  * walks the btree of allocated extents and find a hole of a given size.
4006  * The key ins is changed to record the hole:
4007  * ins->objectid == start position
4008  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4009  * ins->offset == the size of the hole.
4010  * Any available blocks before search_start are skipped.
4011  *
4012  * If there is no suitable free space, we will record the max size of
4013  * the free space extent currently.
4014  *
4015  * The overall logic and call chain:
4016  *
4017  * find_free_extent()
4018  * |- Iterate through all block groups
4019  * |  |- Get a valid block group
4020  * |  |- Try to do clustered allocation in that block group
4021  * |  |- Try to do unclustered allocation in that block group
4022  * |  |- Check if the result is valid
4023  * |  |  |- If valid, then exit
4024  * |  |- Jump to next block group
4025  * |
4026  * |- Push harder to find free extents
4027  *    |- If not found, re-iterate all block groups
4028  */
4029 static noinline int find_free_extent(struct btrfs_root *root,
4030                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
4031                                 u64 hint_byte_orig, struct btrfs_key *ins,
4032                                 u64 flags, int delalloc)
4033 {
4034         struct btrfs_fs_info *fs_info = root->fs_info;
4035         int ret = 0;
4036         int cache_block_group_error = 0;
4037         struct btrfs_block_group *block_group = NULL;
4038         struct find_free_extent_ctl ffe_ctl = {0};
4039         struct btrfs_space_info *space_info;
4040         bool full_search = false;
4041
4042         WARN_ON(num_bytes < fs_info->sectorsize);
4043
4044         ffe_ctl.num_bytes = num_bytes;
4045         ffe_ctl.empty_size = empty_size;
4046         ffe_ctl.flags = flags;
4047         ffe_ctl.search_start = 0;
4048         ffe_ctl.delalloc = delalloc;
4049         ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
4050         ffe_ctl.have_caching_bg = false;
4051         ffe_ctl.orig_have_caching_bg = false;
4052         ffe_ctl.found_offset = 0;
4053         ffe_ctl.hint_byte = hint_byte_orig;
4054         ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4055
4056         /* For clustered allocation */
4057         ffe_ctl.retry_clustered = false;
4058         ffe_ctl.retry_unclustered = false;
4059         ffe_ctl.last_ptr = NULL;
4060         ffe_ctl.use_cluster = true;
4061
4062         ins->type = BTRFS_EXTENT_ITEM_KEY;
4063         ins->objectid = 0;
4064         ins->offset = 0;
4065
4066         trace_find_free_extent(root, num_bytes, empty_size, flags);
4067
4068         space_info = btrfs_find_space_info(fs_info, flags);
4069         if (!space_info) {
4070                 btrfs_err(fs_info, "No space info for %llu", flags);
4071                 return -ENOSPC;
4072         }
4073
4074         ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
4075         if (ret < 0)
4076                 return ret;
4077
4078         ffe_ctl.search_start = max(ffe_ctl.search_start,
4079                                    first_logical_byte(fs_info, 0));
4080         ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
4081         if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
4082                 block_group = btrfs_lookup_block_group(fs_info,
4083                                                        ffe_ctl.search_start);
4084                 /*
4085                  * we don't want to use the block group if it doesn't match our
4086                  * allocation bits, or if its not cached.
4087                  *
4088                  * However if we are re-searching with an ideal block group
4089                  * picked out then we don't care that the block group is cached.
4090                  */
4091                 if (block_group && block_group_bits(block_group, flags) &&
4092                     block_group->cached != BTRFS_CACHE_NO) {
4093                         down_read(&space_info->groups_sem);
4094                         if (list_empty(&block_group->list) ||
4095                             block_group->ro) {
4096                                 /*
4097                                  * someone is removing this block group,
4098                                  * we can't jump into the have_block_group
4099                                  * target because our list pointers are not
4100                                  * valid
4101                                  */
4102                                 btrfs_put_block_group(block_group);
4103                                 up_read(&space_info->groups_sem);
4104                         } else {
4105                                 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
4106                                                 block_group->flags);
4107                                 btrfs_lock_block_group(block_group, delalloc);
4108                                 goto have_block_group;
4109                         }
4110                 } else if (block_group) {
4111                         btrfs_put_block_group(block_group);
4112                 }
4113         }
4114 search:
4115         ffe_ctl.have_caching_bg = false;
4116         if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
4117             ffe_ctl.index == 0)
4118                 full_search = true;
4119         down_read(&space_info->groups_sem);
4120         list_for_each_entry(block_group,
4121                             &space_info->block_groups[ffe_ctl.index], list) {
4122                 struct btrfs_block_group *bg_ret;
4123
4124                 /* If the block group is read-only, we can skip it entirely. */
4125                 if (unlikely(block_group->ro))
4126                         continue;
4127
4128                 btrfs_grab_block_group(block_group, delalloc);
4129                 ffe_ctl.search_start = block_group->start;
4130
4131                 /*
4132                  * this can happen if we end up cycling through all the
4133                  * raid types, but we want to make sure we only allocate
4134                  * for the proper type.
4135                  */
4136                 if (!block_group_bits(block_group, flags)) {
4137                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
4138                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
4139                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
4140                                 BTRFS_BLOCK_GROUP_RAID10;
4141
4142                         /*
4143                          * if they asked for extra copies and this block group
4144                          * doesn't provide them, bail.  This does allow us to
4145                          * fill raid0 from raid1.
4146                          */
4147                         if ((flags & extra) && !(block_group->flags & extra))
4148                                 goto loop;
4149
4150                         /*
4151                          * This block group has different flags than we want.
4152                          * It's possible that we have MIXED_GROUP flag but no
4153                          * block group is mixed.  Just skip such block group.
4154                          */
4155                         btrfs_release_block_group(block_group, delalloc);
4156                         continue;
4157                 }
4158
4159 have_block_group:
4160                 ffe_ctl.cached = btrfs_block_group_done(block_group);
4161                 if (unlikely(!ffe_ctl.cached)) {
4162                         ffe_ctl.have_caching_bg = true;
4163                         ret = btrfs_cache_block_group(block_group, 0);
4164
4165                         /*
4166                          * If we get ENOMEM here or something else we want to
4167                          * try other block groups, because it may not be fatal.
4168                          * However if we can't find anything else we need to
4169                          * save our return here so that we return the actual
4170                          * error that caused problems, not ENOSPC.
4171                          */
4172                         if (ret < 0) {
4173                                 if (!cache_block_group_error)
4174                                         cache_block_group_error = ret;
4175                                 ret = 0;
4176                                 goto loop;
4177                         }
4178                         ret = 0;
4179                 }
4180
4181                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
4182                         goto loop;
4183
4184                 bg_ret = NULL;
4185                 ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
4186                 if (ret == 0) {
4187                         if (bg_ret && bg_ret != block_group) {
4188                                 btrfs_release_block_group(block_group, delalloc);
4189                                 block_group = bg_ret;
4190                         }
4191                 } else if (ret == -EAGAIN) {
4192                         goto have_block_group;
4193                 } else if (ret > 0) {
4194                         goto loop;
4195                 }
4196
4197                 /* Checks */
4198                 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
4199                                              fs_info->stripesize);
4200
4201                 /* move on to the next group */
4202                 if (ffe_ctl.search_start + num_bytes >
4203                     block_group->start + block_group->length) {
4204                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4205                                              num_bytes);
4206                         goto loop;
4207                 }
4208
4209                 if (ffe_ctl.found_offset < ffe_ctl.search_start)
4210                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4211                                 ffe_ctl.search_start - ffe_ctl.found_offset);
4212
4213                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
4214                                 num_bytes, delalloc);
4215                 if (ret == -EAGAIN) {
4216                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4217                                              num_bytes);
4218                         goto loop;
4219                 }
4220                 btrfs_inc_block_group_reservations(block_group);
4221
4222                 /* we are all good, lets return */
4223                 ins->objectid = ffe_ctl.search_start;
4224                 ins->offset = num_bytes;
4225
4226                 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
4227                                            num_bytes);
4228                 btrfs_release_block_group(block_group, delalloc);
4229                 break;
4230 loop:
4231                 release_block_group(block_group, &ffe_ctl, delalloc);
4232                 cond_resched();
4233         }
4234         up_read(&space_info->groups_sem);
4235
4236         ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
4237         if (ret > 0)
4238                 goto search;
4239
4240         if (ret == -ENOSPC && !cache_block_group_error) {
4241                 /*
4242                  * Use ffe_ctl->total_free_space as fallback if we can't find
4243                  * any contiguous hole.
4244                  */
4245                 if (!ffe_ctl.max_extent_size)
4246                         ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4247                 spin_lock(&space_info->lock);
4248                 space_info->max_extent_size = ffe_ctl.max_extent_size;
4249                 spin_unlock(&space_info->lock);
4250                 ins->offset = ffe_ctl.max_extent_size;
4251         } else if (ret == -ENOSPC) {
4252                 ret = cache_block_group_error;
4253         }
4254         return ret;
4255 }
4256
4257 /*
4258  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4259  *                        hole that is at least as big as @num_bytes.
4260  *
4261  * @root           -    The root that will contain this extent
4262  *
4263  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4264  *                      is used for accounting purposes. This value differs
4265  *                      from @num_bytes only in the case of compressed extents.
4266  *
4267  * @num_bytes      -    Number of bytes to allocate on-disk.
4268  *
4269  * @min_alloc_size -    Indicates the minimum amount of space that the
4270  *                      allocator should try to satisfy. In some cases
4271  *                      @num_bytes may be larger than what is required and if
4272  *                      the filesystem is fragmented then allocation fails.
4273  *                      However, the presence of @min_alloc_size gives a
4274  *                      chance to try and satisfy the smaller allocation.
4275  *
4276  * @empty_size     -    A hint that you plan on doing more COW. This is the
4277  *                      size in bytes the allocator should try to find free
4278  *                      next to the block it returns.  This is just a hint and
4279  *                      may be ignored by the allocator.
4280  *
4281  * @hint_byte      -    Hint to the allocator to start searching above the byte
4282  *                      address passed. It might be ignored.
4283  *
4284  * @ins            -    This key is modified to record the found hole. It will
4285  *                      have the following values:
4286  *                      ins->objectid == start position
4287  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4288  *                      ins->offset == the size of the hole.
4289  *
4290  * @is_data        -    Boolean flag indicating whether an extent is
4291  *                      allocated for data (true) or metadata (false)
4292  *
4293  * @delalloc       -    Boolean flag indicating whether this allocation is for
4294  *                      delalloc or not. If 'true' data_rwsem of block groups
4295  *                      is going to be acquired.
4296  *
4297  *
4298  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4299  * case -ENOSPC is returned then @ins->offset will contain the size of the
4300  * largest available hole the allocator managed to find.
4301  */
4302 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4303                          u64 num_bytes, u64 min_alloc_size,
4304                          u64 empty_size, u64 hint_byte,
4305                          struct btrfs_key *ins, int is_data, int delalloc)
4306 {
4307         struct btrfs_fs_info *fs_info = root->fs_info;
4308         bool final_tried = num_bytes == min_alloc_size;
4309         u64 flags;
4310         int ret;
4311
4312         flags = get_alloc_profile_by_root(root, is_data);
4313 again:
4314         WARN_ON(num_bytes < fs_info->sectorsize);
4315         ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
4316                                hint_byte, ins, flags, delalloc);
4317         if (!ret && !is_data) {
4318                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4319         } else if (ret == -ENOSPC) {
4320                 if (!final_tried && ins->offset) {
4321                         num_bytes = min(num_bytes >> 1, ins->offset);
4322                         num_bytes = round_down(num_bytes,
4323                                                fs_info->sectorsize);
4324                         num_bytes = max(num_bytes, min_alloc_size);
4325                         ram_bytes = num_bytes;
4326                         if (num_bytes == min_alloc_size)
4327                                 final_tried = true;
4328                         goto again;
4329                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4330                         struct btrfs_space_info *sinfo;
4331
4332                         sinfo = btrfs_find_space_info(fs_info, flags);
4333                         btrfs_err(fs_info,
4334                                   "allocation failed flags %llu, wanted %llu",
4335                                   flags, num_bytes);
4336                         if (sinfo)
4337                                 btrfs_dump_space_info(fs_info, sinfo,
4338                                                       num_bytes, 1);
4339                 }
4340         }
4341
4342         return ret;
4343 }
4344
4345 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4346                                u64 start, u64 len, int delalloc)
4347 {
4348         struct btrfs_block_group *cache;
4349
4350         cache = btrfs_lookup_block_group(fs_info, start);
4351         if (!cache) {
4352                 btrfs_err(fs_info, "Unable to find block group for %llu",
4353                           start);
4354                 return -ENOSPC;
4355         }
4356
4357         btrfs_add_free_space(cache, start, len);
4358         btrfs_free_reserved_bytes(cache, len, delalloc);
4359         trace_btrfs_reserved_extent_free(fs_info, start, len);
4360
4361         btrfs_put_block_group(cache);
4362         return 0;
4363 }
4364
4365 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4366                               u64 len)
4367 {
4368         struct btrfs_block_group *cache;
4369         int ret = 0;
4370
4371         cache = btrfs_lookup_block_group(trans->fs_info, start);
4372         if (!cache) {
4373                 btrfs_err(trans->fs_info, "unable to find block group for %llu",
4374                           start);
4375                 return -ENOSPC;
4376         }
4377
4378         ret = pin_down_extent(trans, cache, start, len, 1);
4379         btrfs_put_block_group(cache);
4380         return ret;
4381 }
4382
4383 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4384                                       u64 parent, u64 root_objectid,
4385                                       u64 flags, u64 owner, u64 offset,
4386                                       struct btrfs_key *ins, int ref_mod)
4387 {
4388         struct btrfs_fs_info *fs_info = trans->fs_info;
4389         int ret;
4390         struct btrfs_extent_item *extent_item;
4391         struct btrfs_extent_inline_ref *iref;
4392         struct btrfs_path *path;
4393         struct extent_buffer *leaf;
4394         int type;
4395         u32 size;
4396
4397         if (parent > 0)
4398                 type = BTRFS_SHARED_DATA_REF_KEY;
4399         else
4400                 type = BTRFS_EXTENT_DATA_REF_KEY;
4401
4402         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4403
4404         path = btrfs_alloc_path();
4405         if (!path)
4406                 return -ENOMEM;
4407
4408         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4409                                       ins, size);
4410         if (ret) {
4411                 btrfs_free_path(path);
4412                 return ret;
4413         }
4414
4415         leaf = path->nodes[0];
4416         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4417                                      struct btrfs_extent_item);
4418         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4419         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4420         btrfs_set_extent_flags(leaf, extent_item,
4421                                flags | BTRFS_EXTENT_FLAG_DATA);
4422
4423         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4424         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4425         if (parent > 0) {
4426                 struct btrfs_shared_data_ref *ref;
4427                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4428                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4429                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4430         } else {
4431                 struct btrfs_extent_data_ref *ref;
4432                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4433                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4434                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4435                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4436                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4437         }
4438
4439         btrfs_mark_buffer_dirty(path->nodes[0]);
4440         btrfs_free_path(path);
4441
4442         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4443         if (ret)
4444                 return ret;
4445
4446         ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4447         if (ret) { /* -ENOENT, logic error */
4448                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4449                         ins->objectid, ins->offset);
4450                 BUG();
4451         }
4452         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4453         return ret;
4454 }
4455
4456 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4457                                      struct btrfs_delayed_ref_node *node,
4458                                      struct btrfs_delayed_extent_op *extent_op)
4459 {
4460         struct btrfs_fs_info *fs_info = trans->fs_info;
4461         int ret;
4462         struct btrfs_extent_item *extent_item;
4463         struct btrfs_key extent_key;
4464         struct btrfs_tree_block_info *block_info;
4465         struct btrfs_extent_inline_ref *iref;
4466         struct btrfs_path *path;
4467         struct extent_buffer *leaf;
4468         struct btrfs_delayed_tree_ref *ref;
4469         u32 size = sizeof(*extent_item) + sizeof(*iref);
4470         u64 num_bytes;
4471         u64 flags = extent_op->flags_to_set;
4472         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4473
4474         ref = btrfs_delayed_node_to_tree_ref(node);
4475
4476         extent_key.objectid = node->bytenr;
4477         if (skinny_metadata) {
4478                 extent_key.offset = ref->level;
4479                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4480                 num_bytes = fs_info->nodesize;
4481         } else {
4482                 extent_key.offset = node->num_bytes;
4483                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4484                 size += sizeof(*block_info);
4485                 num_bytes = node->num_bytes;
4486         }
4487
4488         path = btrfs_alloc_path();
4489         if (!path)
4490                 return -ENOMEM;
4491
4492         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4493                                       &extent_key, size);
4494         if (ret) {
4495                 btrfs_free_path(path);
4496                 return ret;
4497         }
4498
4499         leaf = path->nodes[0];
4500         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4501                                      struct btrfs_extent_item);
4502         btrfs_set_extent_refs(leaf, extent_item, 1);
4503         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4504         btrfs_set_extent_flags(leaf, extent_item,
4505                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4506
4507         if (skinny_metadata) {
4508                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4509         } else {
4510                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4511                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4512                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
4513                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4514         }
4515
4516         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4517                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4518                 btrfs_set_extent_inline_ref_type(leaf, iref,
4519                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4520                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4521         } else {
4522                 btrfs_set_extent_inline_ref_type(leaf, iref,
4523                                                  BTRFS_TREE_BLOCK_REF_KEY);
4524                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4525         }
4526
4527         btrfs_mark_buffer_dirty(leaf);
4528         btrfs_free_path(path);
4529
4530         ret = remove_from_free_space_tree(trans, extent_key.objectid,
4531                                           num_bytes);
4532         if (ret)
4533                 return ret;
4534
4535         ret = btrfs_update_block_group(trans, extent_key.objectid,
4536                                        fs_info->nodesize, 1);
4537         if (ret) { /* -ENOENT, logic error */
4538                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4539                         extent_key.objectid, extent_key.offset);
4540                 BUG();
4541         }
4542
4543         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4544                                           fs_info->nodesize);
4545         return ret;
4546 }
4547
4548 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4549                                      struct btrfs_root *root, u64 owner,
4550                                      u64 offset, u64 ram_bytes,
4551                                      struct btrfs_key *ins)
4552 {
4553         struct btrfs_ref generic_ref = { 0 };
4554         int ret;
4555
4556         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4557
4558         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4559                                ins->objectid, ins->offset, 0);
4560         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4561         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4562         ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
4563                                          ram_bytes, NULL, NULL);
4564         return ret;
4565 }
4566
4567 /*
4568  * this is used by the tree logging recovery code.  It records that
4569  * an extent has been allocated and makes sure to clear the free
4570  * space cache bits as well
4571  */
4572 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4573                                    u64 root_objectid, u64 owner, u64 offset,
4574                                    struct btrfs_key *ins)
4575 {
4576         struct btrfs_fs_info *fs_info = trans->fs_info;
4577         int ret;
4578         struct btrfs_block_group *block_group;
4579         struct btrfs_space_info *space_info;
4580
4581         /*
4582          * Mixed block groups will exclude before processing the log so we only
4583          * need to do the exclude dance if this fs isn't mixed.
4584          */
4585         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4586                 ret = __exclude_logged_extent(fs_info, ins->objectid,
4587                                               ins->offset);
4588                 if (ret)
4589                         return ret;
4590         }
4591
4592         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4593         if (!block_group)
4594                 return -EINVAL;
4595
4596         space_info = block_group->space_info;
4597         spin_lock(&space_info->lock);
4598         spin_lock(&block_group->lock);
4599         space_info->bytes_reserved += ins->offset;
4600         block_group->reserved += ins->offset;
4601         spin_unlock(&block_group->lock);
4602         spin_unlock(&space_info->lock);
4603
4604         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4605                                          offset, ins, 1);
4606         if (ret)
4607                 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4608         btrfs_put_block_group(block_group);
4609         return ret;
4610 }
4611
4612 static struct extent_buffer *
4613 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4614                       u64 bytenr, int level, u64 owner,
4615                       enum btrfs_lock_nesting nest)
4616 {
4617         struct btrfs_fs_info *fs_info = root->fs_info;
4618         struct extent_buffer *buf;
4619
4620         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4621         if (IS_ERR(buf))
4622                 return buf;
4623
4624         /*
4625          * Extra safety check in case the extent tree is corrupted and extent
4626          * allocator chooses to use a tree block which is already used and
4627          * locked.
4628          */
4629         if (buf->lock_owner == current->pid) {
4630                 btrfs_err_rl(fs_info,
4631 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4632                         buf->start, btrfs_header_owner(buf), current->pid);
4633                 free_extent_buffer(buf);
4634                 return ERR_PTR(-EUCLEAN);
4635         }
4636
4637         /*
4638          * This needs to stay, because we could allocate a freed block from an
4639          * old tree into a new tree, so we need to make sure this new block is
4640          * set to the appropriate level and owner.
4641          */
4642         btrfs_set_buffer_lockdep_class(owner, buf, level);
4643         __btrfs_tree_lock(buf, nest);
4644         btrfs_clean_tree_block(buf);
4645         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4646
4647         set_extent_buffer_uptodate(buf);
4648
4649         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4650         btrfs_set_header_level(buf, level);
4651         btrfs_set_header_bytenr(buf, buf->start);
4652         btrfs_set_header_generation(buf, trans->transid);
4653         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4654         btrfs_set_header_owner(buf, owner);
4655         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4656         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4657         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4658                 buf->log_index = root->log_transid % 2;
4659                 /*
4660                  * we allow two log transactions at a time, use different
4661                  * EXTENT bit to differentiate dirty pages.
4662                  */
4663                 if (buf->log_index == 0)
4664                         set_extent_dirty(&root->dirty_log_pages, buf->start,
4665                                         buf->start + buf->len - 1, GFP_NOFS);
4666                 else
4667                         set_extent_new(&root->dirty_log_pages, buf->start,
4668                                         buf->start + buf->len - 1);
4669         } else {
4670                 buf->log_index = -1;
4671                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4672                          buf->start + buf->len - 1, GFP_NOFS);
4673         }
4674         trans->dirty = true;
4675         /* this returns a buffer locked for blocking */
4676         return buf;
4677 }
4678
4679 /*
4680  * finds a free extent and does all the dirty work required for allocation
4681  * returns the tree buffer or an ERR_PTR on error.
4682  */
4683 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4684                                              struct btrfs_root *root,
4685                                              u64 parent, u64 root_objectid,
4686                                              const struct btrfs_disk_key *key,
4687                                              int level, u64 hint,
4688                                              u64 empty_size,
4689                                              enum btrfs_lock_nesting nest)
4690 {
4691         struct btrfs_fs_info *fs_info = root->fs_info;
4692         struct btrfs_key ins;
4693         struct btrfs_block_rsv *block_rsv;
4694         struct extent_buffer *buf;
4695         struct btrfs_delayed_extent_op *extent_op;
4696         struct btrfs_ref generic_ref = { 0 };
4697         u64 flags = 0;
4698         int ret;
4699         u32 blocksize = fs_info->nodesize;
4700         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4701
4702 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4703         if (btrfs_is_testing(fs_info)) {
4704                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4705                                             level, root_objectid, nest);
4706                 if (!IS_ERR(buf))
4707                         root->alloc_bytenr += blocksize;
4708                 return buf;
4709         }
4710 #endif
4711
4712         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4713         if (IS_ERR(block_rsv))
4714                 return ERR_CAST(block_rsv);
4715
4716         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4717                                    empty_size, hint, &ins, 0, 0);
4718         if (ret)
4719                 goto out_unuse;
4720
4721         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4722                                     root_objectid, nest);
4723         if (IS_ERR(buf)) {
4724                 ret = PTR_ERR(buf);
4725                 goto out_free_reserved;
4726         }
4727
4728         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4729                 if (parent == 0)
4730                         parent = ins.objectid;
4731                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4732         } else
4733                 BUG_ON(parent > 0);
4734
4735         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4736                 extent_op = btrfs_alloc_delayed_extent_op();
4737                 if (!extent_op) {
4738                         ret = -ENOMEM;
4739                         goto out_free_buf;
4740                 }
4741                 if (key)
4742                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4743                 else
4744                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4745                 extent_op->flags_to_set = flags;
4746                 extent_op->update_key = skinny_metadata ? false : true;
4747                 extent_op->update_flags = true;
4748                 extent_op->is_data = false;
4749                 extent_op->level = level;
4750
4751                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4752                                        ins.objectid, ins.offset, parent);
4753                 generic_ref.real_root = root->root_key.objectid;
4754                 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4755                 btrfs_ref_tree_mod(fs_info, &generic_ref);
4756                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
4757                                                  extent_op, NULL, NULL);
4758                 if (ret)
4759                         goto out_free_delayed;
4760         }
4761         return buf;
4762
4763 out_free_delayed:
4764         btrfs_free_delayed_extent_op(extent_op);
4765 out_free_buf:
4766         free_extent_buffer(buf);
4767 out_free_reserved:
4768         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4769 out_unuse:
4770         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4771         return ERR_PTR(ret);
4772 }
4773
4774 struct walk_control {
4775         u64 refs[BTRFS_MAX_LEVEL];
4776         u64 flags[BTRFS_MAX_LEVEL];
4777         struct btrfs_key update_progress;
4778         struct btrfs_key drop_progress;
4779         int drop_level;
4780         int stage;
4781         int level;
4782         int shared_level;
4783         int update_ref;
4784         int keep_locks;
4785         int reada_slot;
4786         int reada_count;
4787         int restarted;
4788 };
4789
4790 #define DROP_REFERENCE  1
4791 #define UPDATE_BACKREF  2
4792
4793 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4794                                      struct btrfs_root *root,
4795                                      struct walk_control *wc,
4796                                      struct btrfs_path *path)
4797 {
4798         struct btrfs_fs_info *fs_info = root->fs_info;
4799         u64 bytenr;
4800         u64 generation;
4801         u64 refs;
4802         u64 flags;
4803         u32 nritems;
4804         struct btrfs_key key;
4805         struct extent_buffer *eb;
4806         int ret;
4807         int slot;
4808         int nread = 0;
4809
4810         if (path->slots[wc->level] < wc->reada_slot) {
4811                 wc->reada_count = wc->reada_count * 2 / 3;
4812                 wc->reada_count = max(wc->reada_count, 2);
4813         } else {
4814                 wc->reada_count = wc->reada_count * 3 / 2;
4815                 wc->reada_count = min_t(int, wc->reada_count,
4816                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
4817         }
4818
4819         eb = path->nodes[wc->level];
4820         nritems = btrfs_header_nritems(eb);
4821
4822         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4823                 if (nread >= wc->reada_count)
4824                         break;
4825
4826                 cond_resched();
4827                 bytenr = btrfs_node_blockptr(eb, slot);
4828                 generation = btrfs_node_ptr_generation(eb, slot);
4829
4830                 if (slot == path->slots[wc->level])
4831                         goto reada;
4832
4833                 if (wc->stage == UPDATE_BACKREF &&
4834                     generation <= root->root_key.offset)
4835                         continue;
4836
4837                 /* We don't lock the tree block, it's OK to be racy here */
4838                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4839                                                wc->level - 1, 1, &refs,
4840                                                &flags);
4841                 /* We don't care about errors in readahead. */
4842                 if (ret < 0)
4843                         continue;
4844                 BUG_ON(refs == 0);
4845
4846                 if (wc->stage == DROP_REFERENCE) {
4847                         if (refs == 1)
4848                                 goto reada;
4849
4850                         if (wc->level == 1 &&
4851                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4852                                 continue;
4853                         if (!wc->update_ref ||
4854                             generation <= root->root_key.offset)
4855                                 continue;
4856                         btrfs_node_key_to_cpu(eb, &key, slot);
4857                         ret = btrfs_comp_cpu_keys(&key,
4858                                                   &wc->update_progress);
4859                         if (ret < 0)
4860                                 continue;
4861                 } else {
4862                         if (wc->level == 1 &&
4863                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4864                                 continue;
4865                 }
4866 reada:
4867                 btrfs_readahead_node_child(eb, slot);
4868                 nread++;
4869         }
4870         wc->reada_slot = slot;
4871 }
4872
4873 /*
4874  * helper to process tree block while walking down the tree.
4875  *
4876  * when wc->stage == UPDATE_BACKREF, this function updates
4877  * back refs for pointers in the block.
4878  *
4879  * NOTE: return value 1 means we should stop walking down.
4880  */
4881 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4882                                    struct btrfs_root *root,
4883                                    struct btrfs_path *path,
4884                                    struct walk_control *wc, int lookup_info)
4885 {
4886         struct btrfs_fs_info *fs_info = root->fs_info;
4887         int level = wc->level;
4888         struct extent_buffer *eb = path->nodes[level];
4889         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4890         int ret;
4891
4892         if (wc->stage == UPDATE_BACKREF &&
4893             btrfs_header_owner(eb) != root->root_key.objectid)
4894                 return 1;
4895
4896         /*
4897          * when reference count of tree block is 1, it won't increase
4898          * again. once full backref flag is set, we never clear it.
4899          */
4900         if (lookup_info &&
4901             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4902              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4903                 BUG_ON(!path->locks[level]);
4904                 ret = btrfs_lookup_extent_info(trans, fs_info,
4905                                                eb->start, level, 1,
4906                                                &wc->refs[level],
4907                                                &wc->flags[level]);
4908                 BUG_ON(ret == -ENOMEM);
4909                 if (ret)
4910                         return ret;
4911                 BUG_ON(wc->refs[level] == 0);
4912         }
4913
4914         if (wc->stage == DROP_REFERENCE) {
4915                 if (wc->refs[level] > 1)
4916                         return 1;
4917
4918                 if (path->locks[level] && !wc->keep_locks) {
4919                         btrfs_tree_unlock_rw(eb, path->locks[level]);
4920                         path->locks[level] = 0;
4921                 }
4922                 return 0;
4923         }
4924
4925         /* wc->stage == UPDATE_BACKREF */
4926         if (!(wc->flags[level] & flag)) {
4927                 BUG_ON(!path->locks[level]);
4928                 ret = btrfs_inc_ref(trans, root, eb, 1);
4929                 BUG_ON(ret); /* -ENOMEM */
4930                 ret = btrfs_dec_ref(trans, root, eb, 0);
4931                 BUG_ON(ret); /* -ENOMEM */
4932                 ret = btrfs_set_disk_extent_flags(trans, eb, flag,
4933                                                   btrfs_header_level(eb), 0);
4934                 BUG_ON(ret); /* -ENOMEM */
4935                 wc->flags[level] |= flag;
4936         }
4937
4938         /*
4939          * the block is shared by multiple trees, so it's not good to
4940          * keep the tree lock
4941          */
4942         if (path->locks[level] && level > 0) {
4943                 btrfs_tree_unlock_rw(eb, path->locks[level]);
4944                 path->locks[level] = 0;
4945         }
4946         return 0;
4947 }
4948
4949 /*
4950  * This is used to verify a ref exists for this root to deal with a bug where we
4951  * would have a drop_progress key that hadn't been updated properly.
4952  */
4953 static int check_ref_exists(struct btrfs_trans_handle *trans,
4954                             struct btrfs_root *root, u64 bytenr, u64 parent,
4955                             int level)
4956 {
4957         struct btrfs_path *path;
4958         struct btrfs_extent_inline_ref *iref;
4959         int ret;
4960
4961         path = btrfs_alloc_path();
4962         if (!path)
4963                 return -ENOMEM;
4964
4965         ret = lookup_extent_backref(trans, path, &iref, bytenr,
4966                                     root->fs_info->nodesize, parent,
4967                                     root->root_key.objectid, level, 0);
4968         btrfs_free_path(path);
4969         if (ret == -ENOENT)
4970                 return 0;
4971         if (ret < 0)
4972                 return ret;
4973         return 1;
4974 }
4975
4976 /*
4977  * helper to process tree block pointer.
4978  *
4979  * when wc->stage == DROP_REFERENCE, this function checks
4980  * reference count of the block pointed to. if the block
4981  * is shared and we need update back refs for the subtree
4982  * rooted at the block, this function changes wc->stage to
4983  * UPDATE_BACKREF. if the block is shared and there is no
4984  * need to update back, this function drops the reference
4985  * to the block.
4986  *
4987  * NOTE: return value 1 means we should stop walking down.
4988  */
4989 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4990                                  struct btrfs_root *root,
4991                                  struct btrfs_path *path,
4992                                  struct walk_control *wc, int *lookup_info)
4993 {
4994         struct btrfs_fs_info *fs_info = root->fs_info;
4995         u64 bytenr;
4996         u64 generation;
4997         u64 parent;
4998         struct btrfs_key key;
4999         struct btrfs_key first_key;
5000         struct btrfs_ref ref = { 0 };
5001         struct extent_buffer *next;
5002         int level = wc->level;
5003         int reada = 0;
5004         int ret = 0;
5005         bool need_account = false;
5006
5007         generation = btrfs_node_ptr_generation(path->nodes[level],
5008                                                path->slots[level]);
5009         /*
5010          * if the lower level block was created before the snapshot
5011          * was created, we know there is no need to update back refs
5012          * for the subtree
5013          */
5014         if (wc->stage == UPDATE_BACKREF &&
5015             generation <= root->root_key.offset) {
5016                 *lookup_info = 1;
5017                 return 1;
5018         }
5019
5020         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5021         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
5022                               path->slots[level]);
5023
5024         next = find_extent_buffer(fs_info, bytenr);
5025         if (!next) {
5026                 next = btrfs_find_create_tree_block(fs_info, bytenr,
5027                                 root->root_key.objectid, level - 1);
5028                 if (IS_ERR(next))
5029                         return PTR_ERR(next);
5030                 reada = 1;
5031         }
5032         btrfs_tree_lock(next);
5033
5034         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5035                                        &wc->refs[level - 1],
5036                                        &wc->flags[level - 1]);
5037         if (ret < 0)
5038                 goto out_unlock;
5039
5040         if (unlikely(wc->refs[level - 1] == 0)) {
5041                 btrfs_err(fs_info, "Missing references.");
5042                 ret = -EIO;
5043                 goto out_unlock;
5044         }
5045         *lookup_info = 0;
5046
5047         if (wc->stage == DROP_REFERENCE) {
5048                 if (wc->refs[level - 1] > 1) {
5049                         need_account = true;
5050                         if (level == 1 &&
5051                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5052                                 goto skip;
5053
5054                         if (!wc->update_ref ||
5055                             generation <= root->root_key.offset)
5056                                 goto skip;
5057
5058                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5059                                               path->slots[level]);
5060                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5061                         if (ret < 0)
5062                                 goto skip;
5063
5064                         wc->stage = UPDATE_BACKREF;
5065                         wc->shared_level = level - 1;
5066                 }
5067         } else {
5068                 if (level == 1 &&
5069                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5070                         goto skip;
5071         }
5072
5073         if (!btrfs_buffer_uptodate(next, generation, 0)) {
5074                 btrfs_tree_unlock(next);
5075                 free_extent_buffer(next);
5076                 next = NULL;
5077                 *lookup_info = 1;
5078         }
5079
5080         if (!next) {
5081                 if (reada && level == 1)
5082                         reada_walk_down(trans, root, wc, path);
5083                 next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
5084                                        generation, level - 1, &first_key);
5085                 if (IS_ERR(next)) {
5086                         return PTR_ERR(next);
5087                 } else if (!extent_buffer_uptodate(next)) {
5088                         free_extent_buffer(next);
5089                         return -EIO;
5090                 }
5091                 btrfs_tree_lock(next);
5092         }
5093
5094         level--;
5095         ASSERT(level == btrfs_header_level(next));
5096         if (level != btrfs_header_level(next)) {
5097                 btrfs_err(root->fs_info, "mismatched level");
5098                 ret = -EIO;
5099                 goto out_unlock;
5100         }
5101         path->nodes[level] = next;
5102         path->slots[level] = 0;
5103         path->locks[level] = BTRFS_WRITE_LOCK;
5104         wc->level = level;
5105         if (wc->level == 1)
5106                 wc->reada_slot = 0;
5107         return 0;
5108 skip:
5109         wc->refs[level - 1] = 0;
5110         wc->flags[level - 1] = 0;
5111         if (wc->stage == DROP_REFERENCE) {
5112                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5113                         parent = path->nodes[level]->start;
5114                 } else {
5115                         ASSERT(root->root_key.objectid ==
5116                                btrfs_header_owner(path->nodes[level]));
5117                         if (root->root_key.objectid !=
5118                             btrfs_header_owner(path->nodes[level])) {
5119                                 btrfs_err(root->fs_info,
5120                                                 "mismatched block owner");
5121                                 ret = -EIO;
5122                                 goto out_unlock;
5123                         }
5124                         parent = 0;
5125                 }
5126
5127                 /*
5128                  * If we had a drop_progress we need to verify the refs are set
5129                  * as expected.  If we find our ref then we know that from here
5130                  * on out everything should be correct, and we can clear the
5131                  * ->restarted flag.
5132                  */
5133                 if (wc->restarted) {
5134                         ret = check_ref_exists(trans, root, bytenr, parent,
5135                                                level - 1);
5136                         if (ret < 0)
5137                                 goto out_unlock;
5138                         if (ret == 0)
5139                                 goto no_delete;
5140                         ret = 0;
5141                         wc->restarted = 0;
5142                 }
5143
5144                 /*
5145                  * Reloc tree doesn't contribute to qgroup numbers, and we have
5146                  * already accounted them at merge time (replace_path),
5147                  * thus we could skip expensive subtree trace here.
5148                  */
5149                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5150                     need_account) {
5151                         ret = btrfs_qgroup_trace_subtree(trans, next,
5152                                                          generation, level - 1);
5153                         if (ret) {
5154                                 btrfs_err_rl(fs_info,
5155                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5156                                              ret);
5157                         }
5158                 }
5159
5160                 /*
5161                  * We need to update the next key in our walk control so we can
5162                  * update the drop_progress key accordingly.  We don't care if
5163                  * find_next_key doesn't find a key because that means we're at
5164                  * the end and are going to clean up now.
5165                  */
5166                 wc->drop_level = level;
5167                 find_next_key(path, level, &wc->drop_progress);
5168
5169                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5170                                        fs_info->nodesize, parent);
5171                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
5172                 ret = btrfs_free_extent(trans, &ref);
5173                 if (ret)
5174                         goto out_unlock;
5175         }
5176 no_delete:
5177         *lookup_info = 1;
5178         ret = 1;
5179
5180 out_unlock:
5181         btrfs_tree_unlock(next);
5182         free_extent_buffer(next);
5183
5184         return ret;
5185 }
5186
5187 /*
5188  * helper to process tree block while walking up the tree.
5189  *
5190  * when wc->stage == DROP_REFERENCE, this function drops
5191  * reference count on the block.
5192  *
5193  * when wc->stage == UPDATE_BACKREF, this function changes
5194  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5195  * to UPDATE_BACKREF previously while processing the block.
5196  *
5197  * NOTE: return value 1 means we should stop walking up.
5198  */
5199 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5200                                  struct btrfs_root *root,
5201                                  struct btrfs_path *path,
5202                                  struct walk_control *wc)
5203 {
5204         struct btrfs_fs_info *fs_info = root->fs_info;
5205         int ret;
5206         int level = wc->level;
5207         struct extent_buffer *eb = path->nodes[level];
5208         u64 parent = 0;
5209
5210         if (wc->stage == UPDATE_BACKREF) {
5211                 BUG_ON(wc->shared_level < level);
5212                 if (level < wc->shared_level)
5213                         goto out;
5214
5215                 ret = find_next_key(path, level + 1, &wc->update_progress);
5216                 if (ret > 0)
5217                         wc->update_ref = 0;
5218
5219                 wc->stage = DROP_REFERENCE;
5220                 wc->shared_level = -1;
5221                 path->slots[level] = 0;
5222
5223                 /*
5224                  * check reference count again if the block isn't locked.
5225                  * we should start walking down the tree again if reference
5226                  * count is one.
5227                  */
5228                 if (!path->locks[level]) {
5229                         BUG_ON(level == 0);
5230                         btrfs_tree_lock(eb);
5231                         path->locks[level] = BTRFS_WRITE_LOCK;
5232
5233                         ret = btrfs_lookup_extent_info(trans, fs_info,
5234                                                        eb->start, level, 1,
5235                                                        &wc->refs[level],
5236                                                        &wc->flags[level]);
5237                         if (ret < 0) {
5238                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5239                                 path->locks[level] = 0;
5240                                 return ret;
5241                         }
5242                         BUG_ON(wc->refs[level] == 0);
5243                         if (wc->refs[level] == 1) {
5244                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5245                                 path->locks[level] = 0;
5246                                 return 1;
5247                         }
5248                 }
5249         }
5250
5251         /* wc->stage == DROP_REFERENCE */
5252         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5253
5254         if (wc->refs[level] == 1) {
5255                 if (level == 0) {
5256                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5257                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5258                         else
5259                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5260                         BUG_ON(ret); /* -ENOMEM */
5261                         if (is_fstree(root->root_key.objectid)) {
5262                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5263                                 if (ret) {
5264                                         btrfs_err_rl(fs_info,
5265         "error %d accounting leaf items, quota is out of sync, rescan required",
5266                                              ret);
5267                                 }
5268                         }
5269                 }
5270                 /* make block locked assertion in btrfs_clean_tree_block happy */
5271                 if (!path->locks[level] &&
5272                     btrfs_header_generation(eb) == trans->transid) {
5273                         btrfs_tree_lock(eb);
5274                         path->locks[level] = BTRFS_WRITE_LOCK;
5275                 }
5276                 btrfs_clean_tree_block(eb);
5277         }
5278
5279         if (eb == root->node) {
5280                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5281                         parent = eb->start;
5282                 else if (root->root_key.objectid != btrfs_header_owner(eb))
5283                         goto owner_mismatch;
5284         } else {
5285                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5286                         parent = path->nodes[level + 1]->start;
5287                 else if (root->root_key.objectid !=
5288                          btrfs_header_owner(path->nodes[level + 1]))
5289                         goto owner_mismatch;
5290         }
5291
5292         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5293 out:
5294         wc->refs[level] = 0;
5295         wc->flags[level] = 0;
5296         return 0;
5297
5298 owner_mismatch:
5299         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5300                      btrfs_header_owner(eb), root->root_key.objectid);
5301         return -EUCLEAN;
5302 }
5303
5304 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5305                                    struct btrfs_root *root,
5306                                    struct btrfs_path *path,
5307                                    struct walk_control *wc)
5308 {
5309         int level = wc->level;
5310         int lookup_info = 1;
5311         int ret;
5312
5313         while (level >= 0) {
5314                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5315                 if (ret > 0)
5316                         break;
5317
5318                 if (level == 0)
5319                         break;
5320
5321                 if (path->slots[level] >=
5322                     btrfs_header_nritems(path->nodes[level]))
5323                         break;
5324
5325                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5326                 if (ret > 0) {
5327                         path->slots[level]++;
5328                         continue;
5329                 } else if (ret < 0)
5330                         return ret;
5331                 level = wc->level;
5332         }
5333         return 0;
5334 }
5335
5336 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5337                                  struct btrfs_root *root,
5338                                  struct btrfs_path *path,
5339                                  struct walk_control *wc, int max_level)
5340 {
5341         int level = wc->level;
5342         int ret;
5343
5344         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5345         while (level < max_level && path->nodes[level]) {
5346                 wc->level = level;
5347                 if (path->slots[level] + 1 <
5348                     btrfs_header_nritems(path->nodes[level])) {
5349                         path->slots[level]++;
5350                         return 0;
5351                 } else {
5352                         ret = walk_up_proc(trans, root, path, wc);
5353                         if (ret > 0)
5354                                 return 0;
5355                         if (ret < 0)
5356                                 return ret;
5357
5358                         if (path->locks[level]) {
5359                                 btrfs_tree_unlock_rw(path->nodes[level],
5360                                                      path->locks[level]);
5361                                 path->locks[level] = 0;
5362                         }
5363                         free_extent_buffer(path->nodes[level]);
5364                         path->nodes[level] = NULL;
5365                         level++;
5366                 }
5367         }
5368         return 1;
5369 }
5370
5371 /*
5372  * drop a subvolume tree.
5373  *
5374  * this function traverses the tree freeing any blocks that only
5375  * referenced by the tree.
5376  *
5377  * when a shared tree block is found. this function decreases its
5378  * reference count by one. if update_ref is true, this function
5379  * also make sure backrefs for the shared block and all lower level
5380  * blocks are properly updated.
5381  *
5382  * If called with for_reloc == 0, may exit early with -EAGAIN
5383  */
5384 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5385 {
5386         struct btrfs_fs_info *fs_info = root->fs_info;
5387         struct btrfs_path *path;
5388         struct btrfs_trans_handle *trans;
5389         struct btrfs_root *tree_root = fs_info->tree_root;
5390         struct btrfs_root_item *root_item = &root->root_item;
5391         struct walk_control *wc;
5392         struct btrfs_key key;
5393         int err = 0;
5394         int ret;
5395         int level;
5396         bool root_dropped = false;
5397
5398         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5399
5400         path = btrfs_alloc_path();
5401         if (!path) {
5402                 err = -ENOMEM;
5403                 goto out;
5404         }
5405
5406         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5407         if (!wc) {
5408                 btrfs_free_path(path);
5409                 err = -ENOMEM;
5410                 goto out;
5411         }
5412
5413         /*
5414          * Use join to avoid potential EINTR from transaction start. See
5415          * wait_reserve_ticket and the whole reservation callchain.
5416          */
5417         if (for_reloc)
5418                 trans = btrfs_join_transaction(tree_root);
5419         else
5420                 trans = btrfs_start_transaction(tree_root, 0);
5421         if (IS_ERR(trans)) {
5422                 err = PTR_ERR(trans);
5423                 goto out_free;
5424         }
5425
5426         err = btrfs_run_delayed_items(trans);
5427         if (err)
5428                 goto out_end_trans;
5429
5430         /*
5431          * This will help us catch people modifying the fs tree while we're
5432          * dropping it.  It is unsafe to mess with the fs tree while it's being
5433          * dropped as we unlock the root node and parent nodes as we walk down
5434          * the tree, assuming nothing will change.  If something does change
5435          * then we'll have stale information and drop references to blocks we've
5436          * already dropped.
5437          */
5438         set_bit(BTRFS_ROOT_DELETING, &root->state);
5439         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5440                 level = btrfs_header_level(root->node);
5441                 path->nodes[level] = btrfs_lock_root_node(root);
5442                 path->slots[level] = 0;
5443                 path->locks[level] = BTRFS_WRITE_LOCK;
5444                 memset(&wc->update_progress, 0,
5445                        sizeof(wc->update_progress));
5446         } else {
5447                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5448                 memcpy(&wc->update_progress, &key,
5449                        sizeof(wc->update_progress));
5450
5451                 level = btrfs_root_drop_level(root_item);
5452                 BUG_ON(level == 0);
5453                 path->lowest_level = level;
5454                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5455                 path->lowest_level = 0;
5456                 if (ret < 0) {
5457                         err = ret;
5458                         goto out_end_trans;
5459                 }
5460                 WARN_ON(ret > 0);
5461
5462                 /*
5463                  * unlock our path, this is safe because only this
5464                  * function is allowed to delete this snapshot
5465                  */
5466                 btrfs_unlock_up_safe(path, 0);
5467
5468                 level = btrfs_header_level(root->node);
5469                 while (1) {
5470                         btrfs_tree_lock(path->nodes[level]);
5471                         path->locks[level] = BTRFS_WRITE_LOCK;
5472
5473                         ret = btrfs_lookup_extent_info(trans, fs_info,
5474                                                 path->nodes[level]->start,
5475                                                 level, 1, &wc->refs[level],
5476                                                 &wc->flags[level]);
5477                         if (ret < 0) {
5478                                 err = ret;
5479                                 goto out_end_trans;
5480                         }
5481                         BUG_ON(wc->refs[level] == 0);
5482
5483                         if (level == btrfs_root_drop_level(root_item))
5484                                 break;
5485
5486                         btrfs_tree_unlock(path->nodes[level]);
5487                         path->locks[level] = 0;
5488                         WARN_ON(wc->refs[level] != 1);
5489                         level--;
5490                 }
5491         }
5492
5493         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5494         wc->level = level;
5495         wc->shared_level = -1;
5496         wc->stage = DROP_REFERENCE;
5497         wc->update_ref = update_ref;
5498         wc->keep_locks = 0;
5499         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5500
5501         while (1) {
5502
5503                 ret = walk_down_tree(trans, root, path, wc);
5504                 if (ret < 0) {
5505                         err = ret;
5506                         break;
5507                 }
5508
5509                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5510                 if (ret < 0) {
5511                         err = ret;
5512                         break;
5513                 }
5514
5515                 if (ret > 0) {
5516                         BUG_ON(wc->stage != DROP_REFERENCE);
5517                         break;
5518                 }
5519
5520                 if (wc->stage == DROP_REFERENCE) {
5521                         wc->drop_level = wc->level;
5522                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5523                                               &wc->drop_progress,
5524                                               path->slots[wc->drop_level]);
5525                 }
5526                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5527                                       &wc->drop_progress);
5528                 btrfs_set_root_drop_level(root_item, wc->drop_level);
5529
5530                 BUG_ON(wc->level == 0);
5531                 if (btrfs_should_end_transaction(trans) ||
5532                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5533                         ret = btrfs_update_root(trans, tree_root,
5534                                                 &root->root_key,
5535                                                 root_item);
5536                         if (ret) {
5537                                 btrfs_abort_transaction(trans, ret);
5538                                 err = ret;
5539                                 goto out_end_trans;
5540                         }
5541
5542                         btrfs_end_transaction_throttle(trans);
5543                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5544                                 btrfs_debug(fs_info,
5545                                             "drop snapshot early exit");
5546                                 err = -EAGAIN;
5547                                 goto out_free;
5548                         }
5549
5550                         trans = btrfs_start_transaction(tree_root, 0);
5551                         if (IS_ERR(trans)) {
5552                                 err = PTR_ERR(trans);
5553                                 goto out_free;
5554                         }
5555                 }
5556         }
5557         btrfs_release_path(path);
5558         if (err)
5559                 goto out_end_trans;
5560
5561         ret = btrfs_del_root(trans, &root->root_key);
5562         if (ret) {
5563                 btrfs_abort_transaction(trans, ret);
5564                 err = ret;
5565                 goto out_end_trans;
5566         }
5567
5568         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5569                 ret = btrfs_find_root(tree_root, &root->root_key, path,
5570                                       NULL, NULL);
5571                 if (ret < 0) {
5572                         btrfs_abort_transaction(trans, ret);
5573                         err = ret;
5574                         goto out_end_trans;
5575                 } else if (ret > 0) {
5576                         /* if we fail to delete the orphan item this time
5577                          * around, it'll get picked up the next time.
5578                          *
5579                          * The most common failure here is just -ENOENT.
5580                          */
5581                         btrfs_del_orphan_item(trans, tree_root,
5582                                               root->root_key.objectid);
5583                 }
5584         }
5585
5586         /*
5587          * This subvolume is going to be completely dropped, and won't be
5588          * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5589          * commit transaction time.  So free it here manually.
5590          */
5591         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5592         btrfs_qgroup_free_meta_all_pertrans(root);
5593
5594         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5595                 btrfs_add_dropped_root(trans, root);
5596         else
5597                 btrfs_put_root(root);
5598         root_dropped = true;
5599 out_end_trans:
5600         btrfs_end_transaction_throttle(trans);
5601 out_free:
5602         kfree(wc);
5603         btrfs_free_path(path);
5604 out:
5605         /*
5606          * So if we need to stop dropping the snapshot for whatever reason we
5607          * need to make sure to add it back to the dead root list so that we
5608          * keep trying to do the work later.  This also cleans up roots if we
5609          * don't have it in the radix (like when we recover after a power fail
5610          * or unmount) so we don't leak memory.
5611          */
5612         if (!for_reloc && !root_dropped)
5613                 btrfs_add_dead_root(root);
5614         return err;
5615 }
5616
5617 /*
5618  * drop subtree rooted at tree block 'node'.
5619  *
5620  * NOTE: this function will unlock and release tree block 'node'
5621  * only used by relocation code
5622  */
5623 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5624                         struct btrfs_root *root,
5625                         struct extent_buffer *node,
5626                         struct extent_buffer *parent)
5627 {
5628         struct btrfs_fs_info *fs_info = root->fs_info;
5629         struct btrfs_path *path;
5630         struct walk_control *wc;
5631         int level;
5632         int parent_level;
5633         int ret = 0;
5634         int wret;
5635
5636         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5637
5638         path = btrfs_alloc_path();
5639         if (!path)
5640                 return -ENOMEM;
5641
5642         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5643         if (!wc) {
5644                 btrfs_free_path(path);
5645                 return -ENOMEM;
5646         }
5647
5648         btrfs_assert_tree_locked(parent);
5649         parent_level = btrfs_header_level(parent);
5650         atomic_inc(&parent->refs);
5651         path->nodes[parent_level] = parent;
5652         path->slots[parent_level] = btrfs_header_nritems(parent);
5653
5654         btrfs_assert_tree_locked(node);
5655         level = btrfs_header_level(node);
5656         path->nodes[level] = node;
5657         path->slots[level] = 0;
5658         path->locks[level] = BTRFS_WRITE_LOCK;
5659
5660         wc->refs[parent_level] = 1;
5661         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5662         wc->level = level;
5663         wc->shared_level = -1;
5664         wc->stage = DROP_REFERENCE;
5665         wc->update_ref = 0;
5666         wc->keep_locks = 1;
5667         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5668
5669         while (1) {
5670                 wret = walk_down_tree(trans, root, path, wc);
5671                 if (wret < 0) {
5672                         ret = wret;
5673                         break;
5674                 }
5675
5676                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5677                 if (wret < 0)
5678                         ret = wret;
5679                 if (wret != 0)
5680                         break;
5681         }
5682
5683         kfree(wc);
5684         btrfs_free_path(path);
5685         return ret;
5686 }
5687
5688 /*
5689  * helper to account the unused space of all the readonly block group in the
5690  * space_info. takes mirrors into account.
5691  */
5692 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5693 {
5694         struct btrfs_block_group *block_group;
5695         u64 free_bytes = 0;
5696         int factor;
5697
5698         /* It's df, we don't care if it's racy */
5699         if (list_empty(&sinfo->ro_bgs))
5700                 return 0;
5701
5702         spin_lock(&sinfo->lock);
5703         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5704                 spin_lock(&block_group->lock);
5705
5706                 if (!block_group->ro) {
5707                         spin_unlock(&block_group->lock);
5708                         continue;
5709                 }
5710
5711                 factor = btrfs_bg_type_to_factor(block_group->flags);
5712                 free_bytes += (block_group->length -
5713                                block_group->used) * factor;
5714
5715                 spin_unlock(&block_group->lock);
5716         }
5717         spin_unlock(&sinfo->lock);
5718
5719         return free_bytes;
5720 }
5721
5722 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5723                                    u64 start, u64 end)
5724 {
5725         return unpin_extent_range(fs_info, start, end, false);
5726 }
5727
5728 /*
5729  * It used to be that old block groups would be left around forever.
5730  * Iterating over them would be enough to trim unused space.  Since we
5731  * now automatically remove them, we also need to iterate over unallocated
5732  * space.
5733  *
5734  * We don't want a transaction for this since the discard may take a
5735  * substantial amount of time.  We don't require that a transaction be
5736  * running, but we do need to take a running transaction into account
5737  * to ensure that we're not discarding chunks that were released or
5738  * allocated in the current transaction.
5739  *
5740  * Holding the chunks lock will prevent other threads from allocating
5741  * or releasing chunks, but it won't prevent a running transaction
5742  * from committing and releasing the memory that the pending chunks
5743  * list head uses.  For that, we need to take a reference to the
5744  * transaction and hold the commit root sem.  We only need to hold
5745  * it while performing the free space search since we have already
5746  * held back allocations.
5747  */
5748 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5749 {
5750         u64 start = SZ_1M, len = 0, end = 0;
5751         int ret;
5752
5753         *trimmed = 0;
5754
5755         /* Discard not supported = nothing to do. */
5756         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
5757                 return 0;
5758
5759         /* Not writable = nothing to do. */
5760         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5761                 return 0;
5762
5763         /* No free space = nothing to do. */
5764         if (device->total_bytes <= device->bytes_used)
5765                 return 0;
5766
5767         ret = 0;
5768
5769         while (1) {
5770                 struct btrfs_fs_info *fs_info = device->fs_info;
5771                 u64 bytes;
5772
5773                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5774                 if (ret)
5775                         break;
5776
5777                 find_first_clear_extent_bit(&device->alloc_state, start,
5778                                             &start, &end,
5779                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
5780
5781                 /* Check if there are any CHUNK_* bits left */
5782                 if (start > device->total_bytes) {
5783                         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5784                         btrfs_warn_in_rcu(fs_info,
5785 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5786                                           start, end - start + 1,
5787                                           rcu_str_deref(device->name),
5788                                           device->total_bytes);
5789                         mutex_unlock(&fs_info->chunk_mutex);
5790                         ret = 0;
5791                         break;
5792                 }
5793
5794                 /* Ensure we skip the reserved area in the first 1M */
5795                 start = max_t(u64, start, SZ_1M);
5796
5797                 /*
5798                  * If find_first_clear_extent_bit find a range that spans the
5799                  * end of the device it will set end to -1, in this case it's up
5800                  * to the caller to trim the value to the size of the device.
5801                  */
5802                 end = min(end, device->total_bytes - 1);
5803
5804                 len = end - start + 1;
5805
5806                 /* We didn't find any extents */
5807                 if (!len) {
5808                         mutex_unlock(&fs_info->chunk_mutex);
5809                         ret = 0;
5810                         break;
5811                 }
5812
5813                 ret = btrfs_issue_discard(device->bdev, start, len,
5814                                           &bytes);
5815                 if (!ret)
5816                         set_extent_bits(&device->alloc_state, start,
5817                                         start + bytes - 1,
5818                                         CHUNK_TRIMMED);
5819                 mutex_unlock(&fs_info->chunk_mutex);
5820
5821                 if (ret)
5822                         break;
5823
5824                 start += len;
5825                 *trimmed += bytes;
5826
5827                 if (fatal_signal_pending(current)) {
5828                         ret = -ERESTARTSYS;
5829                         break;
5830                 }
5831
5832                 cond_resched();
5833         }
5834
5835         return ret;
5836 }
5837
5838 /*
5839  * Trim the whole filesystem by:
5840  * 1) trimming the free space in each block group
5841  * 2) trimming the unallocated space on each device
5842  *
5843  * This will also continue trimming even if a block group or device encounters
5844  * an error.  The return value will be the last error, or 0 if nothing bad
5845  * happens.
5846  */
5847 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5848 {
5849         struct btrfs_block_group *cache = NULL;
5850         struct btrfs_device *device;
5851         struct list_head *devices;
5852         u64 group_trimmed;
5853         u64 range_end = U64_MAX;
5854         u64 start;
5855         u64 end;
5856         u64 trimmed = 0;
5857         u64 bg_failed = 0;
5858         u64 dev_failed = 0;
5859         int bg_ret = 0;
5860         int dev_ret = 0;
5861         int ret = 0;
5862
5863         /*
5864          * Check range overflow if range->len is set.
5865          * The default range->len is U64_MAX.
5866          */
5867         if (range->len != U64_MAX &&
5868             check_add_overflow(range->start, range->len, &range_end))
5869                 return -EINVAL;
5870
5871         cache = btrfs_lookup_first_block_group(fs_info, range->start);
5872         for (; cache; cache = btrfs_next_block_group(cache)) {
5873                 if (cache->start >= range_end) {
5874                         btrfs_put_block_group(cache);
5875                         break;
5876                 }
5877
5878                 start = max(range->start, cache->start);
5879                 end = min(range_end, cache->start + cache->length);
5880
5881                 if (end - start >= range->minlen) {
5882                         if (!btrfs_block_group_done(cache)) {
5883                                 ret = btrfs_cache_block_group(cache, 0);
5884                                 if (ret) {
5885                                         bg_failed++;
5886                                         bg_ret = ret;
5887                                         continue;
5888                                 }
5889                                 ret = btrfs_wait_block_group_cache_done(cache);
5890                                 if (ret) {
5891                                         bg_failed++;
5892                                         bg_ret = ret;
5893                                         continue;
5894                                 }
5895                         }
5896                         ret = btrfs_trim_block_group(cache,
5897                                                      &group_trimmed,
5898                                                      start,
5899                                                      end,
5900                                                      range->minlen);
5901
5902                         trimmed += group_trimmed;
5903                         if (ret) {
5904                                 bg_failed++;
5905                                 bg_ret = ret;
5906                                 continue;
5907                         }
5908                 }
5909         }
5910
5911         if (bg_failed)
5912                 btrfs_warn(fs_info,
5913                         "failed to trim %llu block group(s), last error %d",
5914                         bg_failed, bg_ret);
5915         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5916         devices = &fs_info->fs_devices->devices;
5917         list_for_each_entry(device, devices, dev_list) {
5918                 ret = btrfs_trim_free_extents(device, &group_trimmed);
5919                 if (ret) {
5920                         dev_failed++;
5921                         dev_ret = ret;
5922                         break;
5923                 }
5924
5925                 trimmed += group_trimmed;
5926         }
5927         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5928
5929         if (dev_failed)
5930                 btrfs_warn(fs_info,
5931                         "failed to trim %llu device(s), last error %d",
5932                         dev_failed, dev_ret);
5933         range->len = trimmed;
5934         if (bg_ret)
5935                 return bg_ret;
5936         return dev_ret;
5937 }