Merge tag 'thermal-5.20-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafae...
[linux-2.6-microblaze.git] / fs / btrfs / disk-io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/fs.h>
7 #include <linux/blkdev.h>
8 #include <linux/radix-tree.h>
9 #include <linux/writeback.h>
10 #include <linux/workqueue.h>
11 #include <linux/kthread.h>
12 #include <linux/slab.h>
13 #include <linux/migrate.h>
14 #include <linux/ratelimit.h>
15 #include <linux/uuid.h>
16 #include <linux/semaphore.h>
17 #include <linux/error-injection.h>
18 #include <linux/crc32c.h>
19 #include <linux/sched/mm.h>
20 #include <asm/unaligned.h>
21 #include <crypto/hash.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "btrfs_inode.h"
26 #include "volumes.h"
27 #include "print-tree.h"
28 #include "locking.h"
29 #include "tree-log.h"
30 #include "free-space-cache.h"
31 #include "free-space-tree.h"
32 #include "check-integrity.h"
33 #include "rcu-string.h"
34 #include "dev-replace.h"
35 #include "raid56.h"
36 #include "sysfs.h"
37 #include "qgroup.h"
38 #include "compression.h"
39 #include "tree-checker.h"
40 #include "ref-verify.h"
41 #include "block-group.h"
42 #include "discard.h"
43 #include "space-info.h"
44 #include "zoned.h"
45 #include "subpage.h"
46
47 #define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
48                                  BTRFS_HEADER_FLAG_RELOC |\
49                                  BTRFS_SUPER_FLAG_ERROR |\
50                                  BTRFS_SUPER_FLAG_SEEDING |\
51                                  BTRFS_SUPER_FLAG_METADUMP |\
52                                  BTRFS_SUPER_FLAG_METADUMP_V2)
53
54 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
55 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
56                                       struct btrfs_fs_info *fs_info);
57 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
58 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
59                                         struct extent_io_tree *dirty_pages,
60                                         int mark);
61 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
62                                        struct extent_io_tree *pinned_extents);
63 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
64 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
65
66 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
67 {
68         if (fs_info->csum_shash)
69                 crypto_free_shash(fs_info->csum_shash);
70 }
71
72 /*
73  * async submit bios are used to offload expensive checksumming
74  * onto the worker threads.  They checksum file and metadata bios
75  * just before they are sent down the IO stack.
76  */
77 struct async_submit_bio {
78         struct inode *inode;
79         struct bio *bio;
80         extent_submit_bio_start_t *submit_bio_start;
81         int mirror_num;
82
83         /* Optional parameter for submit_bio_start used by direct io */
84         u64 dio_file_offset;
85         struct btrfs_work work;
86         blk_status_t status;
87 };
88
89 /*
90  * Lockdep class keys for extent_buffer->lock's in this root.  For a given
91  * eb, the lockdep key is determined by the btrfs_root it belongs to and
92  * the level the eb occupies in the tree.
93  *
94  * Different roots are used for different purposes and may nest inside each
95  * other and they require separate keysets.  As lockdep keys should be
96  * static, assign keysets according to the purpose of the root as indicated
97  * by btrfs_root->root_key.objectid.  This ensures that all special purpose
98  * roots have separate keysets.
99  *
100  * Lock-nesting across peer nodes is always done with the immediate parent
101  * node locked thus preventing deadlock.  As lockdep doesn't know this, use
102  * subclass to avoid triggering lockdep warning in such cases.
103  *
104  * The key is set by the readpage_end_io_hook after the buffer has passed
105  * csum validation but before the pages are unlocked.  It is also set by
106  * btrfs_init_new_buffer on freshly allocated blocks.
107  *
108  * We also add a check to make sure the highest level of the tree is the
109  * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
110  * needs update as well.
111  */
112 #ifdef CONFIG_DEBUG_LOCK_ALLOC
113 # if BTRFS_MAX_LEVEL != 8
114 #  error
115 # endif
116
117 #define DEFINE_LEVEL(stem, level)                                       \
118         .names[level] = "btrfs-" stem "-0" #level,
119
120 #define DEFINE_NAME(stem)                                               \
121         DEFINE_LEVEL(stem, 0)                                           \
122         DEFINE_LEVEL(stem, 1)                                           \
123         DEFINE_LEVEL(stem, 2)                                           \
124         DEFINE_LEVEL(stem, 3)                                           \
125         DEFINE_LEVEL(stem, 4)                                           \
126         DEFINE_LEVEL(stem, 5)                                           \
127         DEFINE_LEVEL(stem, 6)                                           \
128         DEFINE_LEVEL(stem, 7)
129
130 static struct btrfs_lockdep_keyset {
131         u64                     id;             /* root objectid */
132         /* Longest entry: btrfs-free-space-00 */
133         char                    names[BTRFS_MAX_LEVEL][20];
134         struct lock_class_key   keys[BTRFS_MAX_LEVEL];
135 } btrfs_lockdep_keysets[] = {
136         { .id = BTRFS_ROOT_TREE_OBJECTID,       DEFINE_NAME("root")     },
137         { .id = BTRFS_EXTENT_TREE_OBJECTID,     DEFINE_NAME("extent")   },
138         { .id = BTRFS_CHUNK_TREE_OBJECTID,      DEFINE_NAME("chunk")    },
139         { .id = BTRFS_DEV_TREE_OBJECTID,        DEFINE_NAME("dev")      },
140         { .id = BTRFS_CSUM_TREE_OBJECTID,       DEFINE_NAME("csum")     },
141         { .id = BTRFS_QUOTA_TREE_OBJECTID,      DEFINE_NAME("quota")    },
142         { .id = BTRFS_TREE_LOG_OBJECTID,        DEFINE_NAME("log")      },
143         { .id = BTRFS_TREE_RELOC_OBJECTID,      DEFINE_NAME("treloc")   },
144         { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc")   },
145         { .id = BTRFS_UUID_TREE_OBJECTID,       DEFINE_NAME("uuid")     },
146         { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
147         { .id = 0,                              DEFINE_NAME("tree")     },
148 };
149
150 #undef DEFINE_LEVEL
151 #undef DEFINE_NAME
152
153 void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
154                                     int level)
155 {
156         struct btrfs_lockdep_keyset *ks;
157
158         BUG_ON(level >= ARRAY_SIZE(ks->keys));
159
160         /* find the matching keyset, id 0 is the default entry */
161         for (ks = btrfs_lockdep_keysets; ks->id; ks++)
162                 if (ks->id == objectid)
163                         break;
164
165         lockdep_set_class_and_name(&eb->lock,
166                                    &ks->keys[level], ks->names[level]);
167 }
168
169 #endif
170
171 /*
172  * Compute the csum of a btree block and store the result to provided buffer.
173  */
174 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
175 {
176         struct btrfs_fs_info *fs_info = buf->fs_info;
177         const int num_pages = num_extent_pages(buf);
178         const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
179         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
180         char *kaddr;
181         int i;
182
183         shash->tfm = fs_info->csum_shash;
184         crypto_shash_init(shash);
185         kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
186         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
187                             first_page_part - BTRFS_CSUM_SIZE);
188
189         for (i = 1; i < num_pages; i++) {
190                 kaddr = page_address(buf->pages[i]);
191                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
192         }
193         memset(result, 0, BTRFS_CSUM_SIZE);
194         crypto_shash_final(shash, result);
195 }
196
197 /*
198  * we can't consider a given block up to date unless the transid of the
199  * block matches the transid in the parent node's pointer.  This is how we
200  * detect blocks that either didn't get written at all or got written
201  * in the wrong place.
202  */
203 static int verify_parent_transid(struct extent_io_tree *io_tree,
204                                  struct extent_buffer *eb, u64 parent_transid,
205                                  int atomic)
206 {
207         struct extent_state *cached_state = NULL;
208         int ret;
209
210         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
211                 return 0;
212
213         if (atomic)
214                 return -EAGAIN;
215
216         lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
217                          &cached_state);
218         if (extent_buffer_uptodate(eb) &&
219             btrfs_header_generation(eb) == parent_transid) {
220                 ret = 0;
221                 goto out;
222         }
223         btrfs_err_rl(eb->fs_info,
224 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
225                         eb->start, eb->read_mirror,
226                         parent_transid, btrfs_header_generation(eb));
227         ret = 1;
228         clear_extent_buffer_uptodate(eb);
229 out:
230         unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
231                              &cached_state);
232         return ret;
233 }
234
235 static bool btrfs_supported_super_csum(u16 csum_type)
236 {
237         switch (csum_type) {
238         case BTRFS_CSUM_TYPE_CRC32:
239         case BTRFS_CSUM_TYPE_XXHASH:
240         case BTRFS_CSUM_TYPE_SHA256:
241         case BTRFS_CSUM_TYPE_BLAKE2:
242                 return true;
243         default:
244                 return false;
245         }
246 }
247
248 /*
249  * Return 0 if the superblock checksum type matches the checksum value of that
250  * algorithm. Pass the raw disk superblock data.
251  */
252 static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
253                                   char *raw_disk_sb)
254 {
255         struct btrfs_super_block *disk_sb =
256                 (struct btrfs_super_block *)raw_disk_sb;
257         char result[BTRFS_CSUM_SIZE];
258         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
259
260         shash->tfm = fs_info->csum_shash;
261
262         /*
263          * The super_block structure does not span the whole
264          * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
265          * filled with zeros and is included in the checksum.
266          */
267         crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
268                             BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
269
270         if (memcmp(disk_sb->csum, result, fs_info->csum_size))
271                 return 1;
272
273         return 0;
274 }
275
276 int btrfs_verify_level_key(struct extent_buffer *eb, int level,
277                            struct btrfs_key *first_key, u64 parent_transid)
278 {
279         struct btrfs_fs_info *fs_info = eb->fs_info;
280         int found_level;
281         struct btrfs_key found_key;
282         int ret;
283
284         found_level = btrfs_header_level(eb);
285         if (found_level != level) {
286                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
287                      KERN_ERR "BTRFS: tree level check failed\n");
288                 btrfs_err(fs_info,
289 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
290                           eb->start, level, found_level);
291                 return -EIO;
292         }
293
294         if (!first_key)
295                 return 0;
296
297         /*
298          * For live tree block (new tree blocks in current transaction),
299          * we need proper lock context to avoid race, which is impossible here.
300          * So we only checks tree blocks which is read from disk, whose
301          * generation <= fs_info->last_trans_committed.
302          */
303         if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
304                 return 0;
305
306         /* We have @first_key, so this @eb must have at least one item */
307         if (btrfs_header_nritems(eb) == 0) {
308                 btrfs_err(fs_info,
309                 "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
310                           eb->start);
311                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
312                 return -EUCLEAN;
313         }
314
315         if (found_level)
316                 btrfs_node_key_to_cpu(eb, &found_key, 0);
317         else
318                 btrfs_item_key_to_cpu(eb, &found_key, 0);
319         ret = btrfs_comp_cpu_keys(first_key, &found_key);
320
321         if (ret) {
322                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
323                      KERN_ERR "BTRFS: tree first key check failed\n");
324                 btrfs_err(fs_info,
325 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
326                           eb->start, parent_transid, first_key->objectid,
327                           first_key->type, first_key->offset,
328                           found_key.objectid, found_key.type,
329                           found_key.offset);
330         }
331         return ret;
332 }
333
334 /*
335  * helper to read a given tree block, doing retries as required when
336  * the checksums don't match and we have alternate mirrors to try.
337  *
338  * @parent_transid:     expected transid, skip check if 0
339  * @level:              expected level, mandatory check
340  * @first_key:          expected key of first slot, skip check if NULL
341  */
342 int btrfs_read_extent_buffer(struct extent_buffer *eb,
343                              u64 parent_transid, int level,
344                              struct btrfs_key *first_key)
345 {
346         struct btrfs_fs_info *fs_info = eb->fs_info;
347         struct extent_io_tree *io_tree;
348         int failed = 0;
349         int ret;
350         int num_copies = 0;
351         int mirror_num = 0;
352         int failed_mirror = 0;
353
354         io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
355         while (1) {
356                 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
357                 ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
358                 if (!ret) {
359                         if (verify_parent_transid(io_tree, eb,
360                                                    parent_transid, 0))
361                                 ret = -EIO;
362                         else if (btrfs_verify_level_key(eb, level,
363                                                 first_key, parent_transid))
364                                 ret = -EUCLEAN;
365                         else
366                                 break;
367                 }
368
369                 num_copies = btrfs_num_copies(fs_info,
370                                               eb->start, eb->len);
371                 if (num_copies == 1)
372                         break;
373
374                 if (!failed_mirror) {
375                         failed = 1;
376                         failed_mirror = eb->read_mirror;
377                 }
378
379                 mirror_num++;
380                 if (mirror_num == failed_mirror)
381                         mirror_num++;
382
383                 if (mirror_num > num_copies)
384                         break;
385         }
386
387         if (failed && !ret && failed_mirror)
388                 btrfs_repair_eb_io_failure(eb, failed_mirror);
389
390         return ret;
391 }
392
393 static int csum_one_extent_buffer(struct extent_buffer *eb)
394 {
395         struct btrfs_fs_info *fs_info = eb->fs_info;
396         u8 result[BTRFS_CSUM_SIZE];
397         int ret;
398
399         ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
400                                     offsetof(struct btrfs_header, fsid),
401                                     BTRFS_FSID_SIZE) == 0);
402         csum_tree_block(eb, result);
403
404         if (btrfs_header_level(eb))
405                 ret = btrfs_check_node(eb);
406         else
407                 ret = btrfs_check_leaf_full(eb);
408
409         if (ret < 0)
410                 goto error;
411
412         /*
413          * Also check the generation, the eb reached here must be newer than
414          * last committed. Or something seriously wrong happened.
415          */
416         if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
417                 ret = -EUCLEAN;
418                 btrfs_err(fs_info,
419                         "block=%llu bad generation, have %llu expect > %llu",
420                           eb->start, btrfs_header_generation(eb),
421                           fs_info->last_trans_committed);
422                 goto error;
423         }
424         write_extent_buffer(eb, result, 0, fs_info->csum_size);
425
426         return 0;
427
428 error:
429         btrfs_print_tree(eb, 0);
430         btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
431                   eb->start);
432         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
433         return ret;
434 }
435
436 /* Checksum all dirty extent buffers in one bio_vec */
437 static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
438                                       struct bio_vec *bvec)
439 {
440         struct page *page = bvec->bv_page;
441         u64 bvec_start = page_offset(page) + bvec->bv_offset;
442         u64 cur;
443         int ret = 0;
444
445         for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
446              cur += fs_info->nodesize) {
447                 struct extent_buffer *eb;
448                 bool uptodate;
449
450                 eb = find_extent_buffer(fs_info, cur);
451                 uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
452                                                        fs_info->nodesize);
453
454                 /* A dirty eb shouldn't disappear from buffer_radix */
455                 if (WARN_ON(!eb))
456                         return -EUCLEAN;
457
458                 if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
459                         free_extent_buffer(eb);
460                         return -EUCLEAN;
461                 }
462                 if (WARN_ON(!uptodate)) {
463                         free_extent_buffer(eb);
464                         return -EUCLEAN;
465                 }
466
467                 ret = csum_one_extent_buffer(eb);
468                 free_extent_buffer(eb);
469                 if (ret < 0)
470                         return ret;
471         }
472         return ret;
473 }
474
475 /*
476  * Checksum a dirty tree block before IO.  This has extra checks to make sure
477  * we only fill in the checksum field in the first page of a multi-page block.
478  * For subpage extent buffers we need bvec to also read the offset in the page.
479  */
480 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
481 {
482         struct page *page = bvec->bv_page;
483         u64 start = page_offset(page);
484         u64 found_start;
485         struct extent_buffer *eb;
486
487         if (fs_info->nodesize < PAGE_SIZE)
488                 return csum_dirty_subpage_buffers(fs_info, bvec);
489
490         eb = (struct extent_buffer *)page->private;
491         if (page != eb->pages[0])
492                 return 0;
493
494         found_start = btrfs_header_bytenr(eb);
495
496         if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
497                 WARN_ON(found_start != 0);
498                 return 0;
499         }
500
501         /*
502          * Please do not consolidate these warnings into a single if.
503          * It is useful to know what went wrong.
504          */
505         if (WARN_ON(found_start != start))
506                 return -EUCLEAN;
507         if (WARN_ON(!PageUptodate(page)))
508                 return -EUCLEAN;
509
510         return csum_one_extent_buffer(eb);
511 }
512
513 static int check_tree_block_fsid(struct extent_buffer *eb)
514 {
515         struct btrfs_fs_info *fs_info = eb->fs_info;
516         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
517         u8 fsid[BTRFS_FSID_SIZE];
518         u8 *metadata_uuid;
519
520         read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
521                            BTRFS_FSID_SIZE);
522         /*
523          * Checking the incompat flag is only valid for the current fs. For
524          * seed devices it's forbidden to have their uuid changed so reading
525          * ->fsid in this case is fine
526          */
527         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
528                 metadata_uuid = fs_devices->metadata_uuid;
529         else
530                 metadata_uuid = fs_devices->fsid;
531
532         if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
533                 return 0;
534
535         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
536                 if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
537                         return 0;
538
539         return 1;
540 }
541
542 /* Do basic extent buffer checks at read time */
543 static int validate_extent_buffer(struct extent_buffer *eb)
544 {
545         struct btrfs_fs_info *fs_info = eb->fs_info;
546         u64 found_start;
547         const u32 csum_size = fs_info->csum_size;
548         u8 found_level;
549         u8 result[BTRFS_CSUM_SIZE];
550         const u8 *header_csum;
551         int ret = 0;
552
553         found_start = btrfs_header_bytenr(eb);
554         if (found_start != eb->start) {
555                 btrfs_err_rl(fs_info,
556                         "bad tree block start, mirror %u want %llu have %llu",
557                              eb->read_mirror, eb->start, found_start);
558                 ret = -EIO;
559                 goto out;
560         }
561         if (check_tree_block_fsid(eb)) {
562                 btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
563                              eb->start, eb->read_mirror);
564                 ret = -EIO;
565                 goto out;
566         }
567         found_level = btrfs_header_level(eb);
568         if (found_level >= BTRFS_MAX_LEVEL) {
569                 btrfs_err(fs_info,
570                         "bad tree block level, mirror %u level %d on logical %llu",
571                         eb->read_mirror, btrfs_header_level(eb), eb->start);
572                 ret = -EIO;
573                 goto out;
574         }
575
576         csum_tree_block(eb, result);
577         header_csum = page_address(eb->pages[0]) +
578                 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
579
580         if (memcmp(result, header_csum, csum_size) != 0) {
581                 btrfs_warn_rl(fs_info,
582 "checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
583                               eb->start, eb->read_mirror,
584                               CSUM_FMT_VALUE(csum_size, header_csum),
585                               CSUM_FMT_VALUE(csum_size, result),
586                               btrfs_header_level(eb));
587                 ret = -EUCLEAN;
588                 goto out;
589         }
590
591         /*
592          * If this is a leaf block and it is corrupt, set the corrupt bit so
593          * that we don't try and read the other copies of this block, just
594          * return -EIO.
595          */
596         if (found_level == 0 && btrfs_check_leaf_full(eb)) {
597                 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
598                 ret = -EIO;
599         }
600
601         if (found_level > 0 && btrfs_check_node(eb))
602                 ret = -EIO;
603
604         if (!ret)
605                 set_extent_buffer_uptodate(eb);
606         else
607                 btrfs_err(fs_info,
608                 "read time tree block corruption detected on logical %llu mirror %u",
609                           eb->start, eb->read_mirror);
610 out:
611         return ret;
612 }
613
614 static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
615                                    int mirror)
616 {
617         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
618         struct extent_buffer *eb;
619         bool reads_done;
620         int ret = 0;
621
622         /*
623          * We don't allow bio merge for subpage metadata read, so we should
624          * only get one eb for each endio hook.
625          */
626         ASSERT(end == start + fs_info->nodesize - 1);
627         ASSERT(PagePrivate(page));
628
629         eb = find_extent_buffer(fs_info, start);
630         /*
631          * When we are reading one tree block, eb must have been inserted into
632          * the radix tree. If not, something is wrong.
633          */
634         ASSERT(eb);
635
636         reads_done = atomic_dec_and_test(&eb->io_pages);
637         /* Subpage read must finish in page read */
638         ASSERT(reads_done);
639
640         eb->read_mirror = mirror;
641         if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
642                 ret = -EIO;
643                 goto err;
644         }
645         ret = validate_extent_buffer(eb);
646         if (ret < 0)
647                 goto err;
648
649         set_extent_buffer_uptodate(eb);
650
651         free_extent_buffer(eb);
652         return ret;
653 err:
654         /*
655          * end_bio_extent_readpage decrements io_pages in case of error,
656          * make sure it has something to decrement.
657          */
658         atomic_inc(&eb->io_pages);
659         clear_extent_buffer_uptodate(eb);
660         free_extent_buffer(eb);
661         return ret;
662 }
663
664 int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
665                                    struct page *page, u64 start, u64 end,
666                                    int mirror)
667 {
668         struct extent_buffer *eb;
669         int ret = 0;
670         int reads_done;
671
672         ASSERT(page->private);
673
674         if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
675                 return validate_subpage_buffer(page, start, end, mirror);
676
677         eb = (struct extent_buffer *)page->private;
678
679         /*
680          * The pending IO might have been the only thing that kept this buffer
681          * in memory.  Make sure we have a ref for all this other checks
682          */
683         atomic_inc(&eb->refs);
684
685         reads_done = atomic_dec_and_test(&eb->io_pages);
686         if (!reads_done)
687                 goto err;
688
689         eb->read_mirror = mirror;
690         if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
691                 ret = -EIO;
692                 goto err;
693         }
694         ret = validate_extent_buffer(eb);
695 err:
696         if (ret) {
697                 /*
698                  * our io error hook is going to dec the io pages
699                  * again, we have to make sure it has something
700                  * to decrement
701                  */
702                 atomic_inc(&eb->io_pages);
703                 clear_extent_buffer_uptodate(eb);
704         }
705         free_extent_buffer(eb);
706
707         return ret;
708 }
709
710 static void run_one_async_start(struct btrfs_work *work)
711 {
712         struct async_submit_bio *async;
713         blk_status_t ret;
714
715         async = container_of(work, struct  async_submit_bio, work);
716         ret = async->submit_bio_start(async->inode, async->bio,
717                                       async->dio_file_offset);
718         if (ret)
719                 async->status = ret;
720 }
721
722 /*
723  * In order to insert checksums into the metadata in large chunks, we wait
724  * until bio submission time.   All the pages in the bio are checksummed and
725  * sums are attached onto the ordered extent record.
726  *
727  * At IO completion time the csums attached on the ordered extent record are
728  * inserted into the tree.
729  */
730 static void run_one_async_done(struct btrfs_work *work)
731 {
732         struct async_submit_bio *async;
733         struct inode *inode;
734
735         async = container_of(work, struct  async_submit_bio, work);
736         inode = async->inode;
737
738         /* If an error occurred we just want to clean up the bio and move on */
739         if (async->status) {
740                 async->bio->bi_status = async->status;
741                 bio_endio(async->bio);
742                 return;
743         }
744
745         /*
746          * All of the bios that pass through here are from async helpers.
747          * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
748          * This changes nothing when cgroups aren't in use.
749          */
750         async->bio->bi_opf |= REQ_CGROUP_PUNT;
751         btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
752 }
753
754 static void run_one_async_free(struct btrfs_work *work)
755 {
756         struct async_submit_bio *async;
757
758         async = container_of(work, struct  async_submit_bio, work);
759         kfree(async);
760 }
761
762 /*
763  * Submit bio to an async queue.
764  *
765  * Retrun:
766  * - true if the work has been succesfuly submitted
767  * - false in case of error
768  */
769 bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
770                          u64 dio_file_offset,
771                          extent_submit_bio_start_t *submit_bio_start)
772 {
773         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
774         struct async_submit_bio *async;
775
776         async = kmalloc(sizeof(*async), GFP_NOFS);
777         if (!async)
778                 return false;
779
780         async->inode = inode;
781         async->bio = bio;
782         async->mirror_num = mirror_num;
783         async->submit_bio_start = submit_bio_start;
784
785         btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
786                         run_one_async_free);
787
788         async->dio_file_offset = dio_file_offset;
789
790         async->status = 0;
791
792         if (op_is_sync(bio->bi_opf))
793                 btrfs_queue_work(fs_info->hipri_workers, &async->work);
794         else
795                 btrfs_queue_work(fs_info->workers, &async->work);
796         return true;
797 }
798
799 static blk_status_t btree_csum_one_bio(struct bio *bio)
800 {
801         struct bio_vec *bvec;
802         struct btrfs_root *root;
803         int ret = 0;
804         struct bvec_iter_all iter_all;
805
806         ASSERT(!bio_flagged(bio, BIO_CLONED));
807         bio_for_each_segment_all(bvec, bio, iter_all) {
808                 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
809                 ret = csum_dirty_buffer(root->fs_info, bvec);
810                 if (ret)
811                         break;
812         }
813
814         return errno_to_blk_status(ret);
815 }
816
817 static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
818                                            u64 dio_file_offset)
819 {
820         /*
821          * when we're called for a write, we're already in the async
822          * submission context.  Just jump into btrfs_submit_bio.
823          */
824         return btree_csum_one_bio(bio);
825 }
826
827 static bool should_async_write(struct btrfs_fs_info *fs_info,
828                              struct btrfs_inode *bi)
829 {
830         if (btrfs_is_zoned(fs_info))
831                 return false;
832         if (atomic_read(&bi->sync_writers))
833                 return false;
834         if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
835                 return false;
836         return true;
837 }
838
839 void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
840 {
841         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
842         blk_status_t ret;
843
844         bio->bi_opf |= REQ_META;
845
846         if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
847                 btrfs_submit_bio(fs_info, bio, mirror_num);
848                 return;
849         }
850
851         /*
852          * Kthread helpers are used to submit writes so that checksumming can
853          * happen in parallel across all CPUs.
854          */
855         if (should_async_write(fs_info, BTRFS_I(inode)) &&
856             btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
857                 return;
858
859         ret = btree_csum_one_bio(bio);
860         if (ret) {
861                 bio->bi_status = ret;
862                 bio_endio(bio);
863                 return;
864         }
865
866         btrfs_submit_bio(fs_info, bio, mirror_num);
867 }
868
869 #ifdef CONFIG_MIGRATION
870 static int btree_migrate_folio(struct address_space *mapping,
871                 struct folio *dst, struct folio *src, enum migrate_mode mode)
872 {
873         /*
874          * we can't safely write a btree page from here,
875          * we haven't done the locking hook
876          */
877         if (folio_test_dirty(src))
878                 return -EAGAIN;
879         /*
880          * Buffers may be managed in a filesystem specific way.
881          * We must have no buffers or drop them.
882          */
883         if (folio_get_private(src) &&
884             !filemap_release_folio(src, GFP_KERNEL))
885                 return -EAGAIN;
886         return migrate_folio(mapping, dst, src, mode);
887 }
888 #else
889 #define btree_migrate_folio NULL
890 #endif
891
892 static int btree_writepages(struct address_space *mapping,
893                             struct writeback_control *wbc)
894 {
895         struct btrfs_fs_info *fs_info;
896         int ret;
897
898         if (wbc->sync_mode == WB_SYNC_NONE) {
899
900                 if (wbc->for_kupdate)
901                         return 0;
902
903                 fs_info = BTRFS_I(mapping->host)->root->fs_info;
904                 /* this is a bit racy, but that's ok */
905                 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
906                                              BTRFS_DIRTY_METADATA_THRESH,
907                                              fs_info->dirty_metadata_batch);
908                 if (ret < 0)
909                         return 0;
910         }
911         return btree_write_cache_pages(mapping, wbc);
912 }
913
914 static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
915 {
916         if (folio_test_writeback(folio) || folio_test_dirty(folio))
917                 return false;
918
919         return try_release_extent_buffer(&folio->page);
920 }
921
922 static void btree_invalidate_folio(struct folio *folio, size_t offset,
923                                  size_t length)
924 {
925         struct extent_io_tree *tree;
926         tree = &BTRFS_I(folio->mapping->host)->io_tree;
927         extent_invalidate_folio(tree, folio, offset);
928         btree_release_folio(folio, GFP_NOFS);
929         if (folio_get_private(folio)) {
930                 btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
931                            "folio private not zero on folio %llu",
932                            (unsigned long long)folio_pos(folio));
933                 folio_detach_private(folio);
934         }
935 }
936
937 #ifdef DEBUG
938 static bool btree_dirty_folio(struct address_space *mapping,
939                 struct folio *folio)
940 {
941         struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
942         struct btrfs_subpage *subpage;
943         struct extent_buffer *eb;
944         int cur_bit = 0;
945         u64 page_start = folio_pos(folio);
946
947         if (fs_info->sectorsize == PAGE_SIZE) {
948                 eb = folio_get_private(folio);
949                 BUG_ON(!eb);
950                 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
951                 BUG_ON(!atomic_read(&eb->refs));
952                 btrfs_assert_tree_write_locked(eb);
953                 return filemap_dirty_folio(mapping, folio);
954         }
955         subpage = folio_get_private(folio);
956
957         ASSERT(subpage->dirty_bitmap);
958         while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
959                 unsigned long flags;
960                 u64 cur;
961                 u16 tmp = (1 << cur_bit);
962
963                 spin_lock_irqsave(&subpage->lock, flags);
964                 if (!(tmp & subpage->dirty_bitmap)) {
965                         spin_unlock_irqrestore(&subpage->lock, flags);
966                         cur_bit++;
967                         continue;
968                 }
969                 spin_unlock_irqrestore(&subpage->lock, flags);
970                 cur = page_start + cur_bit * fs_info->sectorsize;
971
972                 eb = find_extent_buffer(fs_info, cur);
973                 ASSERT(eb);
974                 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
975                 ASSERT(atomic_read(&eb->refs));
976                 btrfs_assert_tree_write_locked(eb);
977                 free_extent_buffer(eb);
978
979                 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
980         }
981         return filemap_dirty_folio(mapping, folio);
982 }
983 #else
984 #define btree_dirty_folio filemap_dirty_folio
985 #endif
986
987 static const struct address_space_operations btree_aops = {
988         .writepages     = btree_writepages,
989         .release_folio  = btree_release_folio,
990         .invalidate_folio = btree_invalidate_folio,
991         .migrate_folio  = btree_migrate_folio,
992         .dirty_folio    = btree_dirty_folio,
993 };
994
995 struct extent_buffer *btrfs_find_create_tree_block(
996                                                 struct btrfs_fs_info *fs_info,
997                                                 u64 bytenr, u64 owner_root,
998                                                 int level)
999 {
1000         if (btrfs_is_testing(fs_info))
1001                 return alloc_test_extent_buffer(fs_info, bytenr);
1002         return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
1003 }
1004
1005 /*
1006  * Read tree block at logical address @bytenr and do variant basic but critical
1007  * verification.
1008  *
1009  * @owner_root:         the objectid of the root owner for this block.
1010  * @parent_transid:     expected transid of this tree block, skip check if 0
1011  * @level:              expected level, mandatory check
1012  * @first_key:          expected key in slot 0, skip check if NULL
1013  */
1014 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1015                                       u64 owner_root, u64 parent_transid,
1016                                       int level, struct btrfs_key *first_key)
1017 {
1018         struct extent_buffer *buf = NULL;
1019         int ret;
1020
1021         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
1022         if (IS_ERR(buf))
1023                 return buf;
1024
1025         ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
1026         if (ret) {
1027                 free_extent_buffer_stale(buf);
1028                 return ERR_PTR(ret);
1029         }
1030         if (btrfs_check_eb_owner(buf, owner_root)) {
1031                 free_extent_buffer_stale(buf);
1032                 return ERR_PTR(-EUCLEAN);
1033         }
1034         return buf;
1035
1036 }
1037
1038 void btrfs_clean_tree_block(struct extent_buffer *buf)
1039 {
1040         struct btrfs_fs_info *fs_info = buf->fs_info;
1041         if (btrfs_header_generation(buf) ==
1042             fs_info->running_transaction->transid) {
1043                 btrfs_assert_tree_write_locked(buf);
1044
1045                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1046                         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1047                                                  -buf->len,
1048                                                  fs_info->dirty_metadata_batch);
1049                         clear_extent_buffer_dirty(buf);
1050                 }
1051         }
1052 }
1053
1054 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1055                          u64 objectid)
1056 {
1057         bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1058
1059         memset(&root->root_key, 0, sizeof(root->root_key));
1060         memset(&root->root_item, 0, sizeof(root->root_item));
1061         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1062         root->fs_info = fs_info;
1063         root->root_key.objectid = objectid;
1064         root->node = NULL;
1065         root->commit_root = NULL;
1066         root->state = 0;
1067         RB_CLEAR_NODE(&root->rb_node);
1068
1069         root->last_trans = 0;
1070         root->free_objectid = 0;
1071         root->nr_delalloc_inodes = 0;
1072         root->nr_ordered_extents = 0;
1073         root->inode_tree = RB_ROOT;
1074         INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1075
1076         btrfs_init_root_block_rsv(root);
1077
1078         INIT_LIST_HEAD(&root->dirty_list);
1079         INIT_LIST_HEAD(&root->root_list);
1080         INIT_LIST_HEAD(&root->delalloc_inodes);
1081         INIT_LIST_HEAD(&root->delalloc_root);
1082         INIT_LIST_HEAD(&root->ordered_extents);
1083         INIT_LIST_HEAD(&root->ordered_root);
1084         INIT_LIST_HEAD(&root->reloc_dirty_list);
1085         INIT_LIST_HEAD(&root->logged_list[0]);
1086         INIT_LIST_HEAD(&root->logged_list[1]);
1087         spin_lock_init(&root->inode_lock);
1088         spin_lock_init(&root->delalloc_lock);
1089         spin_lock_init(&root->ordered_extent_lock);
1090         spin_lock_init(&root->accounting_lock);
1091         spin_lock_init(&root->log_extents_lock[0]);
1092         spin_lock_init(&root->log_extents_lock[1]);
1093         spin_lock_init(&root->qgroup_meta_rsv_lock);
1094         mutex_init(&root->objectid_mutex);
1095         mutex_init(&root->log_mutex);
1096         mutex_init(&root->ordered_extent_mutex);
1097         mutex_init(&root->delalloc_mutex);
1098         init_waitqueue_head(&root->qgroup_flush_wait);
1099         init_waitqueue_head(&root->log_writer_wait);
1100         init_waitqueue_head(&root->log_commit_wait[0]);
1101         init_waitqueue_head(&root->log_commit_wait[1]);
1102         INIT_LIST_HEAD(&root->log_ctxs[0]);
1103         INIT_LIST_HEAD(&root->log_ctxs[1]);
1104         atomic_set(&root->log_commit[0], 0);
1105         atomic_set(&root->log_commit[1], 0);
1106         atomic_set(&root->log_writers, 0);
1107         atomic_set(&root->log_batch, 0);
1108         refcount_set(&root->refs, 1);
1109         atomic_set(&root->snapshot_force_cow, 0);
1110         atomic_set(&root->nr_swapfiles, 0);
1111         root->log_transid = 0;
1112         root->log_transid_committed = -1;
1113         root->last_log_commit = 0;
1114         root->anon_dev = 0;
1115         if (!dummy) {
1116                 extent_io_tree_init(fs_info, &root->dirty_log_pages,
1117                                     IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1118                 extent_io_tree_init(fs_info, &root->log_csum_range,
1119                                     IO_TREE_LOG_CSUM_RANGE, NULL);
1120         }
1121
1122         spin_lock_init(&root->root_item_lock);
1123         btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1124 #ifdef CONFIG_BTRFS_DEBUG
1125         INIT_LIST_HEAD(&root->leak_list);
1126         spin_lock(&fs_info->fs_roots_radix_lock);
1127         list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1128         spin_unlock(&fs_info->fs_roots_radix_lock);
1129 #endif
1130 }
1131
1132 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1133                                            u64 objectid, gfp_t flags)
1134 {
1135         struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1136         if (root)
1137                 __setup_root(root, fs_info, objectid);
1138         return root;
1139 }
1140
1141 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1142 /* Should only be used by the testing infrastructure */
1143 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1144 {
1145         struct btrfs_root *root;
1146
1147         if (!fs_info)
1148                 return ERR_PTR(-EINVAL);
1149
1150         root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1151         if (!root)
1152                 return ERR_PTR(-ENOMEM);
1153
1154         /* We don't use the stripesize in selftest, set it as sectorsize */
1155         root->alloc_bytenr = 0;
1156
1157         return root;
1158 }
1159 #endif
1160
1161 static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
1162 {
1163         const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
1164         const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
1165
1166         return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
1167 }
1168
1169 static int global_root_key_cmp(const void *k, const struct rb_node *node)
1170 {
1171         const struct btrfs_key *key = k;
1172         const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
1173
1174         return btrfs_comp_cpu_keys(key, &root->root_key);
1175 }
1176
1177 int btrfs_global_root_insert(struct btrfs_root *root)
1178 {
1179         struct btrfs_fs_info *fs_info = root->fs_info;
1180         struct rb_node *tmp;
1181
1182         write_lock(&fs_info->global_root_lock);
1183         tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
1184         write_unlock(&fs_info->global_root_lock);
1185         ASSERT(!tmp);
1186
1187         return tmp ? -EEXIST : 0;
1188 }
1189
1190 void btrfs_global_root_delete(struct btrfs_root *root)
1191 {
1192         struct btrfs_fs_info *fs_info = root->fs_info;
1193
1194         write_lock(&fs_info->global_root_lock);
1195         rb_erase(&root->rb_node, &fs_info->global_root_tree);
1196         write_unlock(&fs_info->global_root_lock);
1197 }
1198
1199 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
1200                                      struct btrfs_key *key)
1201 {
1202         struct rb_node *node;
1203         struct btrfs_root *root = NULL;
1204
1205         read_lock(&fs_info->global_root_lock);
1206         node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
1207         if (node)
1208                 root = container_of(node, struct btrfs_root, rb_node);
1209         read_unlock(&fs_info->global_root_lock);
1210
1211         return root;
1212 }
1213
1214 static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
1215 {
1216         struct btrfs_block_group *block_group;
1217         u64 ret;
1218
1219         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
1220                 return 0;
1221
1222         if (bytenr)
1223                 block_group = btrfs_lookup_block_group(fs_info, bytenr);
1224         else
1225                 block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
1226         ASSERT(block_group);
1227         if (!block_group)
1228                 return 0;
1229         ret = block_group->global_root_id;
1230         btrfs_put_block_group(block_group);
1231
1232         return ret;
1233 }
1234
1235 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1236 {
1237         struct btrfs_key key = {
1238                 .objectid = BTRFS_CSUM_TREE_OBJECTID,
1239                 .type = BTRFS_ROOT_ITEM_KEY,
1240                 .offset = btrfs_global_root_id(fs_info, bytenr),
1241         };
1242
1243         return btrfs_global_root(fs_info, &key);
1244 }
1245
1246 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1247 {
1248         struct btrfs_key key = {
1249                 .objectid = BTRFS_EXTENT_TREE_OBJECTID,
1250                 .type = BTRFS_ROOT_ITEM_KEY,
1251                 .offset = btrfs_global_root_id(fs_info, bytenr),
1252         };
1253
1254         return btrfs_global_root(fs_info, &key);
1255 }
1256
1257 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1258                                      u64 objectid)
1259 {
1260         struct btrfs_fs_info *fs_info = trans->fs_info;
1261         struct extent_buffer *leaf;
1262         struct btrfs_root *tree_root = fs_info->tree_root;
1263         struct btrfs_root *root;
1264         struct btrfs_key key;
1265         unsigned int nofs_flag;
1266         int ret = 0;
1267
1268         /*
1269          * We're holding a transaction handle, so use a NOFS memory allocation
1270          * context to avoid deadlock if reclaim happens.
1271          */
1272         nofs_flag = memalloc_nofs_save();
1273         root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1274         memalloc_nofs_restore(nofs_flag);
1275         if (!root)
1276                 return ERR_PTR(-ENOMEM);
1277
1278         root->root_key.objectid = objectid;
1279         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1280         root->root_key.offset = 0;
1281
1282         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1283                                       BTRFS_NESTING_NORMAL);
1284         if (IS_ERR(leaf)) {
1285                 ret = PTR_ERR(leaf);
1286                 leaf = NULL;
1287                 goto fail_unlock;
1288         }
1289
1290         root->node = leaf;
1291         btrfs_mark_buffer_dirty(leaf);
1292
1293         root->commit_root = btrfs_root_node(root);
1294         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1295
1296         btrfs_set_root_flags(&root->root_item, 0);
1297         btrfs_set_root_limit(&root->root_item, 0);
1298         btrfs_set_root_bytenr(&root->root_item, leaf->start);
1299         btrfs_set_root_generation(&root->root_item, trans->transid);
1300         btrfs_set_root_level(&root->root_item, 0);
1301         btrfs_set_root_refs(&root->root_item, 1);
1302         btrfs_set_root_used(&root->root_item, leaf->len);
1303         btrfs_set_root_last_snapshot(&root->root_item, 0);
1304         btrfs_set_root_dirid(&root->root_item, 0);
1305         if (is_fstree(objectid))
1306                 generate_random_guid(root->root_item.uuid);
1307         else
1308                 export_guid(root->root_item.uuid, &guid_null);
1309         btrfs_set_root_drop_level(&root->root_item, 0);
1310
1311         btrfs_tree_unlock(leaf);
1312
1313         key.objectid = objectid;
1314         key.type = BTRFS_ROOT_ITEM_KEY;
1315         key.offset = 0;
1316         ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1317         if (ret)
1318                 goto fail;
1319
1320         return root;
1321
1322 fail_unlock:
1323         if (leaf)
1324                 btrfs_tree_unlock(leaf);
1325 fail:
1326         btrfs_put_root(root);
1327
1328         return ERR_PTR(ret);
1329 }
1330
1331 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1332                                          struct btrfs_fs_info *fs_info)
1333 {
1334         struct btrfs_root *root;
1335
1336         root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1337         if (!root)
1338                 return ERR_PTR(-ENOMEM);
1339
1340         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1341         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1342         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1343
1344         return root;
1345 }
1346
1347 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
1348                               struct btrfs_root *root)
1349 {
1350         struct extent_buffer *leaf;
1351
1352         /*
1353          * DON'T set SHAREABLE bit for log trees.
1354          *
1355          * Log trees are not exposed to user space thus can't be snapshotted,
1356          * and they go away before a real commit is actually done.
1357          *
1358          * They do store pointers to file data extents, and those reference
1359          * counts still get updated (along with back refs to the log tree).
1360          */
1361
1362         leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1363                         NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
1364         if (IS_ERR(leaf))
1365                 return PTR_ERR(leaf);
1366
1367         root->node = leaf;
1368
1369         btrfs_mark_buffer_dirty(root->node);
1370         btrfs_tree_unlock(root->node);
1371
1372         return 0;
1373 }
1374
1375 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1376                              struct btrfs_fs_info *fs_info)
1377 {
1378         struct btrfs_root *log_root;
1379
1380         log_root = alloc_log_tree(trans, fs_info);
1381         if (IS_ERR(log_root))
1382                 return PTR_ERR(log_root);
1383
1384         if (!btrfs_is_zoned(fs_info)) {
1385                 int ret = btrfs_alloc_log_tree_node(trans, log_root);
1386
1387                 if (ret) {
1388                         btrfs_put_root(log_root);
1389                         return ret;
1390                 }
1391         }
1392
1393         WARN_ON(fs_info->log_root_tree);
1394         fs_info->log_root_tree = log_root;
1395         return 0;
1396 }
1397
1398 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1399                        struct btrfs_root *root)
1400 {
1401         struct btrfs_fs_info *fs_info = root->fs_info;
1402         struct btrfs_root *log_root;
1403         struct btrfs_inode_item *inode_item;
1404         int ret;
1405
1406         log_root = alloc_log_tree(trans, fs_info);
1407         if (IS_ERR(log_root))
1408                 return PTR_ERR(log_root);
1409
1410         ret = btrfs_alloc_log_tree_node(trans, log_root);
1411         if (ret) {
1412                 btrfs_put_root(log_root);
1413                 return ret;
1414         }
1415
1416         log_root->last_trans = trans->transid;
1417         log_root->root_key.offset = root->root_key.objectid;
1418
1419         inode_item = &log_root->root_item.inode;
1420         btrfs_set_stack_inode_generation(inode_item, 1);
1421         btrfs_set_stack_inode_size(inode_item, 3);
1422         btrfs_set_stack_inode_nlink(inode_item, 1);
1423         btrfs_set_stack_inode_nbytes(inode_item,
1424                                      fs_info->nodesize);
1425         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1426
1427         btrfs_set_root_node(&log_root->root_item, log_root->node);
1428
1429         WARN_ON(root->log_root);
1430         root->log_root = log_root;
1431         root->log_transid = 0;
1432         root->log_transid_committed = -1;
1433         root->last_log_commit = 0;
1434         return 0;
1435 }
1436
1437 static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1438                                               struct btrfs_path *path,
1439                                               struct btrfs_key *key)
1440 {
1441         struct btrfs_root *root;
1442         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1443         u64 generation;
1444         int ret;
1445         int level;
1446
1447         root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1448         if (!root)
1449                 return ERR_PTR(-ENOMEM);
1450
1451         ret = btrfs_find_root(tree_root, key, path,
1452                               &root->root_item, &root->root_key);
1453         if (ret) {
1454                 if (ret > 0)
1455                         ret = -ENOENT;
1456                 goto fail;
1457         }
1458
1459         generation = btrfs_root_generation(&root->root_item);
1460         level = btrfs_root_level(&root->root_item);
1461         root->node = read_tree_block(fs_info,
1462                                      btrfs_root_bytenr(&root->root_item),
1463                                      key->objectid, generation, level, NULL);
1464         if (IS_ERR(root->node)) {
1465                 ret = PTR_ERR(root->node);
1466                 root->node = NULL;
1467                 goto fail;
1468         }
1469         if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1470                 ret = -EIO;
1471                 goto fail;
1472         }
1473
1474         /*
1475          * For real fs, and not log/reloc trees, root owner must
1476          * match its root node owner
1477          */
1478         if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
1479             root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1480             root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1481             root->root_key.objectid != btrfs_header_owner(root->node)) {
1482                 btrfs_crit(fs_info,
1483 "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1484                            root->root_key.objectid, root->node->start,
1485                            btrfs_header_owner(root->node),
1486                            root->root_key.objectid);
1487                 ret = -EUCLEAN;
1488                 goto fail;
1489         }
1490         root->commit_root = btrfs_root_node(root);
1491         return root;
1492 fail:
1493         btrfs_put_root(root);
1494         return ERR_PTR(ret);
1495 }
1496
1497 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1498                                         struct btrfs_key *key)
1499 {
1500         struct btrfs_root *root;
1501         struct btrfs_path *path;
1502
1503         path = btrfs_alloc_path();
1504         if (!path)
1505                 return ERR_PTR(-ENOMEM);
1506         root = read_tree_root_path(tree_root, path, key);
1507         btrfs_free_path(path);
1508
1509         return root;
1510 }
1511
1512 /*
1513  * Initialize subvolume root in-memory structure
1514  *
1515  * @anon_dev:   anonymous device to attach to the root, if zero, allocate new
1516  */
1517 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1518 {
1519         int ret;
1520         unsigned int nofs_flag;
1521
1522         /*
1523          * We might be called under a transaction (e.g. indirect backref
1524          * resolution) which could deadlock if it triggers memory reclaim
1525          */
1526         nofs_flag = memalloc_nofs_save();
1527         ret = btrfs_drew_lock_init(&root->snapshot_lock);
1528         memalloc_nofs_restore(nofs_flag);
1529         if (ret)
1530                 goto fail;
1531
1532         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1533             !btrfs_is_data_reloc_root(root)) {
1534                 set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1535                 btrfs_check_and_init_root_item(&root->root_item);
1536         }
1537
1538         /*
1539          * Don't assign anonymous block device to roots that are not exposed to
1540          * userspace, the id pool is limited to 1M
1541          */
1542         if (is_fstree(root->root_key.objectid) &&
1543             btrfs_root_refs(&root->root_item) > 0) {
1544                 if (!anon_dev) {
1545                         ret = get_anon_bdev(&root->anon_dev);
1546                         if (ret)
1547                                 goto fail;
1548                 } else {
1549                         root->anon_dev = anon_dev;
1550                 }
1551         }
1552
1553         mutex_lock(&root->objectid_mutex);
1554         ret = btrfs_init_root_free_objectid(root);
1555         if (ret) {
1556                 mutex_unlock(&root->objectid_mutex);
1557                 goto fail;
1558         }
1559
1560         ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1561
1562         mutex_unlock(&root->objectid_mutex);
1563
1564         return 0;
1565 fail:
1566         /* The caller is responsible to call btrfs_free_fs_root */
1567         return ret;
1568 }
1569
1570 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1571                                                u64 root_id)
1572 {
1573         struct btrfs_root *root;
1574
1575         spin_lock(&fs_info->fs_roots_radix_lock);
1576         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1577                                  (unsigned long)root_id);
1578         if (root)
1579                 root = btrfs_grab_root(root);
1580         spin_unlock(&fs_info->fs_roots_radix_lock);
1581         return root;
1582 }
1583
1584 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1585                                                 u64 objectid)
1586 {
1587         struct btrfs_key key = {
1588                 .objectid = objectid,
1589                 .type = BTRFS_ROOT_ITEM_KEY,
1590                 .offset = 0,
1591         };
1592
1593         if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1594                 return btrfs_grab_root(fs_info->tree_root);
1595         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1596                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1597         if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1598                 return btrfs_grab_root(fs_info->chunk_root);
1599         if (objectid == BTRFS_DEV_TREE_OBJECTID)
1600                 return btrfs_grab_root(fs_info->dev_root);
1601         if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1602                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1603         if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1604                 return btrfs_grab_root(fs_info->quota_root) ?
1605                         fs_info->quota_root : ERR_PTR(-ENOENT);
1606         if (objectid == BTRFS_UUID_TREE_OBJECTID)
1607                 return btrfs_grab_root(fs_info->uuid_root) ?
1608                         fs_info->uuid_root : ERR_PTR(-ENOENT);
1609         if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
1610                 struct btrfs_root *root = btrfs_global_root(fs_info, &key);
1611
1612                 return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
1613         }
1614         return NULL;
1615 }
1616
1617 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1618                          struct btrfs_root *root)
1619 {
1620         int ret;
1621
1622         ret = radix_tree_preload(GFP_NOFS);
1623         if (ret)
1624                 return ret;
1625
1626         spin_lock(&fs_info->fs_roots_radix_lock);
1627         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1628                                 (unsigned long)root->root_key.objectid,
1629                                 root);
1630         if (ret == 0) {
1631                 btrfs_grab_root(root);
1632                 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1633         }
1634         spin_unlock(&fs_info->fs_roots_radix_lock);
1635         radix_tree_preload_end();
1636
1637         return ret;
1638 }
1639
1640 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1641 {
1642 #ifdef CONFIG_BTRFS_DEBUG
1643         struct btrfs_root *root;
1644
1645         while (!list_empty(&fs_info->allocated_roots)) {
1646                 char buf[BTRFS_ROOT_NAME_BUF_LEN];
1647
1648                 root = list_first_entry(&fs_info->allocated_roots,
1649                                         struct btrfs_root, leak_list);
1650                 btrfs_err(fs_info, "leaked root %s refcount %d",
1651                           btrfs_root_name(&root->root_key, buf),
1652                           refcount_read(&root->refs));
1653                 while (refcount_read(&root->refs) > 1)
1654                         btrfs_put_root(root);
1655                 btrfs_put_root(root);
1656         }
1657 #endif
1658 }
1659
1660 static void free_global_roots(struct btrfs_fs_info *fs_info)
1661 {
1662         struct btrfs_root *root;
1663         struct rb_node *node;
1664
1665         while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1666                 root = rb_entry(node, struct btrfs_root, rb_node);
1667                 rb_erase(&root->rb_node, &fs_info->global_root_tree);
1668                 btrfs_put_root(root);
1669         }
1670 }
1671
1672 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1673 {
1674         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1675         percpu_counter_destroy(&fs_info->delalloc_bytes);
1676         percpu_counter_destroy(&fs_info->ordered_bytes);
1677         percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1678         btrfs_free_csum_hash(fs_info);
1679         btrfs_free_stripe_hash_table(fs_info);
1680         btrfs_free_ref_cache(fs_info);
1681         kfree(fs_info->balance_ctl);
1682         kfree(fs_info->delayed_root);
1683         free_global_roots(fs_info);
1684         btrfs_put_root(fs_info->tree_root);
1685         btrfs_put_root(fs_info->chunk_root);
1686         btrfs_put_root(fs_info->dev_root);
1687         btrfs_put_root(fs_info->quota_root);
1688         btrfs_put_root(fs_info->uuid_root);
1689         btrfs_put_root(fs_info->fs_root);
1690         btrfs_put_root(fs_info->data_reloc_root);
1691         btrfs_put_root(fs_info->block_group_root);
1692         btrfs_check_leaked_roots(fs_info);
1693         btrfs_extent_buffer_leak_debug_check(fs_info);
1694         kfree(fs_info->super_copy);
1695         kfree(fs_info->super_for_commit);
1696         kfree(fs_info->subpage_info);
1697         kvfree(fs_info);
1698 }
1699
1700
1701 /*
1702  * Get an in-memory reference of a root structure.
1703  *
1704  * For essential trees like root/extent tree, we grab it from fs_info directly.
1705  * For subvolume trees, we check the cached filesystem roots first. If not
1706  * found, then read it from disk and add it to cached fs roots.
1707  *
1708  * Caller should release the root by calling btrfs_put_root() after the usage.
1709  *
1710  * NOTE: Reloc and log trees can't be read by this function as they share the
1711  *       same root objectid.
1712  *
1713  * @objectid:   root id
1714  * @anon_dev:   preallocated anonymous block device number for new roots,
1715  *              pass 0 for new allocation.
1716  * @check_ref:  whether to check root item references, If true, return -ENOENT
1717  *              for orphan roots
1718  */
1719 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1720                                              u64 objectid, dev_t anon_dev,
1721                                              bool check_ref)
1722 {
1723         struct btrfs_root *root;
1724         struct btrfs_path *path;
1725         struct btrfs_key key;
1726         int ret;
1727
1728         root = btrfs_get_global_root(fs_info, objectid);
1729         if (root)
1730                 return root;
1731 again:
1732         root = btrfs_lookup_fs_root(fs_info, objectid);
1733         if (root) {
1734                 /* Shouldn't get preallocated anon_dev for cached roots */
1735                 ASSERT(!anon_dev);
1736                 if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1737                         btrfs_put_root(root);
1738                         return ERR_PTR(-ENOENT);
1739                 }
1740                 return root;
1741         }
1742
1743         key.objectid = objectid;
1744         key.type = BTRFS_ROOT_ITEM_KEY;
1745         key.offset = (u64)-1;
1746         root = btrfs_read_tree_root(fs_info->tree_root, &key);
1747         if (IS_ERR(root))
1748                 return root;
1749
1750         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1751                 ret = -ENOENT;
1752                 goto fail;
1753         }
1754
1755         ret = btrfs_init_fs_root(root, anon_dev);
1756         if (ret)
1757                 goto fail;
1758
1759         path = btrfs_alloc_path();
1760         if (!path) {
1761                 ret = -ENOMEM;
1762                 goto fail;
1763         }
1764         key.objectid = BTRFS_ORPHAN_OBJECTID;
1765         key.type = BTRFS_ORPHAN_ITEM_KEY;
1766         key.offset = objectid;
1767
1768         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1769         btrfs_free_path(path);
1770         if (ret < 0)
1771                 goto fail;
1772         if (ret == 0)
1773                 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1774
1775         ret = btrfs_insert_fs_root(fs_info, root);
1776         if (ret) {
1777                 if (ret == -EEXIST) {
1778                         btrfs_put_root(root);
1779                         goto again;
1780                 }
1781                 goto fail;
1782         }
1783         return root;
1784 fail:
1785         /*
1786          * If our caller provided us an anonymous device, then it's his
1787          * responsibility to free it in case we fail. So we have to set our
1788          * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1789          * and once again by our caller.
1790          */
1791         if (anon_dev)
1792                 root->anon_dev = 0;
1793         btrfs_put_root(root);
1794         return ERR_PTR(ret);
1795 }
1796
1797 /*
1798  * Get in-memory reference of a root structure
1799  *
1800  * @objectid:   tree objectid
1801  * @check_ref:  if set, verify that the tree exists and the item has at least
1802  *              one reference
1803  */
1804 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1805                                      u64 objectid, bool check_ref)
1806 {
1807         return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1808 }
1809
1810 /*
1811  * Get in-memory reference of a root structure, created as new, optionally pass
1812  * the anonymous block device id
1813  *
1814  * @objectid:   tree objectid
1815  * @anon_dev:   if zero, allocate a new anonymous block device or use the
1816  *              parameter value
1817  */
1818 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1819                                          u64 objectid, dev_t anon_dev)
1820 {
1821         return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1822 }
1823
1824 /*
1825  * btrfs_get_fs_root_commit_root - return a root for the given objectid
1826  * @fs_info:    the fs_info
1827  * @objectid:   the objectid we need to lookup
1828  *
1829  * This is exclusively used for backref walking, and exists specifically because
1830  * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1831  * creation time, which means we may have to read the tree_root in order to look
1832  * up a fs root that is not in memory.  If the root is not in memory we will
1833  * read the tree root commit root and look up the fs root from there.  This is a
1834  * temporary root, it will not be inserted into the radix tree as it doesn't
1835  * have the most uptodate information, it'll simply be discarded once the
1836  * backref code is finished using the root.
1837  */
1838 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1839                                                  struct btrfs_path *path,
1840                                                  u64 objectid)
1841 {
1842         struct btrfs_root *root;
1843         struct btrfs_key key;
1844
1845         ASSERT(path->search_commit_root && path->skip_locking);
1846
1847         /*
1848          * This can return -ENOENT if we ask for a root that doesn't exist, but
1849          * since this is called via the backref walking code we won't be looking
1850          * up a root that doesn't exist, unless there's corruption.  So if root
1851          * != NULL just return it.
1852          */
1853         root = btrfs_get_global_root(fs_info, objectid);
1854         if (root)
1855                 return root;
1856
1857         root = btrfs_lookup_fs_root(fs_info, objectid);
1858         if (root)
1859                 return root;
1860
1861         key.objectid = objectid;
1862         key.type = BTRFS_ROOT_ITEM_KEY;
1863         key.offset = (u64)-1;
1864         root = read_tree_root_path(fs_info->tree_root, path, &key);
1865         btrfs_release_path(path);
1866
1867         return root;
1868 }
1869
1870 static int cleaner_kthread(void *arg)
1871 {
1872         struct btrfs_fs_info *fs_info = arg;
1873         int again;
1874
1875         while (1) {
1876                 again = 0;
1877
1878                 set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1879
1880                 /* Make the cleaner go to sleep early. */
1881                 if (btrfs_need_cleaner_sleep(fs_info))
1882                         goto sleep;
1883
1884                 /*
1885                  * Do not do anything if we might cause open_ctree() to block
1886                  * before we have finished mounting the filesystem.
1887                  */
1888                 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1889                         goto sleep;
1890
1891                 if (!mutex_trylock(&fs_info->cleaner_mutex))
1892                         goto sleep;
1893
1894                 /*
1895                  * Avoid the problem that we change the status of the fs
1896                  * during the above check and trylock.
1897                  */
1898                 if (btrfs_need_cleaner_sleep(fs_info)) {
1899                         mutex_unlock(&fs_info->cleaner_mutex);
1900                         goto sleep;
1901                 }
1902
1903                 btrfs_run_delayed_iputs(fs_info);
1904
1905                 again = btrfs_clean_one_deleted_snapshot(fs_info);
1906                 mutex_unlock(&fs_info->cleaner_mutex);
1907
1908                 /*
1909                  * The defragger has dealt with the R/O remount and umount,
1910                  * needn't do anything special here.
1911                  */
1912                 btrfs_run_defrag_inodes(fs_info);
1913
1914                 /*
1915                  * Acquires fs_info->reclaim_bgs_lock to avoid racing
1916                  * with relocation (btrfs_relocate_chunk) and relocation
1917                  * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1918                  * after acquiring fs_info->reclaim_bgs_lock. So we
1919                  * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1920                  * unused block groups.
1921                  */
1922                 btrfs_delete_unused_bgs(fs_info);
1923
1924                 /*
1925                  * Reclaim block groups in the reclaim_bgs list after we deleted
1926                  * all unused block_groups. This possibly gives us some more free
1927                  * space.
1928                  */
1929                 btrfs_reclaim_bgs(fs_info);
1930 sleep:
1931                 clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1932                 if (kthread_should_park())
1933                         kthread_parkme();
1934                 if (kthread_should_stop())
1935                         return 0;
1936                 if (!again) {
1937                         set_current_state(TASK_INTERRUPTIBLE);
1938                         schedule();
1939                         __set_current_state(TASK_RUNNING);
1940                 }
1941         }
1942 }
1943
1944 static int transaction_kthread(void *arg)
1945 {
1946         struct btrfs_root *root = arg;
1947         struct btrfs_fs_info *fs_info = root->fs_info;
1948         struct btrfs_trans_handle *trans;
1949         struct btrfs_transaction *cur;
1950         u64 transid;
1951         time64_t delta;
1952         unsigned long delay;
1953         bool cannot_commit;
1954
1955         do {
1956                 cannot_commit = false;
1957                 delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1958                 mutex_lock(&fs_info->transaction_kthread_mutex);
1959
1960                 spin_lock(&fs_info->trans_lock);
1961                 cur = fs_info->running_transaction;
1962                 if (!cur) {
1963                         spin_unlock(&fs_info->trans_lock);
1964                         goto sleep;
1965                 }
1966
1967                 delta = ktime_get_seconds() - cur->start_time;
1968                 if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1969                     cur->state < TRANS_STATE_COMMIT_START &&
1970                     delta < fs_info->commit_interval) {
1971                         spin_unlock(&fs_info->trans_lock);
1972                         delay -= msecs_to_jiffies((delta - 1) * 1000);
1973                         delay = min(delay,
1974                                     msecs_to_jiffies(fs_info->commit_interval * 1000));
1975                         goto sleep;
1976                 }
1977                 transid = cur->transid;
1978                 spin_unlock(&fs_info->trans_lock);
1979
1980                 /* If the file system is aborted, this will always fail. */
1981                 trans = btrfs_attach_transaction(root);
1982                 if (IS_ERR(trans)) {
1983                         if (PTR_ERR(trans) != -ENOENT)
1984                                 cannot_commit = true;
1985                         goto sleep;
1986                 }
1987                 if (transid == trans->transid) {
1988                         btrfs_commit_transaction(trans);
1989                 } else {
1990                         btrfs_end_transaction(trans);
1991                 }
1992 sleep:
1993                 wake_up_process(fs_info->cleaner_kthread);
1994                 mutex_unlock(&fs_info->transaction_kthread_mutex);
1995
1996                 if (BTRFS_FS_ERROR(fs_info))
1997                         btrfs_cleanup_transaction(fs_info);
1998                 if (!kthread_should_stop() &&
1999                                 (!btrfs_transaction_blocked(fs_info) ||
2000                                  cannot_commit))
2001                         schedule_timeout_interruptible(delay);
2002         } while (!kthread_should_stop());
2003         return 0;
2004 }
2005
2006 /*
2007  * This will find the highest generation in the array of root backups.  The
2008  * index of the highest array is returned, or -EINVAL if we can't find
2009  * anything.
2010  *
2011  * We check to make sure the array is valid by comparing the
2012  * generation of the latest  root in the array with the generation
2013  * in the super block.  If they don't match we pitch it.
2014  */
2015 static int find_newest_super_backup(struct btrfs_fs_info *info)
2016 {
2017         const u64 newest_gen = btrfs_super_generation(info->super_copy);
2018         u64 cur;
2019         struct btrfs_root_backup *root_backup;
2020         int i;
2021
2022         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2023                 root_backup = info->super_copy->super_roots + i;
2024                 cur = btrfs_backup_tree_root_gen(root_backup);
2025                 if (cur == newest_gen)
2026                         return i;
2027         }
2028
2029         return -EINVAL;
2030 }
2031
2032 /*
2033  * copy all the root pointers into the super backup array.
2034  * this will bump the backup pointer by one when it is
2035  * done
2036  */
2037 static void backup_super_roots(struct btrfs_fs_info *info)
2038 {
2039         const int next_backup = info->backup_root_index;
2040         struct btrfs_root_backup *root_backup;
2041
2042         root_backup = info->super_for_commit->super_roots + next_backup;
2043
2044         /*
2045          * make sure all of our padding and empty slots get zero filled
2046          * regardless of which ones we use today
2047          */
2048         memset(root_backup, 0, sizeof(*root_backup));
2049
2050         info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
2051
2052         btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
2053         btrfs_set_backup_tree_root_gen(root_backup,
2054                                btrfs_header_generation(info->tree_root->node));
2055
2056         btrfs_set_backup_tree_root_level(root_backup,
2057                                btrfs_header_level(info->tree_root->node));
2058
2059         btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
2060         btrfs_set_backup_chunk_root_gen(root_backup,
2061                                btrfs_header_generation(info->chunk_root->node));
2062         btrfs_set_backup_chunk_root_level(root_backup,
2063                                btrfs_header_level(info->chunk_root->node));
2064
2065         if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
2066                 btrfs_set_backup_block_group_root(root_backup,
2067                                         info->block_group_root->node->start);
2068                 btrfs_set_backup_block_group_root_gen(root_backup,
2069                         btrfs_header_generation(info->block_group_root->node));
2070                 btrfs_set_backup_block_group_root_level(root_backup,
2071                         btrfs_header_level(info->block_group_root->node));
2072         } else {
2073                 struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
2074                 struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
2075
2076                 btrfs_set_backup_extent_root(root_backup,
2077                                              extent_root->node->start);
2078                 btrfs_set_backup_extent_root_gen(root_backup,
2079                                 btrfs_header_generation(extent_root->node));
2080                 btrfs_set_backup_extent_root_level(root_backup,
2081                                         btrfs_header_level(extent_root->node));
2082
2083                 btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
2084                 btrfs_set_backup_csum_root_gen(root_backup,
2085                                                btrfs_header_generation(csum_root->node));
2086                 btrfs_set_backup_csum_root_level(root_backup,
2087                                                  btrfs_header_level(csum_root->node));
2088         }
2089
2090         /*
2091          * we might commit during log recovery, which happens before we set
2092          * the fs_root.  Make sure it is valid before we fill it in.
2093          */
2094         if (info->fs_root && info->fs_root->node) {
2095                 btrfs_set_backup_fs_root(root_backup,
2096                                          info->fs_root->node->start);
2097                 btrfs_set_backup_fs_root_gen(root_backup,
2098                                btrfs_header_generation(info->fs_root->node));
2099                 btrfs_set_backup_fs_root_level(root_backup,
2100                                btrfs_header_level(info->fs_root->node));
2101         }
2102
2103         btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2104         btrfs_set_backup_dev_root_gen(root_backup,
2105                                btrfs_header_generation(info->dev_root->node));
2106         btrfs_set_backup_dev_root_level(root_backup,
2107                                        btrfs_header_level(info->dev_root->node));
2108
2109         btrfs_set_backup_total_bytes(root_backup,
2110                              btrfs_super_total_bytes(info->super_copy));
2111         btrfs_set_backup_bytes_used(root_backup,
2112                              btrfs_super_bytes_used(info->super_copy));
2113         btrfs_set_backup_num_devices(root_backup,
2114                              btrfs_super_num_devices(info->super_copy));
2115
2116         /*
2117          * if we don't copy this out to the super_copy, it won't get remembered
2118          * for the next commit
2119          */
2120         memcpy(&info->super_copy->super_roots,
2121                &info->super_for_commit->super_roots,
2122                sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2123 }
2124
2125 /*
2126  * read_backup_root - Reads a backup root based on the passed priority. Prio 0
2127  * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
2128  *
2129  * fs_info - filesystem whose backup roots need to be read
2130  * priority - priority of backup root required
2131  *
2132  * Returns backup root index on success and -EINVAL otherwise.
2133  */
2134 static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
2135 {
2136         int backup_index = find_newest_super_backup(fs_info);
2137         struct btrfs_super_block *super = fs_info->super_copy;
2138         struct btrfs_root_backup *root_backup;
2139
2140         if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
2141                 if (priority == 0)
2142                         return backup_index;
2143
2144                 backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
2145                 backup_index %= BTRFS_NUM_BACKUP_ROOTS;
2146         } else {
2147                 return -EINVAL;
2148         }
2149
2150         root_backup = super->super_roots + backup_index;
2151
2152         btrfs_set_super_generation(super,
2153                                    btrfs_backup_tree_root_gen(root_backup));
2154         btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2155         btrfs_set_super_root_level(super,
2156                                    btrfs_backup_tree_root_level(root_backup));
2157         btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2158
2159         /*
2160          * Fixme: the total bytes and num_devices need to match or we should
2161          * need a fsck
2162          */
2163         btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2164         btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2165
2166         return backup_index;
2167 }
2168
2169 /* helper to cleanup workers */
2170 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2171 {
2172         btrfs_destroy_workqueue(fs_info->fixup_workers);
2173         btrfs_destroy_workqueue(fs_info->delalloc_workers);
2174         btrfs_destroy_workqueue(fs_info->hipri_workers);
2175         btrfs_destroy_workqueue(fs_info->workers);
2176         if (fs_info->endio_workers)
2177                 destroy_workqueue(fs_info->endio_workers);
2178         if (fs_info->endio_raid56_workers)
2179                 destroy_workqueue(fs_info->endio_raid56_workers);
2180         if (fs_info->rmw_workers)
2181                 destroy_workqueue(fs_info->rmw_workers);
2182         if (fs_info->compressed_write_workers)
2183                 destroy_workqueue(fs_info->compressed_write_workers);
2184         btrfs_destroy_workqueue(fs_info->endio_write_workers);
2185         btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2186         btrfs_destroy_workqueue(fs_info->delayed_workers);
2187         btrfs_destroy_workqueue(fs_info->caching_workers);
2188         btrfs_destroy_workqueue(fs_info->flush_workers);
2189         btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2190         if (fs_info->discard_ctl.discard_workers)
2191                 destroy_workqueue(fs_info->discard_ctl.discard_workers);
2192         /*
2193          * Now that all other work queues are destroyed, we can safely destroy
2194          * the queues used for metadata I/O, since tasks from those other work
2195          * queues can do metadata I/O operations.
2196          */
2197         if (fs_info->endio_meta_workers)
2198                 destroy_workqueue(fs_info->endio_meta_workers);
2199 }
2200
2201 static void free_root_extent_buffers(struct btrfs_root *root)
2202 {
2203         if (root) {
2204                 free_extent_buffer(root->node);
2205                 free_extent_buffer(root->commit_root);
2206                 root->node = NULL;
2207                 root->commit_root = NULL;
2208         }
2209 }
2210
2211 static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
2212 {
2213         struct btrfs_root *root, *tmp;
2214
2215         rbtree_postorder_for_each_entry_safe(root, tmp,
2216                                              &fs_info->global_root_tree,
2217                                              rb_node)
2218                 free_root_extent_buffers(root);
2219 }
2220
2221 /* helper to cleanup tree roots */
2222 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2223 {
2224         free_root_extent_buffers(info->tree_root);
2225
2226         free_global_root_pointers(info);
2227         free_root_extent_buffers(info->dev_root);
2228         free_root_extent_buffers(info->quota_root);
2229         free_root_extent_buffers(info->uuid_root);
2230         free_root_extent_buffers(info->fs_root);
2231         free_root_extent_buffers(info->data_reloc_root);
2232         free_root_extent_buffers(info->block_group_root);
2233         if (free_chunk_root)
2234                 free_root_extent_buffers(info->chunk_root);
2235 }
2236
2237 void btrfs_put_root(struct btrfs_root *root)
2238 {
2239         if (!root)
2240                 return;
2241
2242         if (refcount_dec_and_test(&root->refs)) {
2243                 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2244                 WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2245                 if (root->anon_dev)
2246                         free_anon_bdev(root->anon_dev);
2247                 btrfs_drew_lock_destroy(&root->snapshot_lock);
2248                 free_root_extent_buffers(root);
2249 #ifdef CONFIG_BTRFS_DEBUG
2250                 spin_lock(&root->fs_info->fs_roots_radix_lock);
2251                 list_del_init(&root->leak_list);
2252                 spin_unlock(&root->fs_info->fs_roots_radix_lock);
2253 #endif
2254                 kfree(root);
2255         }
2256 }
2257
2258 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2259 {
2260         int ret;
2261         struct btrfs_root *gang[8];
2262         int i;
2263
2264         while (!list_empty(&fs_info->dead_roots)) {
2265                 gang[0] = list_entry(fs_info->dead_roots.next,
2266                                      struct btrfs_root, root_list);
2267                 list_del(&gang[0]->root_list);
2268
2269                 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2270                         btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2271                 btrfs_put_root(gang[0]);
2272         }
2273
2274         while (1) {
2275                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2276                                              (void **)gang, 0,
2277                                              ARRAY_SIZE(gang));
2278                 if (!ret)
2279                         break;
2280                 for (i = 0; i < ret; i++)
2281                         btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2282         }
2283 }
2284
2285 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2286 {
2287         mutex_init(&fs_info->scrub_lock);
2288         atomic_set(&fs_info->scrubs_running, 0);
2289         atomic_set(&fs_info->scrub_pause_req, 0);
2290         atomic_set(&fs_info->scrubs_paused, 0);
2291         atomic_set(&fs_info->scrub_cancel_req, 0);
2292         init_waitqueue_head(&fs_info->scrub_pause_wait);
2293         refcount_set(&fs_info->scrub_workers_refcnt, 0);
2294 }
2295
2296 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2297 {
2298         spin_lock_init(&fs_info->balance_lock);
2299         mutex_init(&fs_info->balance_mutex);
2300         atomic_set(&fs_info->balance_pause_req, 0);
2301         atomic_set(&fs_info->balance_cancel_req, 0);
2302         fs_info->balance_ctl = NULL;
2303         init_waitqueue_head(&fs_info->balance_wait_q);
2304         atomic_set(&fs_info->reloc_cancel_req, 0);
2305 }
2306
2307 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2308 {
2309         struct inode *inode = fs_info->btree_inode;
2310
2311         inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2312         set_nlink(inode, 1);
2313         /*
2314          * we set the i_size on the btree inode to the max possible int.
2315          * the real end of the address space is determined by all of
2316          * the devices in the system
2317          */
2318         inode->i_size = OFFSET_MAX;
2319         inode->i_mapping->a_ops = &btree_aops;
2320
2321         RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2322         extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2323                             IO_TREE_BTREE_INODE_IO, inode);
2324         BTRFS_I(inode)->io_tree.track_uptodate = false;
2325         extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2326
2327         BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2328         BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
2329         BTRFS_I(inode)->location.type = 0;
2330         BTRFS_I(inode)->location.offset = 0;
2331         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
2332         btrfs_insert_inode_hash(inode);
2333 }
2334
2335 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2336 {
2337         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2338         init_rwsem(&fs_info->dev_replace.rwsem);
2339         init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2340 }
2341
2342 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2343 {
2344         spin_lock_init(&fs_info->qgroup_lock);
2345         mutex_init(&fs_info->qgroup_ioctl_lock);
2346         fs_info->qgroup_tree = RB_ROOT;
2347         INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2348         fs_info->qgroup_seq = 1;
2349         fs_info->qgroup_ulist = NULL;
2350         fs_info->qgroup_rescan_running = false;
2351         mutex_init(&fs_info->qgroup_rescan_lock);
2352 }
2353
2354 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
2355 {
2356         u32 max_active = fs_info->thread_pool_size;
2357         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2358
2359         fs_info->workers =
2360                 btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
2361         fs_info->hipri_workers =
2362                 btrfs_alloc_workqueue(fs_info, "worker-high",
2363                                       flags | WQ_HIGHPRI, max_active, 16);
2364
2365         fs_info->delalloc_workers =
2366                 btrfs_alloc_workqueue(fs_info, "delalloc",
2367                                       flags, max_active, 2);
2368
2369         fs_info->flush_workers =
2370                 btrfs_alloc_workqueue(fs_info, "flush_delalloc",
2371                                       flags, max_active, 0);
2372
2373         fs_info->caching_workers =
2374                 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2375
2376         fs_info->fixup_workers =
2377                 btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2378
2379         fs_info->endio_workers =
2380                 alloc_workqueue("btrfs-endio", flags, max_active);
2381         fs_info->endio_meta_workers =
2382                 alloc_workqueue("btrfs-endio-meta", flags, max_active);
2383         fs_info->endio_raid56_workers =
2384                 alloc_workqueue("btrfs-endio-raid56", flags, max_active);
2385         fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
2386         fs_info->endio_write_workers =
2387                 btrfs_alloc_workqueue(fs_info, "endio-write", flags,
2388                                       max_active, 2);
2389         fs_info->compressed_write_workers =
2390                 alloc_workqueue("btrfs-compressed-write", flags, max_active);
2391         fs_info->endio_freespace_worker =
2392                 btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
2393                                       max_active, 0);
2394         fs_info->delayed_workers =
2395                 btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
2396                                       max_active, 0);
2397         fs_info->qgroup_rescan_workers =
2398                 btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2399         fs_info->discard_ctl.discard_workers =
2400                 alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2401
2402         if (!(fs_info->workers && fs_info->hipri_workers &&
2403               fs_info->delalloc_workers && fs_info->flush_workers &&
2404               fs_info->endio_workers && fs_info->endio_meta_workers &&
2405               fs_info->compressed_write_workers &&
2406               fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2407               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2408               fs_info->caching_workers && fs_info->fixup_workers &&
2409               fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2410               fs_info->discard_ctl.discard_workers)) {
2411                 return -ENOMEM;
2412         }
2413
2414         return 0;
2415 }
2416
2417 static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2418 {
2419         struct crypto_shash *csum_shash;
2420         const char *csum_driver = btrfs_super_csum_driver(csum_type);
2421
2422         csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2423
2424         if (IS_ERR(csum_shash)) {
2425                 btrfs_err(fs_info, "error allocating %s hash for checksum",
2426                           csum_driver);
2427                 return PTR_ERR(csum_shash);
2428         }
2429
2430         fs_info->csum_shash = csum_shash;
2431
2432         btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2433                         btrfs_super_csum_name(csum_type),
2434                         crypto_shash_driver_name(csum_shash));
2435         return 0;
2436 }
2437
2438 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2439                             struct btrfs_fs_devices *fs_devices)
2440 {
2441         int ret;
2442         struct btrfs_root *log_tree_root;
2443         struct btrfs_super_block *disk_super = fs_info->super_copy;
2444         u64 bytenr = btrfs_super_log_root(disk_super);
2445         int level = btrfs_super_log_root_level(disk_super);
2446
2447         if (fs_devices->rw_devices == 0) {
2448                 btrfs_warn(fs_info, "log replay required on RO media");
2449                 return -EIO;
2450         }
2451
2452         log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2453                                          GFP_KERNEL);
2454         if (!log_tree_root)
2455                 return -ENOMEM;
2456
2457         log_tree_root->node = read_tree_block(fs_info, bytenr,
2458                                               BTRFS_TREE_LOG_OBJECTID,
2459                                               fs_info->generation + 1, level,
2460                                               NULL);
2461         if (IS_ERR(log_tree_root->node)) {
2462                 btrfs_warn(fs_info, "failed to read log tree");
2463                 ret = PTR_ERR(log_tree_root->node);
2464                 log_tree_root->node = NULL;
2465                 btrfs_put_root(log_tree_root);
2466                 return ret;
2467         }
2468         if (!extent_buffer_uptodate(log_tree_root->node)) {
2469                 btrfs_err(fs_info, "failed to read log tree");
2470                 btrfs_put_root(log_tree_root);
2471                 return -EIO;
2472         }
2473
2474         /* returns with log_tree_root freed on success */
2475         ret = btrfs_recover_log_trees(log_tree_root);
2476         if (ret) {
2477                 btrfs_handle_fs_error(fs_info, ret,
2478                                       "Failed to recover log tree");
2479                 btrfs_put_root(log_tree_root);
2480                 return ret;
2481         }
2482
2483         if (sb_rdonly(fs_info->sb)) {
2484                 ret = btrfs_commit_super(fs_info);
2485                 if (ret)
2486                         return ret;
2487         }
2488
2489         return 0;
2490 }
2491
2492 static int load_global_roots_objectid(struct btrfs_root *tree_root,
2493                                       struct btrfs_path *path, u64 objectid,
2494                                       const char *name)
2495 {
2496         struct btrfs_fs_info *fs_info = tree_root->fs_info;
2497         struct btrfs_root *root;
2498         u64 max_global_id = 0;
2499         int ret;
2500         struct btrfs_key key = {
2501                 .objectid = objectid,
2502                 .type = BTRFS_ROOT_ITEM_KEY,
2503                 .offset = 0,
2504         };
2505         bool found = false;
2506
2507         /* If we have IGNOREDATACSUMS skip loading these roots. */
2508         if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2509             btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2510                 set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2511                 return 0;
2512         }
2513
2514         while (1) {
2515                 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2516                 if (ret < 0)
2517                         break;
2518
2519                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2520                         ret = btrfs_next_leaf(tree_root, path);
2521                         if (ret) {
2522                                 if (ret > 0)
2523                                         ret = 0;
2524                                 break;
2525                         }
2526                 }
2527                 ret = 0;
2528
2529                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2530                 if (key.objectid != objectid)
2531                         break;
2532                 btrfs_release_path(path);
2533
2534                 /*
2535                  * Just worry about this for extent tree, it'll be the same for
2536                  * everybody.
2537                  */
2538                 if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2539                         max_global_id = max(max_global_id, key.offset);
2540
2541                 found = true;
2542                 root = read_tree_root_path(tree_root, path, &key);
2543                 if (IS_ERR(root)) {
2544                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2545                                 ret = PTR_ERR(root);
2546                         break;
2547                 }
2548                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2549                 ret = btrfs_global_root_insert(root);
2550                 if (ret) {
2551                         btrfs_put_root(root);
2552                         break;
2553                 }
2554                 key.offset++;
2555         }
2556         btrfs_release_path(path);
2557
2558         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2559                 fs_info->nr_global_roots = max_global_id + 1;
2560
2561         if (!found || ret) {
2562                 if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2563                         set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2564
2565                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2566                         ret = ret ? ret : -ENOENT;
2567                 else
2568                         ret = 0;
2569                 btrfs_err(fs_info, "failed to load root %s", name);
2570         }
2571         return ret;
2572 }
2573
2574 static int load_global_roots(struct btrfs_root *tree_root)
2575 {
2576         struct btrfs_path *path;
2577         int ret = 0;
2578
2579         path = btrfs_alloc_path();
2580         if (!path)
2581                 return -ENOMEM;
2582
2583         ret = load_global_roots_objectid(tree_root, path,
2584                                          BTRFS_EXTENT_TREE_OBJECTID, "extent");
2585         if (ret)
2586                 goto out;
2587         ret = load_global_roots_objectid(tree_root, path,
2588                                          BTRFS_CSUM_TREE_OBJECTID, "csum");
2589         if (ret)
2590                 goto out;
2591         if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2592                 goto out;
2593         ret = load_global_roots_objectid(tree_root, path,
2594                                          BTRFS_FREE_SPACE_TREE_OBJECTID,
2595                                          "free space");
2596 out:
2597         btrfs_free_path(path);
2598         return ret;
2599 }
2600
2601 static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2602 {
2603         struct btrfs_root *tree_root = fs_info->tree_root;
2604         struct btrfs_root *root;
2605         struct btrfs_key location;
2606         int ret;
2607
2608         BUG_ON(!fs_info->tree_root);
2609
2610         ret = load_global_roots(tree_root);
2611         if (ret)
2612                 return ret;
2613
2614         location.objectid = BTRFS_DEV_TREE_OBJECTID;
2615         location.type = BTRFS_ROOT_ITEM_KEY;
2616         location.offset = 0;
2617
2618         root = btrfs_read_tree_root(tree_root, &location);
2619         if (IS_ERR(root)) {
2620                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2621                         ret = PTR_ERR(root);
2622                         goto out;
2623                 }
2624         } else {
2625                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2626                 fs_info->dev_root = root;
2627         }
2628         /* Initialize fs_info for all devices in any case */
2629         btrfs_init_devices_late(fs_info);
2630
2631         /*
2632          * This tree can share blocks with some other fs tree during relocation
2633          * and we need a proper setup by btrfs_get_fs_root
2634          */
2635         root = btrfs_get_fs_root(tree_root->fs_info,
2636                                  BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2637         if (IS_ERR(root)) {
2638                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2639                         ret = PTR_ERR(root);
2640                         goto out;
2641                 }
2642         } else {
2643                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2644                 fs_info->data_reloc_root = root;
2645         }
2646
2647         location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2648         root = btrfs_read_tree_root(tree_root, &location);
2649         if (!IS_ERR(root)) {
2650                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2651                 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2652                 fs_info->quota_root = root;
2653         }
2654
2655         location.objectid = BTRFS_UUID_TREE_OBJECTID;
2656         root = btrfs_read_tree_root(tree_root, &location);
2657         if (IS_ERR(root)) {
2658                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2659                         ret = PTR_ERR(root);
2660                         if (ret != -ENOENT)
2661                                 goto out;
2662                 }
2663         } else {
2664                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2665                 fs_info->uuid_root = root;
2666         }
2667
2668         return 0;
2669 out:
2670         btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2671                    location.objectid, ret);
2672         return ret;
2673 }
2674
2675 /*
2676  * Real super block validation
2677  * NOTE: super csum type and incompat features will not be checked here.
2678  *
2679  * @sb:         super block to check
2680  * @mirror_num: the super block number to check its bytenr:
2681  *              0       the primary (1st) sb
2682  *              1, 2    2nd and 3rd backup copy
2683  *             -1       skip bytenr check
2684  */
2685 static int validate_super(struct btrfs_fs_info *fs_info,
2686                             struct btrfs_super_block *sb, int mirror_num)
2687 {
2688         u64 nodesize = btrfs_super_nodesize(sb);
2689         u64 sectorsize = btrfs_super_sectorsize(sb);
2690         int ret = 0;
2691
2692         if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2693                 btrfs_err(fs_info, "no valid FS found");
2694                 ret = -EINVAL;
2695         }
2696         if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2697                 btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2698                                 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2699                 ret = -EINVAL;
2700         }
2701         if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2702                 btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2703                                 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2704                 ret = -EINVAL;
2705         }
2706         if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2707                 btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2708                                 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2709                 ret = -EINVAL;
2710         }
2711         if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2712                 btrfs_err(fs_info, "log_root level too big: %d >= %d",
2713                                 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2714                 ret = -EINVAL;
2715         }
2716
2717         /*
2718          * Check sectorsize and nodesize first, other check will need it.
2719          * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2720          */
2721         if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2722             sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2723                 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2724                 ret = -EINVAL;
2725         }
2726
2727         /*
2728          * We only support at most two sectorsizes: 4K and PAGE_SIZE.
2729          *
2730          * We can support 16K sectorsize with 64K page size without problem,
2731          * but such sectorsize/pagesize combination doesn't make much sense.
2732          * 4K will be our future standard, PAGE_SIZE is supported from the very
2733          * beginning.
2734          */
2735         if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
2736                 btrfs_err(fs_info,
2737                         "sectorsize %llu not yet supported for page size %lu",
2738                         sectorsize, PAGE_SIZE);
2739                 ret = -EINVAL;
2740         }
2741
2742         if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2743             nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2744                 btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2745                 ret = -EINVAL;
2746         }
2747         if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2748                 btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2749                           le32_to_cpu(sb->__unused_leafsize), nodesize);
2750                 ret = -EINVAL;
2751         }
2752
2753         /* Root alignment check */
2754         if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2755                 btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2756                            btrfs_super_root(sb));
2757                 ret = -EINVAL;
2758         }
2759         if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2760                 btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2761                            btrfs_super_chunk_root(sb));
2762                 ret = -EINVAL;
2763         }
2764         if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2765                 btrfs_warn(fs_info, "log_root block unaligned: %llu",
2766                            btrfs_super_log_root(sb));
2767                 ret = -EINVAL;
2768         }
2769
2770         if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2771                    BTRFS_FSID_SIZE)) {
2772                 btrfs_err(fs_info,
2773                 "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2774                         fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
2775                 ret = -EINVAL;
2776         }
2777
2778         if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
2779             memcmp(fs_info->fs_devices->metadata_uuid,
2780                    fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
2781                 btrfs_err(fs_info,
2782 "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2783                         fs_info->super_copy->metadata_uuid,
2784                         fs_info->fs_devices->metadata_uuid);
2785                 ret = -EINVAL;
2786         }
2787
2788         if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2789                    BTRFS_FSID_SIZE) != 0) {
2790                 btrfs_err(fs_info,
2791                         "dev_item UUID does not match metadata fsid: %pU != %pU",
2792                         fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2793                 ret = -EINVAL;
2794         }
2795
2796         /*
2797          * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2798          * done later
2799          */
2800         if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2801                 btrfs_err(fs_info, "bytes_used is too small %llu",
2802                           btrfs_super_bytes_used(sb));
2803                 ret = -EINVAL;
2804         }
2805         if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2806                 btrfs_err(fs_info, "invalid stripesize %u",
2807                           btrfs_super_stripesize(sb));
2808                 ret = -EINVAL;
2809         }
2810         if (btrfs_super_num_devices(sb) > (1UL << 31))
2811                 btrfs_warn(fs_info, "suspicious number of devices: %llu",
2812                            btrfs_super_num_devices(sb));
2813         if (btrfs_super_num_devices(sb) == 0) {
2814                 btrfs_err(fs_info, "number of devices is 0");
2815                 ret = -EINVAL;
2816         }
2817
2818         if (mirror_num >= 0 &&
2819             btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2820                 btrfs_err(fs_info, "super offset mismatch %llu != %u",
2821                           btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2822                 ret = -EINVAL;
2823         }
2824
2825         /*
2826          * Obvious sys_chunk_array corruptions, it must hold at least one key
2827          * and one chunk
2828          */
2829         if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2830                 btrfs_err(fs_info, "system chunk array too big %u > %u",
2831                           btrfs_super_sys_array_size(sb),
2832                           BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2833                 ret = -EINVAL;
2834         }
2835         if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2836                         + sizeof(struct btrfs_chunk)) {
2837                 btrfs_err(fs_info, "system chunk array too small %u < %zu",
2838                           btrfs_super_sys_array_size(sb),
2839                           sizeof(struct btrfs_disk_key)
2840                           + sizeof(struct btrfs_chunk));
2841                 ret = -EINVAL;
2842         }
2843
2844         /*
2845          * The generation is a global counter, we'll trust it more than the others
2846          * but it's still possible that it's the one that's wrong.
2847          */
2848         if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2849                 btrfs_warn(fs_info,
2850                         "suspicious: generation < chunk_root_generation: %llu < %llu",
2851                         btrfs_super_generation(sb),
2852                         btrfs_super_chunk_root_generation(sb));
2853         if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2854             && btrfs_super_cache_generation(sb) != (u64)-1)
2855                 btrfs_warn(fs_info,
2856                         "suspicious: generation < cache_generation: %llu < %llu",
2857                         btrfs_super_generation(sb),
2858                         btrfs_super_cache_generation(sb));
2859
2860         return ret;
2861 }
2862
2863 /*
2864  * Validation of super block at mount time.
2865  * Some checks already done early at mount time, like csum type and incompat
2866  * flags will be skipped.
2867  */
2868 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2869 {
2870         return validate_super(fs_info, fs_info->super_copy, 0);
2871 }
2872
2873 /*
2874  * Validation of super block at write time.
2875  * Some checks like bytenr check will be skipped as their values will be
2876  * overwritten soon.
2877  * Extra checks like csum type and incompat flags will be done here.
2878  */
2879 static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2880                                       struct btrfs_super_block *sb)
2881 {
2882         int ret;
2883
2884         ret = validate_super(fs_info, sb, -1);
2885         if (ret < 0)
2886                 goto out;
2887         if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2888                 ret = -EUCLEAN;
2889                 btrfs_err(fs_info, "invalid csum type, has %u want %u",
2890                           btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2891                 goto out;
2892         }
2893         if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2894                 ret = -EUCLEAN;
2895                 btrfs_err(fs_info,
2896                 "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2897                           btrfs_super_incompat_flags(sb),
2898                           (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2899                 goto out;
2900         }
2901 out:
2902         if (ret < 0)
2903                 btrfs_err(fs_info,
2904                 "super block corruption detected before writing it to disk");
2905         return ret;
2906 }
2907
2908 static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2909 {
2910         int ret = 0;
2911
2912         root->node = read_tree_block(root->fs_info, bytenr,
2913                                      root->root_key.objectid, gen, level, NULL);
2914         if (IS_ERR(root->node)) {
2915                 ret = PTR_ERR(root->node);
2916                 root->node = NULL;
2917                 return ret;
2918         }
2919         if (!extent_buffer_uptodate(root->node)) {
2920                 free_extent_buffer(root->node);
2921                 root->node = NULL;
2922                 return -EIO;
2923         }
2924
2925         btrfs_set_root_node(&root->root_item, root->node);
2926         root->commit_root = btrfs_root_node(root);
2927         btrfs_set_root_refs(&root->root_item, 1);
2928         return ret;
2929 }
2930
2931 static int load_important_roots(struct btrfs_fs_info *fs_info)
2932 {
2933         struct btrfs_super_block *sb = fs_info->super_copy;
2934         u64 gen, bytenr;
2935         int level, ret;
2936
2937         bytenr = btrfs_super_root(sb);
2938         gen = btrfs_super_generation(sb);
2939         level = btrfs_super_root_level(sb);
2940         ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2941         if (ret) {
2942                 btrfs_warn(fs_info, "couldn't read tree root");
2943                 return ret;
2944         }
2945
2946         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2947                 return 0;
2948
2949         bytenr = btrfs_super_block_group_root(sb);
2950         gen = btrfs_super_block_group_root_generation(sb);
2951         level = btrfs_super_block_group_root_level(sb);
2952         ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
2953         if (ret)
2954                 btrfs_warn(fs_info, "couldn't read block group root");
2955         return ret;
2956 }
2957
2958 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2959 {
2960         int backup_index = find_newest_super_backup(fs_info);
2961         struct btrfs_super_block *sb = fs_info->super_copy;
2962         struct btrfs_root *tree_root = fs_info->tree_root;
2963         bool handle_error = false;
2964         int ret = 0;
2965         int i;
2966
2967         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2968                 struct btrfs_root *root;
2969
2970                 root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
2971                                         GFP_KERNEL);
2972                 if (!root)
2973                         return -ENOMEM;
2974                 fs_info->block_group_root = root;
2975         }
2976
2977         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2978                 if (handle_error) {
2979                         if (!IS_ERR(tree_root->node))
2980                                 free_extent_buffer(tree_root->node);
2981                         tree_root->node = NULL;
2982
2983                         if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2984                                 break;
2985
2986                         free_root_pointers(fs_info, 0);
2987
2988                         /*
2989                          * Don't use the log in recovery mode, it won't be
2990                          * valid
2991                          */
2992                         btrfs_set_super_log_root(sb, 0);
2993
2994                         /* We can't trust the free space cache either */
2995                         btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2996
2997                         ret = read_backup_root(fs_info, i);
2998                         backup_index = ret;
2999                         if (ret < 0)
3000                                 return ret;
3001                 }
3002
3003                 ret = load_important_roots(fs_info);
3004                 if (ret) {
3005                         handle_error = true;
3006                         continue;
3007                 }
3008
3009                 /*
3010                  * No need to hold btrfs_root::objectid_mutex since the fs
3011                  * hasn't been fully initialised and we are the only user
3012                  */
3013                 ret = btrfs_init_root_free_objectid(tree_root);
3014                 if (ret < 0) {
3015                         handle_error = true;
3016                         continue;
3017                 }
3018
3019                 ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
3020
3021                 ret = btrfs_read_roots(fs_info);
3022                 if (ret < 0) {
3023                         handle_error = true;
3024                         continue;
3025                 }
3026
3027                 /* All successful */
3028                 fs_info->generation = btrfs_header_generation(tree_root->node);
3029                 fs_info->last_trans_committed = fs_info->generation;
3030                 fs_info->last_reloc_trans = 0;
3031
3032                 /* Always begin writing backup roots after the one being used */
3033                 if (backup_index < 0) {
3034                         fs_info->backup_root_index = 0;
3035                 } else {
3036                         fs_info->backup_root_index = backup_index + 1;
3037                         fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
3038                 }
3039                 break;
3040         }
3041
3042         return ret;
3043 }
3044
3045 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
3046 {
3047         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
3048         INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
3049         INIT_LIST_HEAD(&fs_info->trans_list);
3050         INIT_LIST_HEAD(&fs_info->dead_roots);
3051         INIT_LIST_HEAD(&fs_info->delayed_iputs);
3052         INIT_LIST_HEAD(&fs_info->delalloc_roots);
3053         INIT_LIST_HEAD(&fs_info->caching_block_groups);
3054         spin_lock_init(&fs_info->delalloc_root_lock);
3055         spin_lock_init(&fs_info->trans_lock);
3056         spin_lock_init(&fs_info->fs_roots_radix_lock);
3057         spin_lock_init(&fs_info->delayed_iput_lock);
3058         spin_lock_init(&fs_info->defrag_inodes_lock);
3059         spin_lock_init(&fs_info->super_lock);
3060         spin_lock_init(&fs_info->buffer_lock);
3061         spin_lock_init(&fs_info->unused_bgs_lock);
3062         spin_lock_init(&fs_info->treelog_bg_lock);
3063         spin_lock_init(&fs_info->zone_active_bgs_lock);
3064         spin_lock_init(&fs_info->relocation_bg_lock);
3065         rwlock_init(&fs_info->tree_mod_log_lock);
3066         rwlock_init(&fs_info->global_root_lock);
3067         mutex_init(&fs_info->unused_bg_unpin_mutex);
3068         mutex_init(&fs_info->reclaim_bgs_lock);
3069         mutex_init(&fs_info->reloc_mutex);
3070         mutex_init(&fs_info->delalloc_root_mutex);
3071         mutex_init(&fs_info->zoned_meta_io_lock);
3072         mutex_init(&fs_info->zoned_data_reloc_io_lock);
3073         seqlock_init(&fs_info->profiles_lock);
3074
3075         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
3076         INIT_LIST_HEAD(&fs_info->space_info);
3077         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
3078         INIT_LIST_HEAD(&fs_info->unused_bgs);
3079         INIT_LIST_HEAD(&fs_info->reclaim_bgs);
3080         INIT_LIST_HEAD(&fs_info->zone_active_bgs);
3081 #ifdef CONFIG_BTRFS_DEBUG
3082         INIT_LIST_HEAD(&fs_info->allocated_roots);
3083         INIT_LIST_HEAD(&fs_info->allocated_ebs);
3084         spin_lock_init(&fs_info->eb_leak_lock);
3085 #endif
3086         extent_map_tree_init(&fs_info->mapping_tree);
3087         btrfs_init_block_rsv(&fs_info->global_block_rsv,
3088                              BTRFS_BLOCK_RSV_GLOBAL);
3089         btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
3090         btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
3091         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
3092         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
3093                              BTRFS_BLOCK_RSV_DELOPS);
3094         btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
3095                              BTRFS_BLOCK_RSV_DELREFS);
3096
3097         atomic_set(&fs_info->async_delalloc_pages, 0);
3098         atomic_set(&fs_info->defrag_running, 0);
3099         atomic_set(&fs_info->nr_delayed_iputs, 0);
3100         atomic64_set(&fs_info->tree_mod_seq, 0);
3101         fs_info->global_root_tree = RB_ROOT;
3102         fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
3103         fs_info->metadata_ratio = 0;
3104         fs_info->defrag_inodes = RB_ROOT;
3105         atomic64_set(&fs_info->free_chunk_space, 0);
3106         fs_info->tree_mod_log = RB_ROOT;
3107         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
3108         fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
3109         btrfs_init_ref_verify(fs_info);
3110
3111         fs_info->thread_pool_size = min_t(unsigned long,
3112                                           num_online_cpus() + 2, 8);
3113
3114         INIT_LIST_HEAD(&fs_info->ordered_roots);
3115         spin_lock_init(&fs_info->ordered_root_lock);
3116
3117         btrfs_init_scrub(fs_info);
3118 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3119         fs_info->check_integrity_print_mask = 0;
3120 #endif
3121         btrfs_init_balance(fs_info);
3122         btrfs_init_async_reclaim_work(fs_info);
3123
3124         rwlock_init(&fs_info->block_group_cache_lock);
3125         fs_info->block_group_cache_tree = RB_ROOT_CACHED;
3126
3127         extent_io_tree_init(fs_info, &fs_info->excluded_extents,
3128                             IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
3129
3130         mutex_init(&fs_info->ordered_operations_mutex);
3131         mutex_init(&fs_info->tree_log_mutex);
3132         mutex_init(&fs_info->chunk_mutex);
3133         mutex_init(&fs_info->transaction_kthread_mutex);
3134         mutex_init(&fs_info->cleaner_mutex);
3135         mutex_init(&fs_info->ro_block_group_mutex);
3136         init_rwsem(&fs_info->commit_root_sem);
3137         init_rwsem(&fs_info->cleanup_work_sem);
3138         init_rwsem(&fs_info->subvol_sem);
3139         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
3140
3141         btrfs_init_dev_replace_locks(fs_info);
3142         btrfs_init_qgroup(fs_info);
3143         btrfs_discard_init(fs_info);
3144
3145         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
3146         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
3147
3148         init_waitqueue_head(&fs_info->transaction_throttle);
3149         init_waitqueue_head(&fs_info->transaction_wait);
3150         init_waitqueue_head(&fs_info->transaction_blocked_wait);
3151         init_waitqueue_head(&fs_info->async_submit_wait);
3152         init_waitqueue_head(&fs_info->delayed_iputs_wait);
3153         init_waitqueue_head(&fs_info->zone_finish_wait);
3154
3155         /* Usable values until the real ones are cached from the superblock */
3156         fs_info->nodesize = 4096;
3157         fs_info->sectorsize = 4096;
3158         fs_info->sectorsize_bits = ilog2(4096);
3159         fs_info->stripesize = 4096;
3160
3161         fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
3162
3163         spin_lock_init(&fs_info->swapfile_pins_lock);
3164         fs_info->swapfile_pins = RB_ROOT;
3165
3166         fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
3167         INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
3168 }
3169
3170 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
3171 {
3172         int ret;
3173
3174         fs_info->sb = sb;
3175         sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
3176         sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
3177
3178         ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
3179         if (ret)
3180                 return ret;
3181
3182         ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
3183         if (ret)
3184                 return ret;
3185
3186         fs_info->dirty_metadata_batch = PAGE_SIZE *
3187                                         (1 + ilog2(nr_cpu_ids));
3188
3189         ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
3190         if (ret)
3191                 return ret;
3192
3193         ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
3194                         GFP_KERNEL);
3195         if (ret)
3196                 return ret;
3197
3198         fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
3199                                         GFP_KERNEL);
3200         if (!fs_info->delayed_root)
3201                 return -ENOMEM;
3202         btrfs_init_delayed_root(fs_info->delayed_root);
3203
3204         if (sb_rdonly(sb))
3205                 set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
3206
3207         return btrfs_alloc_stripe_hash_table(fs_info);
3208 }
3209
3210 static int btrfs_uuid_rescan_kthread(void *data)
3211 {
3212         struct btrfs_fs_info *fs_info = data;
3213         int ret;
3214
3215         /*
3216          * 1st step is to iterate through the existing UUID tree and
3217          * to delete all entries that contain outdated data.
3218          * 2nd step is to add all missing entries to the UUID tree.
3219          */
3220         ret = btrfs_uuid_tree_iterate(fs_info);
3221         if (ret < 0) {
3222                 if (ret != -EINTR)
3223                         btrfs_warn(fs_info, "iterating uuid_tree failed %d",
3224                                    ret);
3225                 up(&fs_info->uuid_tree_rescan_sem);
3226                 return ret;
3227         }
3228         return btrfs_uuid_scan_kthread(data);
3229 }
3230
3231 static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3232 {
3233         struct task_struct *task;
3234
3235         down(&fs_info->uuid_tree_rescan_sem);
3236         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3237         if (IS_ERR(task)) {
3238                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3239                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
3240                 up(&fs_info->uuid_tree_rescan_sem);
3241                 return PTR_ERR(task);
3242         }
3243
3244         return 0;
3245 }
3246
3247 /*
3248  * Some options only have meaning at mount time and shouldn't persist across
3249  * remounts, or be displayed. Clear these at the end of mount and remount
3250  * code paths.
3251  */
3252 void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
3253 {
3254         btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3255         btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
3256 }
3257
3258 /*
3259  * Mounting logic specific to read-write file systems. Shared by open_ctree
3260  * and btrfs_remount when remounting from read-only to read-write.
3261  */
3262 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3263 {
3264         int ret;
3265         const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3266         bool clear_free_space_tree = false;
3267
3268         if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3269             btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3270                 clear_free_space_tree = true;
3271         } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3272                    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3273                 btrfs_warn(fs_info, "free space tree is invalid");
3274                 clear_free_space_tree = true;
3275         }
3276
3277         if (clear_free_space_tree) {
3278                 btrfs_info(fs_info, "clearing free space tree");
3279                 ret = btrfs_clear_free_space_tree(fs_info);
3280                 if (ret) {
3281                         btrfs_warn(fs_info,
3282                                    "failed to clear free space tree: %d", ret);
3283                         goto out;
3284                 }
3285         }
3286
3287         /*
3288          * btrfs_find_orphan_roots() is responsible for finding all the dead
3289          * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3290          * them into the fs_info->fs_roots_radix tree. This must be done before
3291          * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3292          * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3293          * item before the root's tree is deleted - this means that if we unmount
3294          * or crash before the deletion completes, on the next mount we will not
3295          * delete what remains of the tree because the orphan item does not
3296          * exists anymore, which is what tells us we have a pending deletion.
3297          */
3298         ret = btrfs_find_orphan_roots(fs_info);
3299         if (ret)
3300                 goto out;
3301
3302         ret = btrfs_cleanup_fs_roots(fs_info);
3303         if (ret)
3304                 goto out;
3305
3306         down_read(&fs_info->cleanup_work_sem);
3307         if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3308             (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3309                 up_read(&fs_info->cleanup_work_sem);
3310                 goto out;
3311         }
3312         up_read(&fs_info->cleanup_work_sem);
3313
3314         mutex_lock(&fs_info->cleaner_mutex);
3315         ret = btrfs_recover_relocation(fs_info);
3316         mutex_unlock(&fs_info->cleaner_mutex);
3317         if (ret < 0) {
3318                 btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3319                 goto out;
3320         }
3321
3322         if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3323             !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3324                 btrfs_info(fs_info, "creating free space tree");
3325                 ret = btrfs_create_free_space_tree(fs_info);
3326                 if (ret) {
3327                         btrfs_warn(fs_info,
3328                                 "failed to create free space tree: %d", ret);
3329                         goto out;
3330                 }
3331         }
3332
3333         if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3334                 ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3335                 if (ret)
3336                         goto out;
3337         }
3338
3339         ret = btrfs_resume_balance_async(fs_info);
3340         if (ret)
3341                 goto out;
3342
3343         ret = btrfs_resume_dev_replace_async(fs_info);
3344         if (ret) {
3345                 btrfs_warn(fs_info, "failed to resume dev_replace");
3346                 goto out;
3347         }
3348
3349         btrfs_qgroup_rescan_resume(fs_info);
3350
3351         if (!fs_info->uuid_root) {
3352                 btrfs_info(fs_info, "creating UUID tree");
3353                 ret = btrfs_create_uuid_tree(fs_info);
3354                 if (ret) {
3355                         btrfs_warn(fs_info,
3356                                    "failed to create the UUID tree %d", ret);
3357                         goto out;
3358                 }
3359         }
3360
3361 out:
3362         return ret;
3363 }
3364
3365 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
3366                       char *options)
3367 {
3368         u32 sectorsize;
3369         u32 nodesize;
3370         u32 stripesize;
3371         u64 generation;
3372         u64 features;
3373         u16 csum_type;
3374         struct btrfs_super_block *disk_super;
3375         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3376         struct btrfs_root *tree_root;
3377         struct btrfs_root *chunk_root;
3378         int ret;
3379         int err = -EINVAL;
3380         int level;
3381
3382         ret = init_mount_fs_info(fs_info, sb);
3383         if (ret) {
3384                 err = ret;
3385                 goto fail;
3386         }
3387
3388         /* These need to be init'ed before we start creating inodes and such. */
3389         tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3390                                      GFP_KERNEL);
3391         fs_info->tree_root = tree_root;
3392         chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3393                                       GFP_KERNEL);
3394         fs_info->chunk_root = chunk_root;
3395         if (!tree_root || !chunk_root) {
3396                 err = -ENOMEM;
3397                 goto fail;
3398         }
3399
3400         fs_info->btree_inode = new_inode(sb);
3401         if (!fs_info->btree_inode) {
3402                 err = -ENOMEM;
3403                 goto fail;
3404         }
3405         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
3406         btrfs_init_btree_inode(fs_info);
3407
3408         invalidate_bdev(fs_devices->latest_dev->bdev);
3409
3410         /*
3411          * Read super block and check the signature bytes only
3412          */
3413         disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
3414         if (IS_ERR(disk_super)) {
3415                 err = PTR_ERR(disk_super);
3416                 goto fail_alloc;
3417         }
3418
3419         /*
3420          * Verify the type first, if that or the checksum value are
3421          * corrupted, we'll find out
3422          */
3423         csum_type = btrfs_super_csum_type(disk_super);
3424         if (!btrfs_supported_super_csum(csum_type)) {
3425                 btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3426                           csum_type);
3427                 err = -EINVAL;
3428                 btrfs_release_disk_super(disk_super);
3429                 goto fail_alloc;
3430         }
3431
3432         fs_info->csum_size = btrfs_super_csum_size(disk_super);
3433
3434         ret = btrfs_init_csum_hash(fs_info, csum_type);
3435         if (ret) {
3436                 err = ret;
3437                 btrfs_release_disk_super(disk_super);
3438                 goto fail_alloc;
3439         }
3440
3441         /*
3442          * We want to check superblock checksum, the type is stored inside.
3443          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3444          */
3445         if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
3446                 btrfs_err(fs_info, "superblock checksum mismatch");
3447                 err = -EINVAL;
3448                 btrfs_release_disk_super(disk_super);
3449                 goto fail_alloc;
3450         }
3451
3452         /*
3453          * super_copy is zeroed at allocation time and we never touch the
3454          * following bytes up to INFO_SIZE, the checksum is calculated from
3455          * the whole block of INFO_SIZE
3456          */
3457         memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3458         btrfs_release_disk_super(disk_super);
3459
3460         disk_super = fs_info->super_copy;
3461
3462
3463         features = btrfs_super_flags(disk_super);
3464         if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3465                 features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3466                 btrfs_set_super_flags(disk_super, features);
3467                 btrfs_info(fs_info,
3468                         "found metadata UUID change in progress flag, clearing");
3469         }
3470
3471         memcpy(fs_info->super_for_commit, fs_info->super_copy,
3472                sizeof(*fs_info->super_for_commit));
3473
3474         ret = btrfs_validate_mount_super(fs_info);
3475         if (ret) {
3476                 btrfs_err(fs_info, "superblock contains fatal errors");
3477                 err = -EINVAL;
3478                 goto fail_alloc;
3479         }
3480
3481         if (!btrfs_super_root(disk_super))
3482                 goto fail_alloc;
3483
3484         /* check FS state, whether FS is broken. */
3485         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3486                 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3487
3488         /*
3489          * In the long term, we'll store the compression type in the super
3490          * block, and it'll be used for per file compression control.
3491          */
3492         fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3493
3494
3495         /* Set up fs_info before parsing mount options */
3496         nodesize = btrfs_super_nodesize(disk_super);
3497         sectorsize = btrfs_super_sectorsize(disk_super);
3498         stripesize = sectorsize;
3499         fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3500         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3501
3502         fs_info->nodesize = nodesize;
3503         fs_info->sectorsize = sectorsize;
3504         fs_info->sectorsize_bits = ilog2(sectorsize);
3505         fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3506         fs_info->stripesize = stripesize;
3507
3508         ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3509         if (ret) {
3510                 err = ret;
3511                 goto fail_alloc;
3512         }
3513
3514         features = btrfs_super_incompat_flags(disk_super) &
3515                 ~BTRFS_FEATURE_INCOMPAT_SUPP;
3516         if (features) {
3517                 btrfs_err(fs_info,
3518                     "cannot mount because of unsupported optional features (0x%llx)",
3519                     features);
3520                 err = -EINVAL;
3521                 goto fail_alloc;
3522         }
3523
3524         features = btrfs_super_incompat_flags(disk_super);
3525         features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3526         if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3527                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3528         else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3529                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3530
3531         /*
3532          * Flag our filesystem as having big metadata blocks if they are bigger
3533          * than the page size.
3534          */
3535         if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
3536                 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3537
3538         /*
3539          * mixed block groups end up with duplicate but slightly offset
3540          * extent buffers for the same range.  It leads to corruptions
3541          */
3542         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3543             (sectorsize != nodesize)) {
3544                 btrfs_err(fs_info,
3545 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3546                         nodesize, sectorsize);
3547                 goto fail_alloc;
3548         }
3549
3550         /*
3551          * Needn't use the lock because there is no other task which will
3552          * update the flag.
3553          */
3554         btrfs_set_super_incompat_flags(disk_super, features);
3555
3556         features = btrfs_super_compat_ro_flags(disk_super) &
3557                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
3558         if (!sb_rdonly(sb) && features) {
3559                 btrfs_err(fs_info,
3560         "cannot mount read-write because of unsupported optional features (0x%llx)",
3561                        features);
3562                 err = -EINVAL;
3563                 goto fail_alloc;
3564         }
3565         /*
3566          * We have unsupported RO compat features, although RO mounted, we
3567          * should not cause any metadata write, including log replay.
3568          * Or we could screw up whatever the new feature requires.
3569          */
3570         if (unlikely(features && btrfs_super_log_root(disk_super) &&
3571                      !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
3572                 btrfs_err(fs_info,
3573 "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3574                           features);
3575                 err = -EINVAL;
3576                 goto fail_alloc;
3577         }
3578
3579
3580         if (sectorsize < PAGE_SIZE) {
3581                 struct btrfs_subpage_info *subpage_info;
3582
3583                 /*
3584                  * V1 space cache has some hardcoded PAGE_SIZE usage, and is
3585                  * going to be deprecated.
3586                  *
3587                  * Force to use v2 cache for subpage case.
3588                  */
3589                 btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
3590                 btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
3591                         "forcing free space tree for sector size %u with page size %lu",
3592                         sectorsize, PAGE_SIZE);
3593
3594                 btrfs_warn(fs_info,
3595                 "read-write for sector size %u with page size %lu is experimental",
3596                            sectorsize, PAGE_SIZE);
3597                 subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
3598                 if (!subpage_info)
3599                         goto fail_alloc;
3600                 btrfs_init_subpage_info(subpage_info, sectorsize);
3601                 fs_info->subpage_info = subpage_info;
3602         }
3603
3604         ret = btrfs_init_workqueues(fs_info);
3605         if (ret) {
3606                 err = ret;
3607                 goto fail_sb_buffer;
3608         }
3609
3610         sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3611         sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3612
3613         sb->s_blocksize = sectorsize;
3614         sb->s_blocksize_bits = blksize_bits(sectorsize);
3615         memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3616
3617         mutex_lock(&fs_info->chunk_mutex);
3618         ret = btrfs_read_sys_array(fs_info);
3619         mutex_unlock(&fs_info->chunk_mutex);
3620         if (ret) {
3621                 btrfs_err(fs_info, "failed to read the system array: %d", ret);
3622                 goto fail_sb_buffer;
3623         }
3624
3625         generation = btrfs_super_chunk_root_generation(disk_super);
3626         level = btrfs_super_chunk_root_level(disk_super);
3627         ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
3628                               generation, level);
3629         if (ret) {
3630                 btrfs_err(fs_info, "failed to read chunk root");
3631                 goto fail_tree_roots;
3632         }
3633
3634         read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3635                            offsetof(struct btrfs_header, chunk_tree_uuid),
3636                            BTRFS_UUID_SIZE);
3637
3638         ret = btrfs_read_chunk_tree(fs_info);
3639         if (ret) {
3640                 btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3641                 goto fail_tree_roots;
3642         }
3643
3644         /*
3645          * At this point we know all the devices that make this filesystem,
3646          * including the seed devices but we don't know yet if the replace
3647          * target is required. So free devices that are not part of this
3648          * filesystem but skip the replace target device which is checked
3649          * below in btrfs_init_dev_replace().
3650          */
3651         btrfs_free_extra_devids(fs_devices);
3652         if (!fs_devices->latest_dev->bdev) {
3653                 btrfs_err(fs_info, "failed to read devices");
3654                 goto fail_tree_roots;
3655         }
3656
3657         ret = init_tree_roots(fs_info);
3658         if (ret)
3659                 goto fail_tree_roots;
3660
3661         /*
3662          * Get zone type information of zoned block devices. This will also
3663          * handle emulation of a zoned filesystem if a regular device has the
3664          * zoned incompat feature flag set.
3665          */
3666         ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3667         if (ret) {
3668                 btrfs_err(fs_info,
3669                           "zoned: failed to read device zone info: %d",
3670                           ret);
3671                 goto fail_block_groups;
3672         }
3673
3674         /*
3675          * If we have a uuid root and we're not being told to rescan we need to
3676          * check the generation here so we can set the
3677          * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3678          * transaction during a balance or the log replay without updating the
3679          * uuid generation, and then if we crash we would rescan the uuid tree,
3680          * even though it was perfectly fine.
3681          */
3682         if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3683             fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3684                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3685
3686         ret = btrfs_verify_dev_extents(fs_info);
3687         if (ret) {
3688                 btrfs_err(fs_info,
3689                           "failed to verify dev extents against chunks: %d",
3690                           ret);
3691                 goto fail_block_groups;
3692         }
3693         ret = btrfs_recover_balance(fs_info);
3694         if (ret) {
3695                 btrfs_err(fs_info, "failed to recover balance: %d", ret);
3696                 goto fail_block_groups;
3697         }
3698
3699         ret = btrfs_init_dev_stats(fs_info);
3700         if (ret) {
3701                 btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3702                 goto fail_block_groups;
3703         }
3704
3705         ret = btrfs_init_dev_replace(fs_info);
3706         if (ret) {
3707                 btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3708                 goto fail_block_groups;
3709         }
3710
3711         ret = btrfs_check_zoned_mode(fs_info);
3712         if (ret) {
3713                 btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3714                           ret);
3715                 goto fail_block_groups;
3716         }
3717
3718         ret = btrfs_sysfs_add_fsid(fs_devices);
3719         if (ret) {
3720                 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3721                                 ret);
3722                 goto fail_block_groups;
3723         }
3724
3725         ret = btrfs_sysfs_add_mounted(fs_info);
3726         if (ret) {
3727                 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3728                 goto fail_fsdev_sysfs;
3729         }
3730
3731         ret = btrfs_init_space_info(fs_info);
3732         if (ret) {
3733                 btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3734                 goto fail_sysfs;
3735         }
3736
3737         ret = btrfs_read_block_groups(fs_info);
3738         if (ret) {
3739                 btrfs_err(fs_info, "failed to read block groups: %d", ret);
3740                 goto fail_sysfs;
3741         }
3742
3743         btrfs_free_zone_cache(fs_info);
3744
3745         if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3746             !btrfs_check_rw_degradable(fs_info, NULL)) {
3747                 btrfs_warn(fs_info,
3748                 "writable mount is not allowed due to too many missing devices");
3749                 goto fail_sysfs;
3750         }
3751
3752         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3753                                                "btrfs-cleaner");
3754         if (IS_ERR(fs_info->cleaner_kthread))
3755                 goto fail_sysfs;
3756
3757         fs_info->transaction_kthread = kthread_run(transaction_kthread,
3758                                                    tree_root,
3759                                                    "btrfs-transaction");
3760         if (IS_ERR(fs_info->transaction_kthread))
3761                 goto fail_cleaner;
3762
3763         if (!btrfs_test_opt(fs_info, NOSSD) &&
3764             !fs_info->fs_devices->rotating) {
3765                 btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3766         }
3767
3768         /*
3769          * Mount does not set all options immediately, we can do it now and do
3770          * not have to wait for transaction commit
3771          */
3772         btrfs_apply_pending_changes(fs_info);
3773
3774 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3775         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3776                 ret = btrfsic_mount(fs_info, fs_devices,
3777                                     btrfs_test_opt(fs_info,
3778                                         CHECK_INTEGRITY_DATA) ? 1 : 0,
3779                                     fs_info->check_integrity_print_mask);
3780                 if (ret)
3781                         btrfs_warn(fs_info,
3782                                 "failed to initialize integrity check module: %d",
3783                                 ret);
3784         }
3785 #endif
3786         ret = btrfs_read_qgroup_config(fs_info);
3787         if (ret)
3788                 goto fail_trans_kthread;
3789
3790         if (btrfs_build_ref_tree(fs_info))
3791                 btrfs_err(fs_info, "couldn't build ref tree");
3792
3793         /* do not make disk changes in broken FS or nologreplay is given */
3794         if (btrfs_super_log_root(disk_super) != 0 &&
3795             !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3796                 btrfs_info(fs_info, "start tree-log replay");
3797                 ret = btrfs_replay_log(fs_info, fs_devices);
3798                 if (ret) {
3799                         err = ret;
3800                         goto fail_qgroup;
3801                 }
3802         }
3803
3804         fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3805         if (IS_ERR(fs_info->fs_root)) {
3806                 err = PTR_ERR(fs_info->fs_root);
3807                 btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3808                 fs_info->fs_root = NULL;
3809                 goto fail_qgroup;
3810         }
3811
3812         if (sb_rdonly(sb))
3813                 goto clear_oneshot;
3814
3815         ret = btrfs_start_pre_rw_mount(fs_info);
3816         if (ret) {
3817                 close_ctree(fs_info);
3818                 return ret;
3819         }
3820         btrfs_discard_resume(fs_info);
3821
3822         if (fs_info->uuid_root &&
3823             (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3824              fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3825                 btrfs_info(fs_info, "checking UUID tree");
3826                 ret = btrfs_check_uuid_tree(fs_info);
3827                 if (ret) {
3828                         btrfs_warn(fs_info,
3829                                 "failed to check the UUID tree: %d", ret);
3830                         close_ctree(fs_info);
3831                         return ret;
3832                 }
3833         }
3834
3835         set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3836
3837         /* Kick the cleaner thread so it'll start deleting snapshots. */
3838         if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3839                 wake_up_process(fs_info->cleaner_kthread);
3840
3841 clear_oneshot:
3842         btrfs_clear_oneshot_options(fs_info);
3843         return 0;
3844
3845 fail_qgroup:
3846         btrfs_free_qgroup_config(fs_info);
3847 fail_trans_kthread:
3848         kthread_stop(fs_info->transaction_kthread);
3849         btrfs_cleanup_transaction(fs_info);
3850         btrfs_free_fs_roots(fs_info);
3851 fail_cleaner:
3852         kthread_stop(fs_info->cleaner_kthread);
3853
3854         /*
3855          * make sure we're done with the btree inode before we stop our
3856          * kthreads
3857          */
3858         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3859
3860 fail_sysfs:
3861         btrfs_sysfs_remove_mounted(fs_info);
3862
3863 fail_fsdev_sysfs:
3864         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3865
3866 fail_block_groups:
3867         btrfs_put_block_group_cache(fs_info);
3868
3869 fail_tree_roots:
3870         if (fs_info->data_reloc_root)
3871                 btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3872         free_root_pointers(fs_info, true);
3873         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3874
3875 fail_sb_buffer:
3876         btrfs_stop_all_workers(fs_info);
3877         btrfs_free_block_groups(fs_info);
3878 fail_alloc:
3879         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3880
3881         iput(fs_info->btree_inode);
3882 fail:
3883         btrfs_close_devices(fs_info->fs_devices);
3884         return err;
3885 }
3886 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3887
3888 static void btrfs_end_super_write(struct bio *bio)
3889 {
3890         struct btrfs_device *device = bio->bi_private;
3891         struct bio_vec *bvec;
3892         struct bvec_iter_all iter_all;
3893         struct page *page;
3894
3895         bio_for_each_segment_all(bvec, bio, iter_all) {
3896                 page = bvec->bv_page;
3897
3898                 if (bio->bi_status) {
3899                         btrfs_warn_rl_in_rcu(device->fs_info,
3900                                 "lost page write due to IO error on %s (%d)",
3901                                 rcu_str_deref(device->name),
3902                                 blk_status_to_errno(bio->bi_status));
3903                         ClearPageUptodate(page);
3904                         SetPageError(page);
3905                         btrfs_dev_stat_inc_and_print(device,
3906                                                      BTRFS_DEV_STAT_WRITE_ERRS);
3907                 } else {
3908                         SetPageUptodate(page);
3909                 }
3910
3911                 put_page(page);
3912                 unlock_page(page);
3913         }
3914
3915         bio_put(bio);
3916 }
3917
3918 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3919                                                    int copy_num)
3920 {
3921         struct btrfs_super_block *super;
3922         struct page *page;
3923         u64 bytenr, bytenr_orig;
3924         struct address_space *mapping = bdev->bd_inode->i_mapping;
3925         int ret;
3926
3927         bytenr_orig = btrfs_sb_offset(copy_num);
3928         ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
3929         if (ret == -ENOENT)
3930                 return ERR_PTR(-EINVAL);
3931         else if (ret)
3932                 return ERR_PTR(ret);
3933
3934         if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
3935                 return ERR_PTR(-EINVAL);
3936
3937         page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3938         if (IS_ERR(page))
3939                 return ERR_CAST(page);
3940
3941         super = page_address(page);
3942         if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3943                 btrfs_release_disk_super(super);
3944                 return ERR_PTR(-ENODATA);
3945         }
3946
3947         if (btrfs_super_bytenr(super) != bytenr_orig) {
3948                 btrfs_release_disk_super(super);
3949                 return ERR_PTR(-EINVAL);
3950         }
3951
3952         return super;
3953 }
3954
3955
3956 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
3957 {
3958         struct btrfs_super_block *super, *latest = NULL;
3959         int i;
3960         u64 transid = 0;
3961
3962         /* we would like to check all the supers, but that would make
3963          * a btrfs mount succeed after a mkfs from a different FS.
3964          * So, we need to add a special mount option to scan for
3965          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3966          */
3967         for (i = 0; i < 1; i++) {
3968                 super = btrfs_read_dev_one_super(bdev, i);
3969                 if (IS_ERR(super))
3970                         continue;
3971
3972                 if (!latest || btrfs_super_generation(super) > transid) {
3973                         if (latest)
3974                                 btrfs_release_disk_super(super);
3975
3976                         latest = super;
3977                         transid = btrfs_super_generation(super);
3978                 }
3979         }
3980
3981         return super;
3982 }
3983
3984 /*
3985  * Write superblock @sb to the @device. Do not wait for completion, all the
3986  * pages we use for writing are locked.
3987  *
3988  * Write @max_mirrors copies of the superblock, where 0 means default that fit
3989  * the expected device size at commit time. Note that max_mirrors must be
3990  * same for write and wait phases.
3991  *
3992  * Return number of errors when page is not found or submission fails.
3993  */
3994 static int write_dev_supers(struct btrfs_device *device,
3995                             struct btrfs_super_block *sb, int max_mirrors)
3996 {
3997         struct btrfs_fs_info *fs_info = device->fs_info;
3998         struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3999         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
4000         int i;
4001         int errors = 0;
4002         int ret;
4003         u64 bytenr, bytenr_orig;
4004
4005         if (max_mirrors == 0)
4006                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
4007
4008         shash->tfm = fs_info->csum_shash;
4009
4010         for (i = 0; i < max_mirrors; i++) {
4011                 struct page *page;
4012                 struct bio *bio;
4013                 struct btrfs_super_block *disk_super;
4014
4015                 bytenr_orig = btrfs_sb_offset(i);
4016                 ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
4017                 if (ret == -ENOENT) {
4018                         continue;
4019                 } else if (ret < 0) {
4020                         btrfs_err(device->fs_info,
4021                                 "couldn't get super block location for mirror %d",
4022                                 i);
4023                         errors++;
4024                         continue;
4025                 }
4026                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4027                     device->commit_total_bytes)
4028                         break;
4029
4030                 btrfs_set_super_bytenr(sb, bytenr_orig);
4031
4032                 crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
4033                                     BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
4034                                     sb->csum);
4035
4036                 page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
4037                                            GFP_NOFS);
4038                 if (!page) {
4039                         btrfs_err(device->fs_info,
4040                             "couldn't get super block page for bytenr %llu",
4041                             bytenr);
4042                         errors++;
4043                         continue;
4044                 }
4045
4046                 /* Bump the refcount for wait_dev_supers() */
4047                 get_page(page);
4048
4049                 disk_super = page_address(page);
4050                 memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
4051
4052                 /*
4053                  * Directly use bios here instead of relying on the page cache
4054                  * to do I/O, so we don't lose the ability to do integrity
4055                  * checking.
4056                  */
4057                 bio = bio_alloc(device->bdev, 1,
4058                                 REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
4059                                 GFP_NOFS);
4060                 bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
4061                 bio->bi_private = device;
4062                 bio->bi_end_io = btrfs_end_super_write;
4063                 __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
4064                                offset_in_page(bytenr));
4065
4066                 /*
4067                  * We FUA only the first super block.  The others we allow to
4068                  * go down lazy and there's a short window where the on-disk
4069                  * copies might still contain the older version.
4070                  */
4071                 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
4072                         bio->bi_opf |= REQ_FUA;
4073
4074                 btrfsic_check_bio(bio);
4075                 submit_bio(bio);
4076
4077                 if (btrfs_advance_sb_log(device, i))
4078                         errors++;
4079         }
4080         return errors < i ? 0 : -1;
4081 }
4082
4083 /*
4084  * Wait for write completion of superblocks done by write_dev_supers,
4085  * @max_mirrors same for write and wait phases.
4086  *
4087  * Return number of errors when page is not found or not marked up to
4088  * date.
4089  */
4090 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
4091 {
4092         int i;
4093         int errors = 0;
4094         bool primary_failed = false;
4095         int ret;
4096         u64 bytenr;
4097
4098         if (max_mirrors == 0)
4099                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
4100
4101         for (i = 0; i < max_mirrors; i++) {
4102                 struct page *page;
4103
4104                 ret = btrfs_sb_log_location(device, i, READ, &bytenr);
4105                 if (ret == -ENOENT) {
4106                         break;
4107                 } else if (ret < 0) {
4108                         errors++;
4109                         if (i == 0)
4110                                 primary_failed = true;
4111                         continue;
4112                 }
4113                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4114                     device->commit_total_bytes)
4115                         break;
4116
4117                 page = find_get_page(device->bdev->bd_inode->i_mapping,
4118                                      bytenr >> PAGE_SHIFT);
4119                 if (!page) {
4120                         errors++;
4121                         if (i == 0)
4122                                 primary_failed = true;
4123                         continue;
4124                 }
4125                 /* Page is submitted locked and unlocked once the IO completes */
4126                 wait_on_page_locked(page);
4127                 if (PageError(page)) {
4128                         errors++;
4129                         if (i == 0)
4130                                 primary_failed = true;
4131                 }
4132
4133                 /* Drop our reference */
4134                 put_page(page);
4135
4136                 /* Drop the reference from the writing run */
4137                 put_page(page);
4138         }
4139
4140         /* log error, force error return */
4141         if (primary_failed) {
4142                 btrfs_err(device->fs_info, "error writing primary super block to device %llu",
4143                           device->devid);
4144                 return -1;
4145         }
4146
4147         return errors < i ? 0 : -1;
4148 }
4149
4150 /*
4151  * endio for the write_dev_flush, this will wake anyone waiting
4152  * for the barrier when it is done
4153  */
4154 static void btrfs_end_empty_barrier(struct bio *bio)
4155 {
4156         bio_uninit(bio);
4157         complete(bio->bi_private);
4158 }
4159
4160 /*
4161  * Submit a flush request to the device if it supports it. Error handling is
4162  * done in the waiting counterpart.
4163  */
4164 static void write_dev_flush(struct btrfs_device *device)
4165 {
4166         struct bio *bio = &device->flush_bio;
4167
4168 #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4169         /*
4170          * When a disk has write caching disabled, we skip submission of a bio
4171          * with flush and sync requests before writing the superblock, since
4172          * it's not needed. However when the integrity checker is enabled, this
4173          * results in reports that there are metadata blocks referred by a
4174          * superblock that were not properly flushed. So don't skip the bio
4175          * submission only when the integrity checker is enabled for the sake
4176          * of simplicity, since this is a debug tool and not meant for use in
4177          * non-debug builds.
4178          */
4179         if (!bdev_write_cache(device->bdev))
4180                 return;
4181 #endif
4182
4183         bio_init(bio, device->bdev, NULL, 0,
4184                  REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
4185         bio->bi_end_io = btrfs_end_empty_barrier;
4186         init_completion(&device->flush_wait);
4187         bio->bi_private = &device->flush_wait;
4188
4189         btrfsic_check_bio(bio);
4190         submit_bio(bio);
4191         set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4192 }
4193
4194 /*
4195  * If the flush bio has been submitted by write_dev_flush, wait for it.
4196  */
4197 static blk_status_t wait_dev_flush(struct btrfs_device *device)
4198 {
4199         struct bio *bio = &device->flush_bio;
4200
4201         if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
4202                 return BLK_STS_OK;
4203
4204         clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4205         wait_for_completion_io(&device->flush_wait);
4206
4207         return bio->bi_status;
4208 }
4209
4210 static int check_barrier_error(struct btrfs_fs_info *fs_info)
4211 {
4212         if (!btrfs_check_rw_degradable(fs_info, NULL))
4213                 return -EIO;
4214         return 0;
4215 }
4216
4217 /*
4218  * send an empty flush down to each device in parallel,
4219  * then wait for them
4220  */
4221 static int barrier_all_devices(struct btrfs_fs_info *info)
4222 {
4223         struct list_head *head;
4224         struct btrfs_device *dev;
4225         int errors_wait = 0;
4226         blk_status_t ret;
4227
4228         lockdep_assert_held(&info->fs_devices->device_list_mutex);
4229         /* send down all the barriers */
4230         head = &info->fs_devices->devices;
4231         list_for_each_entry(dev, head, dev_list) {
4232                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4233                         continue;
4234                 if (!dev->bdev)
4235                         continue;
4236                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4237                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4238                         continue;
4239
4240                 write_dev_flush(dev);
4241                 dev->last_flush_error = BLK_STS_OK;
4242         }
4243
4244         /* wait for all the barriers */
4245         list_for_each_entry(dev, head, dev_list) {
4246                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4247                         continue;
4248                 if (!dev->bdev) {
4249                         errors_wait++;
4250                         continue;
4251                 }
4252                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4253                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4254                         continue;
4255
4256                 ret = wait_dev_flush(dev);
4257                 if (ret) {
4258                         dev->last_flush_error = ret;
4259                         btrfs_dev_stat_inc_and_print(dev,
4260                                         BTRFS_DEV_STAT_FLUSH_ERRS);
4261                         errors_wait++;
4262                 }
4263         }
4264
4265         if (errors_wait) {
4266                 /*
4267                  * At some point we need the status of all disks
4268                  * to arrive at the volume status. So error checking
4269                  * is being pushed to a separate loop.
4270                  */
4271                 return check_barrier_error(info);
4272         }
4273         return 0;
4274 }
4275
4276 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
4277 {
4278         int raid_type;
4279         int min_tolerated = INT_MAX;
4280
4281         if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
4282             (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
4283                 min_tolerated = min_t(int, min_tolerated,
4284                                     btrfs_raid_array[BTRFS_RAID_SINGLE].
4285                                     tolerated_failures);
4286
4287         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4288                 if (raid_type == BTRFS_RAID_SINGLE)
4289                         continue;
4290                 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
4291                         continue;
4292                 min_tolerated = min_t(int, min_tolerated,
4293                                     btrfs_raid_array[raid_type].
4294                                     tolerated_failures);
4295         }
4296
4297         if (min_tolerated == INT_MAX) {
4298                 pr_warn("BTRFS: unknown raid flag: %llu", flags);
4299                 min_tolerated = 0;
4300         }
4301
4302         return min_tolerated;
4303 }
4304
4305 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4306 {
4307         struct list_head *head;
4308         struct btrfs_device *dev;
4309         struct btrfs_super_block *sb;
4310         struct btrfs_dev_item *dev_item;
4311         int ret;
4312         int do_barriers;
4313         int max_errors;
4314         int total_errors = 0;
4315         u64 flags;
4316
4317         do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4318
4319         /*
4320          * max_mirrors == 0 indicates we're from commit_transaction,
4321          * not from fsync where the tree roots in fs_info have not
4322          * been consistent on disk.
4323          */
4324         if (max_mirrors == 0)
4325                 backup_super_roots(fs_info);
4326
4327         sb = fs_info->super_for_commit;
4328         dev_item = &sb->dev_item;
4329
4330         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4331         head = &fs_info->fs_devices->devices;
4332         max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4333
4334         if (do_barriers) {
4335                 ret = barrier_all_devices(fs_info);
4336                 if (ret) {
4337                         mutex_unlock(
4338                                 &fs_info->fs_devices->device_list_mutex);
4339                         btrfs_handle_fs_error(fs_info, ret,
4340                                               "errors while submitting device barriers.");
4341                         return ret;
4342                 }
4343         }
4344
4345         list_for_each_entry(dev, head, dev_list) {
4346                 if (!dev->bdev) {
4347                         total_errors++;
4348                         continue;
4349                 }
4350                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4351                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4352                         continue;
4353
4354                 btrfs_set_stack_device_generation(dev_item, 0);
4355                 btrfs_set_stack_device_type(dev_item, dev->type);
4356                 btrfs_set_stack_device_id(dev_item, dev->devid);
4357                 btrfs_set_stack_device_total_bytes(dev_item,
4358                                                    dev->commit_total_bytes);
4359                 btrfs_set_stack_device_bytes_used(dev_item,
4360                                                   dev->commit_bytes_used);
4361                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4362                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4363                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4364                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4365                 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4366                        BTRFS_FSID_SIZE);
4367
4368                 flags = btrfs_super_flags(sb);
4369                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4370
4371                 ret = btrfs_validate_write_super(fs_info, sb);
4372                 if (ret < 0) {
4373                         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4374                         btrfs_handle_fs_error(fs_info, -EUCLEAN,
4375                                 "unexpected superblock corruption detected");
4376                         return -EUCLEAN;
4377                 }
4378
4379                 ret = write_dev_supers(dev, sb, max_mirrors);
4380                 if (ret)
4381                         total_errors++;
4382         }
4383         if (total_errors > max_errors) {
4384                 btrfs_err(fs_info, "%d errors while writing supers",
4385                           total_errors);
4386                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4387
4388                 /* FUA is masked off if unsupported and can't be the reason */
4389                 btrfs_handle_fs_error(fs_info, -EIO,
4390                                       "%d errors while writing supers",
4391                                       total_errors);
4392                 return -EIO;
4393         }
4394
4395         total_errors = 0;
4396         list_for_each_entry(dev, head, dev_list) {
4397                 if (!dev->bdev)
4398                         continue;
4399                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4400                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4401                         continue;
4402
4403                 ret = wait_dev_supers(dev, max_mirrors);
4404                 if (ret)
4405                         total_errors++;
4406         }
4407         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4408         if (total_errors > max_errors) {
4409                 btrfs_handle_fs_error(fs_info, -EIO,
4410                                       "%d errors while writing supers",
4411                                       total_errors);
4412                 return -EIO;
4413         }
4414         return 0;
4415 }
4416
4417 /* Drop a fs root from the radix tree and free it. */
4418 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4419                                   struct btrfs_root *root)
4420 {
4421         bool drop_ref = false;
4422
4423         spin_lock(&fs_info->fs_roots_radix_lock);
4424         radix_tree_delete(&fs_info->fs_roots_radix,
4425                           (unsigned long)root->root_key.objectid);
4426         if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4427                 drop_ref = true;
4428         spin_unlock(&fs_info->fs_roots_radix_lock);
4429
4430         if (BTRFS_FS_ERROR(fs_info)) {
4431                 ASSERT(root->log_root == NULL);
4432                 if (root->reloc_root) {
4433                         btrfs_put_root(root->reloc_root);
4434                         root->reloc_root = NULL;
4435                 }
4436         }
4437
4438         if (drop_ref)
4439                 btrfs_put_root(root);
4440 }
4441
4442 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
4443 {
4444         u64 root_objectid = 0;
4445         struct btrfs_root *gang[8];
4446         int i = 0;
4447         int err = 0;
4448         unsigned int ret = 0;
4449
4450         while (1) {
4451                 spin_lock(&fs_info->fs_roots_radix_lock);
4452                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4453                                              (void **)gang, root_objectid,
4454                                              ARRAY_SIZE(gang));
4455                 if (!ret) {
4456                         spin_unlock(&fs_info->fs_roots_radix_lock);
4457                         break;
4458                 }
4459                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
4460
4461                 for (i = 0; i < ret; i++) {
4462                         /* Avoid to grab roots in dead_roots */
4463                         if (btrfs_root_refs(&gang[i]->root_item) == 0) {
4464                                 gang[i] = NULL;
4465                                 continue;
4466                         }
4467                         /* grab all the search result for later use */
4468                         gang[i] = btrfs_grab_root(gang[i]);
4469                 }
4470                 spin_unlock(&fs_info->fs_roots_radix_lock);
4471
4472                 for (i = 0; i < ret; i++) {
4473                         if (!gang[i])
4474                                 continue;
4475                         root_objectid = gang[i]->root_key.objectid;
4476                         err = btrfs_orphan_cleanup(gang[i]);
4477                         if (err)
4478                                 break;
4479                         btrfs_put_root(gang[i]);
4480                 }
4481                 root_objectid++;
4482         }
4483
4484         /* release the uncleaned roots due to error */
4485         for (; i < ret; i++) {
4486                 if (gang[i])
4487                         btrfs_put_root(gang[i]);
4488         }
4489         return err;
4490 }
4491
4492 int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4493 {
4494         struct btrfs_root *root = fs_info->tree_root;
4495         struct btrfs_trans_handle *trans;
4496
4497         mutex_lock(&fs_info->cleaner_mutex);
4498         btrfs_run_delayed_iputs(fs_info);
4499         mutex_unlock(&fs_info->cleaner_mutex);
4500         wake_up_process(fs_info->cleaner_kthread);
4501
4502         /* wait until ongoing cleanup work done */
4503         down_write(&fs_info->cleanup_work_sem);
4504         up_write(&fs_info->cleanup_work_sem);
4505
4506         trans = btrfs_join_transaction(root);
4507         if (IS_ERR(trans))
4508                 return PTR_ERR(trans);
4509         return btrfs_commit_transaction(trans);
4510 }
4511
4512 static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4513 {
4514         struct btrfs_transaction *trans;
4515         struct btrfs_transaction *tmp;
4516         bool found = false;
4517
4518         if (list_empty(&fs_info->trans_list))
4519                 return;
4520
4521         /*
4522          * This function is only called at the very end of close_ctree(),
4523          * thus no other running transaction, no need to take trans_lock.
4524          */
4525         ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4526         list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4527                 struct extent_state *cached = NULL;
4528                 u64 dirty_bytes = 0;
4529                 u64 cur = 0;
4530                 u64 found_start;
4531                 u64 found_end;
4532
4533                 found = true;
4534                 while (!find_first_extent_bit(&trans->dirty_pages, cur,
4535                         &found_start, &found_end, EXTENT_DIRTY, &cached)) {
4536                         dirty_bytes += found_end + 1 - found_start;
4537                         cur = found_end + 1;
4538                 }
4539                 btrfs_warn(fs_info,
4540         "transaction %llu (with %llu dirty metadata bytes) is not committed",
4541                            trans->transid, dirty_bytes);
4542                 btrfs_cleanup_one_transaction(trans, fs_info);
4543
4544                 if (trans == fs_info->running_transaction)
4545                         fs_info->running_transaction = NULL;
4546                 list_del_init(&trans->list);
4547
4548                 btrfs_put_transaction(trans);
4549                 trace_btrfs_transaction_commit(fs_info);
4550         }
4551         ASSERT(!found);
4552 }
4553
4554 void __cold close_ctree(struct btrfs_fs_info *fs_info)
4555 {
4556         int ret;
4557
4558         set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4559
4560         /*
4561          * We may have the reclaim task running and relocating a data block group,
4562          * in which case it may create delayed iputs. So stop it before we park
4563          * the cleaner kthread otherwise we can get new delayed iputs after
4564          * parking the cleaner, and that can make the async reclaim task to hang
4565          * if it's waiting for delayed iputs to complete, since the cleaner is
4566          * parked and can not run delayed iputs - this will make us hang when
4567          * trying to stop the async reclaim task.
4568          */
4569         cancel_work_sync(&fs_info->reclaim_bgs_work);
4570         /*
4571          * We don't want the cleaner to start new transactions, add more delayed
4572          * iputs, etc. while we're closing. We can't use kthread_stop() yet
4573          * because that frees the task_struct, and the transaction kthread might
4574          * still try to wake up the cleaner.
4575          */
4576         kthread_park(fs_info->cleaner_kthread);
4577
4578         /*
4579          * If we had UNFINISHED_DROPS we could still be processing them, so
4580          * clear that bit and wake up relocation so it can stop.
4581          */
4582         btrfs_wake_unfinished_drop(fs_info);
4583
4584         /* wait for the qgroup rescan worker to stop */
4585         btrfs_qgroup_wait_for_completion(fs_info, false);
4586
4587         /* wait for the uuid_scan task to finish */
4588         down(&fs_info->uuid_tree_rescan_sem);
4589         /* avoid complains from lockdep et al., set sem back to initial state */
4590         up(&fs_info->uuid_tree_rescan_sem);
4591
4592         /* pause restriper - we want to resume on mount */
4593         btrfs_pause_balance(fs_info);
4594
4595         btrfs_dev_replace_suspend_for_unmount(fs_info);
4596
4597         btrfs_scrub_cancel(fs_info);
4598
4599         /* wait for any defraggers to finish */
4600         wait_event(fs_info->transaction_wait,
4601                    (atomic_read(&fs_info->defrag_running) == 0));
4602
4603         /* clear out the rbtree of defraggable inodes */
4604         btrfs_cleanup_defrag_inodes(fs_info);
4605
4606         cancel_work_sync(&fs_info->async_reclaim_work);
4607         cancel_work_sync(&fs_info->async_data_reclaim_work);
4608         cancel_work_sync(&fs_info->preempt_reclaim_work);
4609
4610         /* Cancel or finish ongoing discard work */
4611         btrfs_discard_cleanup(fs_info);
4612
4613         if (!sb_rdonly(fs_info->sb)) {
4614                 /*
4615                  * The cleaner kthread is stopped, so do one final pass over
4616                  * unused block groups.
4617                  */
4618                 btrfs_delete_unused_bgs(fs_info);
4619
4620                 /*
4621                  * There might be existing delayed inode workers still running
4622                  * and holding an empty delayed inode item. We must wait for
4623                  * them to complete first because they can create a transaction.
4624                  * This happens when someone calls btrfs_balance_delayed_items()
4625                  * and then a transaction commit runs the same delayed nodes
4626                  * before any delayed worker has done something with the nodes.
4627                  * We must wait for any worker here and not at transaction
4628                  * commit time since that could cause a deadlock.
4629                  * This is a very rare case.
4630                  */
4631                 btrfs_flush_workqueue(fs_info->delayed_workers);
4632
4633                 ret = btrfs_commit_super(fs_info);
4634                 if (ret)
4635                         btrfs_err(fs_info, "commit super ret %d", ret);
4636         }
4637
4638         if (BTRFS_FS_ERROR(fs_info))
4639                 btrfs_error_commit_super(fs_info);
4640
4641         kthread_stop(fs_info->transaction_kthread);
4642         kthread_stop(fs_info->cleaner_kthread);
4643
4644         ASSERT(list_empty(&fs_info->delayed_iputs));
4645         set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4646
4647         if (btrfs_check_quota_leak(fs_info)) {
4648                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4649                 btrfs_err(fs_info, "qgroup reserved space leaked");
4650         }
4651
4652         btrfs_free_qgroup_config(fs_info);
4653         ASSERT(list_empty(&fs_info->delalloc_roots));
4654
4655         if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4656                 btrfs_info(fs_info, "at unmount delalloc count %lld",
4657                        percpu_counter_sum(&fs_info->delalloc_bytes));
4658         }
4659
4660         if (percpu_counter_sum(&fs_info->ordered_bytes))
4661                 btrfs_info(fs_info, "at unmount dio bytes count %lld",
4662                            percpu_counter_sum(&fs_info->ordered_bytes));
4663
4664         btrfs_sysfs_remove_mounted(fs_info);
4665         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4666
4667         btrfs_put_block_group_cache(fs_info);
4668
4669         /*
4670          * we must make sure there is not any read request to
4671          * submit after we stopping all workers.
4672          */
4673         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4674         btrfs_stop_all_workers(fs_info);
4675
4676         /* We shouldn't have any transaction open at this point */
4677         warn_about_uncommitted_trans(fs_info);
4678
4679         clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4680         free_root_pointers(fs_info, true);
4681         btrfs_free_fs_roots(fs_info);
4682
4683         /*
4684          * We must free the block groups after dropping the fs_roots as we could
4685          * have had an IO error and have left over tree log blocks that aren't
4686          * cleaned up until the fs roots are freed.  This makes the block group
4687          * accounting appear to be wrong because there's pending reserved bytes,
4688          * so make sure we do the block group cleanup afterwards.
4689          */
4690         btrfs_free_block_groups(fs_info);
4691
4692         iput(fs_info->btree_inode);
4693
4694 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4695         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4696                 btrfsic_unmount(fs_info->fs_devices);
4697 #endif
4698
4699         btrfs_mapping_tree_free(&fs_info->mapping_tree);
4700         btrfs_close_devices(fs_info->fs_devices);
4701 }
4702
4703 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
4704                           int atomic)
4705 {
4706         int ret;
4707         struct inode *btree_inode = buf->pages[0]->mapping->host;
4708
4709         ret = extent_buffer_uptodate(buf);
4710         if (!ret)
4711                 return ret;
4712
4713         ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4714                                     parent_transid, atomic);
4715         if (ret == -EAGAIN)
4716                 return ret;
4717         return !ret;
4718 }
4719
4720 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4721 {
4722         struct btrfs_fs_info *fs_info = buf->fs_info;
4723         u64 transid = btrfs_header_generation(buf);
4724         int was_dirty;
4725
4726 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4727         /*
4728          * This is a fast path so only do this check if we have sanity tests
4729          * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4730          * outside of the sanity tests.
4731          */
4732         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4733                 return;
4734 #endif
4735         btrfs_assert_tree_write_locked(buf);
4736         if (transid != fs_info->generation)
4737                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4738                         buf->start, transid, fs_info->generation);
4739         was_dirty = set_extent_buffer_dirty(buf);
4740         if (!was_dirty)
4741                 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4742                                          buf->len,
4743                                          fs_info->dirty_metadata_batch);
4744 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4745         /*
4746          * Since btrfs_mark_buffer_dirty() can be called with item pointer set
4747          * but item data not updated.
4748          * So here we should only check item pointers, not item data.
4749          */
4750         if (btrfs_header_level(buf) == 0 &&
4751             btrfs_check_leaf_relaxed(buf)) {
4752                 btrfs_print_leaf(buf);
4753                 ASSERT(0);
4754         }
4755 #endif
4756 }
4757
4758 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4759                                         int flush_delayed)
4760 {
4761         /*
4762          * looks as though older kernels can get into trouble with
4763          * this code, they end up stuck in balance_dirty_pages forever
4764          */
4765         int ret;
4766
4767         if (current->flags & PF_MEMALLOC)
4768                 return;
4769
4770         if (flush_delayed)
4771                 btrfs_balance_delayed_items(fs_info);
4772
4773         ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4774                                      BTRFS_DIRTY_METADATA_THRESH,
4775                                      fs_info->dirty_metadata_batch);
4776         if (ret > 0) {
4777                 balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4778         }
4779 }
4780
4781 void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4782 {
4783         __btrfs_btree_balance_dirty(fs_info, 1);
4784 }
4785
4786 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4787 {
4788         __btrfs_btree_balance_dirty(fs_info, 0);
4789 }
4790
4791 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4792 {
4793         /* cleanup FS via transaction */
4794         btrfs_cleanup_transaction(fs_info);
4795
4796         mutex_lock(&fs_info->cleaner_mutex);
4797         btrfs_run_delayed_iputs(fs_info);
4798         mutex_unlock(&fs_info->cleaner_mutex);
4799
4800         down_write(&fs_info->cleanup_work_sem);
4801         up_write(&fs_info->cleanup_work_sem);
4802 }
4803
4804 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4805 {
4806         struct btrfs_root *gang[8];
4807         u64 root_objectid = 0;
4808         int ret;
4809
4810         spin_lock(&fs_info->fs_roots_radix_lock);
4811         while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4812                                              (void **)gang, root_objectid,
4813                                              ARRAY_SIZE(gang))) != 0) {
4814                 int i;
4815
4816                 for (i = 0; i < ret; i++)
4817                         gang[i] = btrfs_grab_root(gang[i]);
4818                 spin_unlock(&fs_info->fs_roots_radix_lock);
4819
4820                 for (i = 0; i < ret; i++) {
4821                         if (!gang[i])
4822                                 continue;
4823                         root_objectid = gang[i]->root_key.objectid;
4824                         btrfs_free_log(NULL, gang[i]);
4825                         btrfs_put_root(gang[i]);
4826                 }
4827                 root_objectid++;
4828                 spin_lock(&fs_info->fs_roots_radix_lock);
4829         }
4830         spin_unlock(&fs_info->fs_roots_radix_lock);
4831         btrfs_free_log_root_tree(NULL, fs_info);
4832 }
4833
4834 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4835 {
4836         struct btrfs_ordered_extent *ordered;
4837
4838         spin_lock(&root->ordered_extent_lock);
4839         /*
4840          * This will just short circuit the ordered completion stuff which will
4841          * make sure the ordered extent gets properly cleaned up.
4842          */
4843         list_for_each_entry(ordered, &root->ordered_extents,
4844                             root_extent_list)
4845                 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4846         spin_unlock(&root->ordered_extent_lock);
4847 }
4848
4849 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4850 {
4851         struct btrfs_root *root;
4852         struct list_head splice;
4853
4854         INIT_LIST_HEAD(&splice);
4855
4856         spin_lock(&fs_info->ordered_root_lock);
4857         list_splice_init(&fs_info->ordered_roots, &splice);
4858         while (!list_empty(&splice)) {
4859                 root = list_first_entry(&splice, struct btrfs_root,
4860                                         ordered_root);
4861                 list_move_tail(&root->ordered_root,
4862                                &fs_info->ordered_roots);
4863
4864                 spin_unlock(&fs_info->ordered_root_lock);
4865                 btrfs_destroy_ordered_extents(root);
4866
4867                 cond_resched();
4868                 spin_lock(&fs_info->ordered_root_lock);
4869         }
4870         spin_unlock(&fs_info->ordered_root_lock);
4871
4872         /*
4873          * We need this here because if we've been flipped read-only we won't
4874          * get sync() from the umount, so we need to make sure any ordered
4875          * extents that haven't had their dirty pages IO start writeout yet
4876          * actually get run and error out properly.
4877          */
4878         btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4879 }
4880
4881 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4882                                       struct btrfs_fs_info *fs_info)
4883 {
4884         struct rb_node *node;
4885         struct btrfs_delayed_ref_root *delayed_refs;
4886         struct btrfs_delayed_ref_node *ref;
4887         int ret = 0;
4888
4889         delayed_refs = &trans->delayed_refs;
4890
4891         spin_lock(&delayed_refs->lock);
4892         if (atomic_read(&delayed_refs->num_entries) == 0) {
4893                 spin_unlock(&delayed_refs->lock);
4894                 btrfs_debug(fs_info, "delayed_refs has NO entry");
4895                 return ret;
4896         }
4897
4898         while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4899                 struct btrfs_delayed_ref_head *head;
4900                 struct rb_node *n;
4901                 bool pin_bytes = false;
4902
4903                 head = rb_entry(node, struct btrfs_delayed_ref_head,
4904                                 href_node);
4905                 if (btrfs_delayed_ref_lock(delayed_refs, head))
4906                         continue;
4907
4908                 spin_lock(&head->lock);
4909                 while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4910                         ref = rb_entry(n, struct btrfs_delayed_ref_node,
4911                                        ref_node);
4912                         ref->in_tree = 0;
4913                         rb_erase_cached(&ref->ref_node, &head->ref_tree);
4914                         RB_CLEAR_NODE(&ref->ref_node);
4915                         if (!list_empty(&ref->add_list))
4916                                 list_del(&ref->add_list);
4917                         atomic_dec(&delayed_refs->num_entries);
4918                         btrfs_put_delayed_ref(ref);
4919                 }
4920                 if (head->must_insert_reserved)
4921                         pin_bytes = true;
4922                 btrfs_free_delayed_extent_op(head->extent_op);
4923                 btrfs_delete_ref_head(delayed_refs, head);
4924                 spin_unlock(&head->lock);
4925                 spin_unlock(&delayed_refs->lock);
4926                 mutex_unlock(&head->mutex);
4927
4928                 if (pin_bytes) {
4929                         struct btrfs_block_group *cache;
4930
4931                         cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4932                         BUG_ON(!cache);
4933
4934                         spin_lock(&cache->space_info->lock);
4935                         spin_lock(&cache->lock);
4936                         cache->pinned += head->num_bytes;
4937                         btrfs_space_info_update_bytes_pinned(fs_info,
4938                                 cache->space_info, head->num_bytes);
4939                         cache->reserved -= head->num_bytes;
4940                         cache->space_info->bytes_reserved -= head->num_bytes;
4941                         spin_unlock(&cache->lock);
4942                         spin_unlock(&cache->space_info->lock);
4943
4944                         btrfs_put_block_group(cache);
4945
4946                         btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4947                                 head->bytenr + head->num_bytes - 1);
4948                 }
4949                 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4950                 btrfs_put_delayed_ref_head(head);
4951                 cond_resched();
4952                 spin_lock(&delayed_refs->lock);
4953         }
4954         btrfs_qgroup_destroy_extent_records(trans);
4955
4956         spin_unlock(&delayed_refs->lock);
4957
4958         return ret;
4959 }
4960
4961 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4962 {
4963         struct btrfs_inode *btrfs_inode;
4964         struct list_head splice;
4965
4966         INIT_LIST_HEAD(&splice);
4967
4968         spin_lock(&root->delalloc_lock);
4969         list_splice_init(&root->delalloc_inodes, &splice);
4970
4971         while (!list_empty(&splice)) {
4972                 struct inode *inode = NULL;
4973                 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4974                                                delalloc_inodes);
4975                 __btrfs_del_delalloc_inode(root, btrfs_inode);
4976                 spin_unlock(&root->delalloc_lock);
4977
4978                 /*
4979                  * Make sure we get a live inode and that it'll not disappear
4980                  * meanwhile.
4981                  */
4982                 inode = igrab(&btrfs_inode->vfs_inode);
4983                 if (inode) {
4984                         invalidate_inode_pages2(inode->i_mapping);
4985                         iput(inode);
4986                 }
4987                 spin_lock(&root->delalloc_lock);
4988         }
4989         spin_unlock(&root->delalloc_lock);
4990 }
4991
4992 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4993 {
4994         struct btrfs_root *root;
4995         struct list_head splice;
4996
4997         INIT_LIST_HEAD(&splice);
4998
4999         spin_lock(&fs_info->delalloc_root_lock);
5000         list_splice_init(&fs_info->delalloc_roots, &splice);
5001         while (!list_empty(&splice)) {
5002                 root = list_first_entry(&splice, struct btrfs_root,
5003                                          delalloc_root);
5004                 root = btrfs_grab_root(root);
5005                 BUG_ON(!root);
5006                 spin_unlock(&fs_info->delalloc_root_lock);
5007
5008                 btrfs_destroy_delalloc_inodes(root);
5009                 btrfs_put_root(root);
5010
5011                 spin_lock(&fs_info->delalloc_root_lock);
5012         }
5013         spin_unlock(&fs_info->delalloc_root_lock);
5014 }
5015
5016 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
5017                                         struct extent_io_tree *dirty_pages,
5018                                         int mark)
5019 {
5020         int ret;
5021         struct extent_buffer *eb;
5022         u64 start = 0;
5023         u64 end;
5024
5025         while (1) {
5026                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
5027                                             mark, NULL);
5028                 if (ret)
5029                         break;
5030
5031                 clear_extent_bits(dirty_pages, start, end, mark);
5032                 while (start <= end) {
5033                         eb = find_extent_buffer(fs_info, start);
5034                         start += fs_info->nodesize;
5035                         if (!eb)
5036                                 continue;
5037                         wait_on_extent_buffer_writeback(eb);
5038
5039                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
5040                                                &eb->bflags))
5041                                 clear_extent_buffer_dirty(eb);
5042                         free_extent_buffer_stale(eb);
5043                 }
5044         }
5045
5046         return ret;
5047 }
5048
5049 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
5050                                        struct extent_io_tree *unpin)
5051 {
5052         u64 start;
5053         u64 end;
5054         int ret;
5055
5056         while (1) {
5057                 struct extent_state *cached_state = NULL;
5058
5059                 /*
5060                  * The btrfs_finish_extent_commit() may get the same range as
5061                  * ours between find_first_extent_bit and clear_extent_dirty.
5062                  * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
5063                  * the same extent range.
5064                  */
5065                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
5066                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5067                                             EXTENT_DIRTY, &cached_state);
5068                 if (ret) {
5069                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5070                         break;
5071                 }
5072
5073                 clear_extent_dirty(unpin, start, end, &cached_state);
5074                 free_extent_state(cached_state);
5075                 btrfs_error_unpin_extent_range(fs_info, start, end);
5076                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5077                 cond_resched();
5078         }
5079
5080         return 0;
5081 }
5082
5083 static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
5084 {
5085         struct inode *inode;
5086
5087         inode = cache->io_ctl.inode;
5088         if (inode) {
5089                 invalidate_inode_pages2(inode->i_mapping);
5090                 BTRFS_I(inode)->generation = 0;
5091                 cache->io_ctl.inode = NULL;
5092                 iput(inode);
5093         }
5094         ASSERT(cache->io_ctl.pages == NULL);
5095         btrfs_put_block_group(cache);
5096 }
5097
5098 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
5099                              struct btrfs_fs_info *fs_info)
5100 {
5101         struct btrfs_block_group *cache;
5102
5103         spin_lock(&cur_trans->dirty_bgs_lock);
5104         while (!list_empty(&cur_trans->dirty_bgs)) {
5105                 cache = list_first_entry(&cur_trans->dirty_bgs,
5106                                          struct btrfs_block_group,
5107                                          dirty_list);
5108
5109                 if (!list_empty(&cache->io_list)) {
5110                         spin_unlock(&cur_trans->dirty_bgs_lock);
5111                         list_del_init(&cache->io_list);
5112                         btrfs_cleanup_bg_io(cache);
5113                         spin_lock(&cur_trans->dirty_bgs_lock);
5114                 }
5115
5116                 list_del_init(&cache->dirty_list);
5117                 spin_lock(&cache->lock);
5118                 cache->disk_cache_state = BTRFS_DC_ERROR;
5119                 spin_unlock(&cache->lock);
5120
5121                 spin_unlock(&cur_trans->dirty_bgs_lock);
5122                 btrfs_put_block_group(cache);
5123                 btrfs_delayed_refs_rsv_release(fs_info, 1);
5124                 spin_lock(&cur_trans->dirty_bgs_lock);
5125         }
5126         spin_unlock(&cur_trans->dirty_bgs_lock);
5127
5128         /*
5129          * Refer to the definition of io_bgs member for details why it's safe
5130          * to use it without any locking
5131          */
5132         while (!list_empty(&cur_trans->io_bgs)) {
5133                 cache = list_first_entry(&cur_trans->io_bgs,
5134                                          struct btrfs_block_group,
5135                                          io_list);
5136
5137                 list_del_init(&cache->io_list);
5138                 spin_lock(&cache->lock);
5139                 cache->disk_cache_state = BTRFS_DC_ERROR;
5140                 spin_unlock(&cache->lock);
5141                 btrfs_cleanup_bg_io(cache);
5142         }
5143 }
5144
5145 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
5146                                    struct btrfs_fs_info *fs_info)
5147 {
5148         struct btrfs_device *dev, *tmp;
5149
5150         btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
5151         ASSERT(list_empty(&cur_trans->dirty_bgs));
5152         ASSERT(list_empty(&cur_trans->io_bgs));
5153
5154         list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
5155                                  post_commit_list) {
5156                 list_del_init(&dev->post_commit_list);
5157         }
5158
5159         btrfs_destroy_delayed_refs(cur_trans, fs_info);
5160
5161         cur_trans->state = TRANS_STATE_COMMIT_START;
5162         wake_up(&fs_info->transaction_blocked_wait);
5163
5164         cur_trans->state = TRANS_STATE_UNBLOCKED;
5165         wake_up(&fs_info->transaction_wait);
5166
5167         btrfs_destroy_delayed_inodes(fs_info);
5168
5169         btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
5170                                      EXTENT_DIRTY);
5171         btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
5172
5173         btrfs_free_redirty_list(cur_trans);
5174
5175         cur_trans->state =TRANS_STATE_COMPLETED;
5176         wake_up(&cur_trans->commit_wait);
5177 }
5178
5179 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
5180 {
5181         struct btrfs_transaction *t;
5182
5183         mutex_lock(&fs_info->transaction_kthread_mutex);
5184
5185         spin_lock(&fs_info->trans_lock);
5186         while (!list_empty(&fs_info->trans_list)) {
5187                 t = list_first_entry(&fs_info->trans_list,
5188                                      struct btrfs_transaction, list);
5189                 if (t->state >= TRANS_STATE_COMMIT_START) {
5190                         refcount_inc(&t->use_count);
5191                         spin_unlock(&fs_info->trans_lock);
5192                         btrfs_wait_for_commit(fs_info, t->transid);
5193                         btrfs_put_transaction(t);
5194                         spin_lock(&fs_info->trans_lock);
5195                         continue;
5196                 }
5197                 if (t == fs_info->running_transaction) {
5198                         t->state = TRANS_STATE_COMMIT_DOING;
5199                         spin_unlock(&fs_info->trans_lock);
5200                         /*
5201                          * We wait for 0 num_writers since we don't hold a trans
5202                          * handle open currently for this transaction.
5203                          */
5204                         wait_event(t->writer_wait,
5205                                    atomic_read(&t->num_writers) == 0);
5206                 } else {
5207                         spin_unlock(&fs_info->trans_lock);
5208                 }
5209                 btrfs_cleanup_one_transaction(t, fs_info);
5210
5211                 spin_lock(&fs_info->trans_lock);
5212                 if (t == fs_info->running_transaction)
5213                         fs_info->running_transaction = NULL;
5214                 list_del_init(&t->list);
5215                 spin_unlock(&fs_info->trans_lock);
5216
5217                 btrfs_put_transaction(t);
5218                 trace_btrfs_transaction_commit(fs_info);
5219                 spin_lock(&fs_info->trans_lock);
5220         }
5221         spin_unlock(&fs_info->trans_lock);
5222         btrfs_destroy_all_ordered_extents(fs_info);
5223         btrfs_destroy_delayed_inodes(fs_info);
5224         btrfs_assert_delayed_root_empty(fs_info);
5225         btrfs_destroy_all_delalloc_inodes(fs_info);
5226         btrfs_drop_all_logs(fs_info);
5227         mutex_unlock(&fs_info->transaction_kthread_mutex);
5228
5229         return 0;
5230 }
5231
5232 int btrfs_init_root_free_objectid(struct btrfs_root *root)
5233 {
5234         struct btrfs_path *path;
5235         int ret;
5236         struct extent_buffer *l;
5237         struct btrfs_key search_key;
5238         struct btrfs_key found_key;
5239         int slot;
5240
5241         path = btrfs_alloc_path();
5242         if (!path)
5243                 return -ENOMEM;
5244
5245         search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
5246         search_key.type = -1;
5247         search_key.offset = (u64)-1;
5248         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5249         if (ret < 0)
5250                 goto error;
5251         BUG_ON(ret == 0); /* Corruption */
5252         if (path->slots[0] > 0) {
5253                 slot = path->slots[0] - 1;
5254                 l = path->nodes[0];
5255                 btrfs_item_key_to_cpu(l, &found_key, slot);
5256                 root->free_objectid = max_t(u64, found_key.objectid + 1,
5257                                             BTRFS_FIRST_FREE_OBJECTID);
5258         } else {
5259                 root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
5260         }
5261         ret = 0;
5262 error:
5263         btrfs_free_path(path);
5264         return ret;
5265 }
5266
5267 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
5268 {
5269         int ret;
5270         mutex_lock(&root->objectid_mutex);
5271
5272         if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
5273                 btrfs_warn(root->fs_info,
5274                            "the objectid of root %llu reaches its highest value",
5275                            root->root_key.objectid);
5276                 ret = -ENOSPC;
5277                 goto out;
5278         }
5279
5280         *objectid = root->free_objectid++;
5281         ret = 0;
5282 out:
5283         mutex_unlock(&root->objectid_mutex);
5284         return ret;
5285 }