Merge branches 'pm-cpuidle', 'pm-core' and 'pm-sleep'
[linux-2.6-microblaze.git] / fs / btrfs / disk-io.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/fs.h>
7 #include <linux/blkdev.h>
8 #include <linux/radix-tree.h>
9 #include <linux/writeback.h>
10 #include <linux/workqueue.h>
11 #include <linux/kthread.h>
12 #include <linux/slab.h>
13 #include <linux/migrate.h>
14 #include <linux/ratelimit.h>
15 #include <linux/uuid.h>
16 #include <linux/semaphore.h>
17 #include <linux/error-injection.h>
18 #include <linux/crc32c.h>
19 #include <linux/sched/mm.h>
20 #include <asm/unaligned.h>
21 #include <crypto/hash.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "btrfs_inode.h"
26 #include "bio.h"
27 #include "print-tree.h"
28 #include "locking.h"
29 #include "tree-log.h"
30 #include "free-space-cache.h"
31 #include "free-space-tree.h"
32 #include "check-integrity.h"
33 #include "rcu-string.h"
34 #include "dev-replace.h"
35 #include "raid56.h"
36 #include "sysfs.h"
37 #include "qgroup.h"
38 #include "compression.h"
39 #include "tree-checker.h"
40 #include "ref-verify.h"
41 #include "block-group.h"
42 #include "discard.h"
43 #include "space-info.h"
44 #include "zoned.h"
45 #include "subpage.h"
46 #include "fs.h"
47 #include "accessors.h"
48 #include "extent-tree.h"
49 #include "root-tree.h"
50 #include "defrag.h"
51 #include "uuid-tree.h"
52 #include "relocation.h"
53 #include "scrub.h"
54 #include "super.h"
55
56 #define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
57                                  BTRFS_HEADER_FLAG_RELOC |\
58                                  BTRFS_SUPER_FLAG_ERROR |\
59                                  BTRFS_SUPER_FLAG_SEEDING |\
60                                  BTRFS_SUPER_FLAG_METADUMP |\
61                                  BTRFS_SUPER_FLAG_METADUMP_V2)
62
63 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
64 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
65                                       struct btrfs_fs_info *fs_info);
66 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
67 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
68                                         struct extent_io_tree *dirty_pages,
69                                         int mark);
70 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
71                                        struct extent_io_tree *pinned_extents);
72 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
73 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
74
75 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
76 {
77         if (fs_info->csum_shash)
78                 crypto_free_shash(fs_info->csum_shash);
79 }
80
81 /*
82  * async submit bios are used to offload expensive checksumming
83  * onto the worker threads.  They checksum file and metadata bios
84  * just before they are sent down the IO stack.
85  */
86 struct async_submit_bio {
87         struct btrfs_inode *inode;
88         struct bio *bio;
89         enum btrfs_wq_submit_cmd submit_cmd;
90         int mirror_num;
91
92         /* Optional parameter for used by direct io */
93         u64 dio_file_offset;
94         struct btrfs_work work;
95         blk_status_t status;
96 };
97
98 /*
99  * Compute the csum of a btree block and store the result to provided buffer.
100  */
101 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
102 {
103         struct btrfs_fs_info *fs_info = buf->fs_info;
104         const int num_pages = num_extent_pages(buf);
105         const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
106         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
107         char *kaddr;
108         int i;
109
110         shash->tfm = fs_info->csum_shash;
111         crypto_shash_init(shash);
112         kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
113         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
114                             first_page_part - BTRFS_CSUM_SIZE);
115
116         for (i = 1; i < num_pages; i++) {
117                 kaddr = page_address(buf->pages[i]);
118                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
119         }
120         memset(result, 0, BTRFS_CSUM_SIZE);
121         crypto_shash_final(shash, result);
122 }
123
124 /*
125  * we can't consider a given block up to date unless the transid of the
126  * block matches the transid in the parent node's pointer.  This is how we
127  * detect blocks that either didn't get written at all or got written
128  * in the wrong place.
129  */
130 static int verify_parent_transid(struct extent_io_tree *io_tree,
131                                  struct extent_buffer *eb, u64 parent_transid,
132                                  int atomic)
133 {
134         struct extent_state *cached_state = NULL;
135         int ret;
136
137         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
138                 return 0;
139
140         if (atomic)
141                 return -EAGAIN;
142
143         lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
144         if (extent_buffer_uptodate(eb) &&
145             btrfs_header_generation(eb) == parent_transid) {
146                 ret = 0;
147                 goto out;
148         }
149         btrfs_err_rl(eb->fs_info,
150 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
151                         eb->start, eb->read_mirror,
152                         parent_transid, btrfs_header_generation(eb));
153         ret = 1;
154         clear_extent_buffer_uptodate(eb);
155 out:
156         unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
157                       &cached_state);
158         return ret;
159 }
160
161 static bool btrfs_supported_super_csum(u16 csum_type)
162 {
163         switch (csum_type) {
164         case BTRFS_CSUM_TYPE_CRC32:
165         case BTRFS_CSUM_TYPE_XXHASH:
166         case BTRFS_CSUM_TYPE_SHA256:
167         case BTRFS_CSUM_TYPE_BLAKE2:
168                 return true;
169         default:
170                 return false;
171         }
172 }
173
174 /*
175  * Return 0 if the superblock checksum type matches the checksum value of that
176  * algorithm. Pass the raw disk superblock data.
177  */
178 int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
179                            const struct btrfs_super_block *disk_sb)
180 {
181         char result[BTRFS_CSUM_SIZE];
182         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
183
184         shash->tfm = fs_info->csum_shash;
185
186         /*
187          * The super_block structure does not span the whole
188          * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
189          * filled with zeros and is included in the checksum.
190          */
191         crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
192                             BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
193
194         if (memcmp(disk_sb->csum, result, fs_info->csum_size))
195                 return 1;
196
197         return 0;
198 }
199
200 int btrfs_verify_level_key(struct extent_buffer *eb, int level,
201                            struct btrfs_key *first_key, u64 parent_transid)
202 {
203         struct btrfs_fs_info *fs_info = eb->fs_info;
204         int found_level;
205         struct btrfs_key found_key;
206         int ret;
207
208         found_level = btrfs_header_level(eb);
209         if (found_level != level) {
210                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
211                      KERN_ERR "BTRFS: tree level check failed\n");
212                 btrfs_err(fs_info,
213 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
214                           eb->start, level, found_level);
215                 return -EIO;
216         }
217
218         if (!first_key)
219                 return 0;
220
221         /*
222          * For live tree block (new tree blocks in current transaction),
223          * we need proper lock context to avoid race, which is impossible here.
224          * So we only checks tree blocks which is read from disk, whose
225          * generation <= fs_info->last_trans_committed.
226          */
227         if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
228                 return 0;
229
230         /* We have @first_key, so this @eb must have at least one item */
231         if (btrfs_header_nritems(eb) == 0) {
232                 btrfs_err(fs_info,
233                 "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
234                           eb->start);
235                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
236                 return -EUCLEAN;
237         }
238
239         if (found_level)
240                 btrfs_node_key_to_cpu(eb, &found_key, 0);
241         else
242                 btrfs_item_key_to_cpu(eb, &found_key, 0);
243         ret = btrfs_comp_cpu_keys(first_key, &found_key);
244
245         if (ret) {
246                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
247                      KERN_ERR "BTRFS: tree first key check failed\n");
248                 btrfs_err(fs_info,
249 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
250                           eb->start, parent_transid, first_key->objectid,
251                           first_key->type, first_key->offset,
252                           found_key.objectid, found_key.type,
253                           found_key.offset);
254         }
255         return ret;
256 }
257
258 static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
259                                       int mirror_num)
260 {
261         struct btrfs_fs_info *fs_info = eb->fs_info;
262         u64 start = eb->start;
263         int i, num_pages = num_extent_pages(eb);
264         int ret = 0;
265
266         if (sb_rdonly(fs_info->sb))
267                 return -EROFS;
268
269         for (i = 0; i < num_pages; i++) {
270                 struct page *p = eb->pages[i];
271
272                 ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
273                                 start, p, start - page_offset(p), mirror_num);
274                 if (ret)
275                         break;
276                 start += PAGE_SIZE;
277         }
278
279         return ret;
280 }
281
282 /*
283  * helper to read a given tree block, doing retries as required when
284  * the checksums don't match and we have alternate mirrors to try.
285  *
286  * @check:              expected tree parentness check, see the comments of the
287  *                      structure for details.
288  */
289 int btrfs_read_extent_buffer(struct extent_buffer *eb,
290                              struct btrfs_tree_parent_check *check)
291 {
292         struct btrfs_fs_info *fs_info = eb->fs_info;
293         int failed = 0;
294         int ret;
295         int num_copies = 0;
296         int mirror_num = 0;
297         int failed_mirror = 0;
298
299         ASSERT(check);
300
301         while (1) {
302                 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
303                 ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
304                 if (!ret)
305                         break;
306
307                 num_copies = btrfs_num_copies(fs_info,
308                                               eb->start, eb->len);
309                 if (num_copies == 1)
310                         break;
311
312                 if (!failed_mirror) {
313                         failed = 1;
314                         failed_mirror = eb->read_mirror;
315                 }
316
317                 mirror_num++;
318                 if (mirror_num == failed_mirror)
319                         mirror_num++;
320
321                 if (mirror_num > num_copies)
322                         break;
323         }
324
325         if (failed && !ret && failed_mirror)
326                 btrfs_repair_eb_io_failure(eb, failed_mirror);
327
328         return ret;
329 }
330
331 static int csum_one_extent_buffer(struct extent_buffer *eb)
332 {
333         struct btrfs_fs_info *fs_info = eb->fs_info;
334         u8 result[BTRFS_CSUM_SIZE];
335         int ret;
336
337         ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
338                                     offsetof(struct btrfs_header, fsid),
339                                     BTRFS_FSID_SIZE) == 0);
340         csum_tree_block(eb, result);
341
342         if (btrfs_header_level(eb))
343                 ret = btrfs_check_node(eb);
344         else
345                 ret = btrfs_check_leaf_full(eb);
346
347         if (ret < 0)
348                 goto error;
349
350         /*
351          * Also check the generation, the eb reached here must be newer than
352          * last committed. Or something seriously wrong happened.
353          */
354         if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
355                 ret = -EUCLEAN;
356                 btrfs_err(fs_info,
357                         "block=%llu bad generation, have %llu expect > %llu",
358                           eb->start, btrfs_header_generation(eb),
359                           fs_info->last_trans_committed);
360                 goto error;
361         }
362         write_extent_buffer(eb, result, 0, fs_info->csum_size);
363
364         return 0;
365
366 error:
367         btrfs_print_tree(eb, 0);
368         btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
369                   eb->start);
370         /*
371          * Be noisy if this is an extent buffer from a log tree. We don't abort
372          * a transaction in case there's a bad log tree extent buffer, we just
373          * fallback to a transaction commit. Still we want to know when there is
374          * a bad log tree extent buffer, as that may signal a bug somewhere.
375          */
376         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
377                 btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
378         return ret;
379 }
380
381 /* Checksum all dirty extent buffers in one bio_vec */
382 static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
383                                       struct bio_vec *bvec)
384 {
385         struct page *page = bvec->bv_page;
386         u64 bvec_start = page_offset(page) + bvec->bv_offset;
387         u64 cur;
388         int ret = 0;
389
390         for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
391              cur += fs_info->nodesize) {
392                 struct extent_buffer *eb;
393                 bool uptodate;
394
395                 eb = find_extent_buffer(fs_info, cur);
396                 uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
397                                                        fs_info->nodesize);
398
399                 /* A dirty eb shouldn't disappear from buffer_radix */
400                 if (WARN_ON(!eb))
401                         return -EUCLEAN;
402
403                 if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
404                         free_extent_buffer(eb);
405                         return -EUCLEAN;
406                 }
407                 if (WARN_ON(!uptodate)) {
408                         free_extent_buffer(eb);
409                         return -EUCLEAN;
410                 }
411
412                 ret = csum_one_extent_buffer(eb);
413                 free_extent_buffer(eb);
414                 if (ret < 0)
415                         return ret;
416         }
417         return ret;
418 }
419
420 /*
421  * Checksum a dirty tree block before IO.  This has extra checks to make sure
422  * we only fill in the checksum field in the first page of a multi-page block.
423  * For subpage extent buffers we need bvec to also read the offset in the page.
424  */
425 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
426 {
427         struct page *page = bvec->bv_page;
428         u64 start = page_offset(page);
429         u64 found_start;
430         struct extent_buffer *eb;
431
432         if (fs_info->nodesize < PAGE_SIZE)
433                 return csum_dirty_subpage_buffers(fs_info, bvec);
434
435         eb = (struct extent_buffer *)page->private;
436         if (page != eb->pages[0])
437                 return 0;
438
439         found_start = btrfs_header_bytenr(eb);
440
441         if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
442                 WARN_ON(found_start != 0);
443                 return 0;
444         }
445
446         /*
447          * Please do not consolidate these warnings into a single if.
448          * It is useful to know what went wrong.
449          */
450         if (WARN_ON(found_start != start))
451                 return -EUCLEAN;
452         if (WARN_ON(!PageUptodate(page)))
453                 return -EUCLEAN;
454
455         return csum_one_extent_buffer(eb);
456 }
457
458 static int check_tree_block_fsid(struct extent_buffer *eb)
459 {
460         struct btrfs_fs_info *fs_info = eb->fs_info;
461         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
462         u8 fsid[BTRFS_FSID_SIZE];
463         u8 *metadata_uuid;
464
465         read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
466                            BTRFS_FSID_SIZE);
467         /*
468          * Checking the incompat flag is only valid for the current fs. For
469          * seed devices it's forbidden to have their uuid changed so reading
470          * ->fsid in this case is fine
471          */
472         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
473                 metadata_uuid = fs_devices->metadata_uuid;
474         else
475                 metadata_uuid = fs_devices->fsid;
476
477         if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
478                 return 0;
479
480         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
481                 if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
482                         return 0;
483
484         return 1;
485 }
486
487 /* Do basic extent buffer checks at read time */
488 static int validate_extent_buffer(struct extent_buffer *eb,
489                                   struct btrfs_tree_parent_check *check)
490 {
491         struct btrfs_fs_info *fs_info = eb->fs_info;
492         u64 found_start;
493         const u32 csum_size = fs_info->csum_size;
494         u8 found_level;
495         u8 result[BTRFS_CSUM_SIZE];
496         const u8 *header_csum;
497         int ret = 0;
498
499         ASSERT(check);
500
501         found_start = btrfs_header_bytenr(eb);
502         if (found_start != eb->start) {
503                 btrfs_err_rl(fs_info,
504                         "bad tree block start, mirror %u want %llu have %llu",
505                              eb->read_mirror, eb->start, found_start);
506                 ret = -EIO;
507                 goto out;
508         }
509         if (check_tree_block_fsid(eb)) {
510                 btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
511                              eb->start, eb->read_mirror);
512                 ret = -EIO;
513                 goto out;
514         }
515         found_level = btrfs_header_level(eb);
516         if (found_level >= BTRFS_MAX_LEVEL) {
517                 btrfs_err(fs_info,
518                         "bad tree block level, mirror %u level %d on logical %llu",
519                         eb->read_mirror, btrfs_header_level(eb), eb->start);
520                 ret = -EIO;
521                 goto out;
522         }
523
524         csum_tree_block(eb, result);
525         header_csum = page_address(eb->pages[0]) +
526                 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
527
528         if (memcmp(result, header_csum, csum_size) != 0) {
529                 btrfs_warn_rl(fs_info,
530 "checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
531                               eb->start, eb->read_mirror,
532                               CSUM_FMT_VALUE(csum_size, header_csum),
533                               CSUM_FMT_VALUE(csum_size, result),
534                               btrfs_header_level(eb));
535                 ret = -EUCLEAN;
536                 goto out;
537         }
538
539         if (found_level != check->level) {
540                 btrfs_err(fs_info,
541                 "level verify failed on logical %llu mirror %u wanted %u found %u",
542                           eb->start, eb->read_mirror, check->level, found_level);
543                 ret = -EIO;
544                 goto out;
545         }
546         if (unlikely(check->transid &&
547                      btrfs_header_generation(eb) != check->transid)) {
548                 btrfs_err_rl(eb->fs_info,
549 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
550                                 eb->start, eb->read_mirror, check->transid,
551                                 btrfs_header_generation(eb));
552                 ret = -EIO;
553                 goto out;
554         }
555         if (check->has_first_key) {
556                 struct btrfs_key *expect_key = &check->first_key;
557                 struct btrfs_key found_key;
558
559                 if (found_level)
560                         btrfs_node_key_to_cpu(eb, &found_key, 0);
561                 else
562                         btrfs_item_key_to_cpu(eb, &found_key, 0);
563                 if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
564                         btrfs_err(fs_info,
565 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
566                                   eb->start, check->transid,
567                                   expect_key->objectid,
568                                   expect_key->type, expect_key->offset,
569                                   found_key.objectid, found_key.type,
570                                   found_key.offset);
571                         ret = -EUCLEAN;
572                         goto out;
573                 }
574         }
575         if (check->owner_root) {
576                 ret = btrfs_check_eb_owner(eb, check->owner_root);
577                 if (ret < 0)
578                         goto out;
579         }
580
581         /*
582          * If this is a leaf block and it is corrupt, set the corrupt bit so
583          * that we don't try and read the other copies of this block, just
584          * return -EIO.
585          */
586         if (found_level == 0 && btrfs_check_leaf_full(eb)) {
587                 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
588                 ret = -EIO;
589         }
590
591         if (found_level > 0 && btrfs_check_node(eb))
592                 ret = -EIO;
593
594         if (!ret)
595                 set_extent_buffer_uptodate(eb);
596         else
597                 btrfs_err(fs_info,
598                 "read time tree block corruption detected on logical %llu mirror %u",
599                           eb->start, eb->read_mirror);
600 out:
601         return ret;
602 }
603
604 static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
605                                    int mirror, struct btrfs_tree_parent_check *check)
606 {
607         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
608         struct extent_buffer *eb;
609         bool reads_done;
610         int ret = 0;
611
612         ASSERT(check);
613
614         /*
615          * We don't allow bio merge for subpage metadata read, so we should
616          * only get one eb for each endio hook.
617          */
618         ASSERT(end == start + fs_info->nodesize - 1);
619         ASSERT(PagePrivate(page));
620
621         eb = find_extent_buffer(fs_info, start);
622         /*
623          * When we are reading one tree block, eb must have been inserted into
624          * the radix tree. If not, something is wrong.
625          */
626         ASSERT(eb);
627
628         reads_done = atomic_dec_and_test(&eb->io_pages);
629         /* Subpage read must finish in page read */
630         ASSERT(reads_done);
631
632         eb->read_mirror = mirror;
633         if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
634                 ret = -EIO;
635                 goto err;
636         }
637         ret = validate_extent_buffer(eb, check);
638         if (ret < 0)
639                 goto err;
640
641         set_extent_buffer_uptodate(eb);
642
643         free_extent_buffer(eb);
644         return ret;
645 err:
646         /*
647          * end_bio_extent_readpage decrements io_pages in case of error,
648          * make sure it has something to decrement.
649          */
650         atomic_inc(&eb->io_pages);
651         clear_extent_buffer_uptodate(eb);
652         free_extent_buffer(eb);
653         return ret;
654 }
655
656 int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
657                                    struct page *page, u64 start, u64 end,
658                                    int mirror)
659 {
660         struct extent_buffer *eb;
661         int ret = 0;
662         int reads_done;
663
664         ASSERT(page->private);
665
666         if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
667                 return validate_subpage_buffer(page, start, end, mirror,
668                                                &bbio->parent_check);
669
670         eb = (struct extent_buffer *)page->private;
671
672         /*
673          * The pending IO might have been the only thing that kept this buffer
674          * in memory.  Make sure we have a ref for all this other checks
675          */
676         atomic_inc(&eb->refs);
677
678         reads_done = atomic_dec_and_test(&eb->io_pages);
679         if (!reads_done)
680                 goto err;
681
682         eb->read_mirror = mirror;
683         if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
684                 ret = -EIO;
685                 goto err;
686         }
687         ret = validate_extent_buffer(eb, &bbio->parent_check);
688 err:
689         if (ret) {
690                 /*
691                  * our io error hook is going to dec the io pages
692                  * again, we have to make sure it has something
693                  * to decrement
694                  */
695                 atomic_inc(&eb->io_pages);
696                 clear_extent_buffer_uptodate(eb);
697         }
698         free_extent_buffer(eb);
699
700         return ret;
701 }
702
703 static void run_one_async_start(struct btrfs_work *work)
704 {
705         struct async_submit_bio *async;
706         blk_status_t ret;
707
708         async = container_of(work, struct  async_submit_bio, work);
709         switch (async->submit_cmd) {
710         case WQ_SUBMIT_METADATA:
711                 ret = btree_submit_bio_start(async->bio);
712                 break;
713         case WQ_SUBMIT_DATA:
714                 ret = btrfs_submit_bio_start(async->inode, async->bio);
715                 break;
716         case WQ_SUBMIT_DATA_DIO:
717                 ret = btrfs_submit_bio_start_direct_io(async->inode,
718                                 async->bio, async->dio_file_offset);
719                 break;
720         }
721         if (ret)
722                 async->status = ret;
723 }
724
725 /*
726  * In order to insert checksums into the metadata in large chunks, we wait
727  * until bio submission time.   All the pages in the bio are checksummed and
728  * sums are attached onto the ordered extent record.
729  *
730  * At IO completion time the csums attached on the ordered extent record are
731  * inserted into the tree.
732  */
733 static void run_one_async_done(struct btrfs_work *work)
734 {
735         struct async_submit_bio *async =
736                 container_of(work, struct  async_submit_bio, work);
737         struct btrfs_inode *inode = async->inode;
738         struct btrfs_bio *bbio = btrfs_bio(async->bio);
739
740         /* If an error occurred we just want to clean up the bio and move on */
741         if (async->status) {
742                 btrfs_bio_end_io(bbio, async->status);
743                 return;
744         }
745
746         /*
747          * All of the bios that pass through here are from async helpers.
748          * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
749          * This changes nothing when cgroups aren't in use.
750          */
751         async->bio->bi_opf |= REQ_CGROUP_PUNT;
752         btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
753 }
754
755 static void run_one_async_free(struct btrfs_work *work)
756 {
757         struct async_submit_bio *async;
758
759         async = container_of(work, struct  async_submit_bio, work);
760         kfree(async);
761 }
762
763 /*
764  * Submit bio to an async queue.
765  *
766  * Retrun:
767  * - true if the work has been succesfuly submitted
768  * - false in case of error
769  */
770 bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
771                          u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
772 {
773         struct btrfs_fs_info *fs_info = inode->root->fs_info;
774         struct async_submit_bio *async;
775
776         async = kmalloc(sizeof(*async), GFP_NOFS);
777         if (!async)
778                 return false;
779
780         async->inode = inode;
781         async->bio = bio;
782         async->mirror_num = mirror_num;
783         async->submit_cmd = cmd;
784
785         btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
786                         run_one_async_free);
787
788         async->dio_file_offset = dio_file_offset;
789
790         async->status = 0;
791
792         if (op_is_sync(bio->bi_opf))
793                 btrfs_queue_work(fs_info->hipri_workers, &async->work);
794         else
795                 btrfs_queue_work(fs_info->workers, &async->work);
796         return true;
797 }
798
799 static blk_status_t btree_csum_one_bio(struct bio *bio)
800 {
801         struct bio_vec *bvec;
802         struct btrfs_root *root;
803         int ret = 0;
804         struct bvec_iter_all iter_all;
805
806         ASSERT(!bio_flagged(bio, BIO_CLONED));
807         bio_for_each_segment_all(bvec, bio, iter_all) {
808                 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
809                 ret = csum_dirty_buffer(root->fs_info, bvec);
810                 if (ret)
811                         break;
812         }
813
814         return errno_to_blk_status(ret);
815 }
816
817 blk_status_t btree_submit_bio_start(struct bio *bio)
818 {
819         /*
820          * when we're called for a write, we're already in the async
821          * submission context.  Just jump into btrfs_submit_bio.
822          */
823         return btree_csum_one_bio(bio);
824 }
825
826 static bool should_async_write(struct btrfs_fs_info *fs_info,
827                              struct btrfs_inode *bi)
828 {
829         if (btrfs_is_zoned(fs_info))
830                 return false;
831         if (atomic_read(&bi->sync_writers))
832                 return false;
833         if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
834                 return false;
835         return true;
836 }
837
838 void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
839 {
840         struct btrfs_fs_info *fs_info = inode->root->fs_info;
841         struct btrfs_bio *bbio = btrfs_bio(bio);
842         blk_status_t ret;
843
844         bio->bi_opf |= REQ_META;
845         bbio->is_metadata = 1;
846
847         if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
848                 btrfs_submit_bio(fs_info, bio, mirror_num);
849                 return;
850         }
851
852         /*
853          * Kthread helpers are used to submit writes so that checksumming can
854          * happen in parallel across all CPUs.
855          */
856         if (should_async_write(fs_info, inode) &&
857             btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
858                 return;
859
860         ret = btree_csum_one_bio(bio);
861         if (ret) {
862                 btrfs_bio_end_io(bbio, ret);
863                 return;
864         }
865
866         btrfs_submit_bio(fs_info, bio, mirror_num);
867 }
868
869 #ifdef CONFIG_MIGRATION
870 static int btree_migrate_folio(struct address_space *mapping,
871                 struct folio *dst, struct folio *src, enum migrate_mode mode)
872 {
873         /*
874          * we can't safely write a btree page from here,
875          * we haven't done the locking hook
876          */
877         if (folio_test_dirty(src))
878                 return -EAGAIN;
879         /*
880          * Buffers may be managed in a filesystem specific way.
881          * We must have no buffers or drop them.
882          */
883         if (folio_get_private(src) &&
884             !filemap_release_folio(src, GFP_KERNEL))
885                 return -EAGAIN;
886         return migrate_folio(mapping, dst, src, mode);
887 }
888 #else
889 #define btree_migrate_folio NULL
890 #endif
891
892 static int btree_writepages(struct address_space *mapping,
893                             struct writeback_control *wbc)
894 {
895         struct btrfs_fs_info *fs_info;
896         int ret;
897
898         if (wbc->sync_mode == WB_SYNC_NONE) {
899
900                 if (wbc->for_kupdate)
901                         return 0;
902
903                 fs_info = BTRFS_I(mapping->host)->root->fs_info;
904                 /* this is a bit racy, but that's ok */
905                 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
906                                              BTRFS_DIRTY_METADATA_THRESH,
907                                              fs_info->dirty_metadata_batch);
908                 if (ret < 0)
909                         return 0;
910         }
911         return btree_write_cache_pages(mapping, wbc);
912 }
913
914 static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
915 {
916         if (folio_test_writeback(folio) || folio_test_dirty(folio))
917                 return false;
918
919         return try_release_extent_buffer(&folio->page);
920 }
921
922 static void btree_invalidate_folio(struct folio *folio, size_t offset,
923                                  size_t length)
924 {
925         struct extent_io_tree *tree;
926         tree = &BTRFS_I(folio->mapping->host)->io_tree;
927         extent_invalidate_folio(tree, folio, offset);
928         btree_release_folio(folio, GFP_NOFS);
929         if (folio_get_private(folio)) {
930                 btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
931                            "folio private not zero on folio %llu",
932                            (unsigned long long)folio_pos(folio));
933                 folio_detach_private(folio);
934         }
935 }
936
937 #ifdef DEBUG
938 static bool btree_dirty_folio(struct address_space *mapping,
939                 struct folio *folio)
940 {
941         struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
942         struct btrfs_subpage *subpage;
943         struct extent_buffer *eb;
944         int cur_bit = 0;
945         u64 page_start = folio_pos(folio);
946
947         if (fs_info->sectorsize == PAGE_SIZE) {
948                 eb = folio_get_private(folio);
949                 BUG_ON(!eb);
950                 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
951                 BUG_ON(!atomic_read(&eb->refs));
952                 btrfs_assert_tree_write_locked(eb);
953                 return filemap_dirty_folio(mapping, folio);
954         }
955         subpage = folio_get_private(folio);
956
957         ASSERT(subpage->dirty_bitmap);
958         while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
959                 unsigned long flags;
960                 u64 cur;
961                 u16 tmp = (1 << cur_bit);
962
963                 spin_lock_irqsave(&subpage->lock, flags);
964                 if (!(tmp & subpage->dirty_bitmap)) {
965                         spin_unlock_irqrestore(&subpage->lock, flags);
966                         cur_bit++;
967                         continue;
968                 }
969                 spin_unlock_irqrestore(&subpage->lock, flags);
970                 cur = page_start + cur_bit * fs_info->sectorsize;
971
972                 eb = find_extent_buffer(fs_info, cur);
973                 ASSERT(eb);
974                 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
975                 ASSERT(atomic_read(&eb->refs));
976                 btrfs_assert_tree_write_locked(eb);
977                 free_extent_buffer(eb);
978
979                 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
980         }
981         return filemap_dirty_folio(mapping, folio);
982 }
983 #else
984 #define btree_dirty_folio filemap_dirty_folio
985 #endif
986
987 static const struct address_space_operations btree_aops = {
988         .writepages     = btree_writepages,
989         .release_folio  = btree_release_folio,
990         .invalidate_folio = btree_invalidate_folio,
991         .migrate_folio  = btree_migrate_folio,
992         .dirty_folio    = btree_dirty_folio,
993 };
994
995 struct extent_buffer *btrfs_find_create_tree_block(
996                                                 struct btrfs_fs_info *fs_info,
997                                                 u64 bytenr, u64 owner_root,
998                                                 int level)
999 {
1000         if (btrfs_is_testing(fs_info))
1001                 return alloc_test_extent_buffer(fs_info, bytenr);
1002         return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
1003 }
1004
1005 /*
1006  * Read tree block at logical address @bytenr and do variant basic but critical
1007  * verification.
1008  *
1009  * @check:              expected tree parentness check, see comments of the
1010  *                      structure for details.
1011  */
1012 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1013                                       struct btrfs_tree_parent_check *check)
1014 {
1015         struct extent_buffer *buf = NULL;
1016         int ret;
1017
1018         ASSERT(check);
1019
1020         buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
1021                                            check->level);
1022         if (IS_ERR(buf))
1023                 return buf;
1024
1025         ret = btrfs_read_extent_buffer(buf, check);
1026         if (ret) {
1027                 free_extent_buffer_stale(buf);
1028                 return ERR_PTR(ret);
1029         }
1030         if (btrfs_check_eb_owner(buf, check->owner_root)) {
1031                 free_extent_buffer_stale(buf);
1032                 return ERR_PTR(-EUCLEAN);
1033         }
1034         return buf;
1035
1036 }
1037
1038 void btrfs_clean_tree_block(struct extent_buffer *buf)
1039 {
1040         struct btrfs_fs_info *fs_info = buf->fs_info;
1041         if (btrfs_header_generation(buf) ==
1042             fs_info->running_transaction->transid) {
1043                 btrfs_assert_tree_write_locked(buf);
1044
1045                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1046                         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1047                                                  -buf->len,
1048                                                  fs_info->dirty_metadata_batch);
1049                         clear_extent_buffer_dirty(buf);
1050                 }
1051         }
1052 }
1053
1054 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1055                          u64 objectid)
1056 {
1057         bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1058
1059         memset(&root->root_key, 0, sizeof(root->root_key));
1060         memset(&root->root_item, 0, sizeof(root->root_item));
1061         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1062         root->fs_info = fs_info;
1063         root->root_key.objectid = objectid;
1064         root->node = NULL;
1065         root->commit_root = NULL;
1066         root->state = 0;
1067         RB_CLEAR_NODE(&root->rb_node);
1068
1069         root->last_trans = 0;
1070         root->free_objectid = 0;
1071         root->nr_delalloc_inodes = 0;
1072         root->nr_ordered_extents = 0;
1073         root->inode_tree = RB_ROOT;
1074         INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1075
1076         btrfs_init_root_block_rsv(root);
1077
1078         INIT_LIST_HEAD(&root->dirty_list);
1079         INIT_LIST_HEAD(&root->root_list);
1080         INIT_LIST_HEAD(&root->delalloc_inodes);
1081         INIT_LIST_HEAD(&root->delalloc_root);
1082         INIT_LIST_HEAD(&root->ordered_extents);
1083         INIT_LIST_HEAD(&root->ordered_root);
1084         INIT_LIST_HEAD(&root->reloc_dirty_list);
1085         INIT_LIST_HEAD(&root->logged_list[0]);
1086         INIT_LIST_HEAD(&root->logged_list[1]);
1087         spin_lock_init(&root->inode_lock);
1088         spin_lock_init(&root->delalloc_lock);
1089         spin_lock_init(&root->ordered_extent_lock);
1090         spin_lock_init(&root->accounting_lock);
1091         spin_lock_init(&root->log_extents_lock[0]);
1092         spin_lock_init(&root->log_extents_lock[1]);
1093         spin_lock_init(&root->qgroup_meta_rsv_lock);
1094         mutex_init(&root->objectid_mutex);
1095         mutex_init(&root->log_mutex);
1096         mutex_init(&root->ordered_extent_mutex);
1097         mutex_init(&root->delalloc_mutex);
1098         init_waitqueue_head(&root->qgroup_flush_wait);
1099         init_waitqueue_head(&root->log_writer_wait);
1100         init_waitqueue_head(&root->log_commit_wait[0]);
1101         init_waitqueue_head(&root->log_commit_wait[1]);
1102         INIT_LIST_HEAD(&root->log_ctxs[0]);
1103         INIT_LIST_HEAD(&root->log_ctxs[1]);
1104         atomic_set(&root->log_commit[0], 0);
1105         atomic_set(&root->log_commit[1], 0);
1106         atomic_set(&root->log_writers, 0);
1107         atomic_set(&root->log_batch, 0);
1108         refcount_set(&root->refs, 1);
1109         atomic_set(&root->snapshot_force_cow, 0);
1110         atomic_set(&root->nr_swapfiles, 0);
1111         root->log_transid = 0;
1112         root->log_transid_committed = -1;
1113         root->last_log_commit = 0;
1114         root->anon_dev = 0;
1115         if (!dummy) {
1116                 extent_io_tree_init(fs_info, &root->dirty_log_pages,
1117                                     IO_TREE_ROOT_DIRTY_LOG_PAGES);
1118                 extent_io_tree_init(fs_info, &root->log_csum_range,
1119                                     IO_TREE_LOG_CSUM_RANGE);
1120         }
1121
1122         spin_lock_init(&root->root_item_lock);
1123         btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1124 #ifdef CONFIG_BTRFS_DEBUG
1125         INIT_LIST_HEAD(&root->leak_list);
1126         spin_lock(&fs_info->fs_roots_radix_lock);
1127         list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1128         spin_unlock(&fs_info->fs_roots_radix_lock);
1129 #endif
1130 }
1131
1132 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1133                                            u64 objectid, gfp_t flags)
1134 {
1135         struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1136         if (root)
1137                 __setup_root(root, fs_info, objectid);
1138         return root;
1139 }
1140
1141 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1142 /* Should only be used by the testing infrastructure */
1143 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1144 {
1145         struct btrfs_root *root;
1146
1147         if (!fs_info)
1148                 return ERR_PTR(-EINVAL);
1149
1150         root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1151         if (!root)
1152                 return ERR_PTR(-ENOMEM);
1153
1154         /* We don't use the stripesize in selftest, set it as sectorsize */
1155         root->alloc_bytenr = 0;
1156
1157         return root;
1158 }
1159 #endif
1160
1161 static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
1162 {
1163         const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
1164         const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);
1165
1166         return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
1167 }
1168
1169 static int global_root_key_cmp(const void *k, const struct rb_node *node)
1170 {
1171         const struct btrfs_key *key = k;
1172         const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);
1173
1174         return btrfs_comp_cpu_keys(key, &root->root_key);
1175 }
1176
1177 int btrfs_global_root_insert(struct btrfs_root *root)
1178 {
1179         struct btrfs_fs_info *fs_info = root->fs_info;
1180         struct rb_node *tmp;
1181
1182         write_lock(&fs_info->global_root_lock);
1183         tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
1184         write_unlock(&fs_info->global_root_lock);
1185         ASSERT(!tmp);
1186
1187         return tmp ? -EEXIST : 0;
1188 }
1189
1190 void btrfs_global_root_delete(struct btrfs_root *root)
1191 {
1192         struct btrfs_fs_info *fs_info = root->fs_info;
1193
1194         write_lock(&fs_info->global_root_lock);
1195         rb_erase(&root->rb_node, &fs_info->global_root_tree);
1196         write_unlock(&fs_info->global_root_lock);
1197 }
1198
1199 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
1200                                      struct btrfs_key *key)
1201 {
1202         struct rb_node *node;
1203         struct btrfs_root *root = NULL;
1204
1205         read_lock(&fs_info->global_root_lock);
1206         node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
1207         if (node)
1208                 root = container_of(node, struct btrfs_root, rb_node);
1209         read_unlock(&fs_info->global_root_lock);
1210
1211         return root;
1212 }
1213
1214 static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
1215 {
1216         struct btrfs_block_group *block_group;
1217         u64 ret;
1218
1219         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
1220                 return 0;
1221
1222         if (bytenr)
1223                 block_group = btrfs_lookup_block_group(fs_info, bytenr);
1224         else
1225                 block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
1226         ASSERT(block_group);
1227         if (!block_group)
1228                 return 0;
1229         ret = block_group->global_root_id;
1230         btrfs_put_block_group(block_group);
1231
1232         return ret;
1233 }
1234
1235 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1236 {
1237         struct btrfs_key key = {
1238                 .objectid = BTRFS_CSUM_TREE_OBJECTID,
1239                 .type = BTRFS_ROOT_ITEM_KEY,
1240                 .offset = btrfs_global_root_id(fs_info, bytenr),
1241         };
1242
1243         return btrfs_global_root(fs_info, &key);
1244 }
1245
1246 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
1247 {
1248         struct btrfs_key key = {
1249                 .objectid = BTRFS_EXTENT_TREE_OBJECTID,
1250                 .type = BTRFS_ROOT_ITEM_KEY,
1251                 .offset = btrfs_global_root_id(fs_info, bytenr),
1252         };
1253
1254         return btrfs_global_root(fs_info, &key);
1255 }
1256
1257 struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
1258 {
1259         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
1260                 return fs_info->block_group_root;
1261         return btrfs_extent_root(fs_info, 0);
1262 }
1263
1264 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1265                                      u64 objectid)
1266 {
1267         struct btrfs_fs_info *fs_info = trans->fs_info;
1268         struct extent_buffer *leaf;
1269         struct btrfs_root *tree_root = fs_info->tree_root;
1270         struct btrfs_root *root;
1271         struct btrfs_key key;
1272         unsigned int nofs_flag;
1273         int ret = 0;
1274
1275         /*
1276          * We're holding a transaction handle, so use a NOFS memory allocation
1277          * context to avoid deadlock if reclaim happens.
1278          */
1279         nofs_flag = memalloc_nofs_save();
1280         root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1281         memalloc_nofs_restore(nofs_flag);
1282         if (!root)
1283                 return ERR_PTR(-ENOMEM);
1284
1285         root->root_key.objectid = objectid;
1286         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1287         root->root_key.offset = 0;
1288
1289         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1290                                       BTRFS_NESTING_NORMAL);
1291         if (IS_ERR(leaf)) {
1292                 ret = PTR_ERR(leaf);
1293                 leaf = NULL;
1294                 goto fail;
1295         }
1296
1297         root->node = leaf;
1298         btrfs_mark_buffer_dirty(leaf);
1299
1300         root->commit_root = btrfs_root_node(root);
1301         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1302
1303         btrfs_set_root_flags(&root->root_item, 0);
1304         btrfs_set_root_limit(&root->root_item, 0);
1305         btrfs_set_root_bytenr(&root->root_item, leaf->start);
1306         btrfs_set_root_generation(&root->root_item, trans->transid);
1307         btrfs_set_root_level(&root->root_item, 0);
1308         btrfs_set_root_refs(&root->root_item, 1);
1309         btrfs_set_root_used(&root->root_item, leaf->len);
1310         btrfs_set_root_last_snapshot(&root->root_item, 0);
1311         btrfs_set_root_dirid(&root->root_item, 0);
1312         if (is_fstree(objectid))
1313                 generate_random_guid(root->root_item.uuid);
1314         else
1315                 export_guid(root->root_item.uuid, &guid_null);
1316         btrfs_set_root_drop_level(&root->root_item, 0);
1317
1318         btrfs_tree_unlock(leaf);
1319
1320         key.objectid = objectid;
1321         key.type = BTRFS_ROOT_ITEM_KEY;
1322         key.offset = 0;
1323         ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1324         if (ret)
1325                 goto fail;
1326
1327         return root;
1328
1329 fail:
1330         btrfs_put_root(root);
1331
1332         return ERR_PTR(ret);
1333 }
1334
1335 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1336                                          struct btrfs_fs_info *fs_info)
1337 {
1338         struct btrfs_root *root;
1339
1340         root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1341         if (!root)
1342                 return ERR_PTR(-ENOMEM);
1343
1344         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1345         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1346         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1347
1348         return root;
1349 }
1350
1351 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
1352                               struct btrfs_root *root)
1353 {
1354         struct extent_buffer *leaf;
1355
1356         /*
1357          * DON'T set SHAREABLE bit for log trees.
1358          *
1359          * Log trees are not exposed to user space thus can't be snapshotted,
1360          * and they go away before a real commit is actually done.
1361          *
1362          * They do store pointers to file data extents, and those reference
1363          * counts still get updated (along with back refs to the log tree).
1364          */
1365
1366         leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1367                         NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
1368         if (IS_ERR(leaf))
1369                 return PTR_ERR(leaf);
1370
1371         root->node = leaf;
1372
1373         btrfs_mark_buffer_dirty(root->node);
1374         btrfs_tree_unlock(root->node);
1375
1376         return 0;
1377 }
1378
1379 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1380                              struct btrfs_fs_info *fs_info)
1381 {
1382         struct btrfs_root *log_root;
1383
1384         log_root = alloc_log_tree(trans, fs_info);
1385         if (IS_ERR(log_root))
1386                 return PTR_ERR(log_root);
1387
1388         if (!btrfs_is_zoned(fs_info)) {
1389                 int ret = btrfs_alloc_log_tree_node(trans, log_root);
1390
1391                 if (ret) {
1392                         btrfs_put_root(log_root);
1393                         return ret;
1394                 }
1395         }
1396
1397         WARN_ON(fs_info->log_root_tree);
1398         fs_info->log_root_tree = log_root;
1399         return 0;
1400 }
1401
1402 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1403                        struct btrfs_root *root)
1404 {
1405         struct btrfs_fs_info *fs_info = root->fs_info;
1406         struct btrfs_root *log_root;
1407         struct btrfs_inode_item *inode_item;
1408         int ret;
1409
1410         log_root = alloc_log_tree(trans, fs_info);
1411         if (IS_ERR(log_root))
1412                 return PTR_ERR(log_root);
1413
1414         ret = btrfs_alloc_log_tree_node(trans, log_root);
1415         if (ret) {
1416                 btrfs_put_root(log_root);
1417                 return ret;
1418         }
1419
1420         log_root->last_trans = trans->transid;
1421         log_root->root_key.offset = root->root_key.objectid;
1422
1423         inode_item = &log_root->root_item.inode;
1424         btrfs_set_stack_inode_generation(inode_item, 1);
1425         btrfs_set_stack_inode_size(inode_item, 3);
1426         btrfs_set_stack_inode_nlink(inode_item, 1);
1427         btrfs_set_stack_inode_nbytes(inode_item,
1428                                      fs_info->nodesize);
1429         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1430
1431         btrfs_set_root_node(&log_root->root_item, log_root->node);
1432
1433         WARN_ON(root->log_root);
1434         root->log_root = log_root;
1435         root->log_transid = 0;
1436         root->log_transid_committed = -1;
1437         root->last_log_commit = 0;
1438         return 0;
1439 }
1440
1441 static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1442                                               struct btrfs_path *path,
1443                                               struct btrfs_key *key)
1444 {
1445         struct btrfs_root *root;
1446         struct btrfs_tree_parent_check check = { 0 };
1447         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1448         u64 generation;
1449         int ret;
1450         int level;
1451
1452         root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1453         if (!root)
1454                 return ERR_PTR(-ENOMEM);
1455
1456         ret = btrfs_find_root(tree_root, key, path,
1457                               &root->root_item, &root->root_key);
1458         if (ret) {
1459                 if (ret > 0)
1460                         ret = -ENOENT;
1461                 goto fail;
1462         }
1463
1464         generation = btrfs_root_generation(&root->root_item);
1465         level = btrfs_root_level(&root->root_item);
1466         check.level = level;
1467         check.transid = generation;
1468         check.owner_root = key->objectid;
1469         root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
1470                                      &check);
1471         if (IS_ERR(root->node)) {
1472                 ret = PTR_ERR(root->node);
1473                 root->node = NULL;
1474                 goto fail;
1475         }
1476         if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1477                 ret = -EIO;
1478                 goto fail;
1479         }
1480
1481         /*
1482          * For real fs, and not log/reloc trees, root owner must
1483          * match its root node owner
1484          */
1485         if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
1486             root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1487             root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1488             root->root_key.objectid != btrfs_header_owner(root->node)) {
1489                 btrfs_crit(fs_info,
1490 "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1491                            root->root_key.objectid, root->node->start,
1492                            btrfs_header_owner(root->node),
1493                            root->root_key.objectid);
1494                 ret = -EUCLEAN;
1495                 goto fail;
1496         }
1497         root->commit_root = btrfs_root_node(root);
1498         return root;
1499 fail:
1500         btrfs_put_root(root);
1501         return ERR_PTR(ret);
1502 }
1503
1504 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1505                                         struct btrfs_key *key)
1506 {
1507         struct btrfs_root *root;
1508         struct btrfs_path *path;
1509
1510         path = btrfs_alloc_path();
1511         if (!path)
1512                 return ERR_PTR(-ENOMEM);
1513         root = read_tree_root_path(tree_root, path, key);
1514         btrfs_free_path(path);
1515
1516         return root;
1517 }
1518
1519 /*
1520  * Initialize subvolume root in-memory structure
1521  *
1522  * @anon_dev:   anonymous device to attach to the root, if zero, allocate new
1523  */
1524 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1525 {
1526         int ret;
1527         unsigned int nofs_flag;
1528
1529         /*
1530          * We might be called under a transaction (e.g. indirect backref
1531          * resolution) which could deadlock if it triggers memory reclaim
1532          */
1533         nofs_flag = memalloc_nofs_save();
1534         ret = btrfs_drew_lock_init(&root->snapshot_lock);
1535         memalloc_nofs_restore(nofs_flag);
1536         if (ret)
1537                 goto fail;
1538
1539         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1540             !btrfs_is_data_reloc_root(root)) {
1541                 set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1542                 btrfs_check_and_init_root_item(&root->root_item);
1543         }
1544
1545         /*
1546          * Don't assign anonymous block device to roots that are not exposed to
1547          * userspace, the id pool is limited to 1M
1548          */
1549         if (is_fstree(root->root_key.objectid) &&
1550             btrfs_root_refs(&root->root_item) > 0) {
1551                 if (!anon_dev) {
1552                         ret = get_anon_bdev(&root->anon_dev);
1553                         if (ret)
1554                                 goto fail;
1555                 } else {
1556                         root->anon_dev = anon_dev;
1557                 }
1558         }
1559
1560         mutex_lock(&root->objectid_mutex);
1561         ret = btrfs_init_root_free_objectid(root);
1562         if (ret) {
1563                 mutex_unlock(&root->objectid_mutex);
1564                 goto fail;
1565         }
1566
1567         ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1568
1569         mutex_unlock(&root->objectid_mutex);
1570
1571         return 0;
1572 fail:
1573         /* The caller is responsible to call btrfs_free_fs_root */
1574         return ret;
1575 }
1576
1577 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1578                                                u64 root_id)
1579 {
1580         struct btrfs_root *root;
1581
1582         spin_lock(&fs_info->fs_roots_radix_lock);
1583         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1584                                  (unsigned long)root_id);
1585         if (root)
1586                 root = btrfs_grab_root(root);
1587         spin_unlock(&fs_info->fs_roots_radix_lock);
1588         return root;
1589 }
1590
1591 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1592                                                 u64 objectid)
1593 {
1594         struct btrfs_key key = {
1595                 .objectid = objectid,
1596                 .type = BTRFS_ROOT_ITEM_KEY,
1597                 .offset = 0,
1598         };
1599
1600         if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1601                 return btrfs_grab_root(fs_info->tree_root);
1602         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1603                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1604         if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1605                 return btrfs_grab_root(fs_info->chunk_root);
1606         if (objectid == BTRFS_DEV_TREE_OBJECTID)
1607                 return btrfs_grab_root(fs_info->dev_root);
1608         if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1609                 return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1610         if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1611                 return btrfs_grab_root(fs_info->quota_root) ?
1612                         fs_info->quota_root : ERR_PTR(-ENOENT);
1613         if (objectid == BTRFS_UUID_TREE_OBJECTID)
1614                 return btrfs_grab_root(fs_info->uuid_root) ?
1615                         fs_info->uuid_root : ERR_PTR(-ENOENT);
1616         if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
1617                 return btrfs_grab_root(fs_info->block_group_root) ?
1618                         fs_info->block_group_root : ERR_PTR(-ENOENT);
1619         if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
1620                 struct btrfs_root *root = btrfs_global_root(fs_info, &key);
1621
1622                 return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
1623         }
1624         return NULL;
1625 }
1626
1627 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1628                          struct btrfs_root *root)
1629 {
1630         int ret;
1631
1632         ret = radix_tree_preload(GFP_NOFS);
1633         if (ret)
1634                 return ret;
1635
1636         spin_lock(&fs_info->fs_roots_radix_lock);
1637         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1638                                 (unsigned long)root->root_key.objectid,
1639                                 root);
1640         if (ret == 0) {
1641                 btrfs_grab_root(root);
1642                 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1643         }
1644         spin_unlock(&fs_info->fs_roots_radix_lock);
1645         radix_tree_preload_end();
1646
1647         return ret;
1648 }
1649
1650 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1651 {
1652 #ifdef CONFIG_BTRFS_DEBUG
1653         struct btrfs_root *root;
1654
1655         while (!list_empty(&fs_info->allocated_roots)) {
1656                 char buf[BTRFS_ROOT_NAME_BUF_LEN];
1657
1658                 root = list_first_entry(&fs_info->allocated_roots,
1659                                         struct btrfs_root, leak_list);
1660                 btrfs_err(fs_info, "leaked root %s refcount %d",
1661                           btrfs_root_name(&root->root_key, buf),
1662                           refcount_read(&root->refs));
1663                 while (refcount_read(&root->refs) > 1)
1664                         btrfs_put_root(root);
1665                 btrfs_put_root(root);
1666         }
1667 #endif
1668 }
1669
1670 static void free_global_roots(struct btrfs_fs_info *fs_info)
1671 {
1672         struct btrfs_root *root;
1673         struct rb_node *node;
1674
1675         while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
1676                 root = rb_entry(node, struct btrfs_root, rb_node);
1677                 rb_erase(&root->rb_node, &fs_info->global_root_tree);
1678                 btrfs_put_root(root);
1679         }
1680 }
1681
1682 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1683 {
1684         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1685         percpu_counter_destroy(&fs_info->delalloc_bytes);
1686         percpu_counter_destroy(&fs_info->ordered_bytes);
1687         percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1688         btrfs_free_csum_hash(fs_info);
1689         btrfs_free_stripe_hash_table(fs_info);
1690         btrfs_free_ref_cache(fs_info);
1691         kfree(fs_info->balance_ctl);
1692         kfree(fs_info->delayed_root);
1693         free_global_roots(fs_info);
1694         btrfs_put_root(fs_info->tree_root);
1695         btrfs_put_root(fs_info->chunk_root);
1696         btrfs_put_root(fs_info->dev_root);
1697         btrfs_put_root(fs_info->quota_root);
1698         btrfs_put_root(fs_info->uuid_root);
1699         btrfs_put_root(fs_info->fs_root);
1700         btrfs_put_root(fs_info->data_reloc_root);
1701         btrfs_put_root(fs_info->block_group_root);
1702         btrfs_check_leaked_roots(fs_info);
1703         btrfs_extent_buffer_leak_debug_check(fs_info);
1704         kfree(fs_info->super_copy);
1705         kfree(fs_info->super_for_commit);
1706         kfree(fs_info->subpage_info);
1707         kvfree(fs_info);
1708 }
1709
1710
1711 /*
1712  * Get an in-memory reference of a root structure.
1713  *
1714  * For essential trees like root/extent tree, we grab it from fs_info directly.
1715  * For subvolume trees, we check the cached filesystem roots first. If not
1716  * found, then read it from disk and add it to cached fs roots.
1717  *
1718  * Caller should release the root by calling btrfs_put_root() after the usage.
1719  *
1720  * NOTE: Reloc and log trees can't be read by this function as they share the
1721  *       same root objectid.
1722  *
1723  * @objectid:   root id
1724  * @anon_dev:   preallocated anonymous block device number for new roots,
1725  *              pass 0 for new allocation.
1726  * @check_ref:  whether to check root item references, If true, return -ENOENT
1727  *              for orphan roots
1728  */
1729 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1730                                              u64 objectid, dev_t anon_dev,
1731                                              bool check_ref)
1732 {
1733         struct btrfs_root *root;
1734         struct btrfs_path *path;
1735         struct btrfs_key key;
1736         int ret;
1737
1738         root = btrfs_get_global_root(fs_info, objectid);
1739         if (root)
1740                 return root;
1741 again:
1742         root = btrfs_lookup_fs_root(fs_info, objectid);
1743         if (root) {
1744                 /* Shouldn't get preallocated anon_dev for cached roots */
1745                 ASSERT(!anon_dev);
1746                 if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1747                         btrfs_put_root(root);
1748                         return ERR_PTR(-ENOENT);
1749                 }
1750                 return root;
1751         }
1752
1753         key.objectid = objectid;
1754         key.type = BTRFS_ROOT_ITEM_KEY;
1755         key.offset = (u64)-1;
1756         root = btrfs_read_tree_root(fs_info->tree_root, &key);
1757         if (IS_ERR(root))
1758                 return root;
1759
1760         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1761                 ret = -ENOENT;
1762                 goto fail;
1763         }
1764
1765         ret = btrfs_init_fs_root(root, anon_dev);
1766         if (ret)
1767                 goto fail;
1768
1769         path = btrfs_alloc_path();
1770         if (!path) {
1771                 ret = -ENOMEM;
1772                 goto fail;
1773         }
1774         key.objectid = BTRFS_ORPHAN_OBJECTID;
1775         key.type = BTRFS_ORPHAN_ITEM_KEY;
1776         key.offset = objectid;
1777
1778         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1779         btrfs_free_path(path);
1780         if (ret < 0)
1781                 goto fail;
1782         if (ret == 0)
1783                 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1784
1785         ret = btrfs_insert_fs_root(fs_info, root);
1786         if (ret) {
1787                 if (ret == -EEXIST) {
1788                         btrfs_put_root(root);
1789                         goto again;
1790                 }
1791                 goto fail;
1792         }
1793         return root;
1794 fail:
1795         /*
1796          * If our caller provided us an anonymous device, then it's his
1797          * responsibility to free it in case we fail. So we have to set our
1798          * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1799          * and once again by our caller.
1800          */
1801         if (anon_dev)
1802                 root->anon_dev = 0;
1803         btrfs_put_root(root);
1804         return ERR_PTR(ret);
1805 }
1806
1807 /*
1808  * Get in-memory reference of a root structure
1809  *
1810  * @objectid:   tree objectid
1811  * @check_ref:  if set, verify that the tree exists and the item has at least
1812  *              one reference
1813  */
1814 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1815                                      u64 objectid, bool check_ref)
1816 {
1817         return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1818 }
1819
1820 /*
1821  * Get in-memory reference of a root structure, created as new, optionally pass
1822  * the anonymous block device id
1823  *
1824  * @objectid:   tree objectid
1825  * @anon_dev:   if zero, allocate a new anonymous block device or use the
1826  *              parameter value
1827  */
1828 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1829                                          u64 objectid, dev_t anon_dev)
1830 {
1831         return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1832 }
1833
1834 /*
1835  * btrfs_get_fs_root_commit_root - return a root for the given objectid
1836  * @fs_info:    the fs_info
1837  * @objectid:   the objectid we need to lookup
1838  *
1839  * This is exclusively used for backref walking, and exists specifically because
1840  * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1841  * creation time, which means we may have to read the tree_root in order to look
1842  * up a fs root that is not in memory.  If the root is not in memory we will
1843  * read the tree root commit root and look up the fs root from there.  This is a
1844  * temporary root, it will not be inserted into the radix tree as it doesn't
1845  * have the most uptodate information, it'll simply be discarded once the
1846  * backref code is finished using the root.
1847  */
1848 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1849                                                  struct btrfs_path *path,
1850                                                  u64 objectid)
1851 {
1852         struct btrfs_root *root;
1853         struct btrfs_key key;
1854
1855         ASSERT(path->search_commit_root && path->skip_locking);
1856
1857         /*
1858          * This can return -ENOENT if we ask for a root that doesn't exist, but
1859          * since this is called via the backref walking code we won't be looking
1860          * up a root that doesn't exist, unless there's corruption.  So if root
1861          * != NULL just return it.
1862          */
1863         root = btrfs_get_global_root(fs_info, objectid);
1864         if (root)
1865                 return root;
1866
1867         root = btrfs_lookup_fs_root(fs_info, objectid);
1868         if (root)
1869                 return root;
1870
1871         key.objectid = objectid;
1872         key.type = BTRFS_ROOT_ITEM_KEY;
1873         key.offset = (u64)-1;
1874         root = read_tree_root_path(fs_info->tree_root, path, &key);
1875         btrfs_release_path(path);
1876
1877         return root;
1878 }
1879
1880 static int cleaner_kthread(void *arg)
1881 {
1882         struct btrfs_fs_info *fs_info = arg;
1883         int again;
1884
1885         while (1) {
1886                 again = 0;
1887
1888                 set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1889
1890                 /* Make the cleaner go to sleep early. */
1891                 if (btrfs_need_cleaner_sleep(fs_info))
1892                         goto sleep;
1893
1894                 /*
1895                  * Do not do anything if we might cause open_ctree() to block
1896                  * before we have finished mounting the filesystem.
1897                  */
1898                 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1899                         goto sleep;
1900
1901                 if (!mutex_trylock(&fs_info->cleaner_mutex))
1902                         goto sleep;
1903
1904                 /*
1905                  * Avoid the problem that we change the status of the fs
1906                  * during the above check and trylock.
1907                  */
1908                 if (btrfs_need_cleaner_sleep(fs_info)) {
1909                         mutex_unlock(&fs_info->cleaner_mutex);
1910                         goto sleep;
1911                 }
1912
1913                 btrfs_run_delayed_iputs(fs_info);
1914
1915                 again = btrfs_clean_one_deleted_snapshot(fs_info);
1916                 mutex_unlock(&fs_info->cleaner_mutex);
1917
1918                 /*
1919                  * The defragger has dealt with the R/O remount and umount,
1920                  * needn't do anything special here.
1921                  */
1922                 btrfs_run_defrag_inodes(fs_info);
1923
1924                 /*
1925                  * Acquires fs_info->reclaim_bgs_lock to avoid racing
1926                  * with relocation (btrfs_relocate_chunk) and relocation
1927                  * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1928                  * after acquiring fs_info->reclaim_bgs_lock. So we
1929                  * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1930                  * unused block groups.
1931                  */
1932                 btrfs_delete_unused_bgs(fs_info);
1933
1934                 /*
1935                  * Reclaim block groups in the reclaim_bgs list after we deleted
1936                  * all unused block_groups. This possibly gives us some more free
1937                  * space.
1938                  */
1939                 btrfs_reclaim_bgs(fs_info);
1940 sleep:
1941                 clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1942                 if (kthread_should_park())
1943                         kthread_parkme();
1944                 if (kthread_should_stop())
1945                         return 0;
1946                 if (!again) {
1947                         set_current_state(TASK_INTERRUPTIBLE);
1948                         schedule();
1949                         __set_current_state(TASK_RUNNING);
1950                 }
1951         }
1952 }
1953
1954 static int transaction_kthread(void *arg)
1955 {
1956         struct btrfs_root *root = arg;
1957         struct btrfs_fs_info *fs_info = root->fs_info;
1958         struct btrfs_trans_handle *trans;
1959         struct btrfs_transaction *cur;
1960         u64 transid;
1961         time64_t delta;
1962         unsigned long delay;
1963         bool cannot_commit;
1964
1965         do {
1966                 cannot_commit = false;
1967                 delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1968                 mutex_lock(&fs_info->transaction_kthread_mutex);
1969
1970                 spin_lock(&fs_info->trans_lock);
1971                 cur = fs_info->running_transaction;
1972                 if (!cur) {
1973                         spin_unlock(&fs_info->trans_lock);
1974                         goto sleep;
1975                 }
1976
1977                 delta = ktime_get_seconds() - cur->start_time;
1978                 if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1979                     cur->state < TRANS_STATE_COMMIT_START &&
1980                     delta < fs_info->commit_interval) {
1981                         spin_unlock(&fs_info->trans_lock);
1982                         delay -= msecs_to_jiffies((delta - 1) * 1000);
1983                         delay = min(delay,
1984                                     msecs_to_jiffies(fs_info->commit_interval * 1000));
1985                         goto sleep;
1986                 }
1987                 transid = cur->transid;
1988                 spin_unlock(&fs_info->trans_lock);
1989
1990                 /* If the file system is aborted, this will always fail. */
1991                 trans = btrfs_attach_transaction(root);
1992                 if (IS_ERR(trans)) {
1993                         if (PTR_ERR(trans) != -ENOENT)
1994                                 cannot_commit = true;
1995                         goto sleep;
1996                 }
1997                 if (transid == trans->transid) {
1998                         btrfs_commit_transaction(trans);
1999                 } else {
2000                         btrfs_end_transaction(trans);
2001                 }
2002 sleep:
2003                 wake_up_process(fs_info->cleaner_kthread);
2004                 mutex_unlock(&fs_info->transaction_kthread_mutex);
2005
2006                 if (BTRFS_FS_ERROR(fs_info))
2007                         btrfs_cleanup_transaction(fs_info);
2008                 if (!kthread_should_stop() &&
2009                                 (!btrfs_transaction_blocked(fs_info) ||
2010                                  cannot_commit))
2011                         schedule_timeout_interruptible(delay);
2012         } while (!kthread_should_stop());
2013         return 0;
2014 }
2015
2016 /*
2017  * This will find the highest generation in the array of root backups.  The
2018  * index of the highest array is returned, or -EINVAL if we can't find
2019  * anything.
2020  *
2021  * We check to make sure the array is valid by comparing the
2022  * generation of the latest  root in the array with the generation
2023  * in the super block.  If they don't match we pitch it.
2024  */
2025 static int find_newest_super_backup(struct btrfs_fs_info *info)
2026 {
2027         const u64 newest_gen = btrfs_super_generation(info->super_copy);
2028         u64 cur;
2029         struct btrfs_root_backup *root_backup;
2030         int i;
2031
2032         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2033                 root_backup = info->super_copy->super_roots + i;
2034                 cur = btrfs_backup_tree_root_gen(root_backup);
2035                 if (cur == newest_gen)
2036                         return i;
2037         }
2038
2039         return -EINVAL;
2040 }
2041
2042 /*
2043  * copy all the root pointers into the super backup array.
2044  * this will bump the backup pointer by one when it is
2045  * done
2046  */
2047 static void backup_super_roots(struct btrfs_fs_info *info)
2048 {
2049         const int next_backup = info->backup_root_index;
2050         struct btrfs_root_backup *root_backup;
2051
2052         root_backup = info->super_for_commit->super_roots + next_backup;
2053
2054         /*
2055          * make sure all of our padding and empty slots get zero filled
2056          * regardless of which ones we use today
2057          */
2058         memset(root_backup, 0, sizeof(*root_backup));
2059
2060         info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
2061
2062         btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
2063         btrfs_set_backup_tree_root_gen(root_backup,
2064                                btrfs_header_generation(info->tree_root->node));
2065
2066         btrfs_set_backup_tree_root_level(root_backup,
2067                                btrfs_header_level(info->tree_root->node));
2068
2069         btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
2070         btrfs_set_backup_chunk_root_gen(root_backup,
2071                                btrfs_header_generation(info->chunk_root->node));
2072         btrfs_set_backup_chunk_root_level(root_backup,
2073                                btrfs_header_level(info->chunk_root->node));
2074
2075         if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
2076                 struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
2077                 struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
2078
2079                 btrfs_set_backup_extent_root(root_backup,
2080                                              extent_root->node->start);
2081                 btrfs_set_backup_extent_root_gen(root_backup,
2082                                 btrfs_header_generation(extent_root->node));
2083                 btrfs_set_backup_extent_root_level(root_backup,
2084                                         btrfs_header_level(extent_root->node));
2085
2086                 btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
2087                 btrfs_set_backup_csum_root_gen(root_backup,
2088                                                btrfs_header_generation(csum_root->node));
2089                 btrfs_set_backup_csum_root_level(root_backup,
2090                                                  btrfs_header_level(csum_root->node));
2091         }
2092
2093         /*
2094          * we might commit during log recovery, which happens before we set
2095          * the fs_root.  Make sure it is valid before we fill it in.
2096          */
2097         if (info->fs_root && info->fs_root->node) {
2098                 btrfs_set_backup_fs_root(root_backup,
2099                                          info->fs_root->node->start);
2100                 btrfs_set_backup_fs_root_gen(root_backup,
2101                                btrfs_header_generation(info->fs_root->node));
2102                 btrfs_set_backup_fs_root_level(root_backup,
2103                                btrfs_header_level(info->fs_root->node));
2104         }
2105
2106         btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2107         btrfs_set_backup_dev_root_gen(root_backup,
2108                                btrfs_header_generation(info->dev_root->node));
2109         btrfs_set_backup_dev_root_level(root_backup,
2110                                        btrfs_header_level(info->dev_root->node));
2111
2112         btrfs_set_backup_total_bytes(root_backup,
2113                              btrfs_super_total_bytes(info->super_copy));
2114         btrfs_set_backup_bytes_used(root_backup,
2115                              btrfs_super_bytes_used(info->super_copy));
2116         btrfs_set_backup_num_devices(root_backup,
2117                              btrfs_super_num_devices(info->super_copy));
2118
2119         /*
2120          * if we don't copy this out to the super_copy, it won't get remembered
2121          * for the next commit
2122          */
2123         memcpy(&info->super_copy->super_roots,
2124                &info->super_for_commit->super_roots,
2125                sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2126 }
2127
2128 /*
2129  * read_backup_root - Reads a backup root based on the passed priority. Prio 0
2130  * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
2131  *
2132  * fs_info - filesystem whose backup roots need to be read
2133  * priority - priority of backup root required
2134  *
2135  * Returns backup root index on success and -EINVAL otherwise.
2136  */
2137 static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
2138 {
2139         int backup_index = find_newest_super_backup(fs_info);
2140         struct btrfs_super_block *super = fs_info->super_copy;
2141         struct btrfs_root_backup *root_backup;
2142
2143         if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
2144                 if (priority == 0)
2145                         return backup_index;
2146
2147                 backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
2148                 backup_index %= BTRFS_NUM_BACKUP_ROOTS;
2149         } else {
2150                 return -EINVAL;
2151         }
2152
2153         root_backup = super->super_roots + backup_index;
2154
2155         btrfs_set_super_generation(super,
2156                                    btrfs_backup_tree_root_gen(root_backup));
2157         btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2158         btrfs_set_super_root_level(super,
2159                                    btrfs_backup_tree_root_level(root_backup));
2160         btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2161
2162         /*
2163          * Fixme: the total bytes and num_devices need to match or we should
2164          * need a fsck
2165          */
2166         btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2167         btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2168
2169         return backup_index;
2170 }
2171
2172 /* helper to cleanup workers */
2173 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2174 {
2175         btrfs_destroy_workqueue(fs_info->fixup_workers);
2176         btrfs_destroy_workqueue(fs_info->delalloc_workers);
2177         btrfs_destroy_workqueue(fs_info->hipri_workers);
2178         btrfs_destroy_workqueue(fs_info->workers);
2179         if (fs_info->endio_workers)
2180                 destroy_workqueue(fs_info->endio_workers);
2181         if (fs_info->rmw_workers)
2182                 destroy_workqueue(fs_info->rmw_workers);
2183         if (fs_info->compressed_write_workers)
2184                 destroy_workqueue(fs_info->compressed_write_workers);
2185         btrfs_destroy_workqueue(fs_info->endio_write_workers);
2186         btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2187         btrfs_destroy_workqueue(fs_info->delayed_workers);
2188         btrfs_destroy_workqueue(fs_info->caching_workers);
2189         btrfs_destroy_workqueue(fs_info->flush_workers);
2190         btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2191         if (fs_info->discard_ctl.discard_workers)
2192                 destroy_workqueue(fs_info->discard_ctl.discard_workers);
2193         /*
2194          * Now that all other work queues are destroyed, we can safely destroy
2195          * the queues used for metadata I/O, since tasks from those other work
2196          * queues can do metadata I/O operations.
2197          */
2198         if (fs_info->endio_meta_workers)
2199                 destroy_workqueue(fs_info->endio_meta_workers);
2200 }
2201
2202 static void free_root_extent_buffers(struct btrfs_root *root)
2203 {
2204         if (root) {
2205                 free_extent_buffer(root->node);
2206                 free_extent_buffer(root->commit_root);
2207                 root->node = NULL;
2208                 root->commit_root = NULL;
2209         }
2210 }
2211
2212 static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
2213 {
2214         struct btrfs_root *root, *tmp;
2215
2216         rbtree_postorder_for_each_entry_safe(root, tmp,
2217                                              &fs_info->global_root_tree,
2218                                              rb_node)
2219                 free_root_extent_buffers(root);
2220 }
2221
2222 /* helper to cleanup tree roots */
2223 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2224 {
2225         free_root_extent_buffers(info->tree_root);
2226
2227         free_global_root_pointers(info);
2228         free_root_extent_buffers(info->dev_root);
2229         free_root_extent_buffers(info->quota_root);
2230         free_root_extent_buffers(info->uuid_root);
2231         free_root_extent_buffers(info->fs_root);
2232         free_root_extent_buffers(info->data_reloc_root);
2233         free_root_extent_buffers(info->block_group_root);
2234         if (free_chunk_root)
2235                 free_root_extent_buffers(info->chunk_root);
2236 }
2237
2238 void btrfs_put_root(struct btrfs_root *root)
2239 {
2240         if (!root)
2241                 return;
2242
2243         if (refcount_dec_and_test(&root->refs)) {
2244                 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2245                 WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2246                 if (root->anon_dev)
2247                         free_anon_bdev(root->anon_dev);
2248                 btrfs_drew_lock_destroy(&root->snapshot_lock);
2249                 free_root_extent_buffers(root);
2250 #ifdef CONFIG_BTRFS_DEBUG
2251                 spin_lock(&root->fs_info->fs_roots_radix_lock);
2252                 list_del_init(&root->leak_list);
2253                 spin_unlock(&root->fs_info->fs_roots_radix_lock);
2254 #endif
2255                 kfree(root);
2256         }
2257 }
2258
2259 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2260 {
2261         int ret;
2262         struct btrfs_root *gang[8];
2263         int i;
2264
2265         while (!list_empty(&fs_info->dead_roots)) {
2266                 gang[0] = list_entry(fs_info->dead_roots.next,
2267                                      struct btrfs_root, root_list);
2268                 list_del(&gang[0]->root_list);
2269
2270                 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2271                         btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2272                 btrfs_put_root(gang[0]);
2273         }
2274
2275         while (1) {
2276                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2277                                              (void **)gang, 0,
2278                                              ARRAY_SIZE(gang));
2279                 if (!ret)
2280                         break;
2281                 for (i = 0; i < ret; i++)
2282                         btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2283         }
2284 }
2285
2286 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2287 {
2288         mutex_init(&fs_info->scrub_lock);
2289         atomic_set(&fs_info->scrubs_running, 0);
2290         atomic_set(&fs_info->scrub_pause_req, 0);
2291         atomic_set(&fs_info->scrubs_paused, 0);
2292         atomic_set(&fs_info->scrub_cancel_req, 0);
2293         init_waitqueue_head(&fs_info->scrub_pause_wait);
2294         refcount_set(&fs_info->scrub_workers_refcnt, 0);
2295 }
2296
2297 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2298 {
2299         spin_lock_init(&fs_info->balance_lock);
2300         mutex_init(&fs_info->balance_mutex);
2301         atomic_set(&fs_info->balance_pause_req, 0);
2302         atomic_set(&fs_info->balance_cancel_req, 0);
2303         fs_info->balance_ctl = NULL;
2304         init_waitqueue_head(&fs_info->balance_wait_q);
2305         atomic_set(&fs_info->reloc_cancel_req, 0);
2306 }
2307
2308 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2309 {
2310         struct inode *inode = fs_info->btree_inode;
2311         unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
2312                                               fs_info->tree_root);
2313
2314         inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2315         set_nlink(inode, 1);
2316         /*
2317          * we set the i_size on the btree inode to the max possible int.
2318          * the real end of the address space is determined by all of
2319          * the devices in the system
2320          */
2321         inode->i_size = OFFSET_MAX;
2322         inode->i_mapping->a_ops = &btree_aops;
2323
2324         RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2325         extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2326                             IO_TREE_BTREE_INODE_IO);
2327         extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2328
2329         BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2330         BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
2331         BTRFS_I(inode)->location.type = 0;
2332         BTRFS_I(inode)->location.offset = 0;
2333         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
2334         __insert_inode_hash(inode, hash);
2335 }
2336
2337 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2338 {
2339         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2340         init_rwsem(&fs_info->dev_replace.rwsem);
2341         init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2342 }
2343
2344 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2345 {
2346         spin_lock_init(&fs_info->qgroup_lock);
2347         mutex_init(&fs_info->qgroup_ioctl_lock);
2348         fs_info->qgroup_tree = RB_ROOT;
2349         INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2350         fs_info->qgroup_seq = 1;
2351         fs_info->qgroup_ulist = NULL;
2352         fs_info->qgroup_rescan_running = false;
2353         fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
2354         mutex_init(&fs_info->qgroup_rescan_lock);
2355 }
2356
2357 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
2358 {
2359         u32 max_active = fs_info->thread_pool_size;
2360         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2361
2362         fs_info->workers =
2363                 btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
2364         fs_info->hipri_workers =
2365                 btrfs_alloc_workqueue(fs_info, "worker-high",
2366                                       flags | WQ_HIGHPRI, max_active, 16);
2367
2368         fs_info->delalloc_workers =
2369                 btrfs_alloc_workqueue(fs_info, "delalloc",
2370                                       flags, max_active, 2);
2371
2372         fs_info->flush_workers =
2373                 btrfs_alloc_workqueue(fs_info, "flush_delalloc",
2374                                       flags, max_active, 0);
2375
2376         fs_info->caching_workers =
2377                 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2378
2379         fs_info->fixup_workers =
2380                 btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2381
2382         fs_info->endio_workers =
2383                 alloc_workqueue("btrfs-endio", flags, max_active);
2384         fs_info->endio_meta_workers =
2385                 alloc_workqueue("btrfs-endio-meta", flags, max_active);
2386         fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
2387         fs_info->endio_write_workers =
2388                 btrfs_alloc_workqueue(fs_info, "endio-write", flags,
2389                                       max_active, 2);
2390         fs_info->compressed_write_workers =
2391                 alloc_workqueue("btrfs-compressed-write", flags, max_active);
2392         fs_info->endio_freespace_worker =
2393                 btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
2394                                       max_active, 0);
2395         fs_info->delayed_workers =
2396                 btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
2397                                       max_active, 0);
2398         fs_info->qgroup_rescan_workers =
2399                 btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2400         fs_info->discard_ctl.discard_workers =
2401                 alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2402
2403         if (!(fs_info->workers && fs_info->hipri_workers &&
2404               fs_info->delalloc_workers && fs_info->flush_workers &&
2405               fs_info->endio_workers && fs_info->endio_meta_workers &&
2406               fs_info->compressed_write_workers &&
2407               fs_info->endio_write_workers &&
2408               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2409               fs_info->caching_workers && fs_info->fixup_workers &&
2410               fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2411               fs_info->discard_ctl.discard_workers)) {
2412                 return -ENOMEM;
2413         }
2414
2415         return 0;
2416 }
2417
2418 static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2419 {
2420         struct crypto_shash *csum_shash;
2421         const char *csum_driver = btrfs_super_csum_driver(csum_type);
2422
2423         csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2424
2425         if (IS_ERR(csum_shash)) {
2426                 btrfs_err(fs_info, "error allocating %s hash for checksum",
2427                           csum_driver);
2428                 return PTR_ERR(csum_shash);
2429         }
2430
2431         fs_info->csum_shash = csum_shash;
2432
2433         btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2434                         btrfs_super_csum_name(csum_type),
2435                         crypto_shash_driver_name(csum_shash));
2436         return 0;
2437 }
2438
2439 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2440                             struct btrfs_fs_devices *fs_devices)
2441 {
2442         int ret;
2443         struct btrfs_tree_parent_check check = { 0 };
2444         struct btrfs_root *log_tree_root;
2445         struct btrfs_super_block *disk_super = fs_info->super_copy;
2446         u64 bytenr = btrfs_super_log_root(disk_super);
2447         int level = btrfs_super_log_root_level(disk_super);
2448
2449         if (fs_devices->rw_devices == 0) {
2450                 btrfs_warn(fs_info, "log replay required on RO media");
2451                 return -EIO;
2452         }
2453
2454         log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2455                                          GFP_KERNEL);
2456         if (!log_tree_root)
2457                 return -ENOMEM;
2458
2459         check.level = level;
2460         check.transid = fs_info->generation + 1;
2461         check.owner_root = BTRFS_TREE_LOG_OBJECTID;
2462         log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
2463         if (IS_ERR(log_tree_root->node)) {
2464                 btrfs_warn(fs_info, "failed to read log tree");
2465                 ret = PTR_ERR(log_tree_root->node);
2466                 log_tree_root->node = NULL;
2467                 btrfs_put_root(log_tree_root);
2468                 return ret;
2469         }
2470         if (!extent_buffer_uptodate(log_tree_root->node)) {
2471                 btrfs_err(fs_info, "failed to read log tree");
2472                 btrfs_put_root(log_tree_root);
2473                 return -EIO;
2474         }
2475
2476         /* returns with log_tree_root freed on success */
2477         ret = btrfs_recover_log_trees(log_tree_root);
2478         if (ret) {
2479                 btrfs_handle_fs_error(fs_info, ret,
2480                                       "Failed to recover log tree");
2481                 btrfs_put_root(log_tree_root);
2482                 return ret;
2483         }
2484
2485         if (sb_rdonly(fs_info->sb)) {
2486                 ret = btrfs_commit_super(fs_info);
2487                 if (ret)
2488                         return ret;
2489         }
2490
2491         return 0;
2492 }
2493
2494 static int load_global_roots_objectid(struct btrfs_root *tree_root,
2495                                       struct btrfs_path *path, u64 objectid,
2496                                       const char *name)
2497 {
2498         struct btrfs_fs_info *fs_info = tree_root->fs_info;
2499         struct btrfs_root *root;
2500         u64 max_global_id = 0;
2501         int ret;
2502         struct btrfs_key key = {
2503                 .objectid = objectid,
2504                 .type = BTRFS_ROOT_ITEM_KEY,
2505                 .offset = 0,
2506         };
2507         bool found = false;
2508
2509         /* If we have IGNOREDATACSUMS skip loading these roots. */
2510         if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
2511             btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2512                 set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2513                 return 0;
2514         }
2515
2516         while (1) {
2517                 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2518                 if (ret < 0)
2519                         break;
2520
2521                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2522                         ret = btrfs_next_leaf(tree_root, path);
2523                         if (ret) {
2524                                 if (ret > 0)
2525                                         ret = 0;
2526                                 break;
2527                         }
2528                 }
2529                 ret = 0;
2530
2531                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2532                 if (key.objectid != objectid)
2533                         break;
2534                 btrfs_release_path(path);
2535
2536                 /*
2537                  * Just worry about this for extent tree, it'll be the same for
2538                  * everybody.
2539                  */
2540                 if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2541                         max_global_id = max(max_global_id, key.offset);
2542
2543                 found = true;
2544                 root = read_tree_root_path(tree_root, path, &key);
2545                 if (IS_ERR(root)) {
2546                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2547                                 ret = PTR_ERR(root);
2548                         break;
2549                 }
2550                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2551                 ret = btrfs_global_root_insert(root);
2552                 if (ret) {
2553                         btrfs_put_root(root);
2554                         break;
2555                 }
2556                 key.offset++;
2557         }
2558         btrfs_release_path(path);
2559
2560         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
2561                 fs_info->nr_global_roots = max_global_id + 1;
2562
2563         if (!found || ret) {
2564                 if (objectid == BTRFS_CSUM_TREE_OBJECTID)
2565                         set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2566
2567                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
2568                         ret = ret ? ret : -ENOENT;
2569                 else
2570                         ret = 0;
2571                 btrfs_err(fs_info, "failed to load root %s", name);
2572         }
2573         return ret;
2574 }
2575
2576 static int load_global_roots(struct btrfs_root *tree_root)
2577 {
2578         struct btrfs_path *path;
2579         int ret = 0;
2580
2581         path = btrfs_alloc_path();
2582         if (!path)
2583                 return -ENOMEM;
2584
2585         ret = load_global_roots_objectid(tree_root, path,
2586                                          BTRFS_EXTENT_TREE_OBJECTID, "extent");
2587         if (ret)
2588                 goto out;
2589         ret = load_global_roots_objectid(tree_root, path,
2590                                          BTRFS_CSUM_TREE_OBJECTID, "csum");
2591         if (ret)
2592                 goto out;
2593         if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
2594                 goto out;
2595         ret = load_global_roots_objectid(tree_root, path,
2596                                          BTRFS_FREE_SPACE_TREE_OBJECTID,
2597                                          "free space");
2598 out:
2599         btrfs_free_path(path);
2600         return ret;
2601 }
2602
2603 static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2604 {
2605         struct btrfs_root *tree_root = fs_info->tree_root;
2606         struct btrfs_root *root;
2607         struct btrfs_key location;
2608         int ret;
2609
2610         BUG_ON(!fs_info->tree_root);
2611
2612         ret = load_global_roots(tree_root);
2613         if (ret)
2614                 return ret;
2615
2616         location.type = BTRFS_ROOT_ITEM_KEY;
2617         location.offset = 0;
2618
2619         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
2620                 location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
2621                 root = btrfs_read_tree_root(tree_root, &location);
2622                 if (IS_ERR(root)) {
2623                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2624                                 ret = PTR_ERR(root);
2625                                 goto out;
2626                         }
2627                 } else {
2628                         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2629                         fs_info->block_group_root = root;
2630                 }
2631         }
2632
2633         location.objectid = BTRFS_DEV_TREE_OBJECTID;
2634         root = btrfs_read_tree_root(tree_root, &location);
2635         if (IS_ERR(root)) {
2636                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2637                         ret = PTR_ERR(root);
2638                         goto out;
2639                 }
2640         } else {
2641                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2642                 fs_info->dev_root = root;
2643         }
2644         /* Initialize fs_info for all devices in any case */
2645         ret = btrfs_init_devices_late(fs_info);
2646         if (ret)
2647                 goto out;
2648
2649         /*
2650          * This tree can share blocks with some other fs tree during relocation
2651          * and we need a proper setup by btrfs_get_fs_root
2652          */
2653         root = btrfs_get_fs_root(tree_root->fs_info,
2654                                  BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2655         if (IS_ERR(root)) {
2656                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2657                         ret = PTR_ERR(root);
2658                         goto out;
2659                 }
2660         } else {
2661                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2662                 fs_info->data_reloc_root = root;
2663         }
2664
2665         location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2666         root = btrfs_read_tree_root(tree_root, &location);
2667         if (!IS_ERR(root)) {
2668                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2669                 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2670                 fs_info->quota_root = root;
2671         }
2672
2673         location.objectid = BTRFS_UUID_TREE_OBJECTID;
2674         root = btrfs_read_tree_root(tree_root, &location);
2675         if (IS_ERR(root)) {
2676                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2677                         ret = PTR_ERR(root);
2678                         if (ret != -ENOENT)
2679                                 goto out;
2680                 }
2681         } else {
2682                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2683                 fs_info->uuid_root = root;
2684         }
2685
2686         return 0;
2687 out:
2688         btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2689                    location.objectid, ret);
2690         return ret;
2691 }
2692
2693 /*
2694  * Real super block validation
2695  * NOTE: super csum type and incompat features will not be checked here.
2696  *
2697  * @sb:         super block to check
2698  * @mirror_num: the super block number to check its bytenr:
2699  *              0       the primary (1st) sb
2700  *              1, 2    2nd and 3rd backup copy
2701  *             -1       skip bytenr check
2702  */
2703 int btrfs_validate_super(struct btrfs_fs_info *fs_info,
2704                          struct btrfs_super_block *sb, int mirror_num)
2705 {
2706         u64 nodesize = btrfs_super_nodesize(sb);
2707         u64 sectorsize = btrfs_super_sectorsize(sb);
2708         int ret = 0;
2709
2710         if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2711                 btrfs_err(fs_info, "no valid FS found");
2712                 ret = -EINVAL;
2713         }
2714         if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2715                 btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2716                                 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2717                 ret = -EINVAL;
2718         }
2719         if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2720                 btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2721                                 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2722                 ret = -EINVAL;
2723         }
2724         if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2725                 btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2726                                 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2727                 ret = -EINVAL;
2728         }
2729         if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2730                 btrfs_err(fs_info, "log_root level too big: %d >= %d",
2731                                 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2732                 ret = -EINVAL;
2733         }
2734
2735         /*
2736          * Check sectorsize and nodesize first, other check will need it.
2737          * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2738          */
2739         if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2740             sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2741                 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2742                 ret = -EINVAL;
2743         }
2744
2745         /*
2746          * We only support at most two sectorsizes: 4K and PAGE_SIZE.
2747          *
2748          * We can support 16K sectorsize with 64K page size without problem,
2749          * but such sectorsize/pagesize combination doesn't make much sense.
2750          * 4K will be our future standard, PAGE_SIZE is supported from the very
2751          * beginning.
2752          */
2753         if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
2754                 btrfs_err(fs_info,
2755                         "sectorsize %llu not yet supported for page size %lu",
2756                         sectorsize, PAGE_SIZE);
2757                 ret = -EINVAL;
2758         }
2759
2760         if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2761             nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2762                 btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2763                 ret = -EINVAL;
2764         }
2765         if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2766                 btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2767                           le32_to_cpu(sb->__unused_leafsize), nodesize);
2768                 ret = -EINVAL;
2769         }
2770
2771         /* Root alignment check */
2772         if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2773                 btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2774                            btrfs_super_root(sb));
2775                 ret = -EINVAL;
2776         }
2777         if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2778                 btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2779                            btrfs_super_chunk_root(sb));
2780                 ret = -EINVAL;
2781         }
2782         if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2783                 btrfs_warn(fs_info, "log_root block unaligned: %llu",
2784                            btrfs_super_log_root(sb));
2785                 ret = -EINVAL;
2786         }
2787
2788         if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2789                    BTRFS_FSID_SIZE)) {
2790                 btrfs_err(fs_info,
2791                 "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2792                         fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
2793                 ret = -EINVAL;
2794         }
2795
2796         if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
2797             memcmp(fs_info->fs_devices->metadata_uuid,
2798                    fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
2799                 btrfs_err(fs_info,
2800 "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2801                         fs_info->super_copy->metadata_uuid,
2802                         fs_info->fs_devices->metadata_uuid);
2803                 ret = -EINVAL;
2804         }
2805
2806         /*
2807          * Artificial requirement for block-group-tree to force newer features
2808          * (free-space-tree, no-holes) so the test matrix is smaller.
2809          */
2810         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
2811             (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
2812              !btrfs_fs_incompat(fs_info, NO_HOLES))) {
2813                 btrfs_err(fs_info,
2814                 "block-group-tree feature requires fres-space-tree and no-holes");
2815                 ret = -EINVAL;
2816         }
2817
2818         if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2819                    BTRFS_FSID_SIZE) != 0) {
2820                 btrfs_err(fs_info,
2821                         "dev_item UUID does not match metadata fsid: %pU != %pU",
2822                         fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2823                 ret = -EINVAL;
2824         }
2825
2826         /*
2827          * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2828          * done later
2829          */
2830         if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2831                 btrfs_err(fs_info, "bytes_used is too small %llu",
2832                           btrfs_super_bytes_used(sb));
2833                 ret = -EINVAL;
2834         }
2835         if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2836                 btrfs_err(fs_info, "invalid stripesize %u",
2837                           btrfs_super_stripesize(sb));
2838                 ret = -EINVAL;
2839         }
2840         if (btrfs_super_num_devices(sb) > (1UL << 31))
2841                 btrfs_warn(fs_info, "suspicious number of devices: %llu",
2842                            btrfs_super_num_devices(sb));
2843         if (btrfs_super_num_devices(sb) == 0) {
2844                 btrfs_err(fs_info, "number of devices is 0");
2845                 ret = -EINVAL;
2846         }
2847
2848         if (mirror_num >= 0 &&
2849             btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2850                 btrfs_err(fs_info, "super offset mismatch %llu != %u",
2851                           btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2852                 ret = -EINVAL;
2853         }
2854
2855         /*
2856          * Obvious sys_chunk_array corruptions, it must hold at least one key
2857          * and one chunk
2858          */
2859         if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2860                 btrfs_err(fs_info, "system chunk array too big %u > %u",
2861                           btrfs_super_sys_array_size(sb),
2862                           BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2863                 ret = -EINVAL;
2864         }
2865         if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2866                         + sizeof(struct btrfs_chunk)) {
2867                 btrfs_err(fs_info, "system chunk array too small %u < %zu",
2868                           btrfs_super_sys_array_size(sb),
2869                           sizeof(struct btrfs_disk_key)
2870                           + sizeof(struct btrfs_chunk));
2871                 ret = -EINVAL;
2872         }
2873
2874         /*
2875          * The generation is a global counter, we'll trust it more than the others
2876          * but it's still possible that it's the one that's wrong.
2877          */
2878         if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2879                 btrfs_warn(fs_info,
2880                         "suspicious: generation < chunk_root_generation: %llu < %llu",
2881                         btrfs_super_generation(sb),
2882                         btrfs_super_chunk_root_generation(sb));
2883         if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2884             && btrfs_super_cache_generation(sb) != (u64)-1)
2885                 btrfs_warn(fs_info,
2886                         "suspicious: generation < cache_generation: %llu < %llu",
2887                         btrfs_super_generation(sb),
2888                         btrfs_super_cache_generation(sb));
2889
2890         return ret;
2891 }
2892
2893 /*
2894  * Validation of super block at mount time.
2895  * Some checks already done early at mount time, like csum type and incompat
2896  * flags will be skipped.
2897  */
2898 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2899 {
2900         return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
2901 }
2902
2903 /*
2904  * Validation of super block at write time.
2905  * Some checks like bytenr check will be skipped as their values will be
2906  * overwritten soon.
2907  * Extra checks like csum type and incompat flags will be done here.
2908  */
2909 static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2910                                       struct btrfs_super_block *sb)
2911 {
2912         int ret;
2913
2914         ret = btrfs_validate_super(fs_info, sb, -1);
2915         if (ret < 0)
2916                 goto out;
2917         if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2918                 ret = -EUCLEAN;
2919                 btrfs_err(fs_info, "invalid csum type, has %u want %u",
2920                           btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2921                 goto out;
2922         }
2923         if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2924                 ret = -EUCLEAN;
2925                 btrfs_err(fs_info,
2926                 "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2927                           btrfs_super_incompat_flags(sb),
2928                           (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2929                 goto out;
2930         }
2931 out:
2932         if (ret < 0)
2933                 btrfs_err(fs_info,
2934                 "super block corruption detected before writing it to disk");
2935         return ret;
2936 }
2937
2938 static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
2939 {
2940         struct btrfs_tree_parent_check check = {
2941                 .level = level,
2942                 .transid = gen,
2943                 .owner_root = root->root_key.objectid
2944         };
2945         int ret = 0;
2946
2947         root->node = read_tree_block(root->fs_info, bytenr, &check);
2948         if (IS_ERR(root->node)) {
2949                 ret = PTR_ERR(root->node);
2950                 root->node = NULL;
2951                 return ret;
2952         }
2953         if (!extent_buffer_uptodate(root->node)) {
2954                 free_extent_buffer(root->node);
2955                 root->node = NULL;
2956                 return -EIO;
2957         }
2958
2959         btrfs_set_root_node(&root->root_item, root->node);
2960         root->commit_root = btrfs_root_node(root);
2961         btrfs_set_root_refs(&root->root_item, 1);
2962         return ret;
2963 }
2964
2965 static int load_important_roots(struct btrfs_fs_info *fs_info)
2966 {
2967         struct btrfs_super_block *sb = fs_info->super_copy;
2968         u64 gen, bytenr;
2969         int level, ret;
2970
2971         bytenr = btrfs_super_root(sb);
2972         gen = btrfs_super_generation(sb);
2973         level = btrfs_super_root_level(sb);
2974         ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2975         if (ret) {
2976                 btrfs_warn(fs_info, "couldn't read tree root");
2977                 return ret;
2978         }
2979         return 0;
2980 }
2981
2982 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2983 {
2984         int backup_index = find_newest_super_backup(fs_info);
2985         struct btrfs_super_block *sb = fs_info->super_copy;
2986         struct btrfs_root *tree_root = fs_info->tree_root;
2987         bool handle_error = false;
2988         int ret = 0;
2989         int i;
2990
2991         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2992                 if (handle_error) {
2993                         if (!IS_ERR(tree_root->node))
2994                                 free_extent_buffer(tree_root->node);
2995                         tree_root->node = NULL;
2996
2997                         if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2998                                 break;
2999
3000                         free_root_pointers(fs_info, 0);
3001
3002                         /*
3003                          * Don't use the log in recovery mode, it won't be
3004                          * valid
3005                          */
3006                         btrfs_set_super_log_root(sb, 0);
3007
3008                         /* We can't trust the free space cache either */
3009                         btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
3010
3011                         ret = read_backup_root(fs_info, i);
3012                         backup_index = ret;
3013                         if (ret < 0)
3014                                 return ret;
3015                 }
3016
3017                 ret = load_important_roots(fs_info);
3018                 if (ret) {
3019                         handle_error = true;
3020                         continue;
3021                 }
3022
3023                 /*
3024                  * No need to hold btrfs_root::objectid_mutex since the fs
3025                  * hasn't been fully initialised and we are the only user
3026                  */
3027                 ret = btrfs_init_root_free_objectid(tree_root);
3028                 if (ret < 0) {
3029                         handle_error = true;
3030                         continue;
3031                 }
3032
3033                 ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
3034
3035                 ret = btrfs_read_roots(fs_info);
3036                 if (ret < 0) {
3037                         handle_error = true;
3038                         continue;
3039                 }
3040
3041                 /* All successful */
3042                 fs_info->generation = btrfs_header_generation(tree_root->node);
3043                 fs_info->last_trans_committed = fs_info->generation;
3044                 fs_info->last_reloc_trans = 0;
3045
3046                 /* Always begin writing backup roots after the one being used */
3047                 if (backup_index < 0) {
3048                         fs_info->backup_root_index = 0;
3049                 } else {
3050                         fs_info->backup_root_index = backup_index + 1;
3051                         fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
3052                 }
3053                 break;
3054         }
3055
3056         return ret;
3057 }
3058
3059 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
3060 {
3061         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
3062         INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
3063         INIT_LIST_HEAD(&fs_info->trans_list);
3064         INIT_LIST_HEAD(&fs_info->dead_roots);
3065         INIT_LIST_HEAD(&fs_info->delayed_iputs);
3066         INIT_LIST_HEAD(&fs_info->delalloc_roots);
3067         INIT_LIST_HEAD(&fs_info->caching_block_groups);
3068         spin_lock_init(&fs_info->delalloc_root_lock);
3069         spin_lock_init(&fs_info->trans_lock);
3070         spin_lock_init(&fs_info->fs_roots_radix_lock);
3071         spin_lock_init(&fs_info->delayed_iput_lock);
3072         spin_lock_init(&fs_info->defrag_inodes_lock);
3073         spin_lock_init(&fs_info->super_lock);
3074         spin_lock_init(&fs_info->buffer_lock);
3075         spin_lock_init(&fs_info->unused_bgs_lock);
3076         spin_lock_init(&fs_info->treelog_bg_lock);
3077         spin_lock_init(&fs_info->zone_active_bgs_lock);
3078         spin_lock_init(&fs_info->relocation_bg_lock);
3079         rwlock_init(&fs_info->tree_mod_log_lock);
3080         rwlock_init(&fs_info->global_root_lock);
3081         mutex_init(&fs_info->unused_bg_unpin_mutex);
3082         mutex_init(&fs_info->reclaim_bgs_lock);
3083         mutex_init(&fs_info->reloc_mutex);
3084         mutex_init(&fs_info->delalloc_root_mutex);
3085         mutex_init(&fs_info->zoned_meta_io_lock);
3086         mutex_init(&fs_info->zoned_data_reloc_io_lock);
3087         seqlock_init(&fs_info->profiles_lock);
3088
3089         btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
3090         btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
3091         btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
3092         btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
3093         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
3094                                      BTRFS_LOCKDEP_TRANS_COMMIT_START);
3095         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
3096                                      BTRFS_LOCKDEP_TRANS_UNBLOCKED);
3097         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
3098                                      BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
3099         btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
3100                                      BTRFS_LOCKDEP_TRANS_COMPLETED);
3101
3102         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
3103         INIT_LIST_HEAD(&fs_info->space_info);
3104         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
3105         INIT_LIST_HEAD(&fs_info->unused_bgs);
3106         INIT_LIST_HEAD(&fs_info->reclaim_bgs);
3107         INIT_LIST_HEAD(&fs_info->zone_active_bgs);
3108 #ifdef CONFIG_BTRFS_DEBUG
3109         INIT_LIST_HEAD(&fs_info->allocated_roots);
3110         INIT_LIST_HEAD(&fs_info->allocated_ebs);
3111         spin_lock_init(&fs_info->eb_leak_lock);
3112 #endif
3113         extent_map_tree_init(&fs_info->mapping_tree);
3114         btrfs_init_block_rsv(&fs_info->global_block_rsv,
3115                              BTRFS_BLOCK_RSV_GLOBAL);
3116         btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
3117         btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
3118         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
3119         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
3120                              BTRFS_BLOCK_RSV_DELOPS);
3121         btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
3122                              BTRFS_BLOCK_RSV_DELREFS);
3123
3124         atomic_set(&fs_info->async_delalloc_pages, 0);
3125         atomic_set(&fs_info->defrag_running, 0);
3126         atomic_set(&fs_info->nr_delayed_iputs, 0);
3127         atomic64_set(&fs_info->tree_mod_seq, 0);
3128         fs_info->global_root_tree = RB_ROOT;
3129         fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
3130         fs_info->metadata_ratio = 0;
3131         fs_info->defrag_inodes = RB_ROOT;
3132         atomic64_set(&fs_info->free_chunk_space, 0);
3133         fs_info->tree_mod_log = RB_ROOT;
3134         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
3135         fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
3136         btrfs_init_ref_verify(fs_info);
3137
3138         fs_info->thread_pool_size = min_t(unsigned long,
3139                                           num_online_cpus() + 2, 8);
3140
3141         INIT_LIST_HEAD(&fs_info->ordered_roots);
3142         spin_lock_init(&fs_info->ordered_root_lock);
3143
3144         btrfs_init_scrub(fs_info);
3145 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3146         fs_info->check_integrity_print_mask = 0;
3147 #endif
3148         btrfs_init_balance(fs_info);
3149         btrfs_init_async_reclaim_work(fs_info);
3150
3151         rwlock_init(&fs_info->block_group_cache_lock);
3152         fs_info->block_group_cache_tree = RB_ROOT_CACHED;
3153
3154         extent_io_tree_init(fs_info, &fs_info->excluded_extents,
3155                             IO_TREE_FS_EXCLUDED_EXTENTS);
3156
3157         mutex_init(&fs_info->ordered_operations_mutex);
3158         mutex_init(&fs_info->tree_log_mutex);
3159         mutex_init(&fs_info->chunk_mutex);
3160         mutex_init(&fs_info->transaction_kthread_mutex);
3161         mutex_init(&fs_info->cleaner_mutex);
3162         mutex_init(&fs_info->ro_block_group_mutex);
3163         init_rwsem(&fs_info->commit_root_sem);
3164         init_rwsem(&fs_info->cleanup_work_sem);
3165         init_rwsem(&fs_info->subvol_sem);
3166         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
3167
3168         btrfs_init_dev_replace_locks(fs_info);
3169         btrfs_init_qgroup(fs_info);
3170         btrfs_discard_init(fs_info);
3171
3172         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
3173         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
3174
3175         init_waitqueue_head(&fs_info->transaction_throttle);
3176         init_waitqueue_head(&fs_info->transaction_wait);
3177         init_waitqueue_head(&fs_info->transaction_blocked_wait);
3178         init_waitqueue_head(&fs_info->async_submit_wait);
3179         init_waitqueue_head(&fs_info->delayed_iputs_wait);
3180
3181         /* Usable values until the real ones are cached from the superblock */
3182         fs_info->nodesize = 4096;
3183         fs_info->sectorsize = 4096;
3184         fs_info->sectorsize_bits = ilog2(4096);
3185         fs_info->stripesize = 4096;
3186
3187         fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
3188
3189         spin_lock_init(&fs_info->swapfile_pins_lock);
3190         fs_info->swapfile_pins = RB_ROOT;
3191
3192         fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
3193         INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
3194 }
3195
3196 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
3197 {
3198         int ret;
3199
3200         fs_info->sb = sb;
3201         sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
3202         sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
3203
3204         ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
3205         if (ret)
3206                 return ret;
3207
3208         ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
3209         if (ret)
3210                 return ret;
3211
3212         fs_info->dirty_metadata_batch = PAGE_SIZE *
3213                                         (1 + ilog2(nr_cpu_ids));
3214
3215         ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
3216         if (ret)
3217                 return ret;
3218
3219         ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
3220                         GFP_KERNEL);
3221         if (ret)
3222                 return ret;
3223
3224         fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
3225                                         GFP_KERNEL);
3226         if (!fs_info->delayed_root)
3227                 return -ENOMEM;
3228         btrfs_init_delayed_root(fs_info->delayed_root);
3229
3230         if (sb_rdonly(sb))
3231                 set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
3232
3233         return btrfs_alloc_stripe_hash_table(fs_info);
3234 }
3235
3236 static int btrfs_uuid_rescan_kthread(void *data)
3237 {
3238         struct btrfs_fs_info *fs_info = data;
3239         int ret;
3240
3241         /*
3242          * 1st step is to iterate through the existing UUID tree and
3243          * to delete all entries that contain outdated data.
3244          * 2nd step is to add all missing entries to the UUID tree.
3245          */
3246         ret = btrfs_uuid_tree_iterate(fs_info);
3247         if (ret < 0) {
3248                 if (ret != -EINTR)
3249                         btrfs_warn(fs_info, "iterating uuid_tree failed %d",
3250                                    ret);
3251                 up(&fs_info->uuid_tree_rescan_sem);
3252                 return ret;
3253         }
3254         return btrfs_uuid_scan_kthread(data);
3255 }
3256
3257 static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3258 {
3259         struct task_struct *task;
3260
3261         down(&fs_info->uuid_tree_rescan_sem);
3262         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3263         if (IS_ERR(task)) {
3264                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3265                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
3266                 up(&fs_info->uuid_tree_rescan_sem);
3267                 return PTR_ERR(task);
3268         }
3269
3270         return 0;
3271 }
3272
3273 /*
3274  * Some options only have meaning at mount time and shouldn't persist across
3275  * remounts, or be displayed. Clear these at the end of mount and remount
3276  * code paths.
3277  */
3278 void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
3279 {
3280         btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
3281         btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
3282 }
3283
3284 /*
3285  * Mounting logic specific to read-write file systems. Shared by open_ctree
3286  * and btrfs_remount when remounting from read-only to read-write.
3287  */
3288 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
3289 {
3290         int ret;
3291         const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
3292         bool clear_free_space_tree = false;
3293
3294         if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3295             btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3296                 clear_free_space_tree = true;
3297         } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3298                    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3299                 btrfs_warn(fs_info, "free space tree is invalid");
3300                 clear_free_space_tree = true;
3301         }
3302
3303         if (clear_free_space_tree) {
3304                 btrfs_info(fs_info, "clearing free space tree");
3305                 ret = btrfs_clear_free_space_tree(fs_info);
3306                 if (ret) {
3307                         btrfs_warn(fs_info,
3308                                    "failed to clear free space tree: %d", ret);
3309                         goto out;
3310                 }
3311         }
3312
3313         /*
3314          * btrfs_find_orphan_roots() is responsible for finding all the dead
3315          * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3316          * them into the fs_info->fs_roots_radix tree. This must be done before
3317          * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
3318          * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
3319          * item before the root's tree is deleted - this means that if we unmount
3320          * or crash before the deletion completes, on the next mount we will not
3321          * delete what remains of the tree because the orphan item does not
3322          * exists anymore, which is what tells us we have a pending deletion.
3323          */
3324         ret = btrfs_find_orphan_roots(fs_info);
3325         if (ret)
3326                 goto out;
3327
3328         ret = btrfs_cleanup_fs_roots(fs_info);
3329         if (ret)
3330                 goto out;
3331
3332         down_read(&fs_info->cleanup_work_sem);
3333         if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3334             (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3335                 up_read(&fs_info->cleanup_work_sem);
3336                 goto out;
3337         }
3338         up_read(&fs_info->cleanup_work_sem);
3339
3340         mutex_lock(&fs_info->cleaner_mutex);
3341         ret = btrfs_recover_relocation(fs_info);
3342         mutex_unlock(&fs_info->cleaner_mutex);
3343         if (ret < 0) {
3344                 btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
3345                 goto out;
3346         }
3347
3348         if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3349             !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3350                 btrfs_info(fs_info, "creating free space tree");
3351                 ret = btrfs_create_free_space_tree(fs_info);
3352                 if (ret) {
3353                         btrfs_warn(fs_info,
3354                                 "failed to create free space tree: %d", ret);
3355                         goto out;
3356                 }
3357         }
3358
3359         if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
3360                 ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
3361                 if (ret)
3362                         goto out;
3363         }
3364
3365         ret = btrfs_resume_balance_async(fs_info);
3366         if (ret)
3367                 goto out;
3368
3369         ret = btrfs_resume_dev_replace_async(fs_info);
3370         if (ret) {
3371                 btrfs_warn(fs_info, "failed to resume dev_replace");
3372                 goto out;
3373         }
3374
3375         btrfs_qgroup_rescan_resume(fs_info);
3376
3377         if (!fs_info->uuid_root) {
3378                 btrfs_info(fs_info, "creating UUID tree");
3379                 ret = btrfs_create_uuid_tree(fs_info);
3380                 if (ret) {
3381                         btrfs_warn(fs_info,
3382                                    "failed to create the UUID tree %d", ret);
3383                         goto out;
3384                 }
3385         }
3386
3387 out:
3388         return ret;
3389 }
3390
3391 /*
3392  * Do various sanity and dependency checks of different features.
3393  *
3394  * @is_rw_mount:        If the mount is read-write.
3395  *
3396  * This is the place for less strict checks (like for subpage or artificial
3397  * feature dependencies).
3398  *
3399  * For strict checks or possible corruption detection, see
3400  * btrfs_validate_super().
3401  *
3402  * This should be called after btrfs_parse_options(), as some mount options
3403  * (space cache related) can modify on-disk format like free space tree and
3404  * screw up certain feature dependencies.
3405  */
3406 int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
3407 {
3408         struct btrfs_super_block *disk_super = fs_info->super_copy;
3409         u64 incompat = btrfs_super_incompat_flags(disk_super);
3410         const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
3411         const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
3412
3413         if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
3414                 btrfs_err(fs_info,
3415                 "cannot mount because of unknown incompat features (0x%llx)",
3416                     incompat);
3417                 return -EINVAL;
3418         }
3419
3420         /* Runtime limitation for mixed block groups. */
3421         if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3422             (fs_info->sectorsize != fs_info->nodesize)) {
3423                 btrfs_err(fs_info,
3424 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3425                         fs_info->nodesize, fs_info->sectorsize);
3426                 return -EINVAL;
3427         }
3428
3429         /* Mixed backref is an always-enabled feature. */
3430         incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3431
3432         /* Set compression related flags just in case. */
3433         if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3434                 incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3435         else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3436                 incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3437
3438         /*
3439          * An ancient flag, which should really be marked deprecated.
3440          * Such runtime limitation doesn't really need a incompat flag.
3441          */
3442         if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
3443                 incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3444
3445         if (compat_ro_unsupp && is_rw_mount) {
3446                 btrfs_err(fs_info,
3447         "cannot mount read-write because of unknown compat_ro features (0x%llx)",
3448                        compat_ro);
3449                 return -EINVAL;
3450         }
3451
3452         /*
3453          * We have unsupported RO compat features, although RO mounted, we
3454          * should not cause any metadata writes, including log replay.
3455          * Or we could screw up whatever the new feature requires.
3456          */
3457         if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
3458             !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3459                 btrfs_err(fs_info,
3460 "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3461                           compat_ro);
3462                 return -EINVAL;
3463         }
3464
3465         /*
3466          * Artificial limitations for block group tree, to force
3467          * block-group-tree to rely on no-holes and free-space-tree.
3468          */
3469         if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
3470             (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
3471              !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
3472                 btrfs_err(fs_info,
3473 "block-group-tree feature requires no-holes and free-space-tree features");
3474                 return -EINVAL;
3475         }
3476
3477         /*
3478          * Subpage runtime limitation on v1 cache.
3479          *
3480          * V1 space cache still has some hard codeed PAGE_SIZE usage, while
3481          * we're already defaulting to v2 cache, no need to bother v1 as it's
3482          * going to be deprecated anyway.
3483          */
3484         if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
3485                 btrfs_warn(fs_info,
3486         "v1 space cache is not supported for page size %lu with sectorsize %u",
3487                            PAGE_SIZE, fs_info->sectorsize);
3488                 return -EINVAL;
3489         }
3490
3491         /* This can be called by remount, we need to protect the super block. */
3492         spin_lock(&fs_info->super_lock);
3493         btrfs_set_super_incompat_flags(disk_super, incompat);
3494         spin_unlock(&fs_info->super_lock);
3495
3496         return 0;
3497 }
3498
3499 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
3500                       char *options)
3501 {
3502         u32 sectorsize;
3503         u32 nodesize;
3504         u32 stripesize;
3505         u64 generation;
3506         u64 features;
3507         u16 csum_type;
3508         struct btrfs_super_block *disk_super;
3509         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3510         struct btrfs_root *tree_root;
3511         struct btrfs_root *chunk_root;
3512         int ret;
3513         int err = -EINVAL;
3514         int level;
3515
3516         ret = init_mount_fs_info(fs_info, sb);
3517         if (ret) {
3518                 err = ret;
3519                 goto fail;
3520         }
3521
3522         /* These need to be init'ed before we start creating inodes and such. */
3523         tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
3524                                      GFP_KERNEL);
3525         fs_info->tree_root = tree_root;
3526         chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
3527                                       GFP_KERNEL);
3528         fs_info->chunk_root = chunk_root;
3529         if (!tree_root || !chunk_root) {
3530                 err = -ENOMEM;
3531                 goto fail;
3532         }
3533
3534         fs_info->btree_inode = new_inode(sb);
3535         if (!fs_info->btree_inode) {
3536                 err = -ENOMEM;
3537                 goto fail;
3538         }
3539         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
3540         btrfs_init_btree_inode(fs_info);
3541
3542         invalidate_bdev(fs_devices->latest_dev->bdev);
3543
3544         /*
3545          * Read super block and check the signature bytes only
3546          */
3547         disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
3548         if (IS_ERR(disk_super)) {
3549                 err = PTR_ERR(disk_super);
3550                 goto fail_alloc;
3551         }
3552
3553         /*
3554          * Verify the type first, if that or the checksum value are
3555          * corrupted, we'll find out
3556          */
3557         csum_type = btrfs_super_csum_type(disk_super);
3558         if (!btrfs_supported_super_csum(csum_type)) {
3559                 btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3560                           csum_type);
3561                 err = -EINVAL;
3562                 btrfs_release_disk_super(disk_super);
3563                 goto fail_alloc;
3564         }
3565
3566         fs_info->csum_size = btrfs_super_csum_size(disk_super);
3567
3568         ret = btrfs_init_csum_hash(fs_info, csum_type);
3569         if (ret) {
3570                 err = ret;
3571                 btrfs_release_disk_super(disk_super);
3572                 goto fail_alloc;
3573         }
3574
3575         /*
3576          * We want to check superblock checksum, the type is stored inside.
3577          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
3578          */
3579         if (btrfs_check_super_csum(fs_info, disk_super)) {
3580                 btrfs_err(fs_info, "superblock checksum mismatch");
3581                 err = -EINVAL;
3582                 btrfs_release_disk_super(disk_super);
3583                 goto fail_alloc;
3584         }
3585
3586         /*
3587          * super_copy is zeroed at allocation time and we never touch the
3588          * following bytes up to INFO_SIZE, the checksum is calculated from
3589          * the whole block of INFO_SIZE
3590          */
3591         memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3592         btrfs_release_disk_super(disk_super);
3593
3594         disk_super = fs_info->super_copy;
3595
3596
3597         features = btrfs_super_flags(disk_super);
3598         if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3599                 features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3600                 btrfs_set_super_flags(disk_super, features);
3601                 btrfs_info(fs_info,
3602                         "found metadata UUID change in progress flag, clearing");
3603         }
3604
3605         memcpy(fs_info->super_for_commit, fs_info->super_copy,
3606                sizeof(*fs_info->super_for_commit));
3607
3608         ret = btrfs_validate_mount_super(fs_info);
3609         if (ret) {
3610                 btrfs_err(fs_info, "superblock contains fatal errors");
3611                 err = -EINVAL;
3612                 goto fail_alloc;
3613         }
3614
3615         if (!btrfs_super_root(disk_super))
3616                 goto fail_alloc;
3617
3618         /* check FS state, whether FS is broken. */
3619         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3620                 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3621
3622         /*
3623          * In the long term, we'll store the compression type in the super
3624          * block, and it'll be used for per file compression control.
3625          */
3626         fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3627
3628
3629         /* Set up fs_info before parsing mount options */
3630         nodesize = btrfs_super_nodesize(disk_super);
3631         sectorsize = btrfs_super_sectorsize(disk_super);
3632         stripesize = sectorsize;
3633         fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3634         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3635
3636         fs_info->nodesize = nodesize;
3637         fs_info->sectorsize = sectorsize;
3638         fs_info->sectorsize_bits = ilog2(sectorsize);
3639         fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3640         fs_info->stripesize = stripesize;
3641
3642         ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3643         if (ret) {
3644                 err = ret;
3645                 goto fail_alloc;
3646         }
3647
3648         ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
3649         if (ret < 0) {
3650                 err = ret;
3651                 goto fail_alloc;
3652         }
3653
3654         if (sectorsize < PAGE_SIZE) {
3655                 struct btrfs_subpage_info *subpage_info;
3656
3657                 /*
3658                  * V1 space cache has some hardcoded PAGE_SIZE usage, and is
3659                  * going to be deprecated.
3660                  *
3661                  * Force to use v2 cache for subpage case.
3662                  */
3663                 btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
3664                 btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
3665                         "forcing free space tree for sector size %u with page size %lu",
3666                         sectorsize, PAGE_SIZE);
3667
3668                 btrfs_warn(fs_info,
3669                 "read-write for sector size %u with page size %lu is experimental",
3670                            sectorsize, PAGE_SIZE);
3671                 subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
3672                 if (!subpage_info)
3673                         goto fail_alloc;
3674                 btrfs_init_subpage_info(subpage_info, sectorsize);
3675                 fs_info->subpage_info = subpage_info;
3676         }
3677
3678         ret = btrfs_init_workqueues(fs_info);
3679         if (ret) {
3680                 err = ret;
3681                 goto fail_sb_buffer;
3682         }
3683
3684         sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3685         sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3686
3687         sb->s_blocksize = sectorsize;
3688         sb->s_blocksize_bits = blksize_bits(sectorsize);
3689         memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3690
3691         mutex_lock(&fs_info->chunk_mutex);
3692         ret = btrfs_read_sys_array(fs_info);
3693         mutex_unlock(&fs_info->chunk_mutex);
3694         if (ret) {
3695                 btrfs_err(fs_info, "failed to read the system array: %d", ret);
3696                 goto fail_sb_buffer;
3697         }
3698
3699         generation = btrfs_super_chunk_root_generation(disk_super);
3700         level = btrfs_super_chunk_root_level(disk_super);
3701         ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
3702                               generation, level);
3703         if (ret) {
3704                 btrfs_err(fs_info, "failed to read chunk root");
3705                 goto fail_tree_roots;
3706         }
3707
3708         read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3709                            offsetof(struct btrfs_header, chunk_tree_uuid),
3710                            BTRFS_UUID_SIZE);
3711
3712         ret = btrfs_read_chunk_tree(fs_info);
3713         if (ret) {
3714                 btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3715                 goto fail_tree_roots;
3716         }
3717
3718         /*
3719          * At this point we know all the devices that make this filesystem,
3720          * including the seed devices but we don't know yet if the replace
3721          * target is required. So free devices that are not part of this
3722          * filesystem but skip the replace target device which is checked
3723          * below in btrfs_init_dev_replace().
3724          */
3725         btrfs_free_extra_devids(fs_devices);
3726         if (!fs_devices->latest_dev->bdev) {
3727                 btrfs_err(fs_info, "failed to read devices");
3728                 goto fail_tree_roots;
3729         }
3730
3731         ret = init_tree_roots(fs_info);
3732         if (ret)
3733                 goto fail_tree_roots;
3734
3735         /*
3736          * Get zone type information of zoned block devices. This will also
3737          * handle emulation of a zoned filesystem if a regular device has the
3738          * zoned incompat feature flag set.
3739          */
3740         ret = btrfs_get_dev_zone_info_all_devices(fs_info);
3741         if (ret) {
3742                 btrfs_err(fs_info,
3743                           "zoned: failed to read device zone info: %d",
3744                           ret);
3745                 goto fail_block_groups;
3746         }
3747
3748         /*
3749          * If we have a uuid root and we're not being told to rescan we need to
3750          * check the generation here so we can set the
3751          * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3752          * transaction during a balance or the log replay without updating the
3753          * uuid generation, and then if we crash we would rescan the uuid tree,
3754          * even though it was perfectly fine.
3755          */
3756         if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3757             fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3758                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3759
3760         ret = btrfs_verify_dev_extents(fs_info);
3761         if (ret) {
3762                 btrfs_err(fs_info,
3763                           "failed to verify dev extents against chunks: %d",
3764                           ret);
3765                 goto fail_block_groups;
3766         }
3767         ret = btrfs_recover_balance(fs_info);
3768         if (ret) {
3769                 btrfs_err(fs_info, "failed to recover balance: %d", ret);
3770                 goto fail_block_groups;
3771         }
3772
3773         ret = btrfs_init_dev_stats(fs_info);
3774         if (ret) {
3775                 btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3776                 goto fail_block_groups;
3777         }
3778
3779         ret = btrfs_init_dev_replace(fs_info);
3780         if (ret) {
3781                 btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3782                 goto fail_block_groups;
3783         }
3784
3785         ret = btrfs_check_zoned_mode(fs_info);
3786         if (ret) {
3787                 btrfs_err(fs_info, "failed to initialize zoned mode: %d",
3788                           ret);
3789                 goto fail_block_groups;
3790         }
3791
3792         ret = btrfs_sysfs_add_fsid(fs_devices);
3793         if (ret) {
3794                 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3795                                 ret);
3796                 goto fail_block_groups;
3797         }
3798
3799         ret = btrfs_sysfs_add_mounted(fs_info);
3800         if (ret) {
3801                 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3802                 goto fail_fsdev_sysfs;
3803         }
3804
3805         ret = btrfs_init_space_info(fs_info);
3806         if (ret) {
3807                 btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3808                 goto fail_sysfs;
3809         }
3810
3811         ret = btrfs_read_block_groups(fs_info);
3812         if (ret) {
3813                 btrfs_err(fs_info, "failed to read block groups: %d", ret);
3814                 goto fail_sysfs;
3815         }
3816
3817         btrfs_free_zone_cache(fs_info);
3818
3819         if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
3820             !btrfs_check_rw_degradable(fs_info, NULL)) {
3821                 btrfs_warn(fs_info,
3822                 "writable mount is not allowed due to too many missing devices");
3823                 goto fail_sysfs;
3824         }
3825
3826         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3827                                                "btrfs-cleaner");
3828         if (IS_ERR(fs_info->cleaner_kthread))
3829                 goto fail_sysfs;
3830
3831         fs_info->transaction_kthread = kthread_run(transaction_kthread,
3832                                                    tree_root,
3833                                                    "btrfs-transaction");
3834         if (IS_ERR(fs_info->transaction_kthread))
3835                 goto fail_cleaner;
3836
3837         if (!btrfs_test_opt(fs_info, NOSSD) &&
3838             !fs_info->fs_devices->rotating) {
3839                 btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3840         }
3841
3842         /*
3843          * For devices supporting discard turn on discard=async automatically,
3844          * unless it's already set or disabled. This could be turned off by
3845          * nodiscard for the same mount.
3846          */
3847         if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
3848               btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
3849               btrfs_test_opt(fs_info, NODISCARD)) &&
3850             fs_info->fs_devices->discardable) {
3851                 btrfs_set_and_info(fs_info, DISCARD_ASYNC,
3852                                    "auto enabling async discard");
3853                 btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
3854         }
3855
3856 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3857         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3858                 ret = btrfsic_mount(fs_info, fs_devices,
3859                                     btrfs_test_opt(fs_info,
3860                                         CHECK_INTEGRITY_DATA) ? 1 : 0,
3861                                     fs_info->check_integrity_print_mask);
3862                 if (ret)
3863                         btrfs_warn(fs_info,
3864                                 "failed to initialize integrity check module: %d",
3865                                 ret);
3866         }
3867 #endif
3868         ret = btrfs_read_qgroup_config(fs_info);
3869         if (ret)
3870                 goto fail_trans_kthread;
3871
3872         if (btrfs_build_ref_tree(fs_info))
3873                 btrfs_err(fs_info, "couldn't build ref tree");
3874
3875         /* do not make disk changes in broken FS or nologreplay is given */
3876         if (btrfs_super_log_root(disk_super) != 0 &&
3877             !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3878                 btrfs_info(fs_info, "start tree-log replay");
3879                 ret = btrfs_replay_log(fs_info, fs_devices);
3880                 if (ret) {
3881                         err = ret;
3882                         goto fail_qgroup;
3883                 }
3884         }
3885
3886         fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3887         if (IS_ERR(fs_info->fs_root)) {
3888                 err = PTR_ERR(fs_info->fs_root);
3889                 btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3890                 fs_info->fs_root = NULL;
3891                 goto fail_qgroup;
3892         }
3893
3894         if (sb_rdonly(sb))
3895                 goto clear_oneshot;
3896
3897         ret = btrfs_start_pre_rw_mount(fs_info);
3898         if (ret) {
3899                 close_ctree(fs_info);
3900                 return ret;
3901         }
3902         btrfs_discard_resume(fs_info);
3903
3904         if (fs_info->uuid_root &&
3905             (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3906              fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3907                 btrfs_info(fs_info, "checking UUID tree");
3908                 ret = btrfs_check_uuid_tree(fs_info);
3909                 if (ret) {
3910                         btrfs_warn(fs_info,
3911                                 "failed to check the UUID tree: %d", ret);
3912                         close_ctree(fs_info);
3913                         return ret;
3914                 }
3915         }
3916
3917         set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3918
3919         /* Kick the cleaner thread so it'll start deleting snapshots. */
3920         if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3921                 wake_up_process(fs_info->cleaner_kthread);
3922
3923 clear_oneshot:
3924         btrfs_clear_oneshot_options(fs_info);
3925         return 0;
3926
3927 fail_qgroup:
3928         btrfs_free_qgroup_config(fs_info);
3929 fail_trans_kthread:
3930         kthread_stop(fs_info->transaction_kthread);
3931         btrfs_cleanup_transaction(fs_info);
3932         btrfs_free_fs_roots(fs_info);
3933 fail_cleaner:
3934         kthread_stop(fs_info->cleaner_kthread);
3935
3936         /*
3937          * make sure we're done with the btree inode before we stop our
3938          * kthreads
3939          */
3940         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3941
3942 fail_sysfs:
3943         btrfs_sysfs_remove_mounted(fs_info);
3944
3945 fail_fsdev_sysfs:
3946         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3947
3948 fail_block_groups:
3949         btrfs_put_block_group_cache(fs_info);
3950
3951 fail_tree_roots:
3952         if (fs_info->data_reloc_root)
3953                 btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3954         free_root_pointers(fs_info, true);
3955         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3956
3957 fail_sb_buffer:
3958         btrfs_stop_all_workers(fs_info);
3959         btrfs_free_block_groups(fs_info);
3960 fail_alloc:
3961         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3962
3963         iput(fs_info->btree_inode);
3964 fail:
3965         btrfs_close_devices(fs_info->fs_devices);
3966         return err;
3967 }
3968 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3969
3970 static void btrfs_end_super_write(struct bio *bio)
3971 {
3972         struct btrfs_device *device = bio->bi_private;
3973         struct bio_vec *bvec;
3974         struct bvec_iter_all iter_all;
3975         struct page *page;
3976
3977         bio_for_each_segment_all(bvec, bio, iter_all) {
3978                 page = bvec->bv_page;
3979
3980                 if (bio->bi_status) {
3981                         btrfs_warn_rl_in_rcu(device->fs_info,
3982                                 "lost page write due to IO error on %s (%d)",
3983                                 btrfs_dev_name(device),
3984                                 blk_status_to_errno(bio->bi_status));
3985                         ClearPageUptodate(page);
3986                         SetPageError(page);
3987                         btrfs_dev_stat_inc_and_print(device,
3988                                                      BTRFS_DEV_STAT_WRITE_ERRS);
3989                 } else {
3990                         SetPageUptodate(page);
3991                 }
3992
3993                 put_page(page);
3994                 unlock_page(page);
3995         }
3996
3997         bio_put(bio);
3998 }
3999
4000 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
4001                                                    int copy_num, bool drop_cache)
4002 {
4003         struct btrfs_super_block *super;
4004         struct page *page;
4005         u64 bytenr, bytenr_orig;
4006         struct address_space *mapping = bdev->bd_inode->i_mapping;
4007         int ret;
4008
4009         bytenr_orig = btrfs_sb_offset(copy_num);
4010         ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
4011         if (ret == -ENOENT)
4012                 return ERR_PTR(-EINVAL);
4013         else if (ret)
4014                 return ERR_PTR(ret);
4015
4016         if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
4017                 return ERR_PTR(-EINVAL);
4018
4019         if (drop_cache) {
4020                 /* This should only be called with the primary sb. */
4021                 ASSERT(copy_num == 0);
4022
4023                 /*
4024                  * Drop the page of the primary superblock, so later read will
4025                  * always read from the device.
4026                  */
4027                 invalidate_inode_pages2_range(mapping,
4028                                 bytenr >> PAGE_SHIFT,
4029                                 (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
4030         }
4031
4032         page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
4033         if (IS_ERR(page))
4034                 return ERR_CAST(page);
4035
4036         super = page_address(page);
4037         if (btrfs_super_magic(super) != BTRFS_MAGIC) {
4038                 btrfs_release_disk_super(super);
4039                 return ERR_PTR(-ENODATA);
4040         }
4041
4042         if (btrfs_super_bytenr(super) != bytenr_orig) {
4043                 btrfs_release_disk_super(super);
4044                 return ERR_PTR(-EINVAL);
4045         }
4046
4047         return super;
4048 }
4049
4050
4051 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
4052 {
4053         struct btrfs_super_block *super, *latest = NULL;
4054         int i;
4055         u64 transid = 0;
4056
4057         /* we would like to check all the supers, but that would make
4058          * a btrfs mount succeed after a mkfs from a different FS.
4059          * So, we need to add a special mount option to scan for
4060          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
4061          */
4062         for (i = 0; i < 1; i++) {
4063                 super = btrfs_read_dev_one_super(bdev, i, false);
4064                 if (IS_ERR(super))
4065                         continue;
4066
4067                 if (!latest || btrfs_super_generation(super) > transid) {
4068                         if (latest)
4069                                 btrfs_release_disk_super(super);
4070
4071                         latest = super;
4072                         transid = btrfs_super_generation(super);
4073                 }
4074         }
4075
4076         return super;
4077 }
4078
4079 /*
4080  * Write superblock @sb to the @device. Do not wait for completion, all the
4081  * pages we use for writing are locked.
4082  *
4083  * Write @max_mirrors copies of the superblock, where 0 means default that fit
4084  * the expected device size at commit time. Note that max_mirrors must be
4085  * same for write and wait phases.
4086  *
4087  * Return number of errors when page is not found or submission fails.
4088  */
4089 static int write_dev_supers(struct btrfs_device *device,
4090                             struct btrfs_super_block *sb, int max_mirrors)
4091 {
4092         struct btrfs_fs_info *fs_info = device->fs_info;
4093         struct address_space *mapping = device->bdev->bd_inode->i_mapping;
4094         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
4095         int i;
4096         int errors = 0;
4097         int ret;
4098         u64 bytenr, bytenr_orig;
4099
4100         if (max_mirrors == 0)
4101                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
4102
4103         shash->tfm = fs_info->csum_shash;
4104
4105         for (i = 0; i < max_mirrors; i++) {
4106                 struct page *page;
4107                 struct bio *bio;
4108                 struct btrfs_super_block *disk_super;
4109
4110                 bytenr_orig = btrfs_sb_offset(i);
4111                 ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
4112                 if (ret == -ENOENT) {
4113                         continue;
4114                 } else if (ret < 0) {
4115                         btrfs_err(device->fs_info,
4116                                 "couldn't get super block location for mirror %d",
4117                                 i);
4118                         errors++;
4119                         continue;
4120                 }
4121                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4122                     device->commit_total_bytes)
4123                         break;
4124
4125                 btrfs_set_super_bytenr(sb, bytenr_orig);
4126
4127                 crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
4128                                     BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
4129                                     sb->csum);
4130
4131                 page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
4132                                            GFP_NOFS);
4133                 if (!page) {
4134                         btrfs_err(device->fs_info,
4135                             "couldn't get super block page for bytenr %llu",
4136                             bytenr);
4137                         errors++;
4138                         continue;
4139                 }
4140
4141                 /* Bump the refcount for wait_dev_supers() */
4142                 get_page(page);
4143
4144                 disk_super = page_address(page);
4145                 memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
4146
4147                 /*
4148                  * Directly use bios here instead of relying on the page cache
4149                  * to do I/O, so we don't lose the ability to do integrity
4150                  * checking.
4151                  */
4152                 bio = bio_alloc(device->bdev, 1,
4153                                 REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
4154                                 GFP_NOFS);
4155                 bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
4156                 bio->bi_private = device;
4157                 bio->bi_end_io = btrfs_end_super_write;
4158                 __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
4159                                offset_in_page(bytenr));
4160
4161                 /*
4162                  * We FUA only the first super block.  The others we allow to
4163                  * go down lazy and there's a short window where the on-disk
4164                  * copies might still contain the older version.
4165                  */
4166                 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
4167                         bio->bi_opf |= REQ_FUA;
4168
4169                 btrfsic_check_bio(bio);
4170                 submit_bio(bio);
4171
4172                 if (btrfs_advance_sb_log(device, i))
4173                         errors++;
4174         }
4175         return errors < i ? 0 : -1;
4176 }
4177
4178 /*
4179  * Wait for write completion of superblocks done by write_dev_supers,
4180  * @max_mirrors same for write and wait phases.
4181  *
4182  * Return number of errors when page is not found or not marked up to
4183  * date.
4184  */
4185 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
4186 {
4187         int i;
4188         int errors = 0;
4189         bool primary_failed = false;
4190         int ret;
4191         u64 bytenr;
4192
4193         if (max_mirrors == 0)
4194                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
4195
4196         for (i = 0; i < max_mirrors; i++) {
4197                 struct page *page;
4198
4199                 ret = btrfs_sb_log_location(device, i, READ, &bytenr);
4200                 if (ret == -ENOENT) {
4201                         break;
4202                 } else if (ret < 0) {
4203                         errors++;
4204                         if (i == 0)
4205                                 primary_failed = true;
4206                         continue;
4207                 }
4208                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
4209                     device->commit_total_bytes)
4210                         break;
4211
4212                 page = find_get_page(device->bdev->bd_inode->i_mapping,
4213                                      bytenr >> PAGE_SHIFT);
4214                 if (!page) {
4215                         errors++;
4216                         if (i == 0)
4217                                 primary_failed = true;
4218                         continue;
4219                 }
4220                 /* Page is submitted locked and unlocked once the IO completes */
4221                 wait_on_page_locked(page);
4222                 if (PageError(page)) {
4223                         errors++;
4224                         if (i == 0)
4225                                 primary_failed = true;
4226                 }
4227
4228                 /* Drop our reference */
4229                 put_page(page);
4230
4231                 /* Drop the reference from the writing run */
4232                 put_page(page);
4233         }
4234
4235         /* log error, force error return */
4236         if (primary_failed) {
4237                 btrfs_err(device->fs_info, "error writing primary super block to device %llu",
4238                           device->devid);
4239                 return -1;
4240         }
4241
4242         return errors < i ? 0 : -1;
4243 }
4244
4245 /*
4246  * endio for the write_dev_flush, this will wake anyone waiting
4247  * for the barrier when it is done
4248  */
4249 static void btrfs_end_empty_barrier(struct bio *bio)
4250 {
4251         bio_uninit(bio);
4252         complete(bio->bi_private);
4253 }
4254
4255 /*
4256  * Submit a flush request to the device if it supports it. Error handling is
4257  * done in the waiting counterpart.
4258  */
4259 static void write_dev_flush(struct btrfs_device *device)
4260 {
4261         struct bio *bio = &device->flush_bio;
4262
4263 #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4264         /*
4265          * When a disk has write caching disabled, we skip submission of a bio
4266          * with flush and sync requests before writing the superblock, since
4267          * it's not needed. However when the integrity checker is enabled, this
4268          * results in reports that there are metadata blocks referred by a
4269          * superblock that were not properly flushed. So don't skip the bio
4270          * submission only when the integrity checker is enabled for the sake
4271          * of simplicity, since this is a debug tool and not meant for use in
4272          * non-debug builds.
4273          */
4274         if (!bdev_write_cache(device->bdev))
4275                 return;
4276 #endif
4277
4278         bio_init(bio, device->bdev, NULL, 0,
4279                  REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
4280         bio->bi_end_io = btrfs_end_empty_barrier;
4281         init_completion(&device->flush_wait);
4282         bio->bi_private = &device->flush_wait;
4283
4284         btrfsic_check_bio(bio);
4285         submit_bio(bio);
4286         set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4287 }
4288
4289 /*
4290  * If the flush bio has been submitted by write_dev_flush, wait for it.
4291  */
4292 static blk_status_t wait_dev_flush(struct btrfs_device *device)
4293 {
4294         struct bio *bio = &device->flush_bio;
4295
4296         if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
4297                 return BLK_STS_OK;
4298
4299         clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
4300         wait_for_completion_io(&device->flush_wait);
4301
4302         return bio->bi_status;
4303 }
4304
4305 static int check_barrier_error(struct btrfs_fs_info *fs_info)
4306 {
4307         if (!btrfs_check_rw_degradable(fs_info, NULL))
4308                 return -EIO;
4309         return 0;
4310 }
4311
4312 /*
4313  * send an empty flush down to each device in parallel,
4314  * then wait for them
4315  */
4316 static int barrier_all_devices(struct btrfs_fs_info *info)
4317 {
4318         struct list_head *head;
4319         struct btrfs_device *dev;
4320         int errors_wait = 0;
4321         blk_status_t ret;
4322
4323         lockdep_assert_held(&info->fs_devices->device_list_mutex);
4324         /* send down all the barriers */
4325         head = &info->fs_devices->devices;
4326         list_for_each_entry(dev, head, dev_list) {
4327                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4328                         continue;
4329                 if (!dev->bdev)
4330                         continue;
4331                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4332                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4333                         continue;
4334
4335                 write_dev_flush(dev);
4336                 dev->last_flush_error = BLK_STS_OK;
4337         }
4338
4339         /* wait for all the barriers */
4340         list_for_each_entry(dev, head, dev_list) {
4341                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
4342                         continue;
4343                 if (!dev->bdev) {
4344                         errors_wait++;
4345                         continue;
4346                 }
4347                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4348                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4349                         continue;
4350
4351                 ret = wait_dev_flush(dev);
4352                 if (ret) {
4353                         dev->last_flush_error = ret;
4354                         btrfs_dev_stat_inc_and_print(dev,
4355                                         BTRFS_DEV_STAT_FLUSH_ERRS);
4356                         errors_wait++;
4357                 }
4358         }
4359
4360         if (errors_wait) {
4361                 /*
4362                  * At some point we need the status of all disks
4363                  * to arrive at the volume status. So error checking
4364                  * is being pushed to a separate loop.
4365                  */
4366                 return check_barrier_error(info);
4367         }
4368         return 0;
4369 }
4370
4371 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
4372 {
4373         int raid_type;
4374         int min_tolerated = INT_MAX;
4375
4376         if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
4377             (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
4378                 min_tolerated = min_t(int, min_tolerated,
4379                                     btrfs_raid_array[BTRFS_RAID_SINGLE].
4380                                     tolerated_failures);
4381
4382         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4383                 if (raid_type == BTRFS_RAID_SINGLE)
4384                         continue;
4385                 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
4386                         continue;
4387                 min_tolerated = min_t(int, min_tolerated,
4388                                     btrfs_raid_array[raid_type].
4389                                     tolerated_failures);
4390         }
4391
4392         if (min_tolerated == INT_MAX) {
4393                 pr_warn("BTRFS: unknown raid flag: %llu", flags);
4394                 min_tolerated = 0;
4395         }
4396
4397         return min_tolerated;
4398 }
4399
4400 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4401 {
4402         struct list_head *head;
4403         struct btrfs_device *dev;
4404         struct btrfs_super_block *sb;
4405         struct btrfs_dev_item *dev_item;
4406         int ret;
4407         int do_barriers;
4408         int max_errors;
4409         int total_errors = 0;
4410         u64 flags;
4411
4412         do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4413
4414         /*
4415          * max_mirrors == 0 indicates we're from commit_transaction,
4416          * not from fsync where the tree roots in fs_info have not
4417          * been consistent on disk.
4418          */
4419         if (max_mirrors == 0)
4420                 backup_super_roots(fs_info);
4421
4422         sb = fs_info->super_for_commit;
4423         dev_item = &sb->dev_item;
4424
4425         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4426         head = &fs_info->fs_devices->devices;
4427         max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
4428
4429         if (do_barriers) {
4430                 ret = barrier_all_devices(fs_info);
4431                 if (ret) {
4432                         mutex_unlock(
4433                                 &fs_info->fs_devices->device_list_mutex);
4434                         btrfs_handle_fs_error(fs_info, ret,
4435                                               "errors while submitting device barriers.");
4436                         return ret;
4437                 }
4438         }
4439
4440         list_for_each_entry(dev, head, dev_list) {
4441                 if (!dev->bdev) {
4442                         total_errors++;
4443                         continue;
4444                 }
4445                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4446                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4447                         continue;
4448
4449                 btrfs_set_stack_device_generation(dev_item, 0);
4450                 btrfs_set_stack_device_type(dev_item, dev->type);
4451                 btrfs_set_stack_device_id(dev_item, dev->devid);
4452                 btrfs_set_stack_device_total_bytes(dev_item,
4453                                                    dev->commit_total_bytes);
4454                 btrfs_set_stack_device_bytes_used(dev_item,
4455                                                   dev->commit_bytes_used);
4456                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
4457                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
4458                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
4459                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4460                 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
4461                        BTRFS_FSID_SIZE);
4462
4463                 flags = btrfs_super_flags(sb);
4464                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
4465
4466                 ret = btrfs_validate_write_super(fs_info, sb);
4467                 if (ret < 0) {
4468                         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4469                         btrfs_handle_fs_error(fs_info, -EUCLEAN,
4470                                 "unexpected superblock corruption detected");
4471                         return -EUCLEAN;
4472                 }
4473
4474                 ret = write_dev_supers(dev, sb, max_mirrors);
4475                 if (ret)
4476                         total_errors++;
4477         }
4478         if (total_errors > max_errors) {
4479                 btrfs_err(fs_info, "%d errors while writing supers",
4480                           total_errors);
4481                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4482
4483                 /* FUA is masked off if unsupported and can't be the reason */
4484                 btrfs_handle_fs_error(fs_info, -EIO,
4485                                       "%d errors while writing supers",
4486                                       total_errors);
4487                 return -EIO;
4488         }
4489
4490         total_errors = 0;
4491         list_for_each_entry(dev, head, dev_list) {
4492                 if (!dev->bdev)
4493                         continue;
4494                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4495                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4496                         continue;
4497
4498                 ret = wait_dev_supers(dev, max_mirrors);
4499                 if (ret)
4500                         total_errors++;
4501         }
4502         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4503         if (total_errors > max_errors) {
4504                 btrfs_handle_fs_error(fs_info, -EIO,
4505                                       "%d errors while writing supers",
4506                                       total_errors);
4507                 return -EIO;
4508         }
4509         return 0;
4510 }
4511
4512 /* Drop a fs root from the radix tree and free it. */
4513 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
4514                                   struct btrfs_root *root)
4515 {
4516         bool drop_ref = false;
4517
4518         spin_lock(&fs_info->fs_roots_radix_lock);
4519         radix_tree_delete(&fs_info->fs_roots_radix,
4520                           (unsigned long)root->root_key.objectid);
4521         if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4522                 drop_ref = true;
4523         spin_unlock(&fs_info->fs_roots_radix_lock);
4524
4525         if (BTRFS_FS_ERROR(fs_info)) {
4526                 ASSERT(root->log_root == NULL);
4527                 if (root->reloc_root) {
4528                         btrfs_put_root(root->reloc_root);
4529                         root->reloc_root = NULL;
4530                 }
4531         }
4532
4533         if (drop_ref)
4534                 btrfs_put_root(root);
4535 }
4536
4537 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
4538 {
4539         u64 root_objectid = 0;
4540         struct btrfs_root *gang[8];
4541         int i = 0;
4542         int err = 0;
4543         unsigned int ret = 0;
4544
4545         while (1) {
4546                 spin_lock(&fs_info->fs_roots_radix_lock);
4547                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4548                                              (void **)gang, root_objectid,
4549                                              ARRAY_SIZE(gang));
4550                 if (!ret) {
4551                         spin_unlock(&fs_info->fs_roots_radix_lock);
4552                         break;
4553                 }
4554                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
4555
4556                 for (i = 0; i < ret; i++) {
4557                         /* Avoid to grab roots in dead_roots */
4558                         if (btrfs_root_refs(&gang[i]->root_item) == 0) {
4559                                 gang[i] = NULL;
4560                                 continue;
4561                         }
4562                         /* grab all the search result for later use */
4563                         gang[i] = btrfs_grab_root(gang[i]);
4564                 }
4565                 spin_unlock(&fs_info->fs_roots_radix_lock);
4566
4567                 for (i = 0; i < ret; i++) {
4568                         if (!gang[i])
4569                                 continue;
4570                         root_objectid = gang[i]->root_key.objectid;
4571                         err = btrfs_orphan_cleanup(gang[i]);
4572                         if (err)
4573                                 break;
4574                         btrfs_put_root(gang[i]);
4575                 }
4576                 root_objectid++;
4577         }
4578
4579         /* release the uncleaned roots due to error */
4580         for (; i < ret; i++) {
4581                 if (gang[i])
4582                         btrfs_put_root(gang[i]);
4583         }
4584         return err;
4585 }
4586
4587 int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4588 {
4589         struct btrfs_root *root = fs_info->tree_root;
4590         struct btrfs_trans_handle *trans;
4591
4592         mutex_lock(&fs_info->cleaner_mutex);
4593         btrfs_run_delayed_iputs(fs_info);
4594         mutex_unlock(&fs_info->cleaner_mutex);
4595         wake_up_process(fs_info->cleaner_kthread);
4596
4597         /* wait until ongoing cleanup work done */
4598         down_write(&fs_info->cleanup_work_sem);
4599         up_write(&fs_info->cleanup_work_sem);
4600
4601         trans = btrfs_join_transaction(root);
4602         if (IS_ERR(trans))
4603                 return PTR_ERR(trans);
4604         return btrfs_commit_transaction(trans);
4605 }
4606
4607 static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
4608 {
4609         struct btrfs_transaction *trans;
4610         struct btrfs_transaction *tmp;
4611         bool found = false;
4612
4613         if (list_empty(&fs_info->trans_list))
4614                 return;
4615
4616         /*
4617          * This function is only called at the very end of close_ctree(),
4618          * thus no other running transaction, no need to take trans_lock.
4619          */
4620         ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
4621         list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
4622                 struct extent_state *cached = NULL;
4623                 u64 dirty_bytes = 0;
4624                 u64 cur = 0;
4625                 u64 found_start;
4626                 u64 found_end;
4627
4628                 found = true;
4629                 while (!find_first_extent_bit(&trans->dirty_pages, cur,
4630                         &found_start, &found_end, EXTENT_DIRTY, &cached)) {
4631                         dirty_bytes += found_end + 1 - found_start;
4632                         cur = found_end + 1;
4633                 }
4634                 btrfs_warn(fs_info,
4635         "transaction %llu (with %llu dirty metadata bytes) is not committed",
4636                            trans->transid, dirty_bytes);
4637                 btrfs_cleanup_one_transaction(trans, fs_info);
4638
4639                 if (trans == fs_info->running_transaction)
4640                         fs_info->running_transaction = NULL;
4641                 list_del_init(&trans->list);
4642
4643                 btrfs_put_transaction(trans);
4644                 trace_btrfs_transaction_commit(fs_info);
4645         }
4646         ASSERT(!found);
4647 }
4648
4649 void __cold close_ctree(struct btrfs_fs_info *fs_info)
4650 {
4651         int ret;
4652
4653         set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4654
4655         /*
4656          * If we had UNFINISHED_DROPS we could still be processing them, so
4657          * clear that bit and wake up relocation so it can stop.
4658          * We must do this before stopping the block group reclaim task, because
4659          * at btrfs_relocate_block_group() we wait for this bit, and after the
4660          * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
4661          * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
4662          * return 1.
4663          */
4664         btrfs_wake_unfinished_drop(fs_info);
4665
4666         /*
4667          * We may have the reclaim task running and relocating a data block group,
4668          * in which case it may create delayed iputs. So stop it before we park
4669          * the cleaner kthread otherwise we can get new delayed iputs after
4670          * parking the cleaner, and that can make the async reclaim task to hang
4671          * if it's waiting for delayed iputs to complete, since the cleaner is
4672          * parked and can not run delayed iputs - this will make us hang when
4673          * trying to stop the async reclaim task.
4674          */
4675         cancel_work_sync(&fs_info->reclaim_bgs_work);
4676         /*
4677          * We don't want the cleaner to start new transactions, add more delayed
4678          * iputs, etc. while we're closing. We can't use kthread_stop() yet
4679          * because that frees the task_struct, and the transaction kthread might
4680          * still try to wake up the cleaner.
4681          */
4682         kthread_park(fs_info->cleaner_kthread);
4683
4684         /* wait for the qgroup rescan worker to stop */
4685         btrfs_qgroup_wait_for_completion(fs_info, false);
4686
4687         /* wait for the uuid_scan task to finish */
4688         down(&fs_info->uuid_tree_rescan_sem);
4689         /* avoid complains from lockdep et al., set sem back to initial state */
4690         up(&fs_info->uuid_tree_rescan_sem);
4691
4692         /* pause restriper - we want to resume on mount */
4693         btrfs_pause_balance(fs_info);
4694
4695         btrfs_dev_replace_suspend_for_unmount(fs_info);
4696
4697         btrfs_scrub_cancel(fs_info);
4698
4699         /* wait for any defraggers to finish */
4700         wait_event(fs_info->transaction_wait,
4701                    (atomic_read(&fs_info->defrag_running) == 0));
4702
4703         /* clear out the rbtree of defraggable inodes */
4704         btrfs_cleanup_defrag_inodes(fs_info);
4705
4706         /*
4707          * After we parked the cleaner kthread, ordered extents may have
4708          * completed and created new delayed iputs. If one of the async reclaim
4709          * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4710          * can hang forever trying to stop it, because if a delayed iput is
4711          * added after it ran btrfs_run_delayed_iputs() and before it called
4712          * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4713          * no one else to run iputs.
4714          *
4715          * So wait for all ongoing ordered extents to complete and then run
4716          * delayed iputs. This works because once we reach this point no one
4717          * can either create new ordered extents nor create delayed iputs
4718          * through some other means.
4719          *
4720          * Also note that btrfs_wait_ordered_roots() is not safe here, because
4721          * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4722          * but the delayed iput for the respective inode is made only when doing
4723          * the final btrfs_put_ordered_extent() (which must happen at
4724          * btrfs_finish_ordered_io() when we are unmounting).
4725          */
4726         btrfs_flush_workqueue(fs_info->endio_write_workers);
4727         /* Ordered extents for free space inodes. */
4728         btrfs_flush_workqueue(fs_info->endio_freespace_worker);
4729         btrfs_run_delayed_iputs(fs_info);
4730
4731         cancel_work_sync(&fs_info->async_reclaim_work);
4732         cancel_work_sync(&fs_info->async_data_reclaim_work);
4733         cancel_work_sync(&fs_info->preempt_reclaim_work);
4734
4735         /* Cancel or finish ongoing discard work */
4736         btrfs_discard_cleanup(fs_info);
4737
4738         if (!sb_rdonly(fs_info->sb)) {
4739                 /*
4740                  * The cleaner kthread is stopped, so do one final pass over
4741                  * unused block groups.
4742                  */
4743                 btrfs_delete_unused_bgs(fs_info);
4744
4745                 /*
4746                  * There might be existing delayed inode workers still running
4747                  * and holding an empty delayed inode item. We must wait for
4748                  * them to complete first because they can create a transaction.
4749                  * This happens when someone calls btrfs_balance_delayed_items()
4750                  * and then a transaction commit runs the same delayed nodes
4751                  * before any delayed worker has done something with the nodes.
4752                  * We must wait for any worker here and not at transaction
4753                  * commit time since that could cause a deadlock.
4754                  * This is a very rare case.
4755                  */
4756                 btrfs_flush_workqueue(fs_info->delayed_workers);
4757
4758                 ret = btrfs_commit_super(fs_info);
4759                 if (ret)
4760                         btrfs_err(fs_info, "commit super ret %d", ret);
4761         }
4762
4763         if (BTRFS_FS_ERROR(fs_info))
4764                 btrfs_error_commit_super(fs_info);
4765
4766         kthread_stop(fs_info->transaction_kthread);
4767         kthread_stop(fs_info->cleaner_kthread);
4768
4769         ASSERT(list_empty(&fs_info->delayed_iputs));
4770         set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4771
4772         if (btrfs_check_quota_leak(fs_info)) {
4773                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4774                 btrfs_err(fs_info, "qgroup reserved space leaked");
4775         }
4776
4777         btrfs_free_qgroup_config(fs_info);
4778         ASSERT(list_empty(&fs_info->delalloc_roots));
4779
4780         if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4781                 btrfs_info(fs_info, "at unmount delalloc count %lld",
4782                        percpu_counter_sum(&fs_info->delalloc_bytes));
4783         }
4784
4785         if (percpu_counter_sum(&fs_info->ordered_bytes))
4786                 btrfs_info(fs_info, "at unmount dio bytes count %lld",
4787                            percpu_counter_sum(&fs_info->ordered_bytes));
4788
4789         btrfs_sysfs_remove_mounted(fs_info);
4790         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4791
4792         btrfs_put_block_group_cache(fs_info);
4793
4794         /*
4795          * we must make sure there is not any read request to
4796          * submit after we stopping all workers.
4797          */
4798         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4799         btrfs_stop_all_workers(fs_info);
4800
4801         /* We shouldn't have any transaction open at this point */
4802         warn_about_uncommitted_trans(fs_info);
4803
4804         clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4805         free_root_pointers(fs_info, true);
4806         btrfs_free_fs_roots(fs_info);
4807
4808         /*
4809          * We must free the block groups after dropping the fs_roots as we could
4810          * have had an IO error and have left over tree log blocks that aren't
4811          * cleaned up until the fs roots are freed.  This makes the block group
4812          * accounting appear to be wrong because there's pending reserved bytes,
4813          * so make sure we do the block group cleanup afterwards.
4814          */
4815         btrfs_free_block_groups(fs_info);
4816
4817         iput(fs_info->btree_inode);
4818
4819 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4820         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4821                 btrfsic_unmount(fs_info->fs_devices);
4822 #endif
4823
4824         btrfs_mapping_tree_free(&fs_info->mapping_tree);
4825         btrfs_close_devices(fs_info->fs_devices);
4826 }
4827
4828 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
4829                           int atomic)
4830 {
4831         int ret;
4832         struct inode *btree_inode = buf->pages[0]->mapping->host;
4833
4834         ret = extent_buffer_uptodate(buf);
4835         if (!ret)
4836                 return ret;
4837
4838         ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4839                                     parent_transid, atomic);
4840         if (ret == -EAGAIN)
4841                 return ret;
4842         return !ret;
4843 }
4844
4845 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4846 {
4847         struct btrfs_fs_info *fs_info = buf->fs_info;
4848         u64 transid = btrfs_header_generation(buf);
4849         int was_dirty;
4850
4851 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4852         /*
4853          * This is a fast path so only do this check if we have sanity tests
4854          * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4855          * outside of the sanity tests.
4856          */
4857         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4858                 return;
4859 #endif
4860         btrfs_assert_tree_write_locked(buf);
4861         if (transid != fs_info->generation)
4862                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4863                         buf->start, transid, fs_info->generation);
4864         was_dirty = set_extent_buffer_dirty(buf);
4865         if (!was_dirty)
4866                 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4867                                          buf->len,
4868                                          fs_info->dirty_metadata_batch);
4869 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4870         /*
4871          * Since btrfs_mark_buffer_dirty() can be called with item pointer set
4872          * but item data not updated.
4873          * So here we should only check item pointers, not item data.
4874          */
4875         if (btrfs_header_level(buf) == 0 &&
4876             btrfs_check_leaf_relaxed(buf)) {
4877                 btrfs_print_leaf(buf);
4878                 ASSERT(0);
4879         }
4880 #endif
4881 }
4882
4883 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4884                                         int flush_delayed)
4885 {
4886         /*
4887          * looks as though older kernels can get into trouble with
4888          * this code, they end up stuck in balance_dirty_pages forever
4889          */
4890         int ret;
4891
4892         if (current->flags & PF_MEMALLOC)
4893                 return;
4894
4895         if (flush_delayed)
4896                 btrfs_balance_delayed_items(fs_info);
4897
4898         ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4899                                      BTRFS_DIRTY_METADATA_THRESH,
4900                                      fs_info->dirty_metadata_batch);
4901         if (ret > 0) {
4902                 balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4903         }
4904 }
4905
4906 void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4907 {
4908         __btrfs_btree_balance_dirty(fs_info, 1);
4909 }
4910
4911 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4912 {
4913         __btrfs_btree_balance_dirty(fs_info, 0);
4914 }
4915
4916 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4917 {
4918         /* cleanup FS via transaction */
4919         btrfs_cleanup_transaction(fs_info);
4920
4921         mutex_lock(&fs_info->cleaner_mutex);
4922         btrfs_run_delayed_iputs(fs_info);
4923         mutex_unlock(&fs_info->cleaner_mutex);
4924
4925         down_write(&fs_info->cleanup_work_sem);
4926         up_write(&fs_info->cleanup_work_sem);
4927 }
4928
4929 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4930 {
4931         struct btrfs_root *gang[8];
4932         u64 root_objectid = 0;
4933         int ret;
4934
4935         spin_lock(&fs_info->fs_roots_radix_lock);
4936         while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4937                                              (void **)gang, root_objectid,
4938                                              ARRAY_SIZE(gang))) != 0) {
4939                 int i;
4940
4941                 for (i = 0; i < ret; i++)
4942                         gang[i] = btrfs_grab_root(gang[i]);
4943                 spin_unlock(&fs_info->fs_roots_radix_lock);
4944
4945                 for (i = 0; i < ret; i++) {
4946                         if (!gang[i])
4947                                 continue;
4948                         root_objectid = gang[i]->root_key.objectid;
4949                         btrfs_free_log(NULL, gang[i]);
4950                         btrfs_put_root(gang[i]);
4951                 }
4952                 root_objectid++;
4953                 spin_lock(&fs_info->fs_roots_radix_lock);
4954         }
4955         spin_unlock(&fs_info->fs_roots_radix_lock);
4956         btrfs_free_log_root_tree(NULL, fs_info);
4957 }
4958
4959 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4960 {
4961         struct btrfs_ordered_extent *ordered;
4962
4963         spin_lock(&root->ordered_extent_lock);
4964         /*
4965          * This will just short circuit the ordered completion stuff which will
4966          * make sure the ordered extent gets properly cleaned up.
4967          */
4968         list_for_each_entry(ordered, &root->ordered_extents,
4969                             root_extent_list)
4970                 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4971         spin_unlock(&root->ordered_extent_lock);
4972 }
4973
4974 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4975 {
4976         struct btrfs_root *root;
4977         struct list_head splice;
4978
4979         INIT_LIST_HEAD(&splice);
4980
4981         spin_lock(&fs_info->ordered_root_lock);
4982         list_splice_init(&fs_info->ordered_roots, &splice);
4983         while (!list_empty(&splice)) {
4984                 root = list_first_entry(&splice, struct btrfs_root,
4985                                         ordered_root);
4986                 list_move_tail(&root->ordered_root,
4987                                &fs_info->ordered_roots);
4988
4989                 spin_unlock(&fs_info->ordered_root_lock);
4990                 btrfs_destroy_ordered_extents(root);
4991
4992                 cond_resched();
4993                 spin_lock(&fs_info->ordered_root_lock);
4994         }
4995         spin_unlock(&fs_info->ordered_root_lock);
4996
4997         /*
4998          * We need this here because if we've been flipped read-only we won't
4999          * get sync() from the umount, so we need to make sure any ordered
5000          * extents that haven't had their dirty pages IO start writeout yet
5001          * actually get run and error out properly.
5002          */
5003         btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
5004 }
5005
5006 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
5007                                       struct btrfs_fs_info *fs_info)
5008 {
5009         struct rb_node *node;
5010         struct btrfs_delayed_ref_root *delayed_refs;
5011         struct btrfs_delayed_ref_node *ref;
5012         int ret = 0;
5013
5014         delayed_refs = &trans->delayed_refs;
5015
5016         spin_lock(&delayed_refs->lock);
5017         if (atomic_read(&delayed_refs->num_entries) == 0) {
5018                 spin_unlock(&delayed_refs->lock);
5019                 btrfs_debug(fs_info, "delayed_refs has NO entry");
5020                 return ret;
5021         }
5022
5023         while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
5024                 struct btrfs_delayed_ref_head *head;
5025                 struct rb_node *n;
5026                 bool pin_bytes = false;
5027
5028                 head = rb_entry(node, struct btrfs_delayed_ref_head,
5029                                 href_node);
5030                 if (btrfs_delayed_ref_lock(delayed_refs, head))
5031                         continue;
5032
5033                 spin_lock(&head->lock);
5034                 while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
5035                         ref = rb_entry(n, struct btrfs_delayed_ref_node,
5036                                        ref_node);
5037                         ref->in_tree = 0;
5038                         rb_erase_cached(&ref->ref_node, &head->ref_tree);
5039                         RB_CLEAR_NODE(&ref->ref_node);
5040                         if (!list_empty(&ref->add_list))
5041                                 list_del(&ref->add_list);
5042                         atomic_dec(&delayed_refs->num_entries);
5043                         btrfs_put_delayed_ref(ref);
5044                 }
5045                 if (head->must_insert_reserved)
5046                         pin_bytes = true;
5047                 btrfs_free_delayed_extent_op(head->extent_op);
5048                 btrfs_delete_ref_head(delayed_refs, head);
5049                 spin_unlock(&head->lock);
5050                 spin_unlock(&delayed_refs->lock);
5051                 mutex_unlock(&head->mutex);
5052
5053                 if (pin_bytes) {
5054                         struct btrfs_block_group *cache;
5055
5056                         cache = btrfs_lookup_block_group(fs_info, head->bytenr);
5057                         BUG_ON(!cache);
5058
5059                         spin_lock(&cache->space_info->lock);
5060                         spin_lock(&cache->lock);
5061                         cache->pinned += head->num_bytes;
5062                         btrfs_space_info_update_bytes_pinned(fs_info,
5063                                 cache->space_info, head->num_bytes);
5064                         cache->reserved -= head->num_bytes;
5065                         cache->space_info->bytes_reserved -= head->num_bytes;
5066                         spin_unlock(&cache->lock);
5067                         spin_unlock(&cache->space_info->lock);
5068
5069                         btrfs_put_block_group(cache);
5070
5071                         btrfs_error_unpin_extent_range(fs_info, head->bytenr,
5072                                 head->bytenr + head->num_bytes - 1);
5073                 }
5074                 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
5075                 btrfs_put_delayed_ref_head(head);
5076                 cond_resched();
5077                 spin_lock(&delayed_refs->lock);
5078         }
5079         btrfs_qgroup_destroy_extent_records(trans);
5080
5081         spin_unlock(&delayed_refs->lock);
5082
5083         return ret;
5084 }
5085
5086 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
5087 {
5088         struct btrfs_inode *btrfs_inode;
5089         struct list_head splice;
5090
5091         INIT_LIST_HEAD(&splice);
5092
5093         spin_lock(&root->delalloc_lock);
5094         list_splice_init(&root->delalloc_inodes, &splice);
5095
5096         while (!list_empty(&splice)) {
5097                 struct inode *inode = NULL;
5098                 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
5099                                                delalloc_inodes);
5100                 __btrfs_del_delalloc_inode(root, btrfs_inode);
5101                 spin_unlock(&root->delalloc_lock);
5102
5103                 /*
5104                  * Make sure we get a live inode and that it'll not disappear
5105                  * meanwhile.
5106                  */
5107                 inode = igrab(&btrfs_inode->vfs_inode);
5108                 if (inode) {
5109                         invalidate_inode_pages2(inode->i_mapping);
5110                         iput(inode);
5111                 }
5112                 spin_lock(&root->delalloc_lock);
5113         }
5114         spin_unlock(&root->delalloc_lock);
5115 }
5116
5117 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
5118 {
5119         struct btrfs_root *root;
5120         struct list_head splice;
5121
5122         INIT_LIST_HEAD(&splice);
5123
5124         spin_lock(&fs_info->delalloc_root_lock);
5125         list_splice_init(&fs_info->delalloc_roots, &splice);
5126         while (!list_empty(&splice)) {
5127                 root = list_first_entry(&splice, struct btrfs_root,
5128                                          delalloc_root);
5129                 root = btrfs_grab_root(root);
5130                 BUG_ON(!root);
5131                 spin_unlock(&fs_info->delalloc_root_lock);
5132
5133                 btrfs_destroy_delalloc_inodes(root);
5134                 btrfs_put_root(root);
5135
5136                 spin_lock(&fs_info->delalloc_root_lock);
5137         }
5138         spin_unlock(&fs_info->delalloc_root_lock);
5139 }
5140
5141 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
5142                                         struct extent_io_tree *dirty_pages,
5143                                         int mark)
5144 {
5145         int ret;
5146         struct extent_buffer *eb;
5147         u64 start = 0;
5148         u64 end;
5149
5150         while (1) {
5151                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
5152                                             mark, NULL);
5153                 if (ret)
5154                         break;
5155
5156                 clear_extent_bits(dirty_pages, start, end, mark);
5157                 while (start <= end) {
5158                         eb = find_extent_buffer(fs_info, start);
5159                         start += fs_info->nodesize;
5160                         if (!eb)
5161                                 continue;
5162                         wait_on_extent_buffer_writeback(eb);
5163
5164                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
5165                                                &eb->bflags))
5166                                 clear_extent_buffer_dirty(eb);
5167                         free_extent_buffer_stale(eb);
5168                 }
5169         }
5170
5171         return ret;
5172 }
5173
5174 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
5175                                        struct extent_io_tree *unpin)
5176 {
5177         u64 start;
5178         u64 end;
5179         int ret;
5180
5181         while (1) {
5182                 struct extent_state *cached_state = NULL;
5183
5184                 /*
5185                  * The btrfs_finish_extent_commit() may get the same range as
5186                  * ours between find_first_extent_bit and clear_extent_dirty.
5187                  * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
5188                  * the same extent range.
5189                  */
5190                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
5191                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5192                                             EXTENT_DIRTY, &cached_state);
5193                 if (ret) {
5194                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5195                         break;
5196                 }
5197
5198                 clear_extent_dirty(unpin, start, end, &cached_state);
5199                 free_extent_state(cached_state);
5200                 btrfs_error_unpin_extent_range(fs_info, start, end);
5201                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5202                 cond_resched();
5203         }
5204
5205         return 0;
5206 }
5207
5208 static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
5209 {
5210         struct inode *inode;
5211
5212         inode = cache->io_ctl.inode;
5213         if (inode) {
5214                 invalidate_inode_pages2(inode->i_mapping);
5215                 BTRFS_I(inode)->generation = 0;
5216                 cache->io_ctl.inode = NULL;
5217                 iput(inode);
5218         }
5219         ASSERT(cache->io_ctl.pages == NULL);
5220         btrfs_put_block_group(cache);
5221 }
5222
5223 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
5224                              struct btrfs_fs_info *fs_info)
5225 {
5226         struct btrfs_block_group *cache;
5227
5228         spin_lock(&cur_trans->dirty_bgs_lock);
5229         while (!list_empty(&cur_trans->dirty_bgs)) {
5230                 cache = list_first_entry(&cur_trans->dirty_bgs,
5231                                          struct btrfs_block_group,
5232                                          dirty_list);
5233
5234                 if (!list_empty(&cache->io_list)) {
5235                         spin_unlock(&cur_trans->dirty_bgs_lock);
5236                         list_del_init(&cache->io_list);
5237                         btrfs_cleanup_bg_io(cache);
5238                         spin_lock(&cur_trans->dirty_bgs_lock);
5239                 }
5240
5241                 list_del_init(&cache->dirty_list);
5242                 spin_lock(&cache->lock);
5243                 cache->disk_cache_state = BTRFS_DC_ERROR;
5244                 spin_unlock(&cache->lock);
5245
5246                 spin_unlock(&cur_trans->dirty_bgs_lock);
5247                 btrfs_put_block_group(cache);
5248                 btrfs_delayed_refs_rsv_release(fs_info, 1);
5249                 spin_lock(&cur_trans->dirty_bgs_lock);
5250         }
5251         spin_unlock(&cur_trans->dirty_bgs_lock);
5252
5253         /*
5254          * Refer to the definition of io_bgs member for details why it's safe
5255          * to use it without any locking
5256          */
5257         while (!list_empty(&cur_trans->io_bgs)) {
5258                 cache = list_first_entry(&cur_trans->io_bgs,
5259                                          struct btrfs_block_group,
5260                                          io_list);
5261
5262                 list_del_init(&cache->io_list);
5263                 spin_lock(&cache->lock);
5264                 cache->disk_cache_state = BTRFS_DC_ERROR;
5265                 spin_unlock(&cache->lock);
5266                 btrfs_cleanup_bg_io(cache);
5267         }
5268 }
5269
5270 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
5271                                    struct btrfs_fs_info *fs_info)
5272 {
5273         struct btrfs_device *dev, *tmp;
5274
5275         btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
5276         ASSERT(list_empty(&cur_trans->dirty_bgs));
5277         ASSERT(list_empty(&cur_trans->io_bgs));
5278
5279         list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
5280                                  post_commit_list) {
5281                 list_del_init(&dev->post_commit_list);
5282         }
5283
5284         btrfs_destroy_delayed_refs(cur_trans, fs_info);
5285
5286         cur_trans->state = TRANS_STATE_COMMIT_START;
5287         wake_up(&fs_info->transaction_blocked_wait);
5288
5289         cur_trans->state = TRANS_STATE_UNBLOCKED;
5290         wake_up(&fs_info->transaction_wait);
5291
5292         btrfs_destroy_delayed_inodes(fs_info);
5293
5294         btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
5295                                      EXTENT_DIRTY);
5296         btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
5297
5298         btrfs_free_redirty_list(cur_trans);
5299
5300         cur_trans->state =TRANS_STATE_COMPLETED;
5301         wake_up(&cur_trans->commit_wait);
5302 }
5303
5304 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
5305 {
5306         struct btrfs_transaction *t;
5307
5308         mutex_lock(&fs_info->transaction_kthread_mutex);
5309
5310         spin_lock(&fs_info->trans_lock);
5311         while (!list_empty(&fs_info->trans_list)) {
5312                 t = list_first_entry(&fs_info->trans_list,
5313                                      struct btrfs_transaction, list);
5314                 if (t->state >= TRANS_STATE_COMMIT_START) {
5315                         refcount_inc(&t->use_count);
5316                         spin_unlock(&fs_info->trans_lock);
5317                         btrfs_wait_for_commit(fs_info, t->transid);
5318                         btrfs_put_transaction(t);
5319                         spin_lock(&fs_info->trans_lock);
5320                         continue;
5321                 }
5322                 if (t == fs_info->running_transaction) {
5323                         t->state = TRANS_STATE_COMMIT_DOING;
5324                         spin_unlock(&fs_info->trans_lock);
5325                         /*
5326                          * We wait for 0 num_writers since we don't hold a trans
5327                          * handle open currently for this transaction.
5328                          */
5329                         wait_event(t->writer_wait,
5330                                    atomic_read(&t->num_writers) == 0);
5331                 } else {
5332                         spin_unlock(&fs_info->trans_lock);
5333                 }
5334                 btrfs_cleanup_one_transaction(t, fs_info);
5335
5336                 spin_lock(&fs_info->trans_lock);
5337                 if (t == fs_info->running_transaction)
5338                         fs_info->running_transaction = NULL;
5339                 list_del_init(&t->list);
5340                 spin_unlock(&fs_info->trans_lock);
5341
5342                 btrfs_put_transaction(t);
5343                 trace_btrfs_transaction_commit(fs_info);
5344                 spin_lock(&fs_info->trans_lock);
5345         }
5346         spin_unlock(&fs_info->trans_lock);
5347         btrfs_destroy_all_ordered_extents(fs_info);
5348         btrfs_destroy_delayed_inodes(fs_info);
5349         btrfs_assert_delayed_root_empty(fs_info);
5350         btrfs_destroy_all_delalloc_inodes(fs_info);
5351         btrfs_drop_all_logs(fs_info);
5352         mutex_unlock(&fs_info->transaction_kthread_mutex);
5353
5354         return 0;
5355 }
5356
5357 int btrfs_init_root_free_objectid(struct btrfs_root *root)
5358 {
5359         struct btrfs_path *path;
5360         int ret;
5361         struct extent_buffer *l;
5362         struct btrfs_key search_key;
5363         struct btrfs_key found_key;
5364         int slot;
5365
5366         path = btrfs_alloc_path();
5367         if (!path)
5368                 return -ENOMEM;
5369
5370         search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
5371         search_key.type = -1;
5372         search_key.offset = (u64)-1;
5373         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5374         if (ret < 0)
5375                 goto error;
5376         BUG_ON(ret == 0); /* Corruption */
5377         if (path->slots[0] > 0) {
5378                 slot = path->slots[0] - 1;
5379                 l = path->nodes[0];
5380                 btrfs_item_key_to_cpu(l, &found_key, slot);
5381                 root->free_objectid = max_t(u64, found_key.objectid + 1,
5382                                             BTRFS_FIRST_FREE_OBJECTID);
5383         } else {
5384                 root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
5385         }
5386         ret = 0;
5387 error:
5388         btrfs_free_path(path);
5389         return ret;
5390 }
5391
5392 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
5393 {
5394         int ret;
5395         mutex_lock(&root->objectid_mutex);
5396
5397         if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
5398                 btrfs_warn(root->fs_info,
5399                            "the objectid of root %llu reaches its highest value",
5400                            root->root_key.objectid);
5401                 ret = -ENOSPC;
5402                 goto out;
5403         }
5404
5405         *objectid = root->free_objectid++;
5406         ret = 0;
5407 out:
5408         mutex_unlock(&root->objectid_mutex);
5409         return ret;
5410 }