fs/btrfs/disk-io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/fs.h>
   7 #include <linux/blkdev.h>
   8 #include <linux/radix-tree.h>
   9 #include <linux/writeback.h>
  10 #include <linux/workqueue.h>
  11 #include <linux/kthread.h>
  12 #include <linux/slab.h>
  13 #include <linux/migrate.h>
  14 #include <linux/ratelimit.h>
  15 #include <linux/uuid.h>
  16 #include <linux/semaphore.h>
  17 #include <linux/error-injection.h>
  18 #include <linux/crc32c.h>
  19 #include <linux/sched/mm.h>
  20 #include <asm/unaligned.h>
  21 #include <crypto/hash.h>
  22 #include "ctree.h"
  23 #include "disk-io.h"
  24 #include "transaction.h"
  25 #include "btrfs_inode.h"
  26 #include "volumes.h"
  27 #include "print-tree.h"
  28 #include "locking.h"
  29 #include "tree-log.h"
  30 #include "free-space-cache.h"
  31 #include "free-space-tree.h"
  32 #include "inode-map.h"
  33 #include "check-integrity.h"
  34 #include "rcu-string.h"
  35 #include "dev-replace.h"
  36 #include "raid56.h"
  37 #include "sysfs.h"
  38 #include "qgroup.h"
  39 #include "compression.h"
  40 #include "tree-checker.h"
  41 #include "ref-verify.h"
  42 #include "block-group.h"
  43 #include "discard.h"
  44 #include "space-info.h"
  45
  46 #define BTRFS_SUPER_FLAG_SUPP   (BTRFS_HEADER_FLAG_WRITTEN |\
  47                                  BTRFS_HEADER_FLAG_RELOC |\
  48                                  BTRFS_SUPER_FLAG_ERROR |\
  49                                  BTRFS_SUPER_FLAG_SEEDING |\
  50                                  BTRFS_SUPER_FLAG_METADUMP |\
  51                                  BTRFS_SUPER_FLAG_METADUMP_V2)
  52
  53 static void end_workqueue_fn(struct btrfs_work *work);
  54 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
  55 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  56                                       struct btrfs_fs_info *fs_info);
  57 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
  58 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
  59                                         struct extent_io_tree *dirty_pages,
  60                                         int mark);
  61 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
  62                                        struct extent_io_tree *pinned_extents);
  63 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
  64 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
  65
  66 /*
  67  * btrfs_end_io_wq structs are used to do processing in task context when an IO
  68  * is complete.  This is used during reads to verify checksums, and it is used
  69  * by writes to insert metadata for new file extents after IO is complete.
  70  */
  71 struct btrfs_end_io_wq {
  72         struct bio *bio;
  73         bio_end_io_t *end_io;
  74         void *private;
  75         struct btrfs_fs_info *info;
  76         blk_status_t status;
  77         enum btrfs_wq_endio_type metadata;
  78         struct btrfs_work work;
  79 };
  80
  81 static struct kmem_cache *btrfs_end_io_wq_cache;
  82
  83 int __init btrfs_end_io_wq_init(void)
  84 {
  85         btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
  86                                         sizeof(struct btrfs_end_io_wq),
  87                                         0,
  88                                         SLAB_MEM_SPREAD,
  89                                         NULL);
  90         if (!btrfs_end_io_wq_cache)
  91                 return -ENOMEM;
  92         return 0;
  93 }
  94
  95 void __cold btrfs_end_io_wq_exit(void)
  96 {
  97         kmem_cache_destroy(btrfs_end_io_wq_cache);
  98 }
  99
 100 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
 101 {
 102         if (fs_info->csum_shash)
 103                 crypto_free_shash(fs_info->csum_shash);
 104 }
 105
 106 /*
 107  * async submit bios are used to offload expensive checksumming
 108  * onto the worker threads.  They checksum file and metadata bios
 109  * just before they are sent down the IO stack.
 110  */
 111 struct async_submit_bio {
 112         struct inode *inode;
 113         struct bio *bio;
 114         extent_submit_bio_start_t *submit_bio_start;
 115         int mirror_num;
 116         /*
 117          * bio_offset is optional, can be used if the pages in the bio
 118          * can't tell us where in the file the bio should go
 119          */
 120         u64 bio_offset;
 121         struct btrfs_work work;
 122         blk_status_t status;
 123 };
 124
 125 /*
 126  * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 127  * eb, the lockdep key is determined by the btrfs_root it belongs to and
 128  * the level the eb occupies in the tree.
 129  *
 130  * Different roots are used for different purposes and may nest inside each
 131  * other and they require separate keysets.  As lockdep keys should be
 132  * static, assign keysets according to the purpose of the root as indicated
 133  * by btrfs_root->root_key.objectid.  This ensures that all special purpose
 134  * roots have separate keysets.
 135  *
 136  * Lock-nesting across peer nodes is always done with the immediate parent
 137  * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 138  * subclass to avoid triggering lockdep warning in such cases.
 139  *
 140  * The key is set by the readpage_end_io_hook after the buffer has passed
 141  * csum validation but before the pages are unlocked.  It is also set by
 142  * btrfs_init_new_buffer on freshly allocated blocks.
 143  *
 144  * We also add a check to make sure the highest level of the tree is the
 145  * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 146  * needs update as well.
 147  */
 148 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 149 # if BTRFS_MAX_LEVEL != 8
 150 #  error
 151 # endif
 152
 153 #define DEFINE_LEVEL(stem, level)                                       \
 154         .names[level] = "btrfs-" stem "-0" #level,
 155
 156 #define DEFINE_NAME(stem)                                               \
 157         DEFINE_LEVEL(stem, 0)                                           \
 158         DEFINE_LEVEL(stem, 1)                                           \
 159         DEFINE_LEVEL(stem, 2)                                           \
 160         DEFINE_LEVEL(stem, 3)                                           \
 161         DEFINE_LEVEL(stem, 4)                                           \
 162         DEFINE_LEVEL(stem, 5)                                           \
 163         DEFINE_LEVEL(stem, 6)                                           \
 164         DEFINE_LEVEL(stem, 7)
 165
 166 static struct btrfs_lockdep_keyset {
 167         u64                     id;             /* root objectid */
 168         /* Longest entry: btrfs-free-space-00 */
 169         char                    names[BTRFS_MAX_LEVEL][20];
 170         struct lock_class_key   keys[BTRFS_MAX_LEVEL];
 171 } btrfs_lockdep_keysets[] = {
 172         { .id = BTRFS_ROOT_TREE_OBJECTID,       DEFINE_NAME("root")     },
 173         { .id = BTRFS_EXTENT_TREE_OBJECTID,     DEFINE_NAME("extent")   },
 174         { .id = BTRFS_CHUNK_TREE_OBJECTID,      DEFINE_NAME("chunk")    },
 175         { .id = BTRFS_DEV_TREE_OBJECTID,        DEFINE_NAME("dev")      },
 176         { .id = BTRFS_FS_TREE_OBJECTID,         DEFINE_NAME("fs")       },
 177         { .id = BTRFS_CSUM_TREE_OBJECTID,       DEFINE_NAME("csum")     },
 178         { .id = BTRFS_QUOTA_TREE_OBJECTID,      DEFINE_NAME("quota")    },
 179         { .id = BTRFS_TREE_LOG_OBJECTID,        DEFINE_NAME("log")      },
 180         { .id = BTRFS_TREE_RELOC_OBJECTID,      DEFINE_NAME("treloc")   },
 181         { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc")   },
 182         { .id = BTRFS_UUID_TREE_OBJECTID,       DEFINE_NAME("uuid")     },
 183         { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
 184         { .id = 0,                              DEFINE_NAME("tree")     },
 185 };
 186
 187 #undef DEFINE_LEVEL
 188 #undef DEFINE_NAME
 189
 190 void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
 191                                     int level)
 192 {
 193         struct btrfs_lockdep_keyset *ks;
 194
 195         BUG_ON(level >= ARRAY_SIZE(ks->keys));
 196
 197         /* find the matching keyset, id 0 is the default entry */
 198         for (ks = btrfs_lockdep_keysets; ks->id; ks++)
 199                 if (ks->id == objectid)
 200                         break;
 201
 202         lockdep_set_class_and_name(&eb->lock,
 203                                    &ks->keys[level], ks->names[level]);
 204 }
 205
 206 #endif
 207
 208 /*
 209  * Compute the csum of a btree block and store the result to provided buffer.
 210  */
 211 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 212 {
 213         struct btrfs_fs_info *fs_info = buf->fs_info;
 214         const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
 215         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 216         char *kaddr;
 217         int i;
 218
 219         shash->tfm = fs_info->csum_shash;
 220         crypto_shash_init(shash);
 221         kaddr = page_address(buf->pages[0]);
 222         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
 223                             PAGE_SIZE - BTRFS_CSUM_SIZE);
 224
 225         for (i = 1; i < num_pages; i++) {
 226                 kaddr = page_address(buf->pages[i]);
 227                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
 228         }
 229         memset(result, 0, BTRFS_CSUM_SIZE);
 230         crypto_shash_final(shash, result);
 231 }
 232
 233 /*
 234  * we can't consider a given block up to date unless the transid of the
 235  * block matches the transid in the parent node's pointer.  This is how we
 236  * detect blocks that either didn't get written at all or got written
 237  * in the wrong place.
 238  */
 239 static int verify_parent_transid(struct extent_io_tree *io_tree,
 240                                  struct extent_buffer *eb, u64 parent_transid,
 241                                  int atomic)
 242 {
 243         struct extent_state *cached_state = NULL;
 244         int ret;
 245         bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
 246
 247         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 248                 return 0;
 249
 250         if (atomic)
 251                 return -EAGAIN;
 252
 253         if (need_lock) {
 254                 btrfs_tree_read_lock(eb);
 255                 btrfs_set_lock_blocking_read(eb);
 256         }
 257
 258         lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
 259                          &cached_state);
 260         if (extent_buffer_uptodate(eb) &&
 261             btrfs_header_generation(eb) == parent_transid) {
 262                 ret = 0;
 263                 goto out;
 264         }
 265         btrfs_err_rl(eb->fs_info,
 266                 "parent transid verify failed on %llu wanted %llu found %llu",
 267                         eb->start,
 268                         parent_transid, btrfs_header_generation(eb));
 269         ret = 1;
 270
 271         /*
 272          * Things reading via commit roots that don't have normal protection,
 273          * like send, can have a really old block in cache that may point at a
 274          * block that has been freed and re-allocated.  So don't clear uptodate
 275          * if we find an eb that is under IO (dirty/writeback) because we could
 276          * end up reading in the stale data and then writing it back out and
 277          * making everybody very sad.
 278          */
 279         if (!extent_buffer_under_io(eb))
 280                 clear_extent_buffer_uptodate(eb);
 281 out:
 282         unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
 283                              &cached_state);
 284         if (need_lock)
 285                 btrfs_tree_read_unlock_blocking(eb);
 286         return ret;
 287 }
 288
 289 static bool btrfs_supported_super_csum(u16 csum_type)
 290 {
 291         switch (csum_type) {
 292         case BTRFS_CSUM_TYPE_CRC32:
 293         case BTRFS_CSUM_TYPE_XXHASH:
 294         case BTRFS_CSUM_TYPE_SHA256:
 295         case BTRFS_CSUM_TYPE_BLAKE2:
 296                 return true;
 297         default:
 298                 return false;
 299         }
 300 }
 301
 302 /*
 303  * Return 0 if the superblock checksum type matches the checksum value of that
 304  * algorithm. Pass the raw disk superblock data.
 305  */
 306 static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 307                                   char *raw_disk_sb)
 308 {
 309         struct btrfs_super_block *disk_sb =
 310                 (struct btrfs_super_block *)raw_disk_sb;
 311         char result[BTRFS_CSUM_SIZE];
 312         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 313
 314         shash->tfm = fs_info->csum_shash;
 315
 316         /*
 317          * The super_block structure does not span the whole
 318          * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
 319          * filled with zeros and is included in the checksum.
 320          */
 321         crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
 322                             BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
 323
 324         if (memcmp(disk_sb->csum, result, fs_info->csum_size))
 325                 return 1;
 326
 327         return 0;
 328 }
 329
 330 int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 331                            struct btrfs_key *first_key, u64 parent_transid)
 332 {
 333         struct btrfs_fs_info *fs_info = eb->fs_info;
 334         int found_level;
 335         struct btrfs_key found_key;
 336         int ret;
 337
 338         found_level = btrfs_header_level(eb);
 339         if (found_level != level) {
 340                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 341                      KERN_ERR "BTRFS: tree level check failed\n");
 342                 btrfs_err(fs_info,
 343 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
 344                           eb->start, level, found_level);
 345                 return -EIO;
 346         }
 347
 348         if (!first_key)
 349                 return 0;
 350
 351         /*
 352          * For live tree block (new tree blocks in current transaction),
 353          * we need proper lock context to avoid race, which is impossible here.
 354          * So we only checks tree blocks which is read from disk, whose
 355          * generation <= fs_info->last_trans_committed.
 356          */
 357         if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
 358                 return 0;
 359
 360         /* We have @first_key, so this @eb must have at least one item */
 361         if (btrfs_header_nritems(eb) == 0) {
 362                 btrfs_err(fs_info,
 363                 "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
 364                           eb->start);
 365                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 366                 return -EUCLEAN;
 367         }
 368
 369         if (found_level)
 370                 btrfs_node_key_to_cpu(eb, &found_key, 0);
 371         else
 372                 btrfs_item_key_to_cpu(eb, &found_key, 0);
 373         ret = btrfs_comp_cpu_keys(first_key, &found_key);
 374
 375         if (ret) {
 376                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 377                      KERN_ERR "BTRFS: tree first key check failed\n");
 378                 btrfs_err(fs_info,
 379 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
 380                           eb->start, parent_transid, first_key->objectid,
 381                           first_key->type, first_key->offset,
 382                           found_key.objectid, found_key.type,
 383                           found_key.offset);
 384         }
 385         return ret;
 386 }
 387
 388 /*
 389  * helper to read a given tree block, doing retries as required when
 390  * the checksums don't match and we have alternate mirrors to try.
 391  *
 392  * @parent_transid:     expected transid, skip check if 0
 393  * @level:              expected level, mandatory check
 394  * @first_key:          expected key of first slot, skip check if NULL
 395  */
 396 static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
 397                                           u64 parent_transid, int level,
 398                                           struct btrfs_key *first_key)
 399 {
 400         struct btrfs_fs_info *fs_info = eb->fs_info;
 401         struct extent_io_tree *io_tree;
 402         int failed = 0;
 403         int ret;
 404         int num_copies = 0;
 405         int mirror_num = 0;
 406         int failed_mirror = 0;
 407
 408         io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 409         while (1) {
 410                 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 411                 ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
 412                 if (!ret) {
 413                         if (verify_parent_transid(io_tree, eb,
 414                                                    parent_transid, 0))
 415                                 ret = -EIO;
 416                         else if (btrfs_verify_level_key(eb, level,
 417                                                 first_key, parent_transid))
 418                                 ret = -EUCLEAN;
 419                         else
 420                                 break;
 421                 }
 422
 423                 num_copies = btrfs_num_copies(fs_info,
 424                                               eb->start, eb->len);
 425                 if (num_copies == 1)
 426                         break;
 427
 428                 if (!failed_mirror) {
 429                         failed = 1;
 430                         failed_mirror = eb->read_mirror;
 431                 }
 432
 433                 mirror_num++;
 434                 if (mirror_num == failed_mirror)
 435                         mirror_num++;
 436
 437                 if (mirror_num > num_copies)
 438                         break;
 439         }
 440
 441         if (failed && !ret && failed_mirror)
 442                 btrfs_repair_eb_io_failure(eb, failed_mirror);
 443
 444         return ret;
 445 }
 446
 447 /*
 448  * checksum a dirty tree block before IO.  This has extra checks to make sure
 449  * we only fill in the checksum field in the first page of a multi-page block
 450  */
 451
 452 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 453 {
 454         u64 start = page_offset(page);
 455         u64 found_start;
 456         u8 result[BTRFS_CSUM_SIZE];
 457         struct extent_buffer *eb;
 458         int ret;
 459
 460         eb = (struct extent_buffer *)page->private;
 461         if (page != eb->pages[0])
 462                 return 0;
 463
 464         found_start = btrfs_header_bytenr(eb);
 465         /*
 466          * Please do not consolidate these warnings into a single if.
 467          * It is useful to know what went wrong.
 468          */
 469         if (WARN_ON(found_start != start))
 470                 return -EUCLEAN;
 471         if (WARN_ON(!PageUptodate(page)))
 472                 return -EUCLEAN;
 473
 474         ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
 475                                     offsetof(struct btrfs_header, fsid),
 476                                     BTRFS_FSID_SIZE) == 0);
 477
 478         csum_tree_block(eb, result);
 479
 480         if (btrfs_header_level(eb))
 481                 ret = btrfs_check_node(eb);
 482         else
 483                 ret = btrfs_check_leaf_full(eb);
 484
 485         if (ret < 0) {
 486                 btrfs_print_tree(eb, 0);
 487                 btrfs_err(fs_info,
 488                 "block=%llu write time tree block corruption detected",
 489                           eb->start);
 490                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
 491                 return ret;
 492         }
 493         write_extent_buffer(eb, result, 0, fs_info->csum_size);
 494
 495         return 0;
 496 }
 497
 498 static int check_tree_block_fsid(struct extent_buffer *eb)
 499 {
 500         struct btrfs_fs_info *fs_info = eb->fs_info;
 501         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
 502         u8 fsid[BTRFS_FSID_SIZE];
 503         u8 *metadata_uuid;
 504
 505         read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
 506                            BTRFS_FSID_SIZE);
 507         /*
 508          * Checking the incompat flag is only valid for the current fs. For
 509          * seed devices it's forbidden to have their uuid changed so reading
 510          * ->fsid in this case is fine
 511          */
 512         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
 513                 metadata_uuid = fs_devices->metadata_uuid;
 514         else
 515                 metadata_uuid = fs_devices->fsid;
 516
 517         if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
 518                 return 0;
 519
 520         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
 521                 if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
 522                         return 0;
 523
 524         return 1;
 525 }
 526
 527 int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
 528                                    struct page *page, u64 start, u64 end,
 529                                    int mirror)
 530 {
 531         u64 found_start;
 532         int found_level;
 533         struct extent_buffer *eb;
 534         struct btrfs_fs_info *fs_info;
 535         u16 csum_size;
 536         int ret = 0;
 537         u8 result[BTRFS_CSUM_SIZE];
 538         int reads_done;
 539
 540         if (!page->private)
 541                 goto out;
 542
 543         eb = (struct extent_buffer *)page->private;
 544         fs_info = eb->fs_info;
 545         csum_size = fs_info->csum_size;
 546
 547         /* the pending IO might have been the only thing that kept this buffer
 548          * in memory.  Make sure we have a ref for all this other checks
 549          */
 550         atomic_inc(&eb->refs);
 551
 552         reads_done = atomic_dec_and_test(&eb->io_pages);
 553         if (!reads_done)
 554                 goto err;
 555
 556         eb->read_mirror = mirror;
 557         if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
 558                 ret = -EIO;
 559                 goto err;
 560         }
 561
 562         found_start = btrfs_header_bytenr(eb);
 563         if (found_start != eb->start) {
 564                 btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
 565                              eb->start, found_start);
 566                 ret = -EIO;
 567                 goto err;
 568         }
 569         if (check_tree_block_fsid(eb)) {
 570                 btrfs_err_rl(fs_info, "bad fsid on block %llu",
 571                              eb->start);
 572                 ret = -EIO;
 573                 goto err;
 574         }
 575         found_level = btrfs_header_level(eb);
 576         if (found_level >= BTRFS_MAX_LEVEL) {
 577                 btrfs_err(fs_info, "bad tree block level %d on %llu",
 578                           (int)btrfs_header_level(eb), eb->start);
 579                 ret = -EIO;
 580                 goto err;
 581         }
 582
 583         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 584                                        eb, found_level);
 585
 586         csum_tree_block(eb, result);
 587
 588         if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
 589                 u8 val[BTRFS_CSUM_SIZE] = { 0 };
 590
 591                 read_extent_buffer(eb, &val, 0, csum_size);
 592                 btrfs_warn_rl(fs_info,
 593         "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
 594                               fs_info->sb->s_id, eb->start,
 595                               CSUM_FMT_VALUE(csum_size, val),
 596                               CSUM_FMT_VALUE(csum_size, result),
 597                               btrfs_header_level(eb));
 598                 ret = -EUCLEAN;
 599                 goto err;
 600         }
 601
 602         /*
 603          * If this is a leaf block and it is corrupt, set the corrupt bit so
 604          * that we don't try and read the other copies of this block, just
 605          * return -EIO.
 606          */
 607         if (found_level == 0 && btrfs_check_leaf_full(eb)) {
 608                 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 609                 ret = -EIO;
 610         }
 611
 612         if (found_level > 0 && btrfs_check_node(eb))
 613                 ret = -EIO;
 614
 615         if (!ret)
 616                 set_extent_buffer_uptodate(eb);
 617         else
 618                 btrfs_err(fs_info,
 619                           "block=%llu read time tree block corruption detected",
 620                           eb->start);
 621 err:
 622         if (reads_done &&
 623             test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 624                 btree_readahead_hook(eb, ret);
 625
 626         if (ret) {
 627                 /*
 628                  * our io error hook is going to dec the io pages
 629                  * again, we have to make sure it has something
 630                  * to decrement
 631                  */
 632                 atomic_inc(&eb->io_pages);
 633                 clear_extent_buffer_uptodate(eb);
 634         }
 635         free_extent_buffer(eb);
 636 out:
 637         return ret;
 638 }
 639
 640 static void end_workqueue_bio(struct bio *bio)
 641 {
 642         struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
 643         struct btrfs_fs_info *fs_info;
 644         struct btrfs_workqueue *wq;
 645
 646         fs_info = end_io_wq->info;
 647         end_io_wq->status = bio->bi_status;
 648
 649         if (bio_op(bio) == REQ_OP_WRITE) {
 650                 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
 651                         wq = fs_info->endio_meta_write_workers;
 652                 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
 653                         wq = fs_info->endio_freespace_worker;
 654                 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
 655                         wq = fs_info->endio_raid56_workers;
 656                 else
 657                         wq = fs_info->endio_write_workers;
 658         } else {
 659                 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
 660                         wq = fs_info->endio_raid56_workers;
 661                 else if (end_io_wq->metadata)
 662                         wq = fs_info->endio_meta_workers;
 663                 else
 664                         wq = fs_info->endio_workers;
 665         }
 666
 667         btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 668         btrfs_queue_work(wq, &end_io_wq->work);
 669 }
 670
 671 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 672                         enum btrfs_wq_endio_type metadata)
 673 {
 674         struct btrfs_end_io_wq *end_io_wq;
 675
 676         end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
 677         if (!end_io_wq)
 678                 return BLK_STS_RESOURCE;
 679
 680         end_io_wq->private = bio->bi_private;
 681         end_io_wq->end_io = bio->bi_end_io;
 682         end_io_wq->info = info;
 683         end_io_wq->status = 0;
 684         end_io_wq->bio = bio;
 685         end_io_wq->metadata = metadata;
 686
 687         bio->bi_private = end_io_wq;
 688         bio->bi_end_io = end_workqueue_bio;
 689         return 0;
 690 }
 691
 692 static void run_one_async_start(struct btrfs_work *work)
 693 {
 694         struct async_submit_bio *async;
 695         blk_status_t ret;
 696
 697         async = container_of(work, struct  async_submit_bio, work);
 698         ret = async->submit_bio_start(async->inode, async->bio, async->bio_offset);
 699         if (ret)
 700                 async->status = ret;
 701 }
 702
 703 /*
 704  * In order to insert checksums into the metadata in large chunks, we wait
 705  * until bio submission time.   All the pages in the bio are checksummed and
 706  * sums are attached onto the ordered extent record.
 707  *
 708  * At IO completion time the csums attached on the ordered extent record are
 709  * inserted into the tree.
 710  */
 711 static void run_one_async_done(struct btrfs_work *work)
 712 {
 713         struct async_submit_bio *async;
 714         struct inode *inode;
 715         blk_status_t ret;
 716
 717         async = container_of(work, struct  async_submit_bio, work);
 718         inode = async->inode;
 719
 720         /* If an error occurred we just want to clean up the bio and move on */
 721         if (async->status) {
 722                 async->bio->bi_status = async->status;
 723                 bio_endio(async->bio);
 724                 return;
 725         }
 726
 727         /*
 728          * All of the bios that pass through here are from async helpers.
 729          * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
 730          * This changes nothing when cgroups aren't in use.
 731          */
 732         async->bio->bi_opf |= REQ_CGROUP_PUNT;
 733         ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
 734         if (ret) {
 735                 async->bio->bi_status = ret;
 736                 bio_endio(async->bio);
 737         }
 738 }
 739
 740 static void run_one_async_free(struct btrfs_work *work)
 741 {
 742         struct async_submit_bio *async;
 743
 744         async = container_of(work, struct  async_submit_bio, work);
 745         kfree(async);
 746 }
 747
 748 blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
 749                                  int mirror_num, unsigned long bio_flags,
 750                                  u64 bio_offset,
 751                                  extent_submit_bio_start_t *submit_bio_start)
 752 {
 753         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 754         struct async_submit_bio *async;
 755
 756         async = kmalloc(sizeof(*async), GFP_NOFS);
 757         if (!async)
 758                 return BLK_STS_RESOURCE;
 759
 760         async->inode = inode;
 761         async->bio = bio;
 762         async->mirror_num = mirror_num;
 763         async->submit_bio_start = submit_bio_start;
 764
 765         btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
 766                         run_one_async_free);
 767
 768         async->bio_offset = bio_offset;
 769
 770         async->status = 0;
 771
 772         if (op_is_sync(bio->bi_opf))
 773                 btrfs_set_work_high_priority(&async->work);
 774
 775         btrfs_queue_work(fs_info->workers, &async->work);
 776         return 0;
 777 }
 778
 779 static blk_status_t btree_csum_one_bio(struct bio *bio)
 780 {
 781         struct bio_vec *bvec;
 782         struct btrfs_root *root;
 783         int ret = 0;
 784         struct bvec_iter_all iter_all;
 785
 786         ASSERT(!bio_flagged(bio, BIO_CLONED));
 787         bio_for_each_segment_all(bvec, bio, iter_all) {
 788                 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 789                 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
 790                 if (ret)
 791                         break;
 792         }
 793
 794         return errno_to_blk_status(ret);
 795 }
 796
 797 static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
 798                                            u64 bio_offset)
 799 {
 800         /*
 801          * when we're called for a write, we're already in the async
 802          * submission context.  Just jump into btrfs_map_bio
 803          */
 804         return btree_csum_one_bio(bio);
 805 }
 806
 807 static int check_async_write(struct btrfs_fs_info *fs_info,
 808                              struct btrfs_inode *bi)
 809 {
 810         if (atomic_read(&bi->sync_writers))
 811                 return 0;
 812         if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
 813                 return 0;
 814         return 1;
 815 }
 816
 817 blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
 818                                        int mirror_num, unsigned long bio_flags)
 819 {
 820         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 821         int async = check_async_write(fs_info, BTRFS_I(inode));
 822         blk_status_t ret;
 823
 824         if (bio_op(bio) != REQ_OP_WRITE) {
 825                 /*
 826                  * called for a read, do the setup so that checksum validation
 827                  * can happen in the async kernel threads
 828                  */
 829                 ret = btrfs_bio_wq_end_io(fs_info, bio,
 830                                           BTRFS_WQ_ENDIO_METADATA);
 831                 if (ret)
 832                         goto out_w_error;
 833                 ret = btrfs_map_bio(fs_info, bio, mirror_num);
 834         } else if (!async) {
 835                 ret = btree_csum_one_bio(bio);
 836                 if (ret)
 837                         goto out_w_error;
 838                 ret = btrfs_map_bio(fs_info, bio, mirror_num);
 839         } else {
 840                 /*
 841                  * kthread helpers are used to submit writes so that
 842                  * checksumming can happen in parallel across all CPUs
 843                  */
 844                 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
 845                                           0, btree_submit_bio_start);
 846         }
 847
 848         if (ret)
 849                 goto out_w_error;
 850         return 0;
 851
 852 out_w_error:
 853         bio->bi_status = ret;
 854         bio_endio(bio);
 855         return ret;
 856 }
 857
 858 #ifdef CONFIG_MIGRATION
 859 static int btree_migratepage(struct address_space *mapping,
 860                         struct page *newpage, struct page *page,
 861                         enum migrate_mode mode)
 862 {
 863         /*
 864          * we can't safely write a btree page from here,
 865          * we haven't done the locking hook
 866          */
 867         if (PageDirty(page))
 868                 return -EAGAIN;
 869         /*
 870          * Buffers may be managed in a filesystem specific way.
 871          * We must have no buffers or drop them.
 872          */
 873         if (page_has_private(page) &&
 874             !try_to_release_page(page, GFP_KERNEL))
 875                 return -EAGAIN;
 876         return migrate_page(mapping, newpage, page, mode);
 877 }
 878 #endif
 879
 880
 881 static int btree_writepages(struct address_space *mapping,
 882                             struct writeback_control *wbc)
 883 {
 884         struct btrfs_fs_info *fs_info;
 885         int ret;
 886
 887         if (wbc->sync_mode == WB_SYNC_NONE) {
 888
 889                 if (wbc->for_kupdate)
 890                         return 0;
 891
 892                 fs_info = BTRFS_I(mapping->host)->root->fs_info;
 893                 /* this is a bit racy, but that's ok */
 894                 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
 895                                              BTRFS_DIRTY_METADATA_THRESH,
 896                                              fs_info->dirty_metadata_batch);
 897                 if (ret < 0)
 898                         return 0;
 899         }
 900         return btree_write_cache_pages(mapping, wbc);
 901 }
 902
 903 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 904 {
 905         if (PageWriteback(page) || PageDirty(page))
 906                 return 0;
 907
 908         return try_release_extent_buffer(page);
 909 }
 910
 911 static void btree_invalidatepage(struct page *page, unsigned int offset,
 912                                  unsigned int length)
 913 {
 914         struct extent_io_tree *tree;
 915         tree = &BTRFS_I(page->mapping->host)->io_tree;
 916         extent_invalidatepage(tree, page, offset);
 917         btree_releasepage(page, GFP_NOFS);
 918         if (PagePrivate(page)) {
 919                 btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
 920                            "page private not zero on page %llu",
 921                            (unsigned long long)page_offset(page));
 922                 detach_page_private(page);
 923         }
 924 }
 925
 926 static int btree_set_page_dirty(struct page *page)
 927 {
 928 #ifdef DEBUG
 929         struct extent_buffer *eb;
 930
 931         BUG_ON(!PagePrivate(page));
 932         eb = (struct extent_buffer *)page->private;
 933         BUG_ON(!eb);
 934         BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 935         BUG_ON(!atomic_read(&eb->refs));
 936         btrfs_assert_tree_locked(eb);
 937 #endif
 938         return __set_page_dirty_nobuffers(page);
 939 }
 940
 941 static const struct address_space_operations btree_aops = {
 942         .writepages     = btree_writepages,
 943         .releasepage    = btree_releasepage,
 944         .invalidatepage = btree_invalidatepage,
 945 #ifdef CONFIG_MIGRATION
 946         .migratepage    = btree_migratepage,
 947 #endif
 948         .set_page_dirty = btree_set_page_dirty,
 949 };
 950
 951 void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
 952 {
 953         struct extent_buffer *buf = NULL;
 954         int ret;
 955
 956         buf = btrfs_find_create_tree_block(fs_info, bytenr);
 957         if (IS_ERR(buf))
 958                 return;
 959
 960         ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
 961         if (ret < 0)
 962                 free_extent_buffer_stale(buf);
 963         else
 964                 free_extent_buffer(buf);
 965 }
 966
 967 struct extent_buffer *btrfs_find_create_tree_block(
 968                                                 struct btrfs_fs_info *fs_info,
 969                                                 u64 bytenr)
 970 {
 971         if (btrfs_is_testing(fs_info))
 972                 return alloc_test_extent_buffer(fs_info, bytenr);
 973         return alloc_extent_buffer(fs_info, bytenr);
 974 }
 975
 976 /*
 977  * Read tree block at logical address @bytenr and do variant basic but critical
 978  * verification.
 979  *
 980  * @parent_transid:     expected transid of this tree block, skip check if 0
 981  * @level:              expected level, mandatory check
 982  * @first_key:          expected key in slot 0, skip check if NULL
 983  */
 984 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 985                                       u64 parent_transid, int level,
 986                                       struct btrfs_key *first_key)
 987 {
 988         struct extent_buffer *buf = NULL;
 989         int ret;
 990
 991         buf = btrfs_find_create_tree_block(fs_info, bytenr);
 992         if (IS_ERR(buf))
 993                 return buf;
 994
 995         ret = btree_read_extent_buffer_pages(buf, parent_transid,
 996                                              level, first_key);
 997         if (ret) {
 998                 free_extent_buffer_stale(buf);
 999                 return ERR_PTR(ret);
1000         }
1001         return buf;
1002
1003 }
1004
1005 void btrfs_clean_tree_block(struct extent_buffer *buf)
1006 {
1007         struct btrfs_fs_info *fs_info = buf->fs_info;
1008         if (btrfs_header_generation(buf) ==
1009             fs_info->running_transaction->transid) {
1010                 btrfs_assert_tree_locked(buf);
1011
1012                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1013                         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
1014                                                  -buf->len,
1015                                                  fs_info->dirty_metadata_batch);
1016                         /* ugh, clear_extent_buffer_dirty needs to lock the page */
1017                         btrfs_set_lock_blocking_write(buf);
1018                         clear_extent_buffer_dirty(buf);
1019                 }
1020         }
1021 }
1022
1023 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1024                          u64 objectid)
1025 {
1026         bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1027         root->fs_info = fs_info;
1028         root->node = NULL;
1029         root->commit_root = NULL;
1030         root->state = 0;
1031         root->orphan_cleanup_state = 0;
1032
1033         root->last_trans = 0;
1034         root->highest_objectid = 0;
1035         root->nr_delalloc_inodes = 0;
1036         root->nr_ordered_extents = 0;
1037         root->inode_tree = RB_ROOT;
1038         INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1039         root->block_rsv = NULL;
1040
1041         INIT_LIST_HEAD(&root->dirty_list);
1042         INIT_LIST_HEAD(&root->root_list);
1043         INIT_LIST_HEAD(&root->delalloc_inodes);
1044         INIT_LIST_HEAD(&root->delalloc_root);
1045         INIT_LIST_HEAD(&root->ordered_extents);
1046         INIT_LIST_HEAD(&root->ordered_root);
1047         INIT_LIST_HEAD(&root->reloc_dirty_list);
1048         INIT_LIST_HEAD(&root->logged_list[0]);
1049         INIT_LIST_HEAD(&root->logged_list[1]);
1050         spin_lock_init(&root->inode_lock);
1051         spin_lock_init(&root->delalloc_lock);
1052         spin_lock_init(&root->ordered_extent_lock);
1053         spin_lock_init(&root->accounting_lock);
1054         spin_lock_init(&root->log_extents_lock[0]);
1055         spin_lock_init(&root->log_extents_lock[1]);
1056         spin_lock_init(&root->qgroup_meta_rsv_lock);
1057         mutex_init(&root->objectid_mutex);
1058         mutex_init(&root->log_mutex);
1059         mutex_init(&root->ordered_extent_mutex);
1060         mutex_init(&root->delalloc_mutex);
1061         init_waitqueue_head(&root->qgroup_flush_wait);
1062         init_waitqueue_head(&root->log_writer_wait);
1063         init_waitqueue_head(&root->log_commit_wait[0]);
1064         init_waitqueue_head(&root->log_commit_wait[1]);
1065         INIT_LIST_HEAD(&root->log_ctxs[0]);
1066         INIT_LIST_HEAD(&root->log_ctxs[1]);
1067         atomic_set(&root->log_commit[0], 0);
1068         atomic_set(&root->log_commit[1], 0);
1069         atomic_set(&root->log_writers, 0);
1070         atomic_set(&root->log_batch, 0);
1071         refcount_set(&root->refs, 1);
1072         atomic_set(&root->snapshot_force_cow, 0);
1073         atomic_set(&root->nr_swapfiles, 0);
1074         root->log_transid = 0;
1075         root->log_transid_committed = -1;
1076         root->last_log_commit = 0;
1077         if (!dummy) {
1078                 extent_io_tree_init(fs_info, &root->dirty_log_pages,
1079                                     IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1080                 extent_io_tree_init(fs_info, &root->log_csum_range,
1081                                     IO_TREE_LOG_CSUM_RANGE, NULL);
1082         }
1083
1084         memset(&root->root_key, 0, sizeof(root->root_key));
1085         memset(&root->root_item, 0, sizeof(root->root_item));
1086         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1087         root->root_key.objectid = objectid;
1088         root->anon_dev = 0;
1089
1090         spin_lock_init(&root->root_item_lock);
1091         btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1092 #ifdef CONFIG_BTRFS_DEBUG
1093         INIT_LIST_HEAD(&root->leak_list);
1094         spin_lock(&fs_info->fs_roots_radix_lock);
1095         list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1096         spin_unlock(&fs_info->fs_roots_radix_lock);
1097 #endif
1098 }
1099
1100 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1101                                            u64 objectid, gfp_t flags)
1102 {
1103         struct btrfs_root *root = kzalloc(sizeof(*root), flags);
1104         if (root)
1105                 __setup_root(root, fs_info, objectid);
1106         return root;
1107 }
1108
1109 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1110 /* Should only be used by the testing infrastructure */
1111 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1112 {
1113         struct btrfs_root *root;
1114
1115         if (!fs_info)
1116                 return ERR_PTR(-EINVAL);
1117
1118         root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1119         if (!root)
1120                 return ERR_PTR(-ENOMEM);
1121
1122         /* We don't use the stripesize in selftest, set it as sectorsize */
1123         root->alloc_bytenr = 0;
1124
1125         return root;
1126 }
1127 #endif
1128
1129 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1130                                      u64 objectid)
1131 {
1132         struct btrfs_fs_info *fs_info = trans->fs_info;
1133         struct extent_buffer *leaf;
1134         struct btrfs_root *tree_root = fs_info->tree_root;
1135         struct btrfs_root *root;
1136         struct btrfs_key key;
1137         unsigned int nofs_flag;
1138         int ret = 0;
1139
1140         /*
1141          * We're holding a transaction handle, so use a NOFS memory allocation
1142          * context to avoid deadlock if reclaim happens.
1143          */
1144         nofs_flag = memalloc_nofs_save();
1145         root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1146         memalloc_nofs_restore(nofs_flag);
1147         if (!root)
1148                 return ERR_PTR(-ENOMEM);
1149
1150         root->root_key.objectid = objectid;
1151         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1152         root->root_key.offset = 0;
1153
1154         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1155                                       BTRFS_NESTING_NORMAL);
1156         if (IS_ERR(leaf)) {
1157                 ret = PTR_ERR(leaf);
1158                 leaf = NULL;
1159                 goto fail;
1160         }
1161
1162         root->node = leaf;
1163         btrfs_mark_buffer_dirty(leaf);
1164
1165         root->commit_root = btrfs_root_node(root);
1166         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1167
1168         btrfs_set_root_flags(&root->root_item, 0);
1169         btrfs_set_root_limit(&root->root_item, 0);
1170         btrfs_set_root_bytenr(&root->root_item, leaf->start);
1171         btrfs_set_root_generation(&root->root_item, trans->transid);
1172         btrfs_set_root_level(&root->root_item, 0);
1173         btrfs_set_root_refs(&root->root_item, 1);
1174         btrfs_set_root_used(&root->root_item, leaf->len);
1175         btrfs_set_root_last_snapshot(&root->root_item, 0);
1176         btrfs_set_root_dirid(&root->root_item, 0);
1177         if (is_fstree(objectid))
1178                 generate_random_guid(root->root_item.uuid);
1179         else
1180                 export_guid(root->root_item.uuid, &guid_null);
1181         btrfs_set_root_drop_level(&root->root_item, 0);
1182
1183         key.objectid = objectid;
1184         key.type = BTRFS_ROOT_ITEM_KEY;
1185         key.offset = 0;
1186         ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1187         if (ret)
1188                 goto fail;
1189
1190         btrfs_tree_unlock(leaf);
1191
1192         return root;
1193
1194 fail:
1195         if (leaf)
1196                 btrfs_tree_unlock(leaf);
1197         btrfs_put_root(root);
1198
1199         return ERR_PTR(ret);
1200 }
1201
1202 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1203                                          struct btrfs_fs_info *fs_info)
1204 {
1205         struct btrfs_root *root;
1206         struct extent_buffer *leaf;
1207
1208         root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1209         if (!root)
1210                 return ERR_PTR(-ENOMEM);
1211
1212         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1213         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1214         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1215
1216         /*
1217          * DON'T set SHAREABLE bit for log trees.
1218          *
1219          * Log trees are not exposed to user space thus can't be snapshotted,
1220          * and they go away before a real commit is actually done.
1221          *
1222          * They do store pointers to file data extents, and those reference
1223          * counts still get updated (along with back refs to the log tree).
1224          */
1225
1226         leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1227                         NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
1228         if (IS_ERR(leaf)) {
1229                 btrfs_put_root(root);
1230                 return ERR_CAST(leaf);
1231         }
1232
1233         root->node = leaf;
1234
1235         btrfs_mark_buffer_dirty(root->node);
1236         btrfs_tree_unlock(root->node);
1237         return root;
1238 }
1239
1240 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1241                              struct btrfs_fs_info *fs_info)
1242 {
1243         struct btrfs_root *log_root;
1244
1245         log_root = alloc_log_tree(trans, fs_info);
1246         if (IS_ERR(log_root))
1247                 return PTR_ERR(log_root);
1248         WARN_ON(fs_info->log_root_tree);
1249         fs_info->log_root_tree = log_root;
1250         return 0;
1251 }
1252
1253 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1254                        struct btrfs_root *root)
1255 {
1256         struct btrfs_fs_info *fs_info = root->fs_info;
1257         struct btrfs_root *log_root;
1258         struct btrfs_inode_item *inode_item;
1259
1260         log_root = alloc_log_tree(trans, fs_info);
1261         if (IS_ERR(log_root))
1262                 return PTR_ERR(log_root);
1263
1264         log_root->last_trans = trans->transid;
1265         log_root->root_key.offset = root->root_key.objectid;
1266
1267         inode_item = &log_root->root_item.inode;
1268         btrfs_set_stack_inode_generation(inode_item, 1);
1269         btrfs_set_stack_inode_size(inode_item, 3);
1270         btrfs_set_stack_inode_nlink(inode_item, 1);
1271         btrfs_set_stack_inode_nbytes(inode_item,
1272                                      fs_info->nodesize);
1273         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1274
1275         btrfs_set_root_node(&log_root->root_item, log_root->node);
1276
1277         WARN_ON(root->log_root);
1278         root->log_root = log_root;
1279         root->log_transid = 0;
1280         root->log_transid_committed = -1;
1281         root->last_log_commit = 0;
1282         return 0;
1283 }
1284
1285 static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1286                                               struct btrfs_path *path,
1287                                               struct btrfs_key *key)
1288 {
1289         struct btrfs_root *root;
1290         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1291         u64 generation;
1292         int ret;
1293         int level;
1294
1295         root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1296         if (!root)
1297                 return ERR_PTR(-ENOMEM);
1298
1299         ret = btrfs_find_root(tree_root, key, path,
1300                               &root->root_item, &root->root_key);
1301         if (ret) {
1302                 if (ret > 0)
1303                         ret = -ENOENT;
1304                 goto fail;
1305         }
1306
1307         generation = btrfs_root_generation(&root->root_item);
1308         level = btrfs_root_level(&root->root_item);
1309         root->node = read_tree_block(fs_info,
1310                                      btrfs_root_bytenr(&root->root_item),
1311                                      generation, level, NULL);
1312         if (IS_ERR(root->node)) {
1313                 ret = PTR_ERR(root->node);
1314                 root->node = NULL;
1315                 goto fail;
1316         } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1317                 ret = -EIO;
1318                 goto fail;
1319         }
1320         root->commit_root = btrfs_root_node(root);
1321         return root;
1322 fail:
1323         btrfs_put_root(root);
1324         return ERR_PTR(ret);
1325 }
1326
1327 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1328                                         struct btrfs_key *key)
1329 {
1330         struct btrfs_root *root;
1331         struct btrfs_path *path;
1332
1333         path = btrfs_alloc_path();
1334         if (!path)
1335                 return ERR_PTR(-ENOMEM);
1336         root = read_tree_root_path(tree_root, path, key);
1337         btrfs_free_path(path);
1338
1339         return root;
1340 }
1341
1342 /*
1343  * Initialize subvolume root in-memory structure
1344  *
1345  * @anon_dev:   anonymous device to attach to the root, if zero, allocate new
1346  */
1347 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1348 {
1349         int ret;
1350         unsigned int nofs_flag;
1351
1352         root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1353         root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1354                                         GFP_NOFS);
1355         if (!root->free_ino_pinned || !root->free_ino_ctl) {
1356                 ret = -ENOMEM;
1357                 goto fail;
1358         }
1359
1360         /*
1361          * We might be called under a transaction (e.g. indirect backref
1362          * resolution) which could deadlock if it triggers memory reclaim
1363          */
1364         nofs_flag = memalloc_nofs_save();
1365         ret = btrfs_drew_lock_init(&root->snapshot_lock);
1366         memalloc_nofs_restore(nofs_flag);
1367         if (ret)
1368                 goto fail;
1369
1370         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1371             root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
1372                 set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1373                 btrfs_check_and_init_root_item(&root->root_item);
1374         }
1375
1376         btrfs_init_free_ino_ctl(root);
1377         spin_lock_init(&root->ino_cache_lock);
1378         init_waitqueue_head(&root->ino_cache_wait);
1379
1380         /*
1381          * Don't assign anonymous block device to roots that are not exposed to
1382          * userspace, the id pool is limited to 1M
1383          */
1384         if (is_fstree(root->root_key.objectid) &&
1385             btrfs_root_refs(&root->root_item) > 0) {
1386                 if (!anon_dev) {
1387                         ret = get_anon_bdev(&root->anon_dev);
1388                         if (ret)
1389                                 goto fail;
1390                 } else {
1391                         root->anon_dev = anon_dev;
1392                 }
1393         }
1394
1395         mutex_lock(&root->objectid_mutex);
1396         ret = btrfs_find_highest_objectid(root,
1397                                         &root->highest_objectid);
1398         if (ret) {
1399                 mutex_unlock(&root->objectid_mutex);
1400                 goto fail;
1401         }
1402
1403         ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
1404
1405         mutex_unlock(&root->objectid_mutex);
1406
1407         return 0;
1408 fail:
1409         /* The caller is responsible to call btrfs_free_fs_root */
1410         return ret;
1411 }
1412
1413 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1414                                                u64 root_id)
1415 {
1416         struct btrfs_root *root;
1417
1418         spin_lock(&fs_info->fs_roots_radix_lock);
1419         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1420                                  (unsigned long)root_id);
1421         if (root)
1422                 root = btrfs_grab_root(root);
1423         spin_unlock(&fs_info->fs_roots_radix_lock);
1424         return root;
1425 }
1426
1427 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1428                                                 u64 objectid)
1429 {
1430         if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1431                 return btrfs_grab_root(fs_info->tree_root);
1432         if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1433                 return btrfs_grab_root(fs_info->extent_root);
1434         if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1435                 return btrfs_grab_root(fs_info->chunk_root);
1436         if (objectid == BTRFS_DEV_TREE_OBJECTID)
1437                 return btrfs_grab_root(fs_info->dev_root);
1438         if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1439                 return btrfs_grab_root(fs_info->csum_root);
1440         if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1441                 return btrfs_grab_root(fs_info->quota_root) ?
1442                         fs_info->quota_root : ERR_PTR(-ENOENT);
1443         if (objectid == BTRFS_UUID_TREE_OBJECTID)
1444                 return btrfs_grab_root(fs_info->uuid_root) ?
1445                         fs_info->uuid_root : ERR_PTR(-ENOENT);
1446         if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1447                 return btrfs_grab_root(fs_info->free_space_root) ?
1448                         fs_info->free_space_root : ERR_PTR(-ENOENT);
1449         return NULL;
1450 }
1451
1452 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1453                          struct btrfs_root *root)
1454 {
1455         int ret;
1456
1457         ret = radix_tree_preload(GFP_NOFS);
1458         if (ret)
1459                 return ret;
1460
1461         spin_lock(&fs_info->fs_roots_radix_lock);
1462         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1463                                 (unsigned long)root->root_key.objectid,
1464                                 root);
1465         if (ret == 0) {
1466                 btrfs_grab_root(root);
1467                 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1468         }
1469         spin_unlock(&fs_info->fs_roots_radix_lock);
1470         radix_tree_preload_end();
1471
1472         return ret;
1473 }
1474
1475 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1476 {
1477 #ifdef CONFIG_BTRFS_DEBUG
1478         struct btrfs_root *root;
1479
1480         while (!list_empty(&fs_info->allocated_roots)) {
1481                 char buf[BTRFS_ROOT_NAME_BUF_LEN];
1482
1483                 root = list_first_entry(&fs_info->allocated_roots,
1484                                         struct btrfs_root, leak_list);
1485                 btrfs_err(fs_info, "leaked root %s refcount %d",
1486                           btrfs_root_name(root->root_key.objectid, buf),
1487                           refcount_read(&root->refs));
1488                 while (refcount_read(&root->refs) > 1)
1489                         btrfs_put_root(root);
1490                 btrfs_put_root(root);
1491         }
1492 #endif
1493 }
1494
1495 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1496 {
1497         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1498         percpu_counter_destroy(&fs_info->delalloc_bytes);
1499         percpu_counter_destroy(&fs_info->dio_bytes);
1500         percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1501         btrfs_free_csum_hash(fs_info);
1502         btrfs_free_stripe_hash_table(fs_info);
1503         btrfs_free_ref_cache(fs_info);
1504         kfree(fs_info->balance_ctl);
1505         kfree(fs_info->delayed_root);
1506         btrfs_put_root(fs_info->extent_root);
1507         btrfs_put_root(fs_info->tree_root);
1508         btrfs_put_root(fs_info->chunk_root);
1509         btrfs_put_root(fs_info->dev_root);
1510         btrfs_put_root(fs_info->csum_root);
1511         btrfs_put_root(fs_info->quota_root);
1512         btrfs_put_root(fs_info->uuid_root);
1513         btrfs_put_root(fs_info->free_space_root);
1514         btrfs_put_root(fs_info->fs_root);
1515         btrfs_put_root(fs_info->data_reloc_root);
1516         btrfs_check_leaked_roots(fs_info);
1517         btrfs_extent_buffer_leak_debug_check(fs_info);
1518         kfree(fs_info->super_copy);
1519         kfree(fs_info->super_for_commit);
1520         kvfree(fs_info);
1521 }
1522
1523
1524 /*
1525  * Get an in-memory reference of a root structure.
1526  *
1527  * For essential trees like root/extent tree, we grab it from fs_info directly.
1528  * For subvolume trees, we check the cached filesystem roots first. If not
1529  * found, then read it from disk and add it to cached fs roots.
1530  *
1531  * Caller should release the root by calling btrfs_put_root() after the usage.
1532  *
1533  * NOTE: Reloc and log trees can't be read by this function as they share the
1534  *       same root objectid.
1535  *
1536  * @objectid:   root id
1537  * @anon_dev:   preallocated anonymous block device number for new roots,
1538  *              pass 0 for new allocation.
1539  * @check_ref:  whether to check root item references, If true, return -ENOENT
1540  *              for orphan roots
1541  */
1542 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1543                                              u64 objectid, dev_t anon_dev,
1544                                              bool check_ref)
1545 {
1546         struct btrfs_root *root;
1547         struct btrfs_path *path;
1548         struct btrfs_key key;
1549         int ret;
1550
1551         root = btrfs_get_global_root(fs_info, objectid);
1552         if (root)
1553                 return root;
1554 again:
1555         root = btrfs_lookup_fs_root(fs_info, objectid);
1556         if (root) {
1557                 /* Shouldn't get preallocated anon_dev for cached roots */
1558                 ASSERT(!anon_dev);
1559                 if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1560                         btrfs_put_root(root);
1561                         return ERR_PTR(-ENOENT);
1562                 }
1563                 return root;
1564         }
1565
1566         key.objectid = objectid;
1567         key.type = BTRFS_ROOT_ITEM_KEY;
1568         key.offset = (u64)-1;
1569         root = btrfs_read_tree_root(fs_info->tree_root, &key);
1570         if (IS_ERR(root))
1571                 return root;
1572
1573         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1574                 ret = -ENOENT;
1575                 goto fail;
1576         }
1577
1578         ret = btrfs_init_fs_root(root, anon_dev);
1579         if (ret)
1580                 goto fail;
1581
1582         path = btrfs_alloc_path();
1583         if (!path) {
1584                 ret = -ENOMEM;
1585                 goto fail;
1586         }
1587         key.objectid = BTRFS_ORPHAN_OBJECTID;
1588         key.type = BTRFS_ORPHAN_ITEM_KEY;
1589         key.offset = objectid;
1590
1591         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1592         btrfs_free_path(path);
1593         if (ret < 0)
1594                 goto fail;
1595         if (ret == 0)
1596                 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1597
1598         ret = btrfs_insert_fs_root(fs_info, root);
1599         if (ret) {
1600                 btrfs_put_root(root);
1601                 if (ret == -EEXIST)
1602                         goto again;
1603                 goto fail;
1604         }
1605         return root;
1606 fail:
1607         btrfs_put_root(root);
1608         return ERR_PTR(ret);
1609 }
1610
1611 /*
1612  * Get in-memory reference of a root structure
1613  *
1614  * @objectid:   tree objectid
1615  * @check_ref:  if set, verify that the tree exists and the item has at least
1616  *              one reference
1617  */
1618 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1619                                      u64 objectid, bool check_ref)
1620 {
1621         return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1622 }
1623
1624 /*
1625  * Get in-memory reference of a root structure, created as new, optionally pass
1626  * the anonymous block device id
1627  *
1628  * @objectid:   tree objectid
1629  * @anon_dev:   if zero, allocate a new anonymous block device or use the
1630  *              parameter value
1631  */
1632 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1633                                          u64 objectid, dev_t anon_dev)
1634 {
1635         return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1636 }
1637
1638 /*
1639  * btrfs_get_fs_root_commit_root - return a root for the given objectid
1640  * @fs_info:    the fs_info
1641  * @objectid:   the objectid we need to lookup
1642  *
1643  * This is exclusively used for backref walking, and exists specifically because
1644  * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
1645  * creation time, which means we may have to read the tree_root in order to look
1646  * up a fs root that is not in memory.  If the root is not in memory we will
1647  * read the tree root commit root and look up the fs root from there.  This is a
1648  * temporary root, it will not be inserted into the radix tree as it doesn't
1649  * have the most uptodate information, it'll simply be discarded once the
1650  * backref code is finished using the root.
1651  */
1652 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1653                                                  struct btrfs_path *path,
1654                                                  u64 objectid)
1655 {
1656         struct btrfs_root *root;
1657         struct btrfs_key key;
1658
1659         ASSERT(path->search_commit_root && path->skip_locking);
1660
1661         /*
1662          * This can return -ENOENT if we ask for a root that doesn't exist, but
1663          * since this is called via the backref walking code we won't be looking
1664          * up a root that doesn't exist, unless there's corruption.  So if root
1665          * != NULL just return it.
1666          */
1667         root = btrfs_get_global_root(fs_info, objectid);
1668         if (root)
1669                 return root;
1670
1671         root = btrfs_lookup_fs_root(fs_info, objectid);
1672         if (root)
1673                 return root;
1674
1675         key.objectid = objectid;
1676         key.type = BTRFS_ROOT_ITEM_KEY;
1677         key.offset = (u64)-1;
1678         root = read_tree_root_path(fs_info->tree_root, path, &key);
1679         btrfs_release_path(path);
1680
1681         return root;
1682 }
1683
1684 /*
1685  * called by the kthread helper functions to finally call the bio end_io
1686  * functions.  This is where read checksum verification actually happens
1687  */
1688 static void end_workqueue_fn(struct btrfs_work *work)
1689 {
1690         struct bio *bio;
1691         struct btrfs_end_io_wq *end_io_wq;
1692
1693         end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1694         bio = end_io_wq->bio;
1695
1696         bio->bi_status = end_io_wq->status;
1697         bio->bi_private = end_io_wq->private;
1698         bio->bi_end_io = end_io_wq->end_io;
1699         bio_endio(bio);
1700         kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1701 }
1702
1703 static int cleaner_kthread(void *arg)
1704 {
1705         struct btrfs_root *root = arg;
1706         struct btrfs_fs_info *fs_info = root->fs_info;
1707         int again;
1708
1709         while (1) {
1710                 again = 0;
1711
1712                 set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1713
1714                 /* Make the cleaner go to sleep early. */
1715                 if (btrfs_need_cleaner_sleep(fs_info))
1716                         goto sleep;
1717
1718                 /*
1719                  * Do not do anything if we might cause open_ctree() to block
1720                  * before we have finished mounting the filesystem.
1721                  */
1722                 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1723                         goto sleep;
1724
1725                 if (!mutex_trylock(&fs_info->cleaner_mutex))
1726                         goto sleep;
1727
1728                 /*
1729                  * Avoid the problem that we change the status of the fs
1730                  * during the above check and trylock.
1731                  */
1732                 if (btrfs_need_cleaner_sleep(fs_info)) {
1733                         mutex_unlock(&fs_info->cleaner_mutex);
1734                         goto sleep;
1735                 }
1736
1737                 btrfs_run_delayed_iputs(fs_info);
1738
1739                 again = btrfs_clean_one_deleted_snapshot(root);
1740                 mutex_unlock(&fs_info->cleaner_mutex);
1741
1742                 /*
1743                  * The defragger has dealt with the R/O remount and umount,
1744                  * needn't do anything special here.
1745                  */
1746                 btrfs_run_defrag_inodes(fs_info);
1747
1748                 /*
1749                  * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
1750                  * with relocation (btrfs_relocate_chunk) and relocation
1751                  * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1752                  * after acquiring fs_info->delete_unused_bgs_mutex. So we
1753                  * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1754                  * unused block groups.
1755                  */
1756                 btrfs_delete_unused_bgs(fs_info);
1757 sleep:
1758                 clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1759                 if (kthread_should_park())
1760                         kthread_parkme();
1761                 if (kthread_should_stop())
1762                         return 0;
1763                 if (!again) {
1764                         set_current_state(TASK_INTERRUPTIBLE);
1765                         schedule();
1766                         __set_current_state(TASK_RUNNING);
1767                 }
1768         }
1769 }
1770
1771 static int transaction_kthread(void *arg)
1772 {
1773         struct btrfs_root *root = arg;
1774         struct btrfs_fs_info *fs_info = root->fs_info;
1775         struct btrfs_trans_handle *trans;
1776         struct btrfs_transaction *cur;
1777         u64 transid;
1778         time64_t delta;
1779         unsigned long delay;
1780         bool cannot_commit;
1781
1782         do {
1783                 cannot_commit = false;
1784                 delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1785                 mutex_lock(&fs_info->transaction_kthread_mutex);
1786
1787                 spin_lock(&fs_info->trans_lock);
1788                 cur = fs_info->running_transaction;
1789                 if (!cur) {
1790                         spin_unlock(&fs_info->trans_lock);
1791                         goto sleep;
1792                 }
1793
1794                 delta = ktime_get_seconds() - cur->start_time;
1795                 if (cur->state < TRANS_STATE_COMMIT_START &&
1796                     delta < fs_info->commit_interval) {
1797                         spin_unlock(&fs_info->trans_lock);
1798                         delay -= msecs_to_jiffies((delta - 1) * 1000);
1799                         delay = min(delay,
1800                                     msecs_to_jiffies(fs_info->commit_interval * 1000));
1801                         goto sleep;
1802                 }
1803                 transid = cur->transid;
1804                 spin_unlock(&fs_info->trans_lock);
1805
1806                 /* If the file system is aborted, this will always fail. */
1807                 trans = btrfs_attach_transaction(root);
1808                 if (IS_ERR(trans)) {
1809                         if (PTR_ERR(trans) != -ENOENT)
1810                                 cannot_commit = true;
1811                         goto sleep;
1812                 }
1813                 if (transid == trans->transid) {
1814                         btrfs_commit_transaction(trans);
1815                 } else {
1816                         btrfs_end_transaction(trans);
1817                 }
1818 sleep:
1819                 wake_up_process(fs_info->cleaner_kthread);
1820                 mutex_unlock(&fs_info->transaction_kthread_mutex);
1821
1822                 if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1823                                       &fs_info->fs_state)))
1824                         btrfs_cleanup_transaction(fs_info);
1825                 if (!kthread_should_stop() &&
1826                                 (!btrfs_transaction_blocked(fs_info) ||
1827                                  cannot_commit))
1828                         schedule_timeout_interruptible(delay);
1829         } while (!kthread_should_stop());
1830         return 0;
1831 }
1832
1833 /*
1834  * This will find the highest generation in the array of root backups.  The
1835  * index of the highest array is returned, or -EINVAL if we can't find
1836  * anything.
1837  *
1838  * We check to make sure the array is valid by comparing the
1839  * generation of the latest  root in the array with the generation
1840  * in the super block.  If they don't match we pitch it.
1841  */
1842 static int find_newest_super_backup(struct btrfs_fs_info *info)
1843 {
1844         const u64 newest_gen = btrfs_super_generation(info->super_copy);
1845         u64 cur;
1846         struct btrfs_root_backup *root_backup;
1847         int i;
1848
1849         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1850                 root_backup = info->super_copy->super_roots + i;
1851                 cur = btrfs_backup_tree_root_gen(root_backup);
1852                 if (cur == newest_gen)
1853                         return i;
1854         }
1855
1856         return -EINVAL;
1857 }
1858
1859 /*
1860  * copy all the root pointers into the super backup array.
1861  * this will bump the backup pointer by one when it is
1862  * done
1863  */
1864 static void backup_super_roots(struct btrfs_fs_info *info)
1865 {
1866         const int next_backup = info->backup_root_index;
1867         struct btrfs_root_backup *root_backup;
1868
1869         root_backup = info->super_for_commit->super_roots + next_backup;
1870
1871         /*
1872          * make sure all of our padding and empty slots get zero filled
1873          * regardless of which ones we use today
1874          */
1875         memset(root_backup, 0, sizeof(*root_backup));
1876
1877         info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1878
1879         btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1880         btrfs_set_backup_tree_root_gen(root_backup,
1881                                btrfs_header_generation(info->tree_root->node));
1882
1883         btrfs_set_backup_tree_root_level(root_backup,
1884                                btrfs_header_level(info->tree_root->node));
1885
1886         btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1887         btrfs_set_backup_chunk_root_gen(root_backup,
1888                                btrfs_header_generation(info->chunk_root->node));
1889         btrfs_set_backup_chunk_root_level(root_backup,
1890                                btrfs_header_level(info->chunk_root->node));
1891
1892         btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1893         btrfs_set_backup_extent_root_gen(root_backup,
1894                                btrfs_header_generation(info->extent_root->node));
1895         btrfs_set_backup_extent_root_level(root_backup,
1896                                btrfs_header_level(info->extent_root->node));
1897
1898         /*
1899          * we might commit during log recovery, which happens before we set
1900          * the fs_root.  Make sure it is valid before we fill it in.
1901          */
1902         if (info->fs_root && info->fs_root->node) {
1903                 btrfs_set_backup_fs_root(root_backup,
1904                                          info->fs_root->node->start);
1905                 btrfs_set_backup_fs_root_gen(root_backup,
1906                                btrfs_header_generation(info->fs_root->node));
1907                 btrfs_set_backup_fs_root_level(root_backup,
1908                                btrfs_header_level(info->fs_root->node));
1909         }
1910
1911         btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1912         btrfs_set_backup_dev_root_gen(root_backup,
1913                                btrfs_header_generation(info->dev_root->node));
1914         btrfs_set_backup_dev_root_level(root_backup,
1915                                        btrfs_header_level(info->dev_root->node));
1916
1917         btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1918         btrfs_set_backup_csum_root_gen(root_backup,
1919                                btrfs_header_generation(info->csum_root->node));
1920         btrfs_set_backup_csum_root_level(root_backup,
1921                                btrfs_header_level(info->csum_root->node));
1922
1923         btrfs_set_backup_total_bytes(root_backup,
1924                              btrfs_super_total_bytes(info->super_copy));
1925         btrfs_set_backup_bytes_used(root_backup,
1926                              btrfs_super_bytes_used(info->super_copy));
1927         btrfs_set_backup_num_devices(root_backup,
1928                              btrfs_super_num_devices(info->super_copy));
1929
1930         /*
1931          * if we don't copy this out to the super_copy, it won't get remembered
1932          * for the next commit
1933          */
1934         memcpy(&info->super_copy->super_roots,
1935                &info->super_for_commit->super_roots,
1936                sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1937 }
1938
1939 /*
1940  * read_backup_root - Reads a backup root based on the passed priority. Prio 0
1941  * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
1942  *
1943  * fs_info - filesystem whose backup roots need to be read
1944  * priority - priority of backup root required
1945  *
1946  * Returns backup root index on success and -EINVAL otherwise.
1947  */
1948 static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
1949 {
1950         int backup_index = find_newest_super_backup(fs_info);
1951         struct btrfs_super_block *super = fs_info->super_copy;
1952         struct btrfs_root_backup *root_backup;
1953
1954         if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
1955                 if (priority == 0)
1956                         return backup_index;
1957
1958                 backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
1959                 backup_index %= BTRFS_NUM_BACKUP_ROOTS;
1960         } else {
1961                 return -EINVAL;
1962         }
1963
1964         root_backup = super->super_roots + backup_index;
1965
1966         btrfs_set_super_generation(super,
1967                                    btrfs_backup_tree_root_gen(root_backup));
1968         btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1969         btrfs_set_super_root_level(super,
1970                                    btrfs_backup_tree_root_level(root_backup));
1971         btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1972
1973         /*
1974          * Fixme: the total bytes and num_devices need to match or we should
1975          * need a fsck
1976          */
1977         btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1978         btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1979
1980         return backup_index;
1981 }
1982
1983 /* helper to cleanup workers */
1984 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
1985 {
1986         btrfs_destroy_workqueue(fs_info->fixup_workers);
1987         btrfs_destroy_workqueue(fs_info->delalloc_workers);
1988         btrfs_destroy_workqueue(fs_info->workers);
1989         btrfs_destroy_workqueue(fs_info->endio_workers);
1990         btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
1991         btrfs_destroy_workqueue(fs_info->rmw_workers);
1992         btrfs_destroy_workqueue(fs_info->endio_write_workers);
1993         btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
1994         btrfs_destroy_workqueue(fs_info->delayed_workers);
1995         btrfs_destroy_workqueue(fs_info->caching_workers);
1996         btrfs_destroy_workqueue(fs_info->readahead_workers);
1997         btrfs_destroy_workqueue(fs_info->flush_workers);
1998         btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
1999         if (fs_info->discard_ctl.discard_workers)
2000                 destroy_workqueue(fs_info->discard_ctl.discard_workers);
2001         /*
2002          * Now that all other work queues are destroyed, we can safely destroy
2003          * the queues used for metadata I/O, since tasks from those other work
2004          * queues can do metadata I/O operations.
2005          */
2006         btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2007         btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2008 }
2009
2010 static void free_root_extent_buffers(struct btrfs_root *root)
2011 {
2012         if (root) {
2013                 free_extent_buffer(root->node);
2014                 free_extent_buffer(root->commit_root);
2015                 root->node = NULL;
2016                 root->commit_root = NULL;
2017         }
2018 }
2019
2020 /* helper to cleanup tree roots */
2021 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
2022 {
2023         free_root_extent_buffers(info->tree_root);
2024
2025         free_root_extent_buffers(info->dev_root);
2026         free_root_extent_buffers(info->extent_root);
2027         free_root_extent_buffers(info->csum_root);
2028         free_root_extent_buffers(info->quota_root);
2029         free_root_extent_buffers(info->uuid_root);
2030         free_root_extent_buffers(info->fs_root);
2031         free_root_extent_buffers(info->data_reloc_root);
2032         if (free_chunk_root)
2033                 free_root_extent_buffers(info->chunk_root);
2034         free_root_extent_buffers(info->free_space_root);
2035 }
2036
2037 void btrfs_put_root(struct btrfs_root *root)
2038 {
2039         if (!root)
2040                 return;
2041
2042         if (refcount_dec_and_test(&root->refs)) {
2043                 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2044                 WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2045                 if (root->anon_dev)
2046                         free_anon_bdev(root->anon_dev);
2047                 btrfs_drew_lock_destroy(&root->snapshot_lock);
2048                 free_root_extent_buffers(root);
2049                 kfree(root->free_ino_ctl);
2050                 kfree(root->free_ino_pinned);
2051 #ifdef CONFIG_BTRFS_DEBUG
2052                 spin_lock(&root->fs_info->fs_roots_radix_lock);
2053                 list_del_init(&root->leak_list);
2054                 spin_unlock(&root->fs_info->fs_roots_radix_lock);
2055 #endif
2056                 kfree(root);
2057         }
2058 }
2059
2060 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2061 {
2062         int ret;
2063         struct btrfs_root *gang[8];
2064         int i;
2065
2066         while (!list_empty(&fs_info->dead_roots)) {
2067                 gang[0] = list_entry(fs_info->dead_roots.next,
2068                                      struct btrfs_root, root_list);
2069                 list_del(&gang[0]->root_list);
2070
2071                 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2072                         btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2073                 btrfs_put_root(gang[0]);
2074         }
2075
2076         while (1) {
2077                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2078                                              (void **)gang, 0,
2079                                              ARRAY_SIZE(gang));
2080                 if (!ret)
2081                         break;
2082                 for (i = 0; i < ret; i++)
2083                         btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2084         }
2085 }
2086
2087 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2088 {
2089         mutex_init(&fs_info->scrub_lock);
2090         atomic_set(&fs_info->scrubs_running, 0);
2091         atomic_set(&fs_info->scrub_pause_req, 0);
2092         atomic_set(&fs_info->scrubs_paused, 0);
2093         atomic_set(&fs_info->scrub_cancel_req, 0);
2094         init_waitqueue_head(&fs_info->scrub_pause_wait);
2095         refcount_set(&fs_info->scrub_workers_refcnt, 0);
2096 }
2097
2098 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2099 {
2100         spin_lock_init(&fs_info->balance_lock);
2101         mutex_init(&fs_info->balance_mutex);
2102         atomic_set(&fs_info->balance_pause_req, 0);
2103         atomic_set(&fs_info->balance_cancel_req, 0);
2104         fs_info->balance_ctl = NULL;
2105         init_waitqueue_head(&fs_info->balance_wait_q);
2106 }
2107
2108 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2109 {
2110         struct inode *inode = fs_info->btree_inode;
2111
2112         inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2113         set_nlink(inode, 1);
2114         /*
2115          * we set the i_size on the btree inode to the max possible int.
2116          * the real end of the address space is determined by all of
2117          * the devices in the system
2118          */
2119         inode->i_size = OFFSET_MAX;
2120         inode->i_mapping->a_ops = &btree_aops;
2121
2122         RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2123         extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2124                             IO_TREE_BTREE_INODE_IO, inode);
2125         BTRFS_I(inode)->io_tree.track_uptodate = false;
2126         extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2127
2128         BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2129         memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
2130         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
2131         btrfs_insert_inode_hash(inode);
2132 }
2133
2134 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2135 {
2136         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2137         init_rwsem(&fs_info->dev_replace.rwsem);
2138         init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2139 }
2140
2141 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2142 {
2143         spin_lock_init(&fs_info->qgroup_lock);
2144         mutex_init(&fs_info->qgroup_ioctl_lock);
2145         fs_info->qgroup_tree = RB_ROOT;
2146         INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2147         fs_info->qgroup_seq = 1;
2148         fs_info->qgroup_ulist = NULL;
2149         fs_info->qgroup_rescan_running = false;
2150         mutex_init(&fs_info->qgroup_rescan_lock);
2151 }
2152
2153 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
2154                 struct btrfs_fs_devices *fs_devices)
2155 {
2156         u32 max_active = fs_info->thread_pool_size;
2157         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2158
2159         fs_info->workers =
2160                 btrfs_alloc_workqueue(fs_info, "worker",
2161                                       flags | WQ_HIGHPRI, max_active, 16);
2162
2163         fs_info->delalloc_workers =
2164                 btrfs_alloc_workqueue(fs_info, "delalloc",
2165                                       flags, max_active, 2);
2166
2167         fs_info->flush_workers =
2168                 btrfs_alloc_workqueue(fs_info, "flush_delalloc",
2169                                       flags, max_active, 0);
2170
2171         fs_info->caching_workers =
2172                 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2173
2174         fs_info->fixup_workers =
2175                 btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2176
2177         /*
2178          * endios are largely parallel and should have a very
2179          * low idle thresh
2180          */
2181         fs_info->endio_workers =
2182                 btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
2183         fs_info->endio_meta_workers =
2184                 btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
2185                                       max_active, 4);
2186         fs_info->endio_meta_write_workers =
2187                 btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
2188                                       max_active, 2);
2189         fs_info->endio_raid56_workers =
2190                 btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
2191                                       max_active, 4);
2192         fs_info->rmw_workers =
2193                 btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
2194         fs_info->endio_write_workers =
2195                 btrfs_alloc_workqueue(fs_info, "endio-write", flags,
2196                                       max_active, 2);
2197         fs_info->endio_freespace_worker =
2198                 btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
2199                                       max_active, 0);
2200         fs_info->delayed_workers =
2201                 btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
2202                                       max_active, 0);
2203         fs_info->readahead_workers =
2204                 btrfs_alloc_workqueue(fs_info, "readahead", flags,
2205                                       max_active, 2);
2206         fs_info->qgroup_rescan_workers =
2207                 btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2208         fs_info->discard_ctl.discard_workers =
2209                 alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2210
2211         if (!(fs_info->workers && fs_info->delalloc_workers &&
2212               fs_info->flush_workers &&
2213               fs_info->endio_workers && fs_info->endio_meta_workers &&
2214               fs_info->endio_meta_write_workers &&
2215               fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2216               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2217               fs_info->caching_workers && fs_info->readahead_workers &&
2218               fs_info->fixup_workers && fs_info->delayed_workers &&
2219               fs_info->qgroup_rescan_workers &&
2220               fs_info->discard_ctl.discard_workers)) {
2221                 return -ENOMEM;
2222         }
2223
2224         return 0;
2225 }
2226
2227 static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2228 {
2229         struct crypto_shash *csum_shash;
2230         const char *csum_driver = btrfs_super_csum_driver(csum_type);
2231
2232         csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2233
2234         if (IS_ERR(csum_shash)) {
2235                 btrfs_err(fs_info, "error allocating %s hash for checksum",
2236                           csum_driver);
2237                 return PTR_ERR(csum_shash);
2238         }
2239
2240         fs_info->csum_shash = csum_shash;
2241
2242         return 0;
2243 }
2244
2245 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2246                             struct btrfs_fs_devices *fs_devices)
2247 {
2248         int ret;
2249         struct btrfs_root *log_tree_root;
2250         struct btrfs_super_block *disk_super = fs_info->super_copy;
2251         u64 bytenr = btrfs_super_log_root(disk_super);
2252         int level = btrfs_super_log_root_level(disk_super);
2253
2254         if (fs_devices->rw_devices == 0) {
2255                 btrfs_warn(fs_info, "log replay required on RO media");
2256                 return -EIO;
2257         }
2258
2259         log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2260                                          GFP_KERNEL);
2261         if (!log_tree_root)
2262                 return -ENOMEM;
2263
2264         log_tree_root->node = read_tree_block(fs_info, bytenr,
2265                                               fs_info->generation + 1,
2266                                               level, NULL);
2267         if (IS_ERR(log_tree_root->node)) {
2268                 btrfs_warn(fs_info, "failed to read log tree");
2269                 ret = PTR_ERR(log_tree_root->node);
2270                 log_tree_root->node = NULL;
2271                 btrfs_put_root(log_tree_root);
2272                 return ret;
2273         } else if (!extent_buffer_uptodate(log_tree_root->node)) {
2274                 btrfs_err(fs_info, "failed to read log tree");
2275                 btrfs_put_root(log_tree_root);
2276                 return -EIO;
2277         }
2278         /* returns with log_tree_root freed on success */
2279         ret = btrfs_recover_log_trees(log_tree_root);
2280         if (ret) {
2281                 btrfs_handle_fs_error(fs_info, ret,
2282                                       "Failed to recover log tree");
2283                 btrfs_put_root(log_tree_root);
2284                 return ret;
2285         }
2286
2287         if (sb_rdonly(fs_info->sb)) {
2288                 ret = btrfs_commit_super(fs_info);
2289                 if (ret)
2290                         return ret;
2291         }
2292
2293         return 0;
2294 }
2295
2296 static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2297 {
2298         struct btrfs_root *tree_root = fs_info->tree_root;
2299         struct btrfs_root *root;
2300         struct btrfs_key location;
2301         int ret;
2302
2303         BUG_ON(!fs_info->tree_root);
2304
2305         location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2306         location.type = BTRFS_ROOT_ITEM_KEY;
2307         location.offset = 0;
2308
2309         root = btrfs_read_tree_root(tree_root, &location);
2310         if (IS_ERR(root)) {
2311                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2312                         ret = PTR_ERR(root);
2313                         goto out;
2314                 }
2315         } else {
2316                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2317                 fs_info->extent_root = root;
2318         }
2319
2320         location.objectid = BTRFS_DEV_TREE_OBJECTID;
2321         root = btrfs_read_tree_root(tree_root, &location);
2322         if (IS_ERR(root)) {
2323                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2324                         ret = PTR_ERR(root);
2325                         goto out;
2326                 }
2327         } else {
2328                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2329                 fs_info->dev_root = root;
2330                 btrfs_init_devices_late(fs_info);
2331         }
2332
2333         /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
2334         if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
2335                 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2336                 root = btrfs_read_tree_root(tree_root, &location);
2337                 if (IS_ERR(root)) {
2338                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2339                                 ret = PTR_ERR(root);
2340                                 goto out;
2341                         }
2342                 } else {
2343                         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2344                         fs_info->csum_root = root;
2345                 }
2346         }
2347
2348         /*
2349          * This tree can share blocks with some other fs tree during relocation
2350          * and we need a proper setup by btrfs_get_fs_root
2351          */
2352         root = btrfs_get_fs_root(tree_root->fs_info,
2353                                  BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2354         if (IS_ERR(root)) {
2355                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2356                         ret = PTR_ERR(root);
2357                         goto out;
2358                 }
2359         } else {
2360                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2361                 fs_info->data_reloc_root = root;
2362         }
2363
2364         location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2365         root = btrfs_read_tree_root(tree_root, &location);
2366         if (!IS_ERR(root)) {
2367                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2368                 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2369                 fs_info->quota_root = root;
2370         }
2371
2372         location.objectid = BTRFS_UUID_TREE_OBJECTID;
2373         root = btrfs_read_tree_root(tree_root, &location);
2374         if (IS_ERR(root)) {
2375                 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2376                         ret = PTR_ERR(root);
2377                         if (ret != -ENOENT)
2378                                 goto out;
2379                 }
2380         } else {
2381                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2382                 fs_info->uuid_root = root;
2383         }
2384
2385         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
2386                 location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
2387                 root = btrfs_read_tree_root(tree_root, &location);
2388                 if (IS_ERR(root)) {
2389                         if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
2390                                 ret = PTR_ERR(root);
2391                                 goto out;
2392                         }
2393                 }  else {
2394                         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2395                         fs_info->free_space_root = root;
2396                 }
2397         }
2398
2399         return 0;
2400 out:
2401         btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
2402                    location.objectid, ret);
2403         return ret;
2404 }
2405
2406 /*
2407  * Real super block validation
2408  * NOTE: super csum type and incompat features will not be checked here.
2409  *
2410  * @sb:         super block to check
2411  * @mirror_num: the super block number to check its bytenr:
2412  *              0       the primary (1st) sb
2413  *              1, 2    2nd and 3rd backup copy
2414  *             -1       skip bytenr check
2415  */
2416 static int validate_super(struct btrfs_fs_info *fs_info,
2417                             struct btrfs_super_block *sb, int mirror_num)
2418 {
2419         u64 nodesize = btrfs_super_nodesize(sb);
2420         u64 sectorsize = btrfs_super_sectorsize(sb);
2421         int ret = 0;
2422
2423         if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
2424                 btrfs_err(fs_info, "no valid FS found");
2425                 ret = -EINVAL;
2426         }
2427         if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
2428                 btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
2429                                 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
2430                 ret = -EINVAL;
2431         }
2432         if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
2433                 btrfs_err(fs_info, "tree_root level too big: %d >= %d",
2434                                 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
2435                 ret = -EINVAL;
2436         }
2437         if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
2438                 btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
2439                                 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
2440                 ret = -EINVAL;
2441         }
2442         if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
2443                 btrfs_err(fs_info, "log_root level too big: %d >= %d",
2444                                 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
2445                 ret = -EINVAL;
2446         }
2447
2448         /*
2449          * Check sectorsize and nodesize first, other check will need it.
2450          * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
2451          */
2452         if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
2453             sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2454                 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
2455                 ret = -EINVAL;
2456         }
2457         /* Only PAGE SIZE is supported yet */
2458         if (sectorsize != PAGE_SIZE) {
2459                 btrfs_err(fs_info,
2460                         "sectorsize %llu not supported yet, only support %lu",
2461                         sectorsize, PAGE_SIZE);
2462                 ret = -EINVAL;
2463         }
2464         if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
2465             nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
2466                 btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
2467                 ret = -EINVAL;
2468         }
2469         if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
2470                 btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
2471                           le32_to_cpu(sb->__unused_leafsize), nodesize);
2472                 ret = -EINVAL;
2473         }
2474
2475         /* Root alignment check */
2476         if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
2477                 btrfs_warn(fs_info, "tree_root block unaligned: %llu",
2478                            btrfs_super_root(sb));
2479                 ret = -EINVAL;
2480         }
2481         if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
2482                 btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
2483                            btrfs_super_chunk_root(sb));
2484                 ret = -EINVAL;
2485         }
2486         if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
2487                 btrfs_warn(fs_info, "log_root block unaligned: %llu",
2488                            btrfs_super_log_root(sb));
2489                 ret = -EINVAL;
2490         }
2491
2492         if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2493                    BTRFS_FSID_SIZE) != 0) {
2494                 btrfs_err(fs_info,
2495                         "dev_item UUID does not match metadata fsid: %pU != %pU",
2496                         fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2497                 ret = -EINVAL;
2498         }
2499
2500         /*
2501          * Hint to catch really bogus numbers, bitflips or so, more exact checks are
2502          * done later
2503          */
2504         if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
2505                 btrfs_err(fs_info, "bytes_used is too small %llu",
2506                           btrfs_super_bytes_used(sb));
2507                 ret = -EINVAL;
2508         }
2509         if (!is_power_of_2(btrfs_super_stripesize(sb))) {
2510                 btrfs_err(fs_info, "invalid stripesize %u",
2511                           btrfs_super_stripesize(sb));
2512                 ret = -EINVAL;
2513         }
2514         if (btrfs_super_num_devices(sb) > (1UL << 31))
2515                 btrfs_warn(fs_info, "suspicious number of devices: %llu",
2516                            btrfs_super_num_devices(sb));
2517         if (btrfs_super_num_devices(sb) == 0) {
2518                 btrfs_err(fs_info, "number of devices is 0");
2519                 ret = -EINVAL;
2520         }
2521
2522         if (mirror_num >= 0 &&
2523             btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2524                 btrfs_err(fs_info, "super offset mismatch %llu != %u",
2525                           btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
2526                 ret = -EINVAL;
2527         }
2528
2529         /*
2530          * Obvious sys_chunk_array corruptions, it must hold at least one key
2531          * and one chunk
2532          */
2533         if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
2534                 btrfs_err(fs_info, "system chunk array too big %u > %u",
2535                           btrfs_super_sys_array_size(sb),
2536                           BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
2537                 ret = -EINVAL;
2538         }
2539         if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
2540                         + sizeof(struct btrfs_chunk)) {
2541                 btrfs_err(fs_info, "system chunk array too small %u < %zu",
2542                           btrfs_super_sys_array_size(sb),
2543                           sizeof(struct btrfs_disk_key)
2544                           + sizeof(struct btrfs_chunk));
2545                 ret = -EINVAL;
2546         }
2547
2548         /*
2549          * The generation is a global counter, we'll trust it more than the others
2550          * but it's still possible that it's the one that's wrong.
2551          */
2552         if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
2553                 btrfs_warn(fs_info,
2554                         "suspicious: generation < chunk_root_generation: %llu < %llu",
2555                         btrfs_super_generation(sb),
2556                         btrfs_super_chunk_root_generation(sb));
2557         if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
2558             && btrfs_super_cache_generation(sb) != (u64)-1)
2559                 btrfs_warn(fs_info,
2560                         "suspicious: generation < cache_generation: %llu < %llu",
2561                         btrfs_super_generation(sb),
2562                         btrfs_super_cache_generation(sb));
2563
2564         return ret;
2565 }
2566
2567 /*
2568  * Validation of super block at mount time.
2569  * Some checks already done early at mount time, like csum type and incompat
2570  * flags will be skipped.
2571  */
2572 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
2573 {
2574         return validate_super(fs_info, fs_info->super_copy, 0);
2575 }
2576
2577 /*
2578  * Validation of super block at write time.
2579  * Some checks like bytenr check will be skipped as their values will be
2580  * overwritten soon.
2581  * Extra checks like csum type and incompat flags will be done here.
2582  */
2583 static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2584                                       struct btrfs_super_block *sb)
2585 {
2586         int ret;
2587
2588         ret = validate_super(fs_info, sb, -1);
2589         if (ret < 0)
2590                 goto out;
2591         if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2592                 ret = -EUCLEAN;
2593                 btrfs_err(fs_info, "invalid csum type, has %u want %u",
2594                           btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
2595                 goto out;
2596         }
2597         if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
2598                 ret = -EUCLEAN;
2599                 btrfs_err(fs_info,
2600                 "invalid incompat flags, has 0x%llx valid mask 0x%llx",
2601                           btrfs_super_incompat_flags(sb),
2602                           (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
2603                 goto out;
2604         }
2605 out:
2606         if (ret < 0)
2607                 btrfs_err(fs_info,
2608                 "super block corruption detected before writing it to disk");
2609         return ret;
2610 }
2611
2612 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2613 {
2614         int backup_index = find_newest_super_backup(fs_info);
2615         struct btrfs_super_block *sb = fs_info->super_copy;
2616         struct btrfs_root *tree_root = fs_info->tree_root;
2617         bool handle_error = false;
2618         int ret = 0;
2619         int i;
2620
2621         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2622                 u64 generation;
2623                 int level;
2624
2625                 if (handle_error) {
2626                         if (!IS_ERR(tree_root->node))
2627                                 free_extent_buffer(tree_root->node);
2628                         tree_root->node = NULL;
2629
2630                         if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2631                                 break;
2632
2633                         free_root_pointers(fs_info, 0);
2634
2635                         /*
2636                          * Don't use the log in recovery mode, it won't be
2637                          * valid
2638                          */
2639                         btrfs_set_super_log_root(sb, 0);
2640
2641                         /* We can't trust the free space cache either */
2642                         btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2643
2644                         ret = read_backup_root(fs_info, i);
2645                         backup_index = ret;
2646                         if (ret < 0)
2647                                 return ret;
2648                 }
2649                 generation = btrfs_super_generation(sb);
2650                 level = btrfs_super_root_level(sb);
2651                 tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
2652                                                   generation, level, NULL);
2653                 if (IS_ERR(tree_root->node)) {
2654                         handle_error = true;
2655                         ret = PTR_ERR(tree_root->node);
2656                         tree_root->node = NULL;
2657                         btrfs_warn(fs_info, "couldn't read tree root");
2658                         continue;
2659
2660                 } else if (!extent_buffer_uptodate(tree_root->node)) {
2661                         handle_error = true;
2662                         ret = -EIO;
2663                         btrfs_warn(fs_info, "error while reading tree root");
2664                         continue;
2665                 }
2666
2667                 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2668                 tree_root->commit_root = btrfs_root_node(tree_root);
2669                 btrfs_set_root_refs(&tree_root->root_item, 1);
2670
2671                 /*
2672                  * No need to hold btrfs_root::objectid_mutex since the fs
2673                  * hasn't been fully initialised and we are the only user
2674                  */
2675                 ret = btrfs_find_highest_objectid(tree_root,
2676                                                 &tree_root->highest_objectid);
2677                 if (ret < 0) {
2678                         handle_error = true;
2679                         continue;
2680                 }
2681
2682                 ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
2683
2684                 ret = btrfs_read_roots(fs_info);
2685                 if (ret < 0) {
2686                         handle_error = true;
2687                         continue;
2688                 }
2689
2690                 /* All successful */
2691                 fs_info->generation = generation;
2692                 fs_info->last_trans_committed = generation;
2693
2694                 /* Always begin writing backup roots after the one being used */
2695                 if (backup_index < 0) {
2696                         fs_info->backup_root_index = 0;
2697                 } else {
2698                         fs_info->backup_root_index = backup_index + 1;
2699                         fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2700                 }
2701                 break;
2702         }
2703
2704         return ret;
2705 }
2706
2707 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2708 {
2709         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2710         INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2711         INIT_LIST_HEAD(&fs_info->trans_list);
2712         INIT_LIST_HEAD(&fs_info->dead_roots);
2713         INIT_LIST_HEAD(&fs_info->delayed_iputs);
2714         INIT_LIST_HEAD(&fs_info->delalloc_roots);
2715         INIT_LIST_HEAD(&fs_info->caching_block_groups);
2716         spin_lock_init(&fs_info->delalloc_root_lock);
2717         spin_lock_init(&fs_info->trans_lock);
2718         spin_lock_init(&fs_info->fs_roots_radix_lock);
2719         spin_lock_init(&fs_info->delayed_iput_lock);
2720         spin_lock_init(&fs_info->defrag_inodes_lock);
2721         spin_lock_init(&fs_info->super_lock);
2722         spin_lock_init(&fs_info->buffer_lock);
2723         spin_lock_init(&fs_info->unused_bgs_lock);
2724         rwlock_init(&fs_info->tree_mod_log_lock);
2725         mutex_init(&fs_info->unused_bg_unpin_mutex);
2726         mutex_init(&fs_info->delete_unused_bgs_mutex);
2727         mutex_init(&fs_info->reloc_mutex);
2728         mutex_init(&fs_info->delalloc_root_mutex);
2729         seqlock_init(&fs_info->profiles_lock);
2730
2731         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2732         INIT_LIST_HEAD(&fs_info->space_info);
2733         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2734         INIT_LIST_HEAD(&fs_info->unused_bgs);
2735 #ifdef CONFIG_BTRFS_DEBUG
2736         INIT_LIST_HEAD(&fs_info->allocated_roots);
2737         INIT_LIST_HEAD(&fs_info->allocated_ebs);
2738         spin_lock_init(&fs_info->eb_leak_lock);
2739 #endif
2740         extent_map_tree_init(&fs_info->mapping_tree);
2741         btrfs_init_block_rsv(&fs_info->global_block_rsv,
2742                              BTRFS_BLOCK_RSV_GLOBAL);
2743         btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2744         btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2745         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2746         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2747                              BTRFS_BLOCK_RSV_DELOPS);
2748         btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2749                              BTRFS_BLOCK_RSV_DELREFS);
2750
2751         atomic_set(&fs_info->async_delalloc_pages, 0);
2752         atomic_set(&fs_info->defrag_running, 0);
2753         atomic_set(&fs_info->reada_works_cnt, 0);
2754         atomic_set(&fs_info->nr_delayed_iputs, 0);
2755         atomic64_set(&fs_info->tree_mod_seq, 0);
2756         fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2757         fs_info->metadata_ratio = 0;
2758         fs_info->defrag_inodes = RB_ROOT;
2759         atomic64_set(&fs_info->free_chunk_space, 0);
2760         fs_info->tree_mod_log = RB_ROOT;
2761         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2762         fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2763         /* readahead state */
2764         INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2765         spin_lock_init(&fs_info->reada_lock);
2766         btrfs_init_ref_verify(fs_info);
2767
2768         fs_info->thread_pool_size = min_t(unsigned long,
2769                                           num_online_cpus() + 2, 8);
2770
2771         INIT_LIST_HEAD(&fs_info->ordered_roots);
2772         spin_lock_init(&fs_info->ordered_root_lock);
2773
2774         btrfs_init_scrub(fs_info);
2775 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2776         fs_info->check_integrity_print_mask = 0;
2777 #endif
2778         btrfs_init_balance(fs_info);
2779         btrfs_init_async_reclaim_work(fs_info);
2780
2781         spin_lock_init(&fs_info->block_group_cache_lock);
2782         fs_info->block_group_cache_tree = RB_ROOT;
2783         fs_info->first_logical_byte = (u64)-1;
2784
2785         extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2786                             IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
2787         set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
2788
2789         mutex_init(&fs_info->ordered_operations_mutex);
2790         mutex_init(&fs_info->tree_log_mutex);
2791         mutex_init(&fs_info->chunk_mutex);
2792         mutex_init(&fs_info->transaction_kthread_mutex);
2793         mutex_init(&fs_info->cleaner_mutex);
2794         mutex_init(&fs_info->ro_block_group_mutex);
2795         init_rwsem(&fs_info->commit_root_sem);
2796         init_rwsem(&fs_info->cleanup_work_sem);
2797         init_rwsem(&fs_info->subvol_sem);
2798         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2799
2800         btrfs_init_dev_replace_locks(fs_info);
2801         btrfs_init_qgroup(fs_info);
2802         btrfs_discard_init(fs_info);
2803
2804         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2805         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2806
2807         init_waitqueue_head(&fs_info->transaction_throttle);
2808         init_waitqueue_head(&fs_info->transaction_wait);
2809         init_waitqueue_head(&fs_info->transaction_blocked_wait);
2810         init_waitqueue_head(&fs_info->async_submit_wait);
2811         init_waitqueue_head(&fs_info->delayed_iputs_wait);
2812
2813         /* Usable values until the real ones are cached from the superblock */
2814         fs_info->nodesize = 4096;
2815         fs_info->sectorsize = 4096;
2816         fs_info->sectorsize_bits = ilog2(4096);
2817         fs_info->stripesize = 4096;
2818
2819         spin_lock_init(&fs_info->swapfile_pins_lock);
2820         fs_info->swapfile_pins = RB_ROOT;
2821
2822         fs_info->send_in_progress = 0;
2823 }
2824
2825 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2826 {
2827         int ret;
2828
2829         fs_info->sb = sb;
2830         sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2831         sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2832
2833         ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
2834         if (ret)
2835                 return ret;
2836
2837         ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2838         if (ret)
2839                 return ret;
2840
2841         fs_info->dirty_metadata_batch = PAGE_SIZE *
2842                                         (1 + ilog2(nr_cpu_ids));
2843
2844         ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2845         if (ret)
2846                 return ret;
2847
2848         ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
2849                         GFP_KERNEL);
2850         if (ret)
2851                 return ret;
2852
2853         fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2854                                         GFP_KERNEL);
2855         if (!fs_info->delayed_root)
2856                 return -ENOMEM;
2857         btrfs_init_delayed_root(fs_info->delayed_root);
2858
2859         return btrfs_alloc_stripe_hash_table(fs_info);
2860 }
2861
2862 static int btrfs_uuid_rescan_kthread(void *data)
2863 {
2864         struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
2865         int ret;
2866
2867         /*
2868          * 1st step is to iterate through the existing UUID tree and
2869          * to delete all entries that contain outdated data.
2870          * 2nd step is to add all missing entries to the UUID tree.
2871          */
2872         ret = btrfs_uuid_tree_iterate(fs_info);
2873         if (ret < 0) {
2874                 if (ret != -EINTR)
2875                         btrfs_warn(fs_info, "iterating uuid_tree failed %d",
2876                                    ret);
2877                 up(&fs_info->uuid_tree_rescan_sem);
2878                 return ret;
2879         }
2880         return btrfs_uuid_scan_kthread(data);
2881 }
2882
2883 static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
2884 {
2885         struct task_struct *task;
2886
2887         down(&fs_info->uuid_tree_rescan_sem);
2888         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
2889         if (IS_ERR(task)) {
2890                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
2891                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
2892                 up(&fs_info->uuid_tree_rescan_sem);
2893                 return PTR_ERR(task);
2894         }
2895
2896         return 0;
2897 }
2898
2899 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
2900                       char *options)
2901 {
2902         u32 sectorsize;
2903         u32 nodesize;
2904         u32 stripesize;
2905         u64 generation;
2906         u64 features;
2907         u16 csum_type;
2908         struct btrfs_super_block *disk_super;
2909         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2910         struct btrfs_root *tree_root;
2911         struct btrfs_root *chunk_root;
2912         int ret;
2913         int err = -EINVAL;
2914         int clear_free_space_tree = 0;
2915         int level;
2916
2917         ret = init_mount_fs_info(fs_info, sb);
2918         if (ret) {
2919                 err = ret;
2920                 goto fail;
2921         }
2922
2923         /* These need to be init'ed before we start creating inodes and such. */
2924         tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
2925                                      GFP_KERNEL);
2926         fs_info->tree_root = tree_root;
2927         chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
2928                                       GFP_KERNEL);
2929         fs_info->chunk_root = chunk_root;
2930         if (!tree_root || !chunk_root) {
2931                 err = -ENOMEM;
2932                 goto fail;
2933         }
2934
2935         fs_info->btree_inode = new_inode(sb);
2936         if (!fs_info->btree_inode) {
2937                 err = -ENOMEM;
2938                 goto fail;
2939         }
2940         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2941         btrfs_init_btree_inode(fs_info);
2942
2943         invalidate_bdev(fs_devices->latest_bdev);
2944
2945         /*
2946          * Read super block and check the signature bytes only
2947          */
2948         disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
2949         if (IS_ERR(disk_super)) {
2950                 err = PTR_ERR(disk_super);
2951                 goto fail_alloc;
2952         }
2953
2954         /*
2955          * Verify the type first, if that or the checksum value are
2956          * corrupted, we'll find out
2957          */
2958         csum_type = btrfs_super_csum_type(disk_super);
2959         if (!btrfs_supported_super_csum(csum_type)) {
2960                 btrfs_err(fs_info, "unsupported checksum algorithm: %u",
2961                           csum_type);
2962                 err = -EINVAL;
2963                 btrfs_release_disk_super(disk_super);
2964                 goto fail_alloc;
2965         }
2966
2967         ret = btrfs_init_csum_hash(fs_info, csum_type);
2968         if (ret) {
2969                 err = ret;
2970                 btrfs_release_disk_super(disk_super);
2971                 goto fail_alloc;
2972         }
2973
2974         /*
2975          * We want to check superblock checksum, the type is stored inside.
2976          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2977          */
2978         if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
2979                 btrfs_err(fs_info, "superblock checksum mismatch");
2980                 err = -EINVAL;
2981                 btrfs_release_disk_super(disk_super);
2982                 goto fail_alloc;
2983         }
2984
2985         /*
2986          * super_copy is zeroed at allocation time and we never touch the
2987          * following bytes up to INFO_SIZE, the checksum is calculated from
2988          * the whole block of INFO_SIZE
2989          */
2990         memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
2991         btrfs_release_disk_super(disk_super);
2992
2993         disk_super = fs_info->super_copy;
2994
2995         ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
2996                        BTRFS_FSID_SIZE));
2997
2998         if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
2999                 ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
3000                                 fs_info->super_copy->metadata_uuid,
3001                                 BTRFS_FSID_SIZE));
3002         }
3003
3004         features = btrfs_super_flags(disk_super);
3005         if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3006                 features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3007                 btrfs_set_super_flags(disk_super, features);
3008                 btrfs_info(fs_info,
3009                         "found metadata UUID change in progress flag, clearing");
3010         }
3011
3012         memcpy(fs_info->super_for_commit, fs_info->super_copy,
3013                sizeof(*fs_info->super_for_commit));
3014
3015         ret = btrfs_validate_mount_super(fs_info);
3016         if (ret) {
3017                 btrfs_err(fs_info, "superblock contains fatal errors");
3018                 err = -EINVAL;
3019                 goto fail_alloc;
3020         }
3021
3022         if (!btrfs_super_root(disk_super))
3023                 goto fail_alloc;
3024
3025         /* check FS state, whether FS is broken. */
3026         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3027                 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
3028
3029         /*
3030          * In the long term, we'll store the compression type in the super
3031          * block, and it'll be used for per file compression control.
3032          */
3033         fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3034
3035         ret = btrfs_parse_options(fs_info, options, sb->s_flags);
3036         if (ret) {
3037                 err = ret;
3038                 goto fail_alloc;
3039         }
3040
3041         features = btrfs_super_incompat_flags(disk_super) &
3042                 ~BTRFS_FEATURE_INCOMPAT_SUPP;
3043         if (features) {
3044                 btrfs_err(fs_info,
3045                     "cannot mount because of unsupported optional features (%llx)",
3046                     features);
3047                 err = -EINVAL;
3048                 goto fail_alloc;
3049         }
3050
3051         features = btrfs_super_incompat_flags(disk_super);
3052         features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3053         if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
3054                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
3055         else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
3056                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3057
3058         if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
3059                 btrfs_info(fs_info, "has skinny extents");
3060
3061         /*
3062          * flag our filesystem as having big metadata blocks if
3063          * they are bigger than the page size
3064          */
3065         if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
3066                 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
3067                         btrfs_info(fs_info,
3068                                 "flagging fs with big metadata feature");
3069                 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3070         }
3071
3072         nodesize = btrfs_super_nodesize(disk_super);
3073         sectorsize = btrfs_super_sectorsize(disk_super);
3074         stripesize = sectorsize;
3075         fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3076         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3077
3078         /* Cache block sizes */
3079         fs_info->nodesize = nodesize;
3080         fs_info->sectorsize = sectorsize;
3081         fs_info->sectorsize_bits = ilog2(sectorsize);
3082         fs_info->csum_size = btrfs_super_csum_size(disk_super);
3083         fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3084         fs_info->stripesize = stripesize;
3085
3086         /*
3087          * mixed block groups end up with duplicate but slightly offset
3088          * extent buffers for the same range.  It leads to corruptions
3089          */
3090         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3091             (sectorsize != nodesize)) {
3092                 btrfs_err(fs_info,
3093 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
3094                         nodesize, sectorsize);
3095                 goto fail_alloc;
3096         }
3097
3098         /*
3099          * Needn't use the lock because there is no other task which will
3100          * update the flag.
3101          */
3102         btrfs_set_super_incompat_flags(disk_super, features);
3103
3104         features = btrfs_super_compat_ro_flags(disk_super) &
3105                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
3106         if (!sb_rdonly(sb) && features) {
3107                 btrfs_err(fs_info,
3108         "cannot mount read-write because of unsupported optional features (%llx)",
3109                        features);
3110                 err = -EINVAL;
3111                 goto fail_alloc;
3112         }
3113
3114         ret = btrfs_init_workqueues(fs_info, fs_devices);
3115         if (ret) {
3116                 err = ret;
3117                 goto fail_sb_buffer;
3118         }
3119
3120         sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
3121         sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3122
3123         sb->s_blocksize = sectorsize;
3124         sb->s_blocksize_bits = blksize_bits(sectorsize);
3125         memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3126
3127         mutex_lock(&fs_info->chunk_mutex);
3128         ret = btrfs_read_sys_array(fs_info);
3129         mutex_unlock(&fs_info->chunk_mutex);
3130         if (ret) {
3131                 btrfs_err(fs_info, "failed to read the system array: %d", ret);
3132                 goto fail_sb_buffer;
3133         }
3134
3135         generation = btrfs_super_chunk_root_generation(disk_super);
3136         level = btrfs_super_chunk_root_level(disk_super);
3137
3138         chunk_root->node = read_tree_block(fs_info,
3139                                            btrfs_super_chunk_root(disk_super),
3140                                            generation, level, NULL);
3141         if (IS_ERR(chunk_root->node) ||
3142             !extent_buffer_uptodate(chunk_root->node)) {
3143                 btrfs_err(fs_info, "failed to read chunk root");
3144                 if (!IS_ERR(chunk_root->node))
3145                         free_extent_buffer(chunk_root->node);
3146                 chunk_root->node = NULL;
3147                 goto fail_tree_roots;
3148         }
3149         btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
3150         chunk_root->commit_root = btrfs_root_node(chunk_root);
3151
3152         read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3153                            offsetof(struct btrfs_header, chunk_tree_uuid),
3154                            BTRFS_UUID_SIZE);
3155
3156         ret = btrfs_read_chunk_tree(fs_info);
3157         if (ret) {
3158                 btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3159                 goto fail_tree_roots;
3160         }
3161
3162         /*
3163          * Keep the devid that is marked to be the target device for the
3164          * device replace procedure
3165          */
3166         btrfs_free_extra_devids(fs_devices, 0);
3167
3168         if (!fs_devices->latest_bdev) {
3169                 btrfs_err(fs_info, "failed to read devices");
3170                 goto fail_tree_roots;
3171         }
3172
3173         ret = init_tree_roots(fs_info);
3174         if (ret)
3175                 goto fail_tree_roots;
3176
3177         /*
3178          * If we have a uuid root and we're not being told to rescan we need to
3179          * check the generation here so we can set the
3180          * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
3181          * transaction during a balance or the log replay without updating the
3182          * uuid generation, and then if we crash we would rescan the uuid tree,
3183          * even though it was perfectly fine.
3184          */
3185         if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
3186             fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
3187                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3188
3189         ret = btrfs_verify_dev_extents(fs_info);
3190         if (ret) {
3191                 btrfs_err(fs_info,
3192                           "failed to verify dev extents against chunks: %d",
3193                           ret);
3194                 goto fail_block_groups;
3195         }
3196         ret = btrfs_recover_balance(fs_info);
3197         if (ret) {
3198                 btrfs_err(fs_info, "failed to recover balance: %d", ret);
3199                 goto fail_block_groups;
3200         }
3201
3202         ret = btrfs_init_dev_stats(fs_info);
3203         if (ret) {
3204                 btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3205                 goto fail_block_groups;
3206         }
3207
3208         ret = btrfs_init_dev_replace(fs_info);
3209         if (ret) {
3210                 btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3211                 goto fail_block_groups;
3212         }
3213
3214         btrfs_free_extra_devids(fs_devices, 1);
3215
3216         ret = btrfs_sysfs_add_fsid(fs_devices);
3217         if (ret) {
3218                 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
3219                                 ret);
3220                 goto fail_block_groups;
3221         }
3222
3223         ret = btrfs_sysfs_add_mounted(fs_info);
3224         if (ret) {
3225                 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3226                 goto fail_fsdev_sysfs;
3227         }
3228
3229         ret = btrfs_init_space_info(fs_info);
3230         if (ret) {
3231                 btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3232                 goto fail_sysfs;
3233         }
3234
3235         ret = btrfs_read_block_groups(fs_info);
3236         if (ret) {
3237                 btrfs_err(fs_info, "failed to read block groups: %d", ret);
3238                 goto fail_sysfs;
3239         }
3240
3241         if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
3242                 btrfs_warn(fs_info,
3243                 "writable mount is not allowed due to too many missing devices");
3244                 goto fail_sysfs;
3245         }
3246
3247         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
3248                                                "btrfs-cleaner");
3249         if (IS_ERR(fs_info->cleaner_kthread))
3250                 goto fail_sysfs;
3251
3252         fs_info->transaction_kthread = kthread_run(transaction_kthread,
3253                                                    tree_root,
3254                                                    "btrfs-transaction");
3255         if (IS_ERR(fs_info->transaction_kthread))
3256                 goto fail_cleaner;
3257
3258         if (!btrfs_test_opt(fs_info, NOSSD) &&
3259             !fs_info->fs_devices->rotating) {
3260                 btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
3261         }
3262
3263         /*
3264          * Mount does not set all options immediately, we can do it now and do
3265          * not have to wait for transaction commit
3266          */
3267         btrfs_apply_pending_changes(fs_info);
3268
3269 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3270         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3271                 ret = btrfsic_mount(fs_info, fs_devices,
3272                                     btrfs_test_opt(fs_info,
3273                                         CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
3274                                     1 : 0,
3275                                     fs_info->check_integrity_print_mask);
3276                 if (ret)
3277                         btrfs_warn(fs_info,
3278                                 "failed to initialize integrity check module: %d",
3279                                 ret);
3280         }
3281 #endif
3282         ret = btrfs_read_qgroup_config(fs_info);
3283         if (ret)
3284                 goto fail_trans_kthread;
3285
3286         if (btrfs_build_ref_tree(fs_info))
3287                 btrfs_err(fs_info, "couldn't build ref tree");
3288
3289         /* do not make disk changes in broken FS or nologreplay is given */
3290         if (btrfs_super_log_root(disk_super) != 0 &&
3291             !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3292                 btrfs_info(fs_info, "start tree-log replay");
3293                 ret = btrfs_replay_log(fs_info, fs_devices);
3294                 if (ret) {
3295                         err = ret;
3296                         goto fail_qgroup;
3297                 }
3298         }
3299
3300         ret = btrfs_find_orphan_roots(fs_info);
3301         if (ret)
3302                 goto fail_qgroup;
3303
3304         if (!sb_rdonly(sb)) {
3305                 ret = btrfs_cleanup_fs_roots(fs_info);
3306                 if (ret)
3307                         goto fail_qgroup;
3308
3309                 mutex_lock(&fs_info->cleaner_mutex);
3310                 ret = btrfs_recover_relocation(tree_root);
3311                 mutex_unlock(&fs_info->cleaner_mutex);
3312                 if (ret < 0) {
3313                         btrfs_warn(fs_info, "failed to recover relocation: %d",
3314                                         ret);
3315                         err = -EINVAL;
3316                         goto fail_qgroup;
3317                 }
3318         }
3319
3320         fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3321         if (IS_ERR(fs_info->fs_root)) {
3322                 err = PTR_ERR(fs_info->fs_root);
3323                 btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3324                 fs_info->fs_root = NULL;
3325                 goto fail_qgroup;
3326         }
3327
3328         if (sb_rdonly(sb))
3329                 return 0;
3330
3331         if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
3332             btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3333                 clear_free_space_tree = 1;
3334         } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
3335                    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
3336                 btrfs_warn(fs_info, "free space tree is invalid");
3337                 clear_free_space_tree = 1;
3338         }
3339
3340         if (clear_free_space_tree) {
3341                 btrfs_info(fs_info, "clearing free space tree");
3342                 ret = btrfs_clear_free_space_tree(fs_info);
3343                 if (ret) {
3344                         btrfs_warn(fs_info,
3345                                    "failed to clear free space tree: %d", ret);
3346                         close_ctree(fs_info);
3347                         return ret;
3348                 }
3349         }
3350
3351         if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3352             !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3353                 btrfs_info(fs_info, "creating free space tree");
3354                 ret = btrfs_create_free_space_tree(fs_info);
3355                 if (ret) {
3356                         btrfs_warn(fs_info,
3357                                 "failed to create free space tree: %d", ret);
3358                         close_ctree(fs_info);
3359                         return ret;
3360                 }
3361         }
3362
3363         down_read(&fs_info->cleanup_work_sem);
3364         if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3365             (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3366                 up_read(&fs_info->cleanup_work_sem);
3367                 close_ctree(fs_info);
3368                 return ret;
3369         }
3370         up_read(&fs_info->cleanup_work_sem);
3371
3372         ret = btrfs_resume_balance_async(fs_info);
3373         if (ret) {
3374                 btrfs_warn(fs_info, "failed to resume balance: %d", ret);
3375                 close_ctree(fs_info);
3376                 return ret;
3377         }
3378
3379         ret = btrfs_resume_dev_replace_async(fs_info);
3380         if (ret) {
3381                 btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
3382                 close_ctree(fs_info);
3383                 return ret;
3384         }
3385
3386         btrfs_qgroup_rescan_resume(fs_info);
3387         btrfs_discard_resume(fs_info);
3388
3389         if (!fs_info->uuid_root) {
3390                 btrfs_info(fs_info, "creating UUID tree");
3391                 ret = btrfs_create_uuid_tree(fs_info);
3392                 if (ret) {
3393                         btrfs_warn(fs_info,
3394                                 "failed to create the UUID tree: %d", ret);
3395                         close_ctree(fs_info);
3396                         return ret;
3397                 }
3398         } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3399                    fs_info->generation !=
3400                                 btrfs_super_uuid_tree_generation(disk_super)) {
3401                 btrfs_info(fs_info, "checking UUID tree");
3402                 ret = btrfs_check_uuid_tree(fs_info);
3403                 if (ret) {
3404                         btrfs_warn(fs_info,
3405                                 "failed to check the UUID tree: %d", ret);
3406                         close_ctree(fs_info);
3407                         return ret;
3408                 }
3409         }
3410         set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3411
3412         return 0;
3413
3414 fail_qgroup:
3415         btrfs_free_qgroup_config(fs_info);
3416 fail_trans_kthread:
3417         kthread_stop(fs_info->transaction_kthread);
3418         btrfs_cleanup_transaction(fs_info);
3419         btrfs_free_fs_roots(fs_info);
3420 fail_cleaner:
3421         kthread_stop(fs_info->cleaner_kthread);
3422
3423         /*
3424          * make sure we're done with the btree inode before we stop our
3425          * kthreads
3426          */
3427         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3428
3429 fail_sysfs:
3430         btrfs_sysfs_remove_mounted(fs_info);
3431
3432 fail_fsdev_sysfs:
3433         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3434
3435 fail_block_groups:
3436         btrfs_put_block_group_cache(fs_info);
3437
3438 fail_tree_roots:
3439         if (fs_info->data_reloc_root)
3440                 btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3441         free_root_pointers(fs_info, true);
3442         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3443
3444 fail_sb_buffer:
3445         btrfs_stop_all_workers(fs_info);
3446         btrfs_free_block_groups(fs_info);
3447 fail_alloc:
3448         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3449
3450         iput(fs_info->btree_inode);
3451 fail:
3452         btrfs_close_devices(fs_info->fs_devices);
3453         return err;
3454 }
3455 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3456
3457 static void btrfs_end_super_write(struct bio *bio)
3458 {
3459         struct btrfs_device *device = bio->bi_private;
3460         struct bio_vec *bvec;
3461         struct bvec_iter_all iter_all;
3462         struct page *page;
3463
3464         bio_for_each_segment_all(bvec, bio, iter_all) {
3465                 page = bvec->bv_page;
3466
3467                 if (bio->bi_status) {
3468                         btrfs_warn_rl_in_rcu(device->fs_info,
3469                                 "lost page write due to IO error on %s (%d)",
3470                                 rcu_str_deref(device->name),
3471                                 blk_status_to_errno(bio->bi_status));
3472                         ClearPageUptodate(page);
3473                         SetPageError(page);
3474                         btrfs_dev_stat_inc_and_print(device,
3475                                                      BTRFS_DEV_STAT_WRITE_ERRS);
3476                 } else {
3477                         SetPageUptodate(page);
3478                 }
3479
3480                 put_page(page);
3481                 unlock_page(page);
3482         }
3483
3484         bio_put(bio);
3485 }
3486
3487 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3488                                                    int copy_num)
3489 {
3490         struct btrfs_super_block *super;
3491         struct page *page;
3492         u64 bytenr;
3493         struct address_space *mapping = bdev->bd_inode->i_mapping;
3494
3495         bytenr = btrfs_sb_offset(copy_num);
3496         if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3497                 return ERR_PTR(-EINVAL);
3498
3499         page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3500         if (IS_ERR(page))
3501                 return ERR_CAST(page);
3502
3503         super = page_address(page);
3504         if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3505                 btrfs_release_disk_super(super);
3506                 return ERR_PTR(-ENODATA);
3507         }
3508
3509         if (btrfs_super_bytenr(super) != bytenr) {
3510                 btrfs_release_disk_super(super);
3511                 return ERR_PTR(-EINVAL);
3512         }
3513
3514         return super;
3515 }
3516
3517
3518 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
3519 {
3520         struct btrfs_super_block *super, *latest = NULL;
3521         int i;
3522         u64 transid = 0;
3523
3524         /* we would like to check all the supers, but that would make
3525          * a btrfs mount succeed after a mkfs from a different FS.
3526          * So, we need to add a special mount option to scan for
3527          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3528          */
3529         for (i = 0; i < 1; i++) {
3530                 super = btrfs_read_dev_one_super(bdev, i);
3531                 if (IS_ERR(super))
3532                         continue;
3533
3534                 if (!latest || btrfs_super_generation(super) > transid) {
3535                         if (latest)
3536                                 btrfs_release_disk_super(super);
3537
3538                         latest = super;
3539                         transid = btrfs_super_generation(super);
3540                 }
3541         }
3542
3543         return super;
3544 }
3545
3546 /*
3547  * Write superblock @sb to the @device. Do not wait for completion, all the
3548  * pages we use for writing are locked.
3549  *
3550  * Write @max_mirrors copies of the superblock, where 0 means default that fit
3551  * the expected device size at commit time. Note that max_mirrors must be
3552  * same for write and wait phases.
3553  *
3554  * Return number of errors when page is not found or submission fails.
3555  */
3556 static int write_dev_supers(struct btrfs_device *device,
3557                             struct btrfs_super_block *sb, int max_mirrors)
3558 {
3559         struct btrfs_fs_info *fs_info = device->fs_info;
3560         struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3561         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3562         int i;
3563         int errors = 0;
3564         u64 bytenr;
3565
3566         if (max_mirrors == 0)
3567                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3568
3569         shash->tfm = fs_info->csum_shash;
3570
3571         for (i = 0; i < max_mirrors; i++) {
3572                 struct page *page;
3573                 struct bio *bio;
3574                 struct btrfs_super_block *disk_super;
3575
3576                 bytenr = btrfs_sb_offset(i);
3577                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3578                     device->commit_total_bytes)
3579                         break;
3580
3581                 btrfs_set_super_bytenr(sb, bytenr);
3582
3583                 crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3584                                     BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3585                                     sb->csum);
3586
3587                 page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
3588                                            GFP_NOFS);
3589                 if (!page) {
3590                         btrfs_err(device->fs_info,
3591                             "couldn't get super block page for bytenr %llu",
3592                             bytenr);
3593                         errors++;
3594                         continue;
3595                 }
3596
3597                 /* Bump the refcount for wait_dev_supers() */
3598                 get_page(page);
3599
3600                 disk_super = page_address(page);
3601                 memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3602
3603                 /*
3604                  * Directly use bios here instead of relying on the page cache
3605                  * to do I/O, so we don't lose the ability to do integrity
3606                  * checking.
3607                  */
3608                 bio = bio_alloc(GFP_NOFS, 1);
3609                 bio_set_dev(bio, device->bdev);
3610                 bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3611                 bio->bi_private = device;
3612                 bio->bi_end_io = btrfs_end_super_write;
3613                 __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
3614                                offset_in_page(bytenr));
3615
3616                 /*
3617                  * We FUA only the first super block.  The others we allow to
3618                  * go down lazy and there's a short window where the on-disk
3619                  * copies might still contain the older version.
3620                  */
3621                 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
3622                 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3623                         bio->bi_opf |= REQ_FUA;
3624
3625                 btrfsic_submit_bio(bio);
3626         }
3627         return errors < i ? 0 : -1;
3628 }
3629
3630 /*
3631  * Wait for write completion of superblocks done by write_dev_supers,
3632  * @max_mirrors same for write and wait phases.
3633  *
3634  * Return number of errors when page is not found or not marked up to
3635  * date.
3636  */
3637 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
3638 {
3639         int i;
3640         int errors = 0;
3641         bool primary_failed = false;
3642         u64 bytenr;
3643
3644         if (max_mirrors == 0)
3645                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3646
3647         for (i = 0; i < max_mirrors; i++) {
3648                 struct page *page;
3649
3650                 bytenr = btrfs_sb_offset(i);
3651                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3652                     device->commit_total_bytes)
3653                         break;
3654
3655                 page = find_get_page(device->bdev->bd_inode->i_mapping,
3656                                      bytenr >> PAGE_SHIFT);
3657                 if (!page) {
3658                         errors++;
3659                         if (i == 0)
3660                                 primary_failed = true;
3661                         continue;
3662                 }
3663                 /* Page is submitted locked and unlocked once the IO completes */
3664                 wait_on_page_locked(page);
3665                 if (PageError(page)) {
3666                         errors++;
3667                         if (i == 0)
3668                                 primary_failed = true;
3669                 }
3670
3671                 /* Drop our reference */
3672                 put_page(page);
3673
3674                 /* Drop the reference from the writing run */
3675                 put_page(page);
3676         }
3677
3678         /* log error, force error return */
3679         if (primary_failed) {
3680                 btrfs_err(device->fs_info, "error writing primary super block to device %llu",
3681                           device->devid);
3682                 return -1;
3683         }
3684
3685         return errors < i ? 0 : -1;
3686 }
3687
3688 /*
3689  * endio for the write_dev_flush, this will wake anyone waiting
3690  * for the barrier when it is done
3691  */
3692 static void btrfs_end_empty_barrier(struct bio *bio)
3693 {
3694         complete(bio->bi_private);
3695 }
3696
3697 /*
3698  * Submit a flush request to the device if it supports it. Error handling is
3699  * done in the waiting counterpart.
3700  */
3701 static void write_dev_flush(struct btrfs_device *device)
3702 {
3703         struct request_queue *q = bdev_get_queue(device->bdev);
3704         struct bio *bio = device->flush_bio;
3705
3706         if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
3707                 return;
3708
3709         bio_reset(bio);
3710         bio->bi_end_io = btrfs_end_empty_barrier;
3711         bio_set_dev(bio, device->bdev);
3712         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
3713         init_completion(&device->flush_wait);
3714         bio->bi_private = &device->flush_wait;
3715
3716         btrfsic_submit_bio(bio);
3717         set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3718 }
3719
3720 /*
3721  * If the flush bio has been submitted by write_dev_flush, wait for it.
3722  */
3723 static blk_status_t wait_dev_flush(struct btrfs_device *device)
3724 {
3725         struct bio *bio = device->flush_bio;
3726
3727         if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3728                 return BLK_STS_OK;
3729
3730         clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3731         wait_for_completion_io(&device->flush_wait);
3732
3733         return bio->bi_status;
3734 }
3735
3736 static int check_barrier_error(struct btrfs_fs_info *fs_info)
3737 {
3738         if (!btrfs_check_rw_degradable(fs_info, NULL))
3739                 return -EIO;
3740         return 0;
3741 }
3742
3743 /*
3744  * send an empty flush down to each device in parallel,
3745  * then wait for them
3746  */
3747 static int barrier_all_devices(struct btrfs_fs_info *info)
3748 {
3749         struct list_head *head;
3750         struct btrfs_device *dev;
3751         int errors_wait = 0;
3752         blk_status_t ret;
3753
3754         lockdep_assert_held(&info->fs_devices->device_list_mutex);
3755         /* send down all the barriers */
3756         head = &info->fs_devices->devices;
3757         list_for_each_entry(dev, head, dev_list) {
3758                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3759                         continue;
3760                 if (!dev->bdev)
3761                         continue;
3762                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3763                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3764                         continue;
3765
3766                 write_dev_flush(dev);
3767                 dev->last_flush_error = BLK_STS_OK;
3768         }
3769
3770         /* wait for all the barriers */
3771         list_for_each_entry(dev, head, dev_list) {
3772                 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3773                         continue;
3774                 if (!dev->bdev) {
3775                         errors_wait++;
3776                         continue;
3777                 }
3778                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3779                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3780                         continue;
3781
3782                 ret = wait_dev_flush(dev);
3783                 if (ret) {
3784                         dev->last_flush_error = ret;
3785                         btrfs_dev_stat_inc_and_print(dev,
3786                                         BTRFS_DEV_STAT_FLUSH_ERRS);
3787                         errors_wait++;
3788                 }
3789         }
3790
3791         if (errors_wait) {
3792                 /*
3793                  * At some point we need the status of all disks
3794                  * to arrive at the volume status. So error checking
3795                  * is being pushed to a separate loop.
3796                  */
3797                 return check_barrier_error(info);
3798         }
3799         return 0;
3800 }
3801
3802 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3803 {
3804         int raid_type;
3805         int min_tolerated = INT_MAX;
3806
3807         if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3808             (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3809                 min_tolerated = min_t(int, min_tolerated,
3810                                     btrfs_raid_array[BTRFS_RAID_SINGLE].
3811                                     tolerated_failures);
3812
3813         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3814                 if (raid_type == BTRFS_RAID_SINGLE)
3815                         continue;
3816                 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3817                         continue;
3818                 min_tolerated = min_t(int, min_tolerated,
3819                                     btrfs_raid_array[raid_type].
3820                                     tolerated_failures);
3821         }
3822
3823         if (min_tolerated == INT_MAX) {
3824                 pr_warn("BTRFS: unknown raid flag: %llu", flags);
3825                 min_tolerated = 0;
3826         }
3827
3828         return min_tolerated;
3829 }
3830
3831 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
3832 {
3833         struct list_head *head;
3834         struct btrfs_device *dev;
3835         struct btrfs_super_block *sb;
3836         struct btrfs_dev_item *dev_item;
3837         int ret;
3838         int do_barriers;
3839         int max_errors;
3840         int total_errors = 0;
3841         u64 flags;
3842
3843         do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
3844
3845         /*
3846          * max_mirrors == 0 indicates we're from commit_transaction,
3847          * not from fsync where the tree roots in fs_info have not
3848          * been consistent on disk.
3849          */
3850         if (max_mirrors == 0)
3851                 backup_super_roots(fs_info);
3852
3853         sb = fs_info->super_for_commit;
3854         dev_item = &sb->dev_item;
3855
3856         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3857         head = &fs_info->fs_devices->devices;
3858         max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
3859
3860         if (do_barriers) {
3861                 ret = barrier_all_devices(fs_info);
3862                 if (ret) {
3863                         mutex_unlock(
3864                                 &fs_info->fs_devices->device_list_mutex);
3865                         btrfs_handle_fs_error(fs_info, ret,
3866                                               "errors while submitting device barriers.");
3867                         return ret;
3868                 }
3869         }
3870
3871         list_for_each_entry(dev, head, dev_list) {
3872                 if (!dev->bdev) {
3873                         total_errors++;
3874                         continue;
3875                 }
3876                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3877                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3878                         continue;
3879
3880                 btrfs_set_stack_device_generation(dev_item, 0);
3881                 btrfs_set_stack_device_type(dev_item, dev->type);
3882                 btrfs_set_stack_device_id(dev_item, dev->devid);
3883                 btrfs_set_stack_device_total_bytes(dev_item,
3884                                                    dev->commit_total_bytes);
3885                 btrfs_set_stack_device_bytes_used(dev_item,
3886                                                   dev->commit_bytes_used);
3887                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3888                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3889                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
3890                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3891                 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
3892                        BTRFS_FSID_SIZE);
3893
3894                 flags = btrfs_super_flags(sb);
3895                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
3896
3897                 ret = btrfs_validate_write_super(fs_info, sb);
3898                 if (ret < 0) {
3899                         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3900                         btrfs_handle_fs_error(fs_info, -EUCLEAN,
3901                                 "unexpected superblock corruption detected");
3902                         return -EUCLEAN;
3903                 }
3904
3905                 ret = write_dev_supers(dev, sb, max_mirrors);
3906                 if (ret)
3907                         total_errors++;
3908         }
3909         if (total_errors > max_errors) {
3910                 btrfs_err(fs_info, "%d errors while writing supers",
3911                           total_errors);
3912                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3913
3914                 /* FUA is masked off if unsupported and can't be the reason */
3915                 btrfs_handle_fs_error(fs_info, -EIO,
3916                                       "%d errors while writing supers",
3917                                       total_errors);
3918                 return -EIO;
3919         }
3920
3921         total_errors = 0;
3922         list_for_each_entry(dev, head, dev_list) {
3923                 if (!dev->bdev)
3924                         continue;
3925                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3926                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3927                         continue;
3928
3929                 ret = wait_dev_supers(dev, max_mirrors);
3930                 if (ret)
3931                         total_errors++;
3932         }
3933         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3934         if (total_errors > max_errors) {
3935                 btrfs_handle_fs_error(fs_info, -EIO,
3936                                       "%d errors while writing supers",
3937                                       total_errors);
3938                 return -EIO;
3939         }
3940         return 0;
3941 }
3942
3943 /* Drop a fs root from the radix tree and free it. */
3944 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3945                                   struct btrfs_root *root)
3946 {
3947         bool drop_ref = false;
3948
3949         spin_lock(&fs_info->fs_roots_radix_lock);
3950         radix_tree_delete(&fs_info->fs_roots_radix,
3951                           (unsigned long)root->root_key.objectid);
3952         if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
3953                 drop_ref = true;
3954         spin_unlock(&fs_info->fs_roots_radix_lock);
3955
3956         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3957                 ASSERT(root->log_root == NULL);
3958                 if (root->reloc_root) {
3959                         btrfs_put_root(root->reloc_root);
3960                         root->reloc_root = NULL;
3961                 }
3962         }
3963
3964         if (root->free_ino_pinned)
3965                 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3966         if (root->free_ino_ctl)
3967                 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3968         if (root->ino_cache_inode) {
3969                 iput(root->ino_cache_inode);
3970                 root->ino_cache_inode = NULL;
3971         }
3972         if (drop_ref)
3973                 btrfs_put_root(root);
3974 }
3975
3976 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3977 {
3978         u64 root_objectid = 0;
3979         struct btrfs_root *gang[8];
3980         int i = 0;
3981         int err = 0;
3982         unsigned int ret = 0;
3983
3984         while (1) {
3985                 spin_lock(&fs_info->fs_roots_radix_lock);
3986                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3987                                              (void **)gang, root_objectid,
3988                                              ARRAY_SIZE(gang));
3989                 if (!ret) {
3990                         spin_unlock(&fs_info->fs_roots_radix_lock);
3991                         break;
3992                 }
3993                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
3994
3995                 for (i = 0; i < ret; i++) {
3996                         /* Avoid to grab roots in dead_roots */
3997                         if (btrfs_root_refs(&gang[i]->root_item) == 0) {
3998                                 gang[i] = NULL;
3999                                 continue;
4000                         }
4001                         /* grab all the search result for later use */
4002                         gang[i] = btrfs_grab_root(gang[i]);
4003                 }
4004                 spin_unlock(&fs_info->fs_roots_radix_lock);
4005
4006                 for (i = 0; i < ret; i++) {
4007                         if (!gang[i])
4008                                 continue;
4009                         root_objectid = gang[i]->root_key.objectid;
4010                         err = btrfs_orphan_cleanup(gang[i]);
4011                         if (err)
4012                                 break;
4013                         btrfs_put_root(gang[i]);
4014                 }
4015                 root_objectid++;
4016         }
4017
4018         /* release the uncleaned roots due to error */
4019         for (; i < ret; i++) {
4020                 if (gang[i])
4021                         btrfs_put_root(gang[i]);
4022         }
4023         return err;
4024 }
4025
4026 int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4027 {
4028         struct btrfs_root *root = fs_info->tree_root;
4029         struct btrfs_trans_handle *trans;
4030
4031         mutex_lock(&fs_info->cleaner_mutex);
4032         btrfs_run_delayed_iputs(fs_info);
4033         mutex_unlock(&fs_info->cleaner_mutex);
4034         wake_up_process(fs_info->cleaner_kthread);
4035
4036         /* wait until ongoing cleanup work done */
4037         down_write(&fs_info->cleanup_work_sem);
4038         up_write(&fs_info->cleanup_work_sem);
4039
4040         trans = btrfs_join_transaction(root);
4041         if (IS_ERR(trans))
4042                 return PTR_ERR(trans);
4043         return btrfs_commit_transaction(trans);
4044 }
4045
4046 void __cold close_ctree(struct btrfs_fs_info *fs_info)
4047 {
4048         int ret;
4049
4050         set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4051         /*
4052          * We don't want the cleaner to start new transactions, add more delayed
4053          * iputs, etc. while we're closing. We can't use kthread_stop() yet
4054          * because that frees the task_struct, and the transaction kthread might
4055          * still try to wake up the cleaner.
4056          */
4057         kthread_park(fs_info->cleaner_kthread);
4058
4059         /* wait for the qgroup rescan worker to stop */
4060         btrfs_qgroup_wait_for_completion(fs_info, false);
4061
4062         /* wait for the uuid_scan task to finish */
4063         down(&fs_info->uuid_tree_rescan_sem);
4064         /* avoid complains from lockdep et al., set sem back to initial state */
4065         up(&fs_info->uuid_tree_rescan_sem);
4066
4067         /* pause restriper - we want to resume on mount */
4068         btrfs_pause_balance(fs_info);
4069
4070         btrfs_dev_replace_suspend_for_unmount(fs_info);
4071
4072         btrfs_scrub_cancel(fs_info);
4073
4074         /* wait for any defraggers to finish */
4075         wait_event(fs_info->transaction_wait,
4076                    (atomic_read(&fs_info->defrag_running) == 0));
4077
4078         /* clear out the rbtree of defraggable inodes */
4079         btrfs_cleanup_defrag_inodes(fs_info);
4080
4081         cancel_work_sync(&fs_info->async_reclaim_work);
4082         cancel_work_sync(&fs_info->async_data_reclaim_work);
4083
4084         /* Cancel or finish ongoing discard work */
4085         btrfs_discard_cleanup(fs_info);
4086
4087         if (!sb_rdonly(fs_info->sb)) {
4088                 /*
4089                  * The cleaner kthread is stopped, so do one final pass over
4090                  * unused block groups.
4091                  */
4092                 btrfs_delete_unused_bgs(fs_info);
4093
4094                 /*
4095                  * There might be existing delayed inode workers still running
4096                  * and holding an empty delayed inode item. We must wait for
4097                  * them to complete first because they can create a transaction.
4098                  * This happens when someone calls btrfs_balance_delayed_items()
4099                  * and then a transaction commit runs the same delayed nodes
4100                  * before any delayed worker has done something with the nodes.
4101                  * We must wait for any worker here and not at transaction
4102                  * commit time since that could cause a deadlock.
4103                  * This is a very rare case.
4104                  */
4105                 btrfs_flush_workqueue(fs_info->delayed_workers);
4106
4107                 ret = btrfs_commit_super(fs_info);
4108                 if (ret)
4109                         btrfs_err(fs_info, "commit super ret %d", ret);
4110         }
4111
4112         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
4113             test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
4114                 btrfs_error_commit_super(fs_info);
4115
4116         kthread_stop(fs_info->transaction_kthread);
4117         kthread_stop(fs_info->cleaner_kthread);
4118
4119         ASSERT(list_empty(&fs_info->delayed_iputs));
4120         set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4121
4122         if (btrfs_check_quota_leak(fs_info)) {
4123                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4124                 btrfs_err(fs_info, "qgroup reserved space leaked");
4125         }
4126
4127         btrfs_free_qgroup_config(fs_info);
4128         ASSERT(list_empty(&fs_info->delalloc_roots));
4129
4130         if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4131                 btrfs_info(fs_info, "at unmount delalloc count %lld",
4132                        percpu_counter_sum(&fs_info->delalloc_bytes));
4133         }
4134
4135         if (percpu_counter_sum(&fs_info->dio_bytes))
4136                 btrfs_info(fs_info, "at unmount dio bytes count %lld",
4137                            percpu_counter_sum(&fs_info->dio_bytes));
4138
4139         btrfs_sysfs_remove_mounted(fs_info);
4140         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4141
4142         btrfs_put_block_group_cache(fs_info);
4143
4144         /*
4145          * we must make sure there is not any read request to
4146          * submit after we stopping all workers.
4147          */
4148         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4149         btrfs_stop_all_workers(fs_info);
4150
4151         clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4152         free_root_pointers(fs_info, true);
4153         btrfs_free_fs_roots(fs_info);
4154
4155         /*
4156          * We must free the block groups after dropping the fs_roots as we could
4157          * have had an IO error and have left over tree log blocks that aren't
4158          * cleaned up until the fs roots are freed.  This makes the block group
4159          * accounting appear to be wrong because there's pending reserved bytes,
4160          * so make sure we do the block group cleanup afterwards.
4161          */
4162         btrfs_free_block_groups(fs_info);
4163
4164         iput(fs_info->btree_inode);
4165
4166 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4167         if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4168                 btrfsic_unmount(fs_info->fs_devices);
4169 #endif
4170
4171         btrfs_mapping_tree_free(&fs_info->mapping_tree);
4172         btrfs_close_devices(fs_info->fs_devices);
4173 }
4174
4175 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
4176                           int atomic)
4177 {
4178         int ret;
4179         struct inode *btree_inode = buf->pages[0]->mapping->host;
4180
4181         ret = extent_buffer_uptodate(buf);
4182         if (!ret)
4183                 return ret;
4184
4185         ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4186                                     parent_transid, atomic);
4187         if (ret == -EAGAIN)
4188                 return ret;
4189         return !ret;
4190 }
4191
4192 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4193 {
4194         struct btrfs_fs_info *fs_info;
4195         struct btrfs_root *root;
4196         u64 transid = btrfs_header_generation(buf);
4197         int was_dirty;
4198
4199 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4200         /*
4201          * This is a fast path so only do this check if we have sanity tests
4202          * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4203          * outside of the sanity tests.
4204          */
4205         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4206                 return;
4207 #endif
4208         root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4209         fs_info = root->fs_info;
4210         btrfs_assert_tree_locked(buf);
4211         if (transid != fs_info->generation)
4212                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4213                         buf->start, transid, fs_info->generation);
4214         was_dirty = set_extent_buffer_dirty(buf);
4215         if (!was_dirty)
4216                 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4217                                          buf->len,
4218                                          fs_info->dirty_metadata_batch);
4219 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4220         /*
4221          * Since btrfs_mark_buffer_dirty() can be called with item pointer set
4222          * but item data not updated.
4223          * So here we should only check item pointers, not item data.
4224          */
4225         if (btrfs_header_level(buf) == 0 &&
4226             btrfs_check_leaf_relaxed(buf)) {
4227                 btrfs_print_leaf(buf);
4228                 ASSERT(0);
4229         }
4230 #endif
4231 }
4232
4233 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4234                                         int flush_delayed)
4235 {
4236         /*
4237          * looks as though older kernels can get into trouble with
4238          * this code, they end up stuck in balance_dirty_pages forever
4239          */
4240         int ret;
4241
4242         if (current->flags & PF_MEMALLOC)
4243                 return;
4244
4245         if (flush_delayed)
4246                 btrfs_balance_delayed_items(fs_info);
4247
4248         ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
4249                                      BTRFS_DIRTY_METADATA_THRESH,
4250                                      fs_info->dirty_metadata_batch);
4251         if (ret > 0) {
4252                 balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4253         }
4254 }
4255
4256 void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4257 {
4258         __btrfs_btree_balance_dirty(fs_info, 1);
4259 }
4260
4261 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4262 {
4263         __btrfs_btree_balance_dirty(fs_info, 0);
4264 }
4265
4266 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
4267                       struct btrfs_key *first_key)
4268 {
4269         return btree_read_extent_buffer_pages(buf, parent_transid,
4270                                               level, first_key);
4271 }
4272
4273 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4274 {
4275         /* cleanup FS via transaction */
4276         btrfs_cleanup_transaction(fs_info);
4277
4278         mutex_lock(&fs_info->cleaner_mutex);
4279         btrfs_run_delayed_iputs(fs_info);
4280         mutex_unlock(&fs_info->cleaner_mutex);
4281
4282         down_write(&fs_info->cleanup_work_sem);
4283         up_write(&fs_info->cleanup_work_sem);
4284 }
4285
4286 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4287 {
4288         struct btrfs_root *gang[8];
4289         u64 root_objectid = 0;
4290         int ret;
4291
4292         spin_lock(&fs_info->fs_roots_radix_lock);
4293         while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4294                                              (void **)gang, root_objectid,
4295                                              ARRAY_SIZE(gang))) != 0) {
4296                 int i;
4297
4298                 for (i = 0; i < ret; i++)
4299                         gang[i] = btrfs_grab_root(gang[i]);
4300                 spin_unlock(&fs_info->fs_roots_radix_lock);
4301
4302                 for (i = 0; i < ret; i++) {
4303                         if (!gang[i])
4304                                 continue;
4305                         root_objectid = gang[i]->root_key.objectid;
4306                         btrfs_free_log(NULL, gang[i]);
4307                         btrfs_put_root(gang[i]);
4308                 }
4309                 root_objectid++;
4310                 spin_lock(&fs_info->fs_roots_radix_lock);
4311         }
4312         spin_unlock(&fs_info->fs_roots_radix_lock);
4313         btrfs_free_log_root_tree(NULL, fs_info);
4314 }
4315
4316 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4317 {
4318         struct btrfs_ordered_extent *ordered;
4319
4320         spin_lock(&root->ordered_extent_lock);
4321         /*
4322          * This will just short circuit the ordered completion stuff which will
4323          * make sure the ordered extent gets properly cleaned up.
4324          */
4325         list_for_each_entry(ordered, &root->ordered_extents,
4326                             root_extent_list)
4327                 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4328         spin_unlock(&root->ordered_extent_lock);
4329 }
4330
4331 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4332 {
4333         struct btrfs_root *root;
4334         struct list_head splice;
4335
4336         INIT_LIST_HEAD(&splice);
4337
4338         spin_lock(&fs_info->ordered_root_lock);
4339         list_splice_init(&fs_info->ordered_roots, &splice);
4340         while (!list_empty(&splice)) {
4341                 root = list_first_entry(&splice, struct btrfs_root,
4342                                         ordered_root);
4343                 list_move_tail(&root->ordered_root,
4344                                &fs_info->ordered_roots);
4345
4346                 spin_unlock(&fs_info->ordered_root_lock);
4347                 btrfs_destroy_ordered_extents(root);
4348
4349                 cond_resched();
4350                 spin_lock(&fs_info->ordered_root_lock);
4351         }
4352         spin_unlock(&fs_info->ordered_root_lock);
4353
4354         /*
4355          * We need this here because if we've been flipped read-only we won't
4356          * get sync() from the umount, so we need to make sure any ordered
4357          * extents that haven't had their dirty pages IO start writeout yet
4358          * actually get run and error out properly.
4359          */
4360         btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4361 }
4362
4363 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4364                                       struct btrfs_fs_info *fs_info)
4365 {
4366         struct rb_node *node;
4367         struct btrfs_delayed_ref_root *delayed_refs;
4368         struct btrfs_delayed_ref_node *ref;
4369         int ret = 0;
4370
4371         delayed_refs = &trans->delayed_refs;
4372
4373         spin_lock(&delayed_refs->lock);
4374         if (atomic_read(&delayed_refs->num_entries) == 0) {
4375                 spin_unlock(&delayed_refs->lock);
4376                 btrfs_debug(fs_info, "delayed_refs has NO entry");
4377                 return ret;
4378         }
4379
4380         while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4381                 struct btrfs_delayed_ref_head *head;
4382                 struct rb_node *n;
4383                 bool pin_bytes = false;
4384
4385                 head = rb_entry(node, struct btrfs_delayed_ref_head,
4386                                 href_node);
4387                 if (btrfs_delayed_ref_lock(delayed_refs, head))
4388                         continue;
4389
4390                 spin_lock(&head->lock);
4391                 while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4392                         ref = rb_entry(n, struct btrfs_delayed_ref_node,
4393                                        ref_node);
4394                         ref->in_tree = 0;
4395                         rb_erase_cached(&ref->ref_node, &head->ref_tree);
4396                         RB_CLEAR_NODE(&ref->ref_node);
4397                         if (!list_empty(&ref->add_list))
4398                                 list_del(&ref->add_list);
4399                         atomic_dec(&delayed_refs->num_entries);
4400                         btrfs_put_delayed_ref(ref);
4401                 }
4402                 if (head->must_insert_reserved)
4403                         pin_bytes = true;
4404                 btrfs_free_delayed_extent_op(head->extent_op);
4405                 btrfs_delete_ref_head(delayed_refs, head);
4406                 spin_unlock(&head->lock);
4407                 spin_unlock(&delayed_refs->lock);
4408                 mutex_unlock(&head->mutex);
4409
4410                 if (pin_bytes) {
4411                         struct btrfs_block_group *cache;
4412
4413                         cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4414                         BUG_ON(!cache);
4415
4416                         spin_lock(&cache->space_info->lock);
4417                         spin_lock(&cache->lock);
4418                         cache->pinned += head->num_bytes;
4419                         btrfs_space_info_update_bytes_pinned(fs_info,
4420                                 cache->space_info, head->num_bytes);
4421                         cache->reserved -= head->num_bytes;
4422                         cache->space_info->bytes_reserved -= head->num_bytes;
4423                         spin_unlock(&cache->lock);
4424                         spin_unlock(&cache->space_info->lock);
4425                         percpu_counter_add_batch(
4426                                 &cache->space_info->total_bytes_pinned,
4427                                 head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
4428
4429                         btrfs_put_block_group(cache);
4430
4431                         btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4432                                 head->bytenr + head->num_bytes - 1);
4433                 }
4434                 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4435                 btrfs_put_delayed_ref_head(head);
4436                 cond_resched();
4437                 spin_lock(&delayed_refs->lock);
4438         }
4439         btrfs_qgroup_destroy_extent_records(trans);
4440
4441         spin_unlock(&delayed_refs->lock);
4442
4443         return ret;
4444 }
4445
4446 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4447 {
4448         struct btrfs_inode *btrfs_inode;
4449         struct list_head splice;
4450
4451         INIT_LIST_HEAD(&splice);
4452
4453         spin_lock(&root->delalloc_lock);
4454         list_splice_init(&root->delalloc_inodes, &splice);
4455
4456         while (!list_empty(&splice)) {
4457                 struct inode *inode = NULL;
4458                 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4459                                                delalloc_inodes);
4460                 __btrfs_del_delalloc_inode(root, btrfs_inode);
4461                 spin_unlock(&root->delalloc_lock);
4462
4463                 /*
4464                  * Make sure we get a live inode and that it'll not disappear
4465                  * meanwhile.
4466                  */
4467                 inode = igrab(&btrfs_inode->vfs_inode);
4468                 if (inode) {
4469                         invalidate_inode_pages2(inode->i_mapping);
4470                         iput(inode);
4471                 }
4472                 spin_lock(&root->delalloc_lock);
4473         }
4474         spin_unlock(&root->delalloc_lock);
4475 }
4476
4477 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4478 {
4479         struct btrfs_root *root;
4480         struct list_head splice;
4481
4482         INIT_LIST_HEAD(&splice);
4483
4484         spin_lock(&fs_info->delalloc_root_lock);
4485         list_splice_init(&fs_info->delalloc_roots, &splice);
4486         while (!list_empty(&splice)) {
4487                 root = list_first_entry(&splice, struct btrfs_root,
4488                                          delalloc_root);
4489                 root = btrfs_grab_root(root);
4490                 BUG_ON(!root);
4491                 spin_unlock(&fs_info->delalloc_root_lock);
4492
4493                 btrfs_destroy_delalloc_inodes(root);
4494                 btrfs_put_root(root);
4495
4496                 spin_lock(&fs_info->delalloc_root_lock);
4497         }
4498         spin_unlock(&fs_info->delalloc_root_lock);
4499 }
4500
4501 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
4502                                         struct extent_io_tree *dirty_pages,
4503                                         int mark)
4504 {
4505         int ret;
4506         struct extent_buffer *eb;
4507         u64 start = 0;
4508         u64 end;
4509
4510         while (1) {
4511                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4512                                             mark, NULL);
4513                 if (ret)
4514                         break;
4515
4516                 clear_extent_bits(dirty_pages, start, end, mark);
4517                 while (start <= end) {
4518                         eb = find_extent_buffer(fs_info, start);
4519                         start += fs_info->nodesize;
4520                         if (!eb)
4521                                 continue;
4522                         wait_on_extent_buffer_writeback(eb);
4523
4524                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
4525                                                &eb->bflags))
4526                                 clear_extent_buffer_dirty(eb);
4527                         free_extent_buffer_stale(eb);
4528                 }
4529         }
4530
4531         return ret;
4532 }
4533
4534 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4535                                        struct extent_io_tree *unpin)
4536 {
4537         u64 start;
4538         u64 end;
4539         int ret;
4540
4541         while (1) {
4542                 struct extent_state *cached_state = NULL;
4543
4544                 /*
4545                  * The btrfs_finish_extent_commit() may get the same range as
4546                  * ours between find_first_extent_bit and clear_extent_dirty.
4547                  * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4548                  * the same extent range.
4549                  */
4550                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
4551                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4552                                             EXTENT_DIRTY, &cached_state);
4553                 if (ret) {
4554                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4555                         break;
4556                 }
4557
4558                 clear_extent_dirty(unpin, start, end, &cached_state);
4559                 free_extent_state(cached_state);
4560                 btrfs_error_unpin_extent_range(fs_info, start, end);
4561                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4562                 cond_resched();
4563         }
4564
4565         return 0;
4566 }
4567
4568 static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4569 {
4570         struct inode *inode;
4571
4572         inode = cache->io_ctl.inode;
4573         if (inode) {
4574                 invalidate_inode_pages2(inode->i_mapping);
4575                 BTRFS_I(inode)->generation = 0;
4576                 cache->io_ctl.inode = NULL;
4577                 iput(inode);
4578         }
4579         ASSERT(cache->io_ctl.pages == NULL);
4580         btrfs_put_block_group(cache);
4581 }
4582
4583 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4584                              struct btrfs_fs_info *fs_info)
4585 {
4586         struct btrfs_block_group *cache;
4587
4588         spin_lock(&cur_trans->dirty_bgs_lock);
4589         while (!list_empty(&cur_trans->dirty_bgs)) {
4590                 cache = list_first_entry(&cur_trans->dirty_bgs,
4591                                          struct btrfs_block_group,
4592                                          dirty_list);
4593
4594                 if (!list_empty(&cache->io_list)) {
4595                         spin_unlock(&cur_trans->dirty_bgs_lock);
4596                         list_del_init(&cache->io_list);
4597                         btrfs_cleanup_bg_io(cache);
4598                         spin_lock(&cur_trans->dirty_bgs_lock);
4599                 }
4600
4601                 list_del_init(&cache->dirty_list);
4602                 spin_lock(&cache->lock);
4603                 cache->disk_cache_state = BTRFS_DC_ERROR;
4604                 spin_unlock(&cache->lock);
4605
4606                 spin_unlock(&cur_trans->dirty_bgs_lock);
4607                 btrfs_put_block_group(cache);
4608                 btrfs_delayed_refs_rsv_release(fs_info, 1);
4609                 spin_lock(&cur_trans->dirty_bgs_lock);
4610         }
4611         spin_unlock(&cur_trans->dirty_bgs_lock);
4612
4613         /*
4614          * Refer to the definition of io_bgs member for details why it's safe
4615          * to use it without any locking
4616          */
4617         while (!list_empty(&cur_trans->io_bgs)) {
4618                 cache = list_first_entry(&cur_trans->io_bgs,
4619                                          struct btrfs_block_group,
4620                                          io_list);
4621
4622                 list_del_init(&cache->io_list);
4623                 spin_lock(&cache->lock);
4624                 cache->disk_cache_state = BTRFS_DC_ERROR;
4625                 spin_unlock(&cache->lock);
4626                 btrfs_cleanup_bg_io(cache);
4627         }
4628 }
4629
4630 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4631                                    struct btrfs_fs_info *fs_info)
4632 {
4633         struct btrfs_device *dev, *tmp;
4634
4635         btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4636         ASSERT(list_empty(&cur_trans->dirty_bgs));
4637         ASSERT(list_empty(&cur_trans->io_bgs));
4638
4639         list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4640                                  post_commit_list) {
4641                 list_del_init(&dev->post_commit_list);
4642         }
4643
4644         btrfs_destroy_delayed_refs(cur_trans, fs_info);
4645
4646         cur_trans->state = TRANS_STATE_COMMIT_START;
4647         wake_up(&fs_info->transaction_blocked_wait);
4648
4649         cur_trans->state = TRANS_STATE_UNBLOCKED;
4650         wake_up(&fs_info->transaction_wait);
4651
4652         btrfs_destroy_delayed_inodes(fs_info);
4653
4654         btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4655                                      EXTENT_DIRTY);
4656         btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4657
4658         cur_trans->state =TRANS_STATE_COMPLETED;
4659         wake_up(&cur_trans->commit_wait);
4660 }
4661
4662 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4663 {
4664         struct btrfs_transaction *t;
4665
4666         mutex_lock(&fs_info->transaction_kthread_mutex);
4667
4668         spin_lock(&fs_info->trans_lock);
4669         while (!list_empty(&fs_info->trans_list)) {
4670                 t = list_first_entry(&fs_info->trans_list,
4671                                      struct btrfs_transaction, list);
4672                 if (t->state >= TRANS_STATE_COMMIT_START) {
4673                         refcount_inc(&t->use_count);
4674                         spin_unlock(&fs_info->trans_lock);
4675                         btrfs_wait_for_commit(fs_info, t->transid);
4676                         btrfs_put_transaction(t);
4677                         spin_lock(&fs_info->trans_lock);
4678                         continue;
4679                 }
4680                 if (t == fs_info->running_transaction) {
4681                         t->state = TRANS_STATE_COMMIT_DOING;
4682                         spin_unlock(&fs_info->trans_lock);
4683                         /*
4684                          * We wait for 0 num_writers since we don't hold a trans
4685                          * handle open currently for this transaction.
4686                          */
4687                         wait_event(t->writer_wait,
4688                                    atomic_read(&t->num_writers) == 0);
4689                 } else {
4690                         spin_unlock(&fs_info->trans_lock);
4691                 }
4692                 btrfs_cleanup_one_transaction(t, fs_info);
4693
4694                 spin_lock(&fs_info->trans_lock);
4695                 if (t == fs_info->running_transaction)
4696                         fs_info->running_transaction = NULL;
4697                 list_del_init(&t->list);
4698                 spin_unlock(&fs_info->trans_lock);
4699
4700                 btrfs_put_transaction(t);
4701                 trace_btrfs_transaction_commit(fs_info->tree_root);
4702                 spin_lock(&fs_info->trans_lock);
4703         }
4704         spin_unlock(&fs_info->trans_lock);
4705         btrfs_destroy_all_ordered_extents(fs_info);
4706         btrfs_destroy_delayed_inodes(fs_info);
4707         btrfs_assert_delayed_root_empty(fs_info);
4708         btrfs_destroy_all_delalloc_inodes(fs_info);
4709         btrfs_drop_all_logs(fs_info);
4710         mutex_unlock(&fs_info->transaction_kthread_mutex);
4711
4712         return 0;
4713 }