fs/btrfs/volumes.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/ratelimit.h>
  12 #include <linux/kthread.h>
  13 #include <linux/raid/pq.h>
  14 #include <linux/semaphore.h>
  15 #include <linux/uuid.h>
  16 #include <linux/list_sort.h>
  17 #include <linux/namei.h>
  18 #include "misc.h"
  19 #include "ctree.h"
  20 #include "extent_map.h"
  21 #include "disk-io.h"
  22 #include "transaction.h"
  23 #include "print-tree.h"
  24 #include "volumes.h"
  25 #include "raid56.h"
  26 #include "async-thread.h"
  27 #include "check-integrity.h"
  28 #include "rcu-string.h"
  29 #include "dev-replace.h"
  30 #include "sysfs.h"
  31 #include "tree-checker.h"
  32 #include "space-info.h"
  33 #include "block-group.h"
  34 #include "discard.h"
  35 #include "zoned.h"
  36
  37 #define BTRFS_BLOCK_GROUP_STRIPE_MASK   (BTRFS_BLOCK_GROUP_RAID0 | \
  38                                          BTRFS_BLOCK_GROUP_RAID10 | \
  39                                          BTRFS_BLOCK_GROUP_RAID56_MASK)
  40
  41 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  42         [BTRFS_RAID_RAID10] = {
  43                 .sub_stripes    = 2,
  44                 .dev_stripes    = 1,
  45                 .devs_max       = 0,    /* 0 == as many as possible */
  46                 .devs_min       = 2,
  47                 .tolerated_failures = 1,
  48                 .devs_increment = 2,
  49                 .ncopies        = 2,
  50                 .nparity        = 0,
  51                 .raid_name      = "raid10",
  52                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  53                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  54         },
  55         [BTRFS_RAID_RAID1] = {
  56                 .sub_stripes    = 1,
  57                 .dev_stripes    = 1,
  58                 .devs_max       = 2,
  59                 .devs_min       = 2,
  60                 .tolerated_failures = 1,
  61                 .devs_increment = 2,
  62                 .ncopies        = 2,
  63                 .nparity        = 0,
  64                 .raid_name      = "raid1",
  65                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  66                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  67         },
  68         [BTRFS_RAID_RAID1C3] = {
  69                 .sub_stripes    = 1,
  70                 .dev_stripes    = 1,
  71                 .devs_max       = 3,
  72                 .devs_min       = 3,
  73                 .tolerated_failures = 2,
  74                 .devs_increment = 3,
  75                 .ncopies        = 3,
  76                 .nparity        = 0,
  77                 .raid_name      = "raid1c3",
  78                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  79                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  80         },
  81         [BTRFS_RAID_RAID1C4] = {
  82                 .sub_stripes    = 1,
  83                 .dev_stripes    = 1,
  84                 .devs_max       = 4,
  85                 .devs_min       = 4,
  86                 .tolerated_failures = 3,
  87                 .devs_increment = 4,
  88                 .ncopies        = 4,
  89                 .nparity        = 0,
  90                 .raid_name      = "raid1c4",
  91                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  92                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  93         },
  94         [BTRFS_RAID_DUP] = {
  95                 .sub_stripes    = 1,
  96                 .dev_stripes    = 2,
  97                 .devs_max       = 1,
  98                 .devs_min       = 1,
  99                 .tolerated_failures = 0,
 100                 .devs_increment = 1,
 101                 .ncopies        = 2,
 102                 .nparity        = 0,
 103                 .raid_name      = "dup",
 104                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
 105                 .mindev_error   = 0,
 106         },
 107         [BTRFS_RAID_RAID0] = {
 108                 .sub_stripes    = 1,
 109                 .dev_stripes    = 1,
 110                 .devs_max       = 0,
 111                 .devs_min       = 1,
 112                 .tolerated_failures = 0,
 113                 .devs_increment = 1,
 114                 .ncopies        = 1,
 115                 .nparity        = 0,
 116                 .raid_name      = "raid0",
 117                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 118                 .mindev_error   = 0,
 119         },
 120         [BTRFS_RAID_SINGLE] = {
 121                 .sub_stripes    = 1,
 122                 .dev_stripes    = 1,
 123                 .devs_max       = 1,
 124                 .devs_min       = 1,
 125                 .tolerated_failures = 0,
 126                 .devs_increment = 1,
 127                 .ncopies        = 1,
 128                 .nparity        = 0,
 129                 .raid_name      = "single",
 130                 .bg_flag        = 0,
 131                 .mindev_error   = 0,
 132         },
 133         [BTRFS_RAID_RAID5] = {
 134                 .sub_stripes    = 1,
 135                 .dev_stripes    = 1,
 136                 .devs_max       = 0,
 137                 .devs_min       = 2,
 138                 .tolerated_failures = 1,
 139                 .devs_increment = 1,
 140                 .ncopies        = 1,
 141                 .nparity        = 1,
 142                 .raid_name      = "raid5",
 143                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 144                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 145         },
 146         [BTRFS_RAID_RAID6] = {
 147                 .sub_stripes    = 1,
 148                 .dev_stripes    = 1,
 149                 .devs_max       = 0,
 150                 .devs_min       = 3,
 151                 .tolerated_failures = 2,
 152                 .devs_increment = 1,
 153                 .ncopies        = 1,
 154                 .nparity        = 2,
 155                 .raid_name      = "raid6",
 156                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 157                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 158         },
 159 };
 160
 161 /*
 162  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 163  * can be used as index to access btrfs_raid_array[].
 164  */
 165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
 166 {
 167         const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
 168
 169         if (!profile)
 170                 return BTRFS_RAID_SINGLE;
 171
 172         return BTRFS_BG_FLAG_TO_INDEX(profile);
 173 }
 174
 175 const char *btrfs_bg_type_to_raid_name(u64 flags)
 176 {
 177         const int index = btrfs_bg_flags_to_raid_index(flags);
 178
 179         if (index >= BTRFS_NR_RAID_TYPES)
 180                 return NULL;
 181
 182         return btrfs_raid_array[index].raid_name;
 183 }
 184
 185 /*
 186  * Fill @buf with textual description of @bg_flags, no more than @size_buf
 187  * bytes including terminating null byte.
 188  */
 189 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 190 {
 191         int i;
 192         int ret;
 193         char *bp = buf;
 194         u64 flags = bg_flags;
 195         u32 size_bp = size_buf;
 196
 197         if (!flags) {
 198                 strcpy(bp, "NONE");
 199                 return;
 200         }
 201
 202 #define DESCRIBE_FLAG(flag, desc)                                               \
 203         do {                                                            \
 204                 if (flags & (flag)) {                                   \
 205                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
 206                         if (ret < 0 || ret >= size_bp)                  \
 207                                 goto out_overflow;                      \
 208                         size_bp -= ret;                                 \
 209                         bp += ret;                                      \
 210                         flags &= ~(flag);                               \
 211                 }                                                       \
 212         } while (0)
 213
 214         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 215         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 216         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 217
 218         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 219         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 220                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 221                               btrfs_raid_array[i].raid_name);
 222 #undef DESCRIBE_FLAG
 223
 224         if (flags) {
 225                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
 226                 size_bp -= ret;
 227         }
 228
 229         if (size_bp < size_buf)
 230                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 231
 232         /*
 233          * The text is trimmed, it's up to the caller to provide sufficiently
 234          * large buffer
 235          */
 236 out_overflow:;
 237 }
 238
 239 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 240 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 241 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 242 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 243 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 244                              enum btrfs_map_op op,
 245                              u64 logical, u64 *length,
 246                              struct btrfs_io_context **bioc_ret,
 247                              int mirror_num, int need_raid_map);
 248
 249 /*
 250  * Device locking
 251  * ==============
 252  *
 253  * There are several mutexes that protect manipulation of devices and low-level
 254  * structures like chunks but not block groups, extents or files
 255  *
 256  * uuid_mutex (global lock)
 257  * ------------------------
 258  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 259  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 260  * device) or requested by the device= mount option
 261  *
 262  * the mutex can be very coarse and can cover long-running operations
 263  *
 264  * protects: updates to fs_devices counters like missing devices, rw devices,
 265  * seeding, structure cloning, opening/closing devices at mount/umount time
 266  *
 267  * global::fs_devs - add, remove, updates to the global list
 268  *
 269  * does not protect: manipulation of the fs_devices::devices list in general
 270  * but in mount context it could be used to exclude list modifications by eg.
 271  * scan ioctl
 272  *
 273  * btrfs_device::name - renames (write side), read is RCU
 274  *
 275  * fs_devices::device_list_mutex (per-fs, with RCU)
 276  * ------------------------------------------------
 277  * protects updates to fs_devices::devices, ie. adding and deleting
 278  *
 279  * simple list traversal with read-only actions can be done with RCU protection
 280  *
 281  * may be used to exclude some operations from running concurrently without any
 282  * modifications to the list (see write_all_supers)
 283  *
 284  * Is not required at mount and close times, because our device list is
 285  * protected by the uuid_mutex at that point.
 286  *
 287  * balance_mutex
 288  * -------------
 289  * protects balance structures (status, state) and context accessed from
 290  * several places (internally, ioctl)
 291  *
 292  * chunk_mutex
 293  * -----------
 294  * protects chunks, adding or removing during allocation, trim or when a new
 295  * device is added/removed. Additionally it also protects post_commit_list of
 296  * individual devices, since they can be added to the transaction's
 297  * post_commit_list only with chunk_mutex held.
 298  *
 299  * cleaner_mutex
 300  * -------------
 301  * a big lock that is held by the cleaner thread and prevents running subvolume
 302  * cleaning together with relocation or delayed iputs
 303  *
 304  *
 305  * Lock nesting
 306  * ============
 307  *
 308  * uuid_mutex
 309  *   device_list_mutex
 310  *     chunk_mutex
 311  *   balance_mutex
 312  *
 313  *
 314  * Exclusive operations
 315  * ====================
 316  *
 317  * Maintains the exclusivity of the following operations that apply to the
 318  * whole filesystem and cannot run in parallel.
 319  *
 320  * - Balance (*)
 321  * - Device add
 322  * - Device remove
 323  * - Device replace (*)
 324  * - Resize
 325  *
 326  * The device operations (as above) can be in one of the following states:
 327  *
 328  * - Running state
 329  * - Paused state
 330  * - Completed state
 331  *
 332  * Only device operations marked with (*) can go into the Paused state for the
 333  * following reasons:
 334  *
 335  * - ioctl (only Balance can be Paused through ioctl)
 336  * - filesystem remounted as read-only
 337  * - filesystem unmounted and mounted as read-only
 338  * - system power-cycle and filesystem mounted as read-only
 339  * - filesystem or device errors leading to forced read-only
 340  *
 341  * The status of exclusive operation is set and cleared atomically.
 342  * During the course of Paused state, fs_info::exclusive_operation remains set.
 343  * A device operation in Paused or Running state can be canceled or resumed
 344  * either by ioctl (Balance only) or when remounted as read-write.
 345  * The exclusive status is cleared when the device operation is canceled or
 346  * completed.
 347  */
 348
 349 DEFINE_MUTEX(uuid_mutex);
 350 static LIST_HEAD(fs_uuids);
 351 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 352 {
 353         return &fs_uuids;
 354 }
 355
 356 /*
 357  * alloc_fs_devices - allocate struct btrfs_fs_devices
 358  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 359  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 360  *
 361  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 362  * The returned struct is not linked onto any lists and can be destroyed with
 363  * kfree() right away.
 364  */
 365 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 366                                                  const u8 *metadata_fsid)
 367 {
 368         struct btrfs_fs_devices *fs_devs;
 369
 370         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 371         if (!fs_devs)
 372                 return ERR_PTR(-ENOMEM);
 373
 374         mutex_init(&fs_devs->device_list_mutex);
 375
 376         INIT_LIST_HEAD(&fs_devs->devices);
 377         INIT_LIST_HEAD(&fs_devs->alloc_list);
 378         INIT_LIST_HEAD(&fs_devs->fs_list);
 379         INIT_LIST_HEAD(&fs_devs->seed_list);
 380         if (fsid)
 381                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 382
 383         if (metadata_fsid)
 384                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 385         else if (fsid)
 386                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 387
 388         return fs_devs;
 389 }
 390
 391 void btrfs_free_device(struct btrfs_device *device)
 392 {
 393         WARN_ON(!list_empty(&device->post_commit_list));
 394         rcu_string_free(device->name);
 395         extent_io_tree_release(&device->alloc_state);
 396         btrfs_destroy_dev_zone_info(device);
 397         kfree(device);
 398 }
 399
 400 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 401 {
 402         struct btrfs_device *device;
 403         WARN_ON(fs_devices->opened);
 404         while (!list_empty(&fs_devices->devices)) {
 405                 device = list_entry(fs_devices->devices.next,
 406                                     struct btrfs_device, dev_list);
 407                 list_del(&device->dev_list);
 408                 btrfs_free_device(device);
 409         }
 410         kfree(fs_devices);
 411 }
 412
 413 void __exit btrfs_cleanup_fs_uuids(void)
 414 {
 415         struct btrfs_fs_devices *fs_devices;
 416
 417         while (!list_empty(&fs_uuids)) {
 418                 fs_devices = list_entry(fs_uuids.next,
 419                                         struct btrfs_fs_devices, fs_list);
 420                 list_del(&fs_devices->fs_list);
 421                 free_fs_devices(fs_devices);
 422         }
 423 }
 424
 425 static noinline struct btrfs_fs_devices *find_fsid(
 426                 const u8 *fsid, const u8 *metadata_fsid)
 427 {
 428         struct btrfs_fs_devices *fs_devices;
 429
 430         ASSERT(fsid);
 431
 432         /* Handle non-split brain cases */
 433         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 434                 if (metadata_fsid) {
 435                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 436                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 437                                       BTRFS_FSID_SIZE) == 0)
 438                                 return fs_devices;
 439                 } else {
 440                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 441                                 return fs_devices;
 442                 }
 443         }
 444         return NULL;
 445 }
 446
 447 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 448                                 struct btrfs_super_block *disk_super)
 449 {
 450
 451         struct btrfs_fs_devices *fs_devices;
 452
 453         /*
 454          * Handle scanned device having completed its fsid change but
 455          * belonging to a fs_devices that was created by first scanning
 456          * a device which didn't have its fsid/metadata_uuid changed
 457          * at all and the CHANGING_FSID_V2 flag set.
 458          */
 459         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 460                 if (fs_devices->fsid_change &&
 461                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 462                            BTRFS_FSID_SIZE) == 0 &&
 463                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 464                            BTRFS_FSID_SIZE) == 0) {
 465                         return fs_devices;
 466                 }
 467         }
 468         /*
 469          * Handle scanned device having completed its fsid change but
 470          * belonging to a fs_devices that was created by a device that
 471          * has an outdated pair of fsid/metadata_uuid and
 472          * CHANGING_FSID_V2 flag set.
 473          */
 474         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 475                 if (fs_devices->fsid_change &&
 476                     memcmp(fs_devices->metadata_uuid,
 477                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 478                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 479                            BTRFS_FSID_SIZE) == 0) {
 480                         return fs_devices;
 481                 }
 482         }
 483
 484         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 485 }
 486
 487
 488 static int
 489 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 490                       int flush, struct block_device **bdev,
 491                       struct btrfs_super_block **disk_super)
 492 {
 493         int ret;
 494
 495         *bdev = blkdev_get_by_path(device_path, flags, holder);
 496
 497         if (IS_ERR(*bdev)) {
 498                 ret = PTR_ERR(*bdev);
 499                 goto error;
 500         }
 501
 502         if (flush)
 503                 sync_blockdev(*bdev);
 504         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 505         if (ret) {
 506                 blkdev_put(*bdev, flags);
 507                 goto error;
 508         }
 509         invalidate_bdev(*bdev);
 510         *disk_super = btrfs_read_dev_super(*bdev);
 511         if (IS_ERR(*disk_super)) {
 512                 ret = PTR_ERR(*disk_super);
 513                 blkdev_put(*bdev, flags);
 514                 goto error;
 515         }
 516
 517         return 0;
 518
 519 error:
 520         *bdev = NULL;
 521         return ret;
 522 }
 523
 524 /**
 525  *  Search and remove all stale devices (which are not mounted).
 526  *  When both inputs are NULL, it will search and release all stale devices.
 527  *
 528  *  @devt:      Optional. When provided will it release all unmounted devices
 529  *              matching this devt only.
 530  *  @skip_device:  Optional. Will skip this device when searching for the stale
 531  *              devices.
 532  *
 533  *  Return:     0 for success or if @devt is 0.
 534  *              -EBUSY if @devt is a mounted device.
 535  *              -ENOENT if @devt does not match any device in the list.
 536  */
 537 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
 538 {
 539         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 540         struct btrfs_device *device, *tmp_device;
 541         int ret = 0;
 542
 543         lockdep_assert_held(&uuid_mutex);
 544
 545         if (devt)
 546                 ret = -ENOENT;
 547
 548         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 549
 550                 mutex_lock(&fs_devices->device_list_mutex);
 551                 list_for_each_entry_safe(device, tmp_device,
 552                                          &fs_devices->devices, dev_list) {
 553                         if (skip_device && skip_device == device)
 554                                 continue;
 555                         if (devt && devt != device->devt)
 556                                 continue;
 557                         if (fs_devices->opened) {
 558                                 /* for an already deleted device return 0 */
 559                                 if (devt && ret != 0)
 560                                         ret = -EBUSY;
 561                                 break;
 562                         }
 563
 564                         /* delete the stale device */
 565                         fs_devices->num_devices--;
 566                         list_del(&device->dev_list);
 567                         btrfs_free_device(device);
 568
 569                         ret = 0;
 570                 }
 571                 mutex_unlock(&fs_devices->device_list_mutex);
 572
 573                 if (fs_devices->num_devices == 0) {
 574                         btrfs_sysfs_remove_fsid(fs_devices);
 575                         list_del(&fs_devices->fs_list);
 576                         free_fs_devices(fs_devices);
 577                 }
 578         }
 579
 580         return ret;
 581 }
 582
 583 /*
 584  * This is only used on mount, and we are protected from competing things
 585  * messing with our fs_devices by the uuid_mutex, thus we do not need the
 586  * fs_devices->device_list_mutex here.
 587  */
 588 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 589                         struct btrfs_device *device, fmode_t flags,
 590                         void *holder)
 591 {
 592         struct block_device *bdev;
 593         struct btrfs_super_block *disk_super;
 594         u64 devid;
 595         int ret;
 596
 597         if (device->bdev)
 598                 return -EINVAL;
 599         if (!device->name)
 600                 return -EINVAL;
 601
 602         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 603                                     &bdev, &disk_super);
 604         if (ret)
 605                 return ret;
 606
 607         devid = btrfs_stack_device_id(&disk_super->dev_item);
 608         if (devid != device->devid)
 609                 goto error_free_page;
 610
 611         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 612                 goto error_free_page;
 613
 614         device->generation = btrfs_super_generation(disk_super);
 615
 616         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 617                 if (btrfs_super_incompat_flags(disk_super) &
 618                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 619                         pr_err(
 620                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
 621                         goto error_free_page;
 622                 }
 623
 624                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 625                 fs_devices->seeding = true;
 626         } else {
 627                 if (bdev_read_only(bdev))
 628                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 629                 else
 630                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 631         }
 632
 633         if (!bdev_nonrot(bdev))
 634                 fs_devices->rotating = true;
 635
 636         device->bdev = bdev;
 637         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 638         device->mode = flags;
 639
 640         fs_devices->open_devices++;
 641         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 642             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 643                 fs_devices->rw_devices++;
 644                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 645         }
 646         btrfs_release_disk_super(disk_super);
 647
 648         return 0;
 649
 650 error_free_page:
 651         btrfs_release_disk_super(disk_super);
 652         blkdev_put(bdev, flags);
 653
 654         return -EINVAL;
 655 }
 656
 657 /*
 658  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 659  * being created with a disk that has already completed its fsid change. Such
 660  * disk can belong to an fs which has its FSID changed or to one which doesn't.
 661  * Handle both cases here.
 662  */
 663 static struct btrfs_fs_devices *find_fsid_inprogress(
 664                                         struct btrfs_super_block *disk_super)
 665 {
 666         struct btrfs_fs_devices *fs_devices;
 667
 668         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 669                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 670                            BTRFS_FSID_SIZE) != 0 &&
 671                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 672                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 673                         return fs_devices;
 674                 }
 675         }
 676
 677         return find_fsid(disk_super->fsid, NULL);
 678 }
 679
 680
 681 static struct btrfs_fs_devices *find_fsid_changed(
 682                                         struct btrfs_super_block *disk_super)
 683 {
 684         struct btrfs_fs_devices *fs_devices;
 685
 686         /*
 687          * Handles the case where scanned device is part of an fs that had
 688          * multiple successful changes of FSID but currently device didn't
 689          * observe it. Meaning our fsid will be different than theirs. We need
 690          * to handle two subcases :
 691          *  1 - The fs still continues to have different METADATA/FSID uuids.
 692          *  2 - The fs is switched back to its original FSID (METADATA/FSID
 693          *  are equal).
 694          */
 695         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 696                 /* Changed UUIDs */
 697                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 698                            BTRFS_FSID_SIZE) != 0 &&
 699                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 700                            BTRFS_FSID_SIZE) == 0 &&
 701                     memcmp(fs_devices->fsid, disk_super->fsid,
 702                            BTRFS_FSID_SIZE) != 0)
 703                         return fs_devices;
 704
 705                 /* Unchanged UUIDs */
 706                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 707                            BTRFS_FSID_SIZE) == 0 &&
 708                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 709                            BTRFS_FSID_SIZE) == 0)
 710                         return fs_devices;
 711         }
 712
 713         return NULL;
 714 }
 715
 716 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 717                                 struct btrfs_super_block *disk_super)
 718 {
 719         struct btrfs_fs_devices *fs_devices;
 720
 721         /*
 722          * Handle the case where the scanned device is part of an fs whose last
 723          * metadata UUID change reverted it to the original FSID. At the same
 724          * time * fs_devices was first created by another constitutent device
 725          * which didn't fully observe the operation. This results in an
 726          * btrfs_fs_devices created with metadata/fsid different AND
 727          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 728          * fs_devices equal to the FSID of the disk.
 729          */
 730         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 731                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 732                            BTRFS_FSID_SIZE) != 0 &&
 733                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 734                            BTRFS_FSID_SIZE) == 0 &&
 735                     fs_devices->fsid_change)
 736                         return fs_devices;
 737         }
 738
 739         return NULL;
 740 }
 741 /*
 742  * Add new device to list of registered devices
 743  *
 744  * Returns:
 745  * device pointer which was just added or updated when successful
 746  * error pointer when failed
 747  */
 748 static noinline struct btrfs_device *device_list_add(const char *path,
 749                            struct btrfs_super_block *disk_super,
 750                            bool *new_device_added)
 751 {
 752         struct btrfs_device *device;
 753         struct btrfs_fs_devices *fs_devices = NULL;
 754         struct rcu_string *name;
 755         u64 found_transid = btrfs_super_generation(disk_super);
 756         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 757         dev_t path_devt;
 758         int error;
 759         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 760                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 761         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 762                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 763
 764         error = lookup_bdev(path, &path_devt);
 765         if (error)
 766                 return ERR_PTR(error);
 767
 768         if (fsid_change_in_progress) {
 769                 if (!has_metadata_uuid)
 770                         fs_devices = find_fsid_inprogress(disk_super);
 771                 else
 772                         fs_devices = find_fsid_changed(disk_super);
 773         } else if (has_metadata_uuid) {
 774                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
 775         } else {
 776                 fs_devices = find_fsid_reverted_metadata(disk_super);
 777                 if (!fs_devices)
 778                         fs_devices = find_fsid(disk_super->fsid, NULL);
 779         }
 780
 781
 782         if (!fs_devices) {
 783                 if (has_metadata_uuid)
 784                         fs_devices = alloc_fs_devices(disk_super->fsid,
 785                                                       disk_super->metadata_uuid);
 786                 else
 787                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 788
 789                 if (IS_ERR(fs_devices))
 790                         return ERR_CAST(fs_devices);
 791
 792                 fs_devices->fsid_change = fsid_change_in_progress;
 793
 794                 mutex_lock(&fs_devices->device_list_mutex);
 795                 list_add(&fs_devices->fs_list, &fs_uuids);
 796
 797                 device = NULL;
 798         } else {
 799                 struct btrfs_dev_lookup_args args = {
 800                         .devid = devid,
 801                         .uuid = disk_super->dev_item.uuid,
 802                 };
 803
 804                 mutex_lock(&fs_devices->device_list_mutex);
 805                 device = btrfs_find_device(fs_devices, &args);
 806
 807                 /*
 808                  * If this disk has been pulled into an fs devices created by
 809                  * a device which had the CHANGING_FSID_V2 flag then replace the
 810                  * metadata_uuid/fsid values of the fs_devices.
 811                  */
 812                 if (fs_devices->fsid_change &&
 813                     found_transid > fs_devices->latest_generation) {
 814                         memcpy(fs_devices->fsid, disk_super->fsid,
 815                                         BTRFS_FSID_SIZE);
 816
 817                         if (has_metadata_uuid)
 818                                 memcpy(fs_devices->metadata_uuid,
 819                                        disk_super->metadata_uuid,
 820                                        BTRFS_FSID_SIZE);
 821                         else
 822                                 memcpy(fs_devices->metadata_uuid,
 823                                        disk_super->fsid, BTRFS_FSID_SIZE);
 824
 825                         fs_devices->fsid_change = false;
 826                 }
 827         }
 828
 829         if (!device) {
 830                 if (fs_devices->opened) {
 831                         mutex_unlock(&fs_devices->device_list_mutex);
 832                         return ERR_PTR(-EBUSY);
 833                 }
 834
 835                 device = btrfs_alloc_device(NULL, &devid,
 836                                             disk_super->dev_item.uuid);
 837                 if (IS_ERR(device)) {
 838                         mutex_unlock(&fs_devices->device_list_mutex);
 839                         /* we can safely leave the fs_devices entry around */
 840                         return device;
 841                 }
 842
 843                 name = rcu_string_strdup(path, GFP_NOFS);
 844                 if (!name) {
 845                         btrfs_free_device(device);
 846                         mutex_unlock(&fs_devices->device_list_mutex);
 847                         return ERR_PTR(-ENOMEM);
 848                 }
 849                 rcu_assign_pointer(device->name, name);
 850                 device->devt = path_devt;
 851
 852                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 853                 fs_devices->num_devices++;
 854
 855                 device->fs_devices = fs_devices;
 856                 *new_device_added = true;
 857
 858                 if (disk_super->label[0])
 859                         pr_info(
 860         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 861                                 disk_super->label, devid, found_transid, path,
 862                                 current->comm, task_pid_nr(current));
 863                 else
 864                         pr_info(
 865         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 866                                 disk_super->fsid, devid, found_transid, path,
 867                                 current->comm, task_pid_nr(current));
 868
 869         } else if (!device->name || strcmp(device->name->str, path)) {
 870                 /*
 871                  * When FS is already mounted.
 872                  * 1. If you are here and if the device->name is NULL that
 873                  *    means this device was missing at time of FS mount.
 874                  * 2. If you are here and if the device->name is different
 875                  *    from 'path' that means either
 876                  *      a. The same device disappeared and reappeared with
 877                  *         different name. or
 878                  *      b. The missing-disk-which-was-replaced, has
 879                  *         reappeared now.
 880                  *
 881                  * We must allow 1 and 2a above. But 2b would be a spurious
 882                  * and unintentional.
 883                  *
 884                  * Further in case of 1 and 2a above, the disk at 'path'
 885                  * would have missed some transaction when it was away and
 886                  * in case of 2a the stale bdev has to be updated as well.
 887                  * 2b must not be allowed at all time.
 888                  */
 889
 890                 /*
 891                  * For now, we do allow update to btrfs_fs_device through the
 892                  * btrfs dev scan cli after FS has been mounted.  We're still
 893                  * tracking a problem where systems fail mount by subvolume id
 894                  * when we reject replacement on a mounted FS.
 895                  */
 896                 if (!fs_devices->opened && found_transid < device->generation) {
 897                         /*
 898                          * That is if the FS is _not_ mounted and if you
 899                          * are here, that means there is more than one
 900                          * disk with same uuid and devid.We keep the one
 901                          * with larger generation number or the last-in if
 902                          * generation are equal.
 903                          */
 904                         mutex_unlock(&fs_devices->device_list_mutex);
 905                         return ERR_PTR(-EEXIST);
 906                 }
 907
 908                 /*
 909                  * We are going to replace the device path for a given devid,
 910                  * make sure it's the same device if the device is mounted
 911                  *
 912                  * NOTE: the device->fs_info may not be reliable here so pass
 913                  * in a NULL to message helpers instead. This avoids a possible
 914                  * use-after-free when the fs_info and fs_info->sb are already
 915                  * torn down.
 916                  */
 917                 if (device->bdev) {
 918                         if (device->devt != path_devt) {
 919                                 mutex_unlock(&fs_devices->device_list_mutex);
 920                                 btrfs_warn_in_rcu(NULL,
 921         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 922                                                   path, devid, found_transid,
 923                                                   current->comm,
 924                                                   task_pid_nr(current));
 925                                 return ERR_PTR(-EEXIST);
 926                         }
 927                         btrfs_info_in_rcu(NULL,
 928         "devid %llu device path %s changed to %s scanned by %s (%d)",
 929                                           devid, rcu_str_deref(device->name),
 930                                           path, current->comm,
 931                                           task_pid_nr(current));
 932                 }
 933
 934                 name = rcu_string_strdup(path, GFP_NOFS);
 935                 if (!name) {
 936                         mutex_unlock(&fs_devices->device_list_mutex);
 937                         return ERR_PTR(-ENOMEM);
 938                 }
 939                 rcu_string_free(device->name);
 940                 rcu_assign_pointer(device->name, name);
 941                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 942                         fs_devices->missing_devices--;
 943                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 944                 }
 945                 device->devt = path_devt;
 946         }
 947
 948         /*
 949          * Unmount does not free the btrfs_device struct but would zero
 950          * generation along with most of the other members. So just update
 951          * it back. We need it to pick the disk with largest generation
 952          * (as above).
 953          */
 954         if (!fs_devices->opened) {
 955                 device->generation = found_transid;
 956                 fs_devices->latest_generation = max_t(u64, found_transid,
 957                                                 fs_devices->latest_generation);
 958         }
 959
 960         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 961
 962         mutex_unlock(&fs_devices->device_list_mutex);
 963         return device;
 964 }
 965
 966 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 967 {
 968         struct btrfs_fs_devices *fs_devices;
 969         struct btrfs_device *device;
 970         struct btrfs_device *orig_dev;
 971         int ret = 0;
 972
 973         lockdep_assert_held(&uuid_mutex);
 974
 975         fs_devices = alloc_fs_devices(orig->fsid, NULL);
 976         if (IS_ERR(fs_devices))
 977                 return fs_devices;
 978
 979         fs_devices->total_devices = orig->total_devices;
 980
 981         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 982                 struct rcu_string *name;
 983
 984                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
 985                                             orig_dev->uuid);
 986                 if (IS_ERR(device)) {
 987                         ret = PTR_ERR(device);
 988                         goto error;
 989                 }
 990
 991                 /*
 992                  * This is ok to do without rcu read locked because we hold the
 993                  * uuid mutex so nothing we touch in here is going to disappear.
 994                  */
 995                 if (orig_dev->name) {
 996                         name = rcu_string_strdup(orig_dev->name->str,
 997                                         GFP_KERNEL);
 998                         if (!name) {
 999                                 btrfs_free_device(device);
1000                                 ret = -ENOMEM;
1001                                 goto error;
1002                         }
1003                         rcu_assign_pointer(device->name, name);
1004                 }
1005
1006                 list_add(&device->dev_list, &fs_devices->devices);
1007                 device->fs_devices = fs_devices;
1008                 fs_devices->num_devices++;
1009         }
1010         return fs_devices;
1011 error:
1012         free_fs_devices(fs_devices);
1013         return ERR_PTR(ret);
1014 }
1015
1016 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1017                                       struct btrfs_device **latest_dev)
1018 {
1019         struct btrfs_device *device, *next;
1020
1021         /* This is the initialized path, it is safe to release the devices. */
1022         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1023                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1024                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1025                                       &device->dev_state) &&
1026                             !test_bit(BTRFS_DEV_STATE_MISSING,
1027                                       &device->dev_state) &&
1028                             (!*latest_dev ||
1029                              device->generation > (*latest_dev)->generation)) {
1030                                 *latest_dev = device;
1031                         }
1032                         continue;
1033                 }
1034
1035                 /*
1036                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1037                  * in btrfs_init_dev_replace() so just continue.
1038                  */
1039                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1040                         continue;
1041
1042                 if (device->bdev) {
1043                         blkdev_put(device->bdev, device->mode);
1044                         device->bdev = NULL;
1045                         fs_devices->open_devices--;
1046                 }
1047                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1048                         list_del_init(&device->dev_alloc_list);
1049                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1050                         fs_devices->rw_devices--;
1051                 }
1052                 list_del_init(&device->dev_list);
1053                 fs_devices->num_devices--;
1054                 btrfs_free_device(device);
1055         }
1056
1057 }
1058
1059 /*
1060  * After we have read the system tree and know devids belonging to this
1061  * filesystem, remove the device which does not belong there.
1062  */
1063 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1064 {
1065         struct btrfs_device *latest_dev = NULL;
1066         struct btrfs_fs_devices *seed_dev;
1067
1068         mutex_lock(&uuid_mutex);
1069         __btrfs_free_extra_devids(fs_devices, &latest_dev);
1070
1071         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1072                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1073
1074         fs_devices->latest_dev = latest_dev;
1075
1076         mutex_unlock(&uuid_mutex);
1077 }
1078
1079 static void btrfs_close_bdev(struct btrfs_device *device)
1080 {
1081         if (!device->bdev)
1082                 return;
1083
1084         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1085                 sync_blockdev(device->bdev);
1086                 invalidate_bdev(device->bdev);
1087         }
1088
1089         blkdev_put(device->bdev, device->mode);
1090 }
1091
1092 static void btrfs_close_one_device(struct btrfs_device *device)
1093 {
1094         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1095
1096         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1097             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1098                 list_del_init(&device->dev_alloc_list);
1099                 fs_devices->rw_devices--;
1100         }
1101
1102         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1103                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1104
1105         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1106                 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1107                 fs_devices->missing_devices--;
1108         }
1109
1110         btrfs_close_bdev(device);
1111         if (device->bdev) {
1112                 fs_devices->open_devices--;
1113                 device->bdev = NULL;
1114         }
1115         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1116         btrfs_destroy_dev_zone_info(device);
1117
1118         device->fs_info = NULL;
1119         atomic_set(&device->dev_stats_ccnt, 0);
1120         extent_io_tree_release(&device->alloc_state);
1121
1122         /*
1123          * Reset the flush error record. We might have a transient flush error
1124          * in this mount, and if so we aborted the current transaction and set
1125          * the fs to an error state, guaranteeing no super blocks can be further
1126          * committed. However that error might be transient and if we unmount the
1127          * filesystem and mount it again, we should allow the mount to succeed
1128          * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1129          * filesystem again we still get flush errors, then we will again abort
1130          * any transaction and set the error state, guaranteeing no commits of
1131          * unsafe super blocks.
1132          */
1133         device->last_flush_error = 0;
1134
1135         /* Verify the device is back in a pristine state  */
1136         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1137         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1138         ASSERT(list_empty(&device->dev_alloc_list));
1139         ASSERT(list_empty(&device->post_commit_list));
1140 }
1141
1142 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1143 {
1144         struct btrfs_device *device, *tmp;
1145
1146         lockdep_assert_held(&uuid_mutex);
1147
1148         if (--fs_devices->opened > 0)
1149                 return;
1150
1151         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1152                 btrfs_close_one_device(device);
1153
1154         WARN_ON(fs_devices->open_devices);
1155         WARN_ON(fs_devices->rw_devices);
1156         fs_devices->opened = 0;
1157         fs_devices->seeding = false;
1158         fs_devices->fs_info = NULL;
1159 }
1160
1161 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1162 {
1163         LIST_HEAD(list);
1164         struct btrfs_fs_devices *tmp;
1165
1166         mutex_lock(&uuid_mutex);
1167         close_fs_devices(fs_devices);
1168         if (!fs_devices->opened)
1169                 list_splice_init(&fs_devices->seed_list, &list);
1170
1171         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1172                 close_fs_devices(fs_devices);
1173                 list_del(&fs_devices->seed_list);
1174                 free_fs_devices(fs_devices);
1175         }
1176         mutex_unlock(&uuid_mutex);
1177 }
1178
1179 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1180                                 fmode_t flags, void *holder)
1181 {
1182         struct btrfs_device *device;
1183         struct btrfs_device *latest_dev = NULL;
1184         struct btrfs_device *tmp_device;
1185
1186         flags |= FMODE_EXCL;
1187
1188         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1189                                  dev_list) {
1190                 int ret;
1191
1192                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1193                 if (ret == 0 &&
1194                     (!latest_dev || device->generation > latest_dev->generation)) {
1195                         latest_dev = device;
1196                 } else if (ret == -ENODATA) {
1197                         fs_devices->num_devices--;
1198                         list_del(&device->dev_list);
1199                         btrfs_free_device(device);
1200                 }
1201         }
1202         if (fs_devices->open_devices == 0)
1203                 return -EINVAL;
1204
1205         fs_devices->opened = 1;
1206         fs_devices->latest_dev = latest_dev;
1207         fs_devices->total_rw_bytes = 0;
1208         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1209         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1210
1211         return 0;
1212 }
1213
1214 static int devid_cmp(void *priv, const struct list_head *a,
1215                      const struct list_head *b)
1216 {
1217         const struct btrfs_device *dev1, *dev2;
1218
1219         dev1 = list_entry(a, struct btrfs_device, dev_list);
1220         dev2 = list_entry(b, struct btrfs_device, dev_list);
1221
1222         if (dev1->devid < dev2->devid)
1223                 return -1;
1224         else if (dev1->devid > dev2->devid)
1225                 return 1;
1226         return 0;
1227 }
1228
1229 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1230                        fmode_t flags, void *holder)
1231 {
1232         int ret;
1233
1234         lockdep_assert_held(&uuid_mutex);
1235         /*
1236          * The device_list_mutex cannot be taken here in case opening the
1237          * underlying device takes further locks like open_mutex.
1238          *
1239          * We also don't need the lock here as this is called during mount and
1240          * exclusion is provided by uuid_mutex
1241          */
1242
1243         if (fs_devices->opened) {
1244                 fs_devices->opened++;
1245                 ret = 0;
1246         } else {
1247                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1248                 ret = open_fs_devices(fs_devices, flags, holder);
1249         }
1250
1251         return ret;
1252 }
1253
1254 void btrfs_release_disk_super(struct btrfs_super_block *super)
1255 {
1256         struct page *page = virt_to_page(super);
1257
1258         put_page(page);
1259 }
1260
1261 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1262                                                        u64 bytenr, u64 bytenr_orig)
1263 {
1264         struct btrfs_super_block *disk_super;
1265         struct page *page;
1266         void *p;
1267         pgoff_t index;
1268
1269         /* make sure our super fits in the device */
1270         if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1271                 return ERR_PTR(-EINVAL);
1272
1273         /* make sure our super fits in the page */
1274         if (sizeof(*disk_super) > PAGE_SIZE)
1275                 return ERR_PTR(-EINVAL);
1276
1277         /* make sure our super doesn't straddle pages on disk */
1278         index = bytenr >> PAGE_SHIFT;
1279         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1280                 return ERR_PTR(-EINVAL);
1281
1282         /* pull in the page with our super */
1283         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1284
1285         if (IS_ERR(page))
1286                 return ERR_CAST(page);
1287
1288         p = page_address(page);
1289
1290         /* align our pointer to the offset of the super block */
1291         disk_super = p + offset_in_page(bytenr);
1292
1293         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1294             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1295                 btrfs_release_disk_super(p);
1296                 return ERR_PTR(-EINVAL);
1297         }
1298
1299         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1300                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1301
1302         return disk_super;
1303 }
1304
1305 int btrfs_forget_devices(dev_t devt)
1306 {
1307         int ret;
1308
1309         mutex_lock(&uuid_mutex);
1310         ret = btrfs_free_stale_devices(devt, NULL);
1311         mutex_unlock(&uuid_mutex);
1312
1313         return ret;
1314 }
1315
1316 /*
1317  * Look for a btrfs signature on a device. This may be called out of the mount path
1318  * and we are not allowed to call set_blocksize during the scan. The superblock
1319  * is read via pagecache
1320  */
1321 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1322                                            void *holder)
1323 {
1324         struct btrfs_super_block *disk_super;
1325         bool new_device_added = false;
1326         struct btrfs_device *device = NULL;
1327         struct block_device *bdev;
1328         u64 bytenr, bytenr_orig;
1329         int ret;
1330
1331         lockdep_assert_held(&uuid_mutex);
1332
1333         /*
1334          * we would like to check all the supers, but that would make
1335          * a btrfs mount succeed after a mkfs from a different FS.
1336          * So, we need to add a special mount option to scan for
1337          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1338          */
1339         flags |= FMODE_EXCL;
1340
1341         bdev = blkdev_get_by_path(path, flags, holder);
1342         if (IS_ERR(bdev))
1343                 return ERR_CAST(bdev);
1344
1345         bytenr_orig = btrfs_sb_offset(0);
1346         ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1347         if (ret) {
1348                 device = ERR_PTR(ret);
1349                 goto error_bdev_put;
1350         }
1351
1352         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1353         if (IS_ERR(disk_super)) {
1354                 device = ERR_CAST(disk_super);
1355                 goto error_bdev_put;
1356         }
1357
1358         device = device_list_add(path, disk_super, &new_device_added);
1359         if (!IS_ERR(device) && new_device_added)
1360                 btrfs_free_stale_devices(device->devt, device);
1361
1362         btrfs_release_disk_super(disk_super);
1363
1364 error_bdev_put:
1365         blkdev_put(bdev, flags);
1366
1367         return device;
1368 }
1369
1370 /*
1371  * Try to find a chunk that intersects [start, start + len] range and when one
1372  * such is found, record the end of it in *start
1373  */
1374 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1375                                     u64 len)
1376 {
1377         u64 physical_start, physical_end;
1378
1379         lockdep_assert_held(&device->fs_info->chunk_mutex);
1380
1381         if (!find_first_extent_bit(&device->alloc_state, *start,
1382                                    &physical_start, &physical_end,
1383                                    CHUNK_ALLOCATED, NULL)) {
1384
1385                 if (in_range(physical_start, *start, len) ||
1386                     in_range(*start, physical_start,
1387                              physical_end - physical_start)) {
1388                         *start = physical_end + 1;
1389                         return true;
1390                 }
1391         }
1392         return false;
1393 }
1394
1395 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1396 {
1397         switch (device->fs_devices->chunk_alloc_policy) {
1398         case BTRFS_CHUNK_ALLOC_REGULAR:
1399                 /*
1400                  * We don't want to overwrite the superblock on the drive nor
1401                  * any area used by the boot loader (grub for example), so we
1402                  * make sure to start at an offset of at least 1MB.
1403                  */
1404                 return max_t(u64, start, SZ_1M);
1405         case BTRFS_CHUNK_ALLOC_ZONED:
1406                 /*
1407                  * We don't care about the starting region like regular
1408                  * allocator, because we anyway use/reserve the first two zones
1409                  * for superblock logging.
1410                  */
1411                 return ALIGN(start, device->zone_info->zone_size);
1412         default:
1413                 BUG();
1414         }
1415 }
1416
1417 static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1418                                         u64 *hole_start, u64 *hole_size,
1419                                         u64 num_bytes)
1420 {
1421         u64 zone_size = device->zone_info->zone_size;
1422         u64 pos;
1423         int ret;
1424         bool changed = false;
1425
1426         ASSERT(IS_ALIGNED(*hole_start, zone_size));
1427
1428         while (*hole_size > 0) {
1429                 pos = btrfs_find_allocatable_zones(device, *hole_start,
1430                                                    *hole_start + *hole_size,
1431                                                    num_bytes);
1432                 if (pos != *hole_start) {
1433                         *hole_size = *hole_start + *hole_size - pos;
1434                         *hole_start = pos;
1435                         changed = true;
1436                         if (*hole_size < num_bytes)
1437                                 break;
1438                 }
1439
1440                 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1441
1442                 /* Range is ensured to be empty */
1443                 if (!ret)
1444                         return changed;
1445
1446                 /* Given hole range was invalid (outside of device) */
1447                 if (ret == -ERANGE) {
1448                         *hole_start += *hole_size;
1449                         *hole_size = 0;
1450                         return true;
1451                 }
1452
1453                 *hole_start += zone_size;
1454                 *hole_size -= zone_size;
1455                 changed = true;
1456         }
1457
1458         return changed;
1459 }
1460
1461 /**
1462  * dev_extent_hole_check - check if specified hole is suitable for allocation
1463  * @device:     the device which we have the hole
1464  * @hole_start: starting position of the hole
1465  * @hole_size:  the size of the hole
1466  * @num_bytes:  the size of the free space that we need
1467  *
1468  * This function may modify @hole_start and @hole_size to reflect the suitable
1469  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1470  */
1471 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1472                                   u64 *hole_size, u64 num_bytes)
1473 {
1474         bool changed = false;
1475         u64 hole_end = *hole_start + *hole_size;
1476
1477         for (;;) {
1478                 /*
1479                  * Check before we set max_hole_start, otherwise we could end up
1480                  * sending back this offset anyway.
1481                  */
1482                 if (contains_pending_extent(device, hole_start, *hole_size)) {
1483                         if (hole_end >= *hole_start)
1484                                 *hole_size = hole_end - *hole_start;
1485                         else
1486                                 *hole_size = 0;
1487                         changed = true;
1488                 }
1489
1490                 switch (device->fs_devices->chunk_alloc_policy) {
1491                 case BTRFS_CHUNK_ALLOC_REGULAR:
1492                         /* No extra check */
1493                         break;
1494                 case BTRFS_CHUNK_ALLOC_ZONED:
1495                         if (dev_extent_hole_check_zoned(device, hole_start,
1496                                                         hole_size, num_bytes)) {
1497                                 changed = true;
1498                                 /*
1499                                  * The changed hole can contain pending extent.
1500                                  * Loop again to check that.
1501                                  */
1502                                 continue;
1503                         }
1504                         break;
1505                 default:
1506                         BUG();
1507                 }
1508
1509                 break;
1510         }
1511
1512         return changed;
1513 }
1514
1515 /*
1516  * find_free_dev_extent_start - find free space in the specified device
1517  * @device:       the device which we search the free space in
1518  * @num_bytes:    the size of the free space that we need
1519  * @search_start: the position from which to begin the search
1520  * @start:        store the start of the free space.
1521  * @len:          the size of the free space. that we find, or the size
1522  *                of the max free space if we don't find suitable free space
1523  *
1524  * this uses a pretty simple search, the expectation is that it is
1525  * called very infrequently and that a given device has a small number
1526  * of extents
1527  *
1528  * @start is used to store the start of the free space if we find. But if we
1529  * don't find suitable free space, it will be used to store the start position
1530  * of the max free space.
1531  *
1532  * @len is used to store the size of the free space that we find.
1533  * But if we don't find suitable free space, it is used to store the size of
1534  * the max free space.
1535  *
1536  * NOTE: This function will search *commit* root of device tree, and does extra
1537  * check to ensure dev extents are not double allocated.
1538  * This makes the function safe to allocate dev extents but may not report
1539  * correct usable device space, as device extent freed in current transaction
1540  * is not reported as available.
1541  */
1542 static int find_free_dev_extent_start(struct btrfs_device *device,
1543                                 u64 num_bytes, u64 search_start, u64 *start,
1544                                 u64 *len)
1545 {
1546         struct btrfs_fs_info *fs_info = device->fs_info;
1547         struct btrfs_root *root = fs_info->dev_root;
1548         struct btrfs_key key;
1549         struct btrfs_dev_extent *dev_extent;
1550         struct btrfs_path *path;
1551         u64 hole_size;
1552         u64 max_hole_start;
1553         u64 max_hole_size;
1554         u64 extent_end;
1555         u64 search_end = device->total_bytes;
1556         int ret;
1557         int slot;
1558         struct extent_buffer *l;
1559
1560         search_start = dev_extent_search_start(device, search_start);
1561
1562         WARN_ON(device->zone_info &&
1563                 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1564
1565         path = btrfs_alloc_path();
1566         if (!path)
1567                 return -ENOMEM;
1568
1569         max_hole_start = search_start;
1570         max_hole_size = 0;
1571
1572 again:
1573         if (search_start >= search_end ||
1574                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1575                 ret = -ENOSPC;
1576                 goto out;
1577         }
1578
1579         path->reada = READA_FORWARD;
1580         path->search_commit_root = 1;
1581         path->skip_locking = 1;
1582
1583         key.objectid = device->devid;
1584         key.offset = search_start;
1585         key.type = BTRFS_DEV_EXTENT_KEY;
1586
1587         ret = btrfs_search_backwards(root, &key, path);
1588         if (ret < 0)
1589                 goto out;
1590
1591         while (1) {
1592                 l = path->nodes[0];
1593                 slot = path->slots[0];
1594                 if (slot >= btrfs_header_nritems(l)) {
1595                         ret = btrfs_next_leaf(root, path);
1596                         if (ret == 0)
1597                                 continue;
1598                         if (ret < 0)
1599                                 goto out;
1600
1601                         break;
1602                 }
1603                 btrfs_item_key_to_cpu(l, &key, slot);
1604
1605                 if (key.objectid < device->devid)
1606                         goto next;
1607
1608                 if (key.objectid > device->devid)
1609                         break;
1610
1611                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1612                         goto next;
1613
1614                 if (key.offset > search_start) {
1615                         hole_size = key.offset - search_start;
1616                         dev_extent_hole_check(device, &search_start, &hole_size,
1617                                               num_bytes);
1618
1619                         if (hole_size > max_hole_size) {
1620                                 max_hole_start = search_start;
1621                                 max_hole_size = hole_size;
1622                         }
1623
1624                         /*
1625                          * If this free space is greater than which we need,
1626                          * it must be the max free space that we have found
1627                          * until now, so max_hole_start must point to the start
1628                          * of this free space and the length of this free space
1629                          * is stored in max_hole_size. Thus, we return
1630                          * max_hole_start and max_hole_size and go back to the
1631                          * caller.
1632                          */
1633                         if (hole_size >= num_bytes) {
1634                                 ret = 0;
1635                                 goto out;
1636                         }
1637                 }
1638
1639                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1640                 extent_end = key.offset + btrfs_dev_extent_length(l,
1641                                                                   dev_extent);
1642                 if (extent_end > search_start)
1643                         search_start = extent_end;
1644 next:
1645                 path->slots[0]++;
1646                 cond_resched();
1647         }
1648
1649         /*
1650          * At this point, search_start should be the end of
1651          * allocated dev extents, and when shrinking the device,
1652          * search_end may be smaller than search_start.
1653          */
1654         if (search_end > search_start) {
1655                 hole_size = search_end - search_start;
1656                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1657                                           num_bytes)) {
1658                         btrfs_release_path(path);
1659                         goto again;
1660                 }
1661
1662                 if (hole_size > max_hole_size) {
1663                         max_hole_start = search_start;
1664                         max_hole_size = hole_size;
1665                 }
1666         }
1667
1668         /* See above. */
1669         if (max_hole_size < num_bytes)
1670                 ret = -ENOSPC;
1671         else
1672                 ret = 0;
1673
1674 out:
1675         btrfs_free_path(path);
1676         *start = max_hole_start;
1677         if (len)
1678                 *len = max_hole_size;
1679         return ret;
1680 }
1681
1682 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1683                          u64 *start, u64 *len)
1684 {
1685         /* FIXME use last free of some kind */
1686         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1687 }
1688
1689 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1690                           struct btrfs_device *device,
1691                           u64 start, u64 *dev_extent_len)
1692 {
1693         struct btrfs_fs_info *fs_info = device->fs_info;
1694         struct btrfs_root *root = fs_info->dev_root;
1695         int ret;
1696         struct btrfs_path *path;
1697         struct btrfs_key key;
1698         struct btrfs_key found_key;
1699         struct extent_buffer *leaf = NULL;
1700         struct btrfs_dev_extent *extent = NULL;
1701
1702         path = btrfs_alloc_path();
1703         if (!path)
1704                 return -ENOMEM;
1705
1706         key.objectid = device->devid;
1707         key.offset = start;
1708         key.type = BTRFS_DEV_EXTENT_KEY;
1709 again:
1710         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1711         if (ret > 0) {
1712                 ret = btrfs_previous_item(root, path, key.objectid,
1713                                           BTRFS_DEV_EXTENT_KEY);
1714                 if (ret)
1715                         goto out;
1716                 leaf = path->nodes[0];
1717                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1718                 extent = btrfs_item_ptr(leaf, path->slots[0],
1719                                         struct btrfs_dev_extent);
1720                 BUG_ON(found_key.offset > start || found_key.offset +
1721                        btrfs_dev_extent_length(leaf, extent) < start);
1722                 key = found_key;
1723                 btrfs_release_path(path);
1724                 goto again;
1725         } else if (ret == 0) {
1726                 leaf = path->nodes[0];
1727                 extent = btrfs_item_ptr(leaf, path->slots[0],
1728                                         struct btrfs_dev_extent);
1729         } else {
1730                 goto out;
1731         }
1732
1733         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1734
1735         ret = btrfs_del_item(trans, root, path);
1736         if (ret == 0)
1737                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1738 out:
1739         btrfs_free_path(path);
1740         return ret;
1741 }
1742
1743 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1744 {
1745         struct extent_map_tree *em_tree;
1746         struct extent_map *em;
1747         struct rb_node *n;
1748         u64 ret = 0;
1749
1750         em_tree = &fs_info->mapping_tree;
1751         read_lock(&em_tree->lock);
1752         n = rb_last(&em_tree->map.rb_root);
1753         if (n) {
1754                 em = rb_entry(n, struct extent_map, rb_node);
1755                 ret = em->start + em->len;
1756         }
1757         read_unlock(&em_tree->lock);
1758
1759         return ret;
1760 }
1761
1762 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1763                                     u64 *devid_ret)
1764 {
1765         int ret;
1766         struct btrfs_key key;
1767         struct btrfs_key found_key;
1768         struct btrfs_path *path;
1769
1770         path = btrfs_alloc_path();
1771         if (!path)
1772                 return -ENOMEM;
1773
1774         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775         key.type = BTRFS_DEV_ITEM_KEY;
1776         key.offset = (u64)-1;
1777
1778         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1779         if (ret < 0)
1780                 goto error;
1781
1782         if (ret == 0) {
1783                 /* Corruption */
1784                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1785                 ret = -EUCLEAN;
1786                 goto error;
1787         }
1788
1789         ret = btrfs_previous_item(fs_info->chunk_root, path,
1790                                   BTRFS_DEV_ITEMS_OBJECTID,
1791                                   BTRFS_DEV_ITEM_KEY);
1792         if (ret) {
1793                 *devid_ret = 1;
1794         } else {
1795                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1796                                       path->slots[0]);
1797                 *devid_ret = found_key.offset + 1;
1798         }
1799         ret = 0;
1800 error:
1801         btrfs_free_path(path);
1802         return ret;
1803 }
1804
1805 /*
1806  * the device information is stored in the chunk root
1807  * the btrfs_device struct should be fully filled in
1808  */
1809 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1810                             struct btrfs_device *device)
1811 {
1812         int ret;
1813         struct btrfs_path *path;
1814         struct btrfs_dev_item *dev_item;
1815         struct extent_buffer *leaf;
1816         struct btrfs_key key;
1817         unsigned long ptr;
1818
1819         path = btrfs_alloc_path();
1820         if (!path)
1821                 return -ENOMEM;
1822
1823         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1824         key.type = BTRFS_DEV_ITEM_KEY;
1825         key.offset = device->devid;
1826
1827         btrfs_reserve_chunk_metadata(trans, true);
1828         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1829                                       &key, sizeof(*dev_item));
1830         btrfs_trans_release_chunk_metadata(trans);
1831         if (ret)
1832                 goto out;
1833
1834         leaf = path->nodes[0];
1835         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1836
1837         btrfs_set_device_id(leaf, dev_item, device->devid);
1838         btrfs_set_device_generation(leaf, dev_item, 0);
1839         btrfs_set_device_type(leaf, dev_item, device->type);
1840         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1841         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1842         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1843         btrfs_set_device_total_bytes(leaf, dev_item,
1844                                      btrfs_device_get_disk_total_bytes(device));
1845         btrfs_set_device_bytes_used(leaf, dev_item,
1846                                     btrfs_device_get_bytes_used(device));
1847         btrfs_set_device_group(leaf, dev_item, 0);
1848         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1849         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1850         btrfs_set_device_start_offset(leaf, dev_item, 0);
1851
1852         ptr = btrfs_device_uuid(dev_item);
1853         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1854         ptr = btrfs_device_fsid(dev_item);
1855         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1856                             ptr, BTRFS_FSID_SIZE);
1857         btrfs_mark_buffer_dirty(leaf);
1858
1859         ret = 0;
1860 out:
1861         btrfs_free_path(path);
1862         return ret;
1863 }
1864
1865 /*
1866  * Function to update ctime/mtime for a given device path.
1867  * Mainly used for ctime/mtime based probe like libblkid.
1868  *
1869  * We don't care about errors here, this is just to be kind to userspace.
1870  */
1871 static void update_dev_time(const char *device_path)
1872 {
1873         struct path path;
1874         struct timespec64 now;
1875         int ret;
1876
1877         ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1878         if (ret)
1879                 return;
1880
1881         now = current_time(d_inode(path.dentry));
1882         inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1883         path_put(&path);
1884 }
1885
1886 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1887                              struct btrfs_device *device)
1888 {
1889         struct btrfs_root *root = device->fs_info->chunk_root;
1890         int ret;
1891         struct btrfs_path *path;
1892         struct btrfs_key key;
1893
1894         path = btrfs_alloc_path();
1895         if (!path)
1896                 return -ENOMEM;
1897
1898         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1899         key.type = BTRFS_DEV_ITEM_KEY;
1900         key.offset = device->devid;
1901
1902         btrfs_reserve_chunk_metadata(trans, false);
1903         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1904         btrfs_trans_release_chunk_metadata(trans);
1905         if (ret) {
1906                 if (ret > 0)
1907                         ret = -ENOENT;
1908                 goto out;
1909         }
1910
1911         ret = btrfs_del_item(trans, root, path);
1912 out:
1913         btrfs_free_path(path);
1914         return ret;
1915 }
1916
1917 /*
1918  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1919  * filesystem. It's up to the caller to adjust that number regarding eg. device
1920  * replace.
1921  */
1922 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1923                 u64 num_devices)
1924 {
1925         u64 all_avail;
1926         unsigned seq;
1927         int i;
1928
1929         do {
1930                 seq = read_seqbegin(&fs_info->profiles_lock);
1931
1932                 all_avail = fs_info->avail_data_alloc_bits |
1933                             fs_info->avail_system_alloc_bits |
1934                             fs_info->avail_metadata_alloc_bits;
1935         } while (read_seqretry(&fs_info->profiles_lock, seq));
1936
1937         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1938                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1939                         continue;
1940
1941                 if (num_devices < btrfs_raid_array[i].devs_min)
1942                         return btrfs_raid_array[i].mindev_error;
1943         }
1944
1945         return 0;
1946 }
1947
1948 static struct btrfs_device * btrfs_find_next_active_device(
1949                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1950 {
1951         struct btrfs_device *next_device;
1952
1953         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1954                 if (next_device != device &&
1955                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1956                     && next_device->bdev)
1957                         return next_device;
1958         }
1959
1960         return NULL;
1961 }
1962
1963 /*
1964  * Helper function to check if the given device is part of s_bdev / latest_dev
1965  * and replace it with the provided or the next active device, in the context
1966  * where this function called, there should be always be another device (or
1967  * this_dev) which is active.
1968  */
1969 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1970                                             struct btrfs_device *next_device)
1971 {
1972         struct btrfs_fs_info *fs_info = device->fs_info;
1973
1974         if (!next_device)
1975                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1976                                                             device);
1977         ASSERT(next_device);
1978
1979         if (fs_info->sb->s_bdev &&
1980                         (fs_info->sb->s_bdev == device->bdev))
1981                 fs_info->sb->s_bdev = next_device->bdev;
1982
1983         if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
1984                 fs_info->fs_devices->latest_dev = next_device;
1985 }
1986
1987 /*
1988  * Return btrfs_fs_devices::num_devices excluding the device that's being
1989  * currently replaced.
1990  */
1991 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1992 {
1993         u64 num_devices = fs_info->fs_devices->num_devices;
1994
1995         down_read(&fs_info->dev_replace.rwsem);
1996         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1997                 ASSERT(num_devices > 1);
1998                 num_devices--;
1999         }
2000         up_read(&fs_info->dev_replace.rwsem);
2001
2002         return num_devices;
2003 }
2004
2005 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2006                                struct block_device *bdev,
2007                                const char *device_path)
2008 {
2009         struct btrfs_super_block *disk_super;
2010         int copy_num;
2011
2012         if (!bdev)
2013                 return;
2014
2015         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2016                 struct page *page;
2017                 int ret;
2018
2019                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2020                 if (IS_ERR(disk_super))
2021                         continue;
2022
2023                 if (bdev_is_zoned(bdev)) {
2024                         btrfs_reset_sb_log_zones(bdev, copy_num);
2025                         continue;
2026                 }
2027
2028                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2029
2030                 page = virt_to_page(disk_super);
2031                 set_page_dirty(page);
2032                 lock_page(page);
2033                 /* write_on_page() unlocks the page */
2034                 ret = write_one_page(page);
2035                 if (ret)
2036                         btrfs_warn(fs_info,
2037                                 "error clearing superblock number %d (%d)",
2038                                 copy_num, ret);
2039                 btrfs_release_disk_super(disk_super);
2040
2041         }
2042
2043         /* Notify udev that device has changed */
2044         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2045
2046         /* Update ctime/mtime for device path for libblkid */
2047         update_dev_time(device_path);
2048 }
2049
2050 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
2051                     struct btrfs_dev_lookup_args *args,
2052                     struct block_device **bdev, fmode_t *mode)
2053 {
2054         struct btrfs_trans_handle *trans;
2055         struct btrfs_device *device;
2056         struct btrfs_fs_devices *cur_devices;
2057         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2058         u64 num_devices;
2059         int ret = 0;
2060
2061         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2062                 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2063                 return -EINVAL;
2064         }
2065
2066         /*
2067          * The device list in fs_devices is accessed without locks (neither
2068          * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2069          * filesystem and another device rm cannot run.
2070          */
2071         num_devices = btrfs_num_devices(fs_info);
2072
2073         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2074         if (ret)
2075                 return ret;
2076
2077         device = btrfs_find_device(fs_info->fs_devices, args);
2078         if (!device) {
2079                 if (args->missing)
2080                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2081                 else
2082                         ret = -ENOENT;
2083                 return ret;
2084         }
2085
2086         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2087                 btrfs_warn_in_rcu(fs_info,
2088                   "cannot remove device %s (devid %llu) due to active swapfile",
2089                                   rcu_str_deref(device->name), device->devid);
2090                 return -ETXTBSY;
2091         }
2092
2093         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2094                 return BTRFS_ERROR_DEV_TGT_REPLACE;
2095
2096         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2097             fs_info->fs_devices->rw_devices == 1)
2098                 return BTRFS_ERROR_DEV_ONLY_WRITABLE;
2099
2100         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2101                 mutex_lock(&fs_info->chunk_mutex);
2102                 list_del_init(&device->dev_alloc_list);
2103                 device->fs_devices->rw_devices--;
2104                 mutex_unlock(&fs_info->chunk_mutex);
2105         }
2106
2107         ret = btrfs_shrink_device(device, 0);
2108         if (ret)
2109                 goto error_undo;
2110
2111         trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2112         if (IS_ERR(trans)) {
2113                 ret = PTR_ERR(trans);
2114                 goto error_undo;
2115         }
2116
2117         ret = btrfs_rm_dev_item(trans, device);
2118         if (ret) {
2119                 /* Any error in dev item removal is critical */
2120                 btrfs_crit(fs_info,
2121                            "failed to remove device item for devid %llu: %d",
2122                            device->devid, ret);
2123                 btrfs_abort_transaction(trans, ret);
2124                 btrfs_end_transaction(trans);
2125                 return ret;
2126         }
2127
2128         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2129         btrfs_scrub_cancel_dev(device);
2130
2131         /*
2132          * the device list mutex makes sure that we don't change
2133          * the device list while someone else is writing out all
2134          * the device supers. Whoever is writing all supers, should
2135          * lock the device list mutex before getting the number of
2136          * devices in the super block (super_copy). Conversely,
2137          * whoever updates the number of devices in the super block
2138          * (super_copy) should hold the device list mutex.
2139          */
2140
2141         /*
2142          * In normal cases the cur_devices == fs_devices. But in case
2143          * of deleting a seed device, the cur_devices should point to
2144          * its own fs_devices listed under the fs_devices->seed_list.
2145          */
2146         cur_devices = device->fs_devices;
2147         mutex_lock(&fs_devices->device_list_mutex);
2148         list_del_rcu(&device->dev_list);
2149
2150         cur_devices->num_devices--;
2151         cur_devices->total_devices--;
2152         /* Update total_devices of the parent fs_devices if it's seed */
2153         if (cur_devices != fs_devices)
2154                 fs_devices->total_devices--;
2155
2156         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2157                 cur_devices->missing_devices--;
2158
2159         btrfs_assign_next_active_device(device, NULL);
2160
2161         if (device->bdev) {
2162                 cur_devices->open_devices--;
2163                 /* remove sysfs entry */
2164                 btrfs_sysfs_remove_device(device);
2165         }
2166
2167         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2168         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2169         mutex_unlock(&fs_devices->device_list_mutex);
2170
2171         /*
2172          * At this point, the device is zero sized and detached from the
2173          * devices list.  All that's left is to zero out the old supers and
2174          * free the device.
2175          *
2176          * We cannot call btrfs_close_bdev() here because we're holding the sb
2177          * write lock, and blkdev_put() will pull in the ->open_mutex on the
2178          * block device and it's dependencies.  Instead just flush the device
2179          * and let the caller do the final blkdev_put.
2180          */
2181         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2182                 btrfs_scratch_superblocks(fs_info, device->bdev,
2183                                           device->name->str);
2184                 if (device->bdev) {
2185                         sync_blockdev(device->bdev);
2186                         invalidate_bdev(device->bdev);
2187                 }
2188         }
2189
2190         *bdev = device->bdev;
2191         *mode = device->mode;
2192         synchronize_rcu();
2193         btrfs_free_device(device);
2194
2195         /*
2196          * This can happen if cur_devices is the private seed devices list.  We
2197          * cannot call close_fs_devices() here because it expects the uuid_mutex
2198          * to be held, but in fact we don't need that for the private
2199          * seed_devices, we can simply decrement cur_devices->opened and then
2200          * remove it from our list and free the fs_devices.
2201          */
2202         if (cur_devices->num_devices == 0) {
2203                 list_del_init(&cur_devices->seed_list);
2204                 ASSERT(cur_devices->opened == 1);
2205                 cur_devices->opened--;
2206                 free_fs_devices(cur_devices);
2207         }
2208
2209         ret = btrfs_commit_transaction(trans);
2210
2211         return ret;
2212
2213 error_undo:
2214         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2215                 mutex_lock(&fs_info->chunk_mutex);
2216                 list_add(&device->dev_alloc_list,
2217                          &fs_devices->alloc_list);
2218                 device->fs_devices->rw_devices++;
2219                 mutex_unlock(&fs_info->chunk_mutex);
2220         }
2221         return ret;
2222 }
2223
2224 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2225 {
2226         struct btrfs_fs_devices *fs_devices;
2227
2228         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2229
2230         /*
2231          * in case of fs with no seed, srcdev->fs_devices will point
2232          * to fs_devices of fs_info. However when the dev being replaced is
2233          * a seed dev it will point to the seed's local fs_devices. In short
2234          * srcdev will have its correct fs_devices in both the cases.
2235          */
2236         fs_devices = srcdev->fs_devices;
2237
2238         list_del_rcu(&srcdev->dev_list);
2239         list_del(&srcdev->dev_alloc_list);
2240         fs_devices->num_devices--;
2241         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2242                 fs_devices->missing_devices--;
2243
2244         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2245                 fs_devices->rw_devices--;
2246
2247         if (srcdev->bdev)
2248                 fs_devices->open_devices--;
2249 }
2250
2251 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2252 {
2253         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2254
2255         mutex_lock(&uuid_mutex);
2256
2257         btrfs_close_bdev(srcdev);
2258         synchronize_rcu();
2259         btrfs_free_device(srcdev);
2260
2261         /* if this is no devs we rather delete the fs_devices */
2262         if (!fs_devices->num_devices) {
2263                 /*
2264                  * On a mounted FS, num_devices can't be zero unless it's a
2265                  * seed. In case of a seed device being replaced, the replace
2266                  * target added to the sprout FS, so there will be no more
2267                  * device left under the seed FS.
2268                  */
2269                 ASSERT(fs_devices->seeding);
2270
2271                 list_del_init(&fs_devices->seed_list);
2272                 close_fs_devices(fs_devices);
2273                 free_fs_devices(fs_devices);
2274         }
2275         mutex_unlock(&uuid_mutex);
2276 }
2277
2278 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2279 {
2280         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2281
2282         mutex_lock(&fs_devices->device_list_mutex);
2283
2284         btrfs_sysfs_remove_device(tgtdev);
2285
2286         if (tgtdev->bdev)
2287                 fs_devices->open_devices--;
2288
2289         fs_devices->num_devices--;
2290
2291         btrfs_assign_next_active_device(tgtdev, NULL);
2292
2293         list_del_rcu(&tgtdev->dev_list);
2294
2295         mutex_unlock(&fs_devices->device_list_mutex);
2296
2297         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2298                                   tgtdev->name->str);
2299
2300         btrfs_close_bdev(tgtdev);
2301         synchronize_rcu();
2302         btrfs_free_device(tgtdev);
2303 }
2304
2305 /**
2306  * Populate args from device at path
2307  *
2308  * @fs_info:    the filesystem
2309  * @args:       the args to populate
2310  * @path:       the path to the device
2311  *
2312  * This will read the super block of the device at @path and populate @args with
2313  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
2314  * lookup a device to operate on, but need to do it before we take any locks.
2315  * This properly handles the special case of "missing" that a user may pass in,
2316  * and does some basic sanity checks.  The caller must make sure that @path is
2317  * properly NUL terminated before calling in, and must call
2318  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2319  * uuid buffers.
2320  *
2321  * Return: 0 for success, -errno for failure
2322  */
2323 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2324                                  struct btrfs_dev_lookup_args *args,
2325                                  const char *path)
2326 {
2327         struct btrfs_super_block *disk_super;
2328         struct block_device *bdev;
2329         int ret;
2330
2331         if (!path || !path[0])
2332                 return -EINVAL;
2333         if (!strcmp(path, "missing")) {
2334                 args->missing = true;
2335                 return 0;
2336         }
2337
2338         args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2339         args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2340         if (!args->uuid || !args->fsid) {
2341                 btrfs_put_dev_args_from_path(args);
2342                 return -ENOMEM;
2343         }
2344
2345         ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
2346                                     &bdev, &disk_super);
2347         if (ret)
2348                 return ret;
2349         args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2350         memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
2351         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2352                 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
2353         else
2354                 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
2355         btrfs_release_disk_super(disk_super);
2356         blkdev_put(bdev, FMODE_READ);
2357         return 0;
2358 }
2359
2360 /*
2361  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
2362  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2363  * that don't need to be freed.
2364  */
2365 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2366 {
2367         kfree(args->uuid);
2368         kfree(args->fsid);
2369         args->uuid = NULL;
2370         args->fsid = NULL;
2371 }
2372
2373 struct btrfs_device *btrfs_find_device_by_devspec(
2374                 struct btrfs_fs_info *fs_info, u64 devid,
2375                 const char *device_path)
2376 {
2377         BTRFS_DEV_LOOKUP_ARGS(args);
2378         struct btrfs_device *device;
2379         int ret;
2380
2381         if (devid) {
2382                 args.devid = devid;
2383                 device = btrfs_find_device(fs_info->fs_devices, &args);
2384                 if (!device)
2385                         return ERR_PTR(-ENOENT);
2386                 return device;
2387         }
2388
2389         ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2390         if (ret)
2391                 return ERR_PTR(ret);
2392         device = btrfs_find_device(fs_info->fs_devices, &args);
2393         btrfs_put_dev_args_from_path(&args);
2394         if (!device)
2395                 return ERR_PTR(-ENOENT);
2396         return device;
2397 }
2398
2399 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
2400 {
2401         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2402         struct btrfs_fs_devices *old_devices;
2403         struct btrfs_fs_devices *seed_devices;
2404
2405         lockdep_assert_held(&uuid_mutex);
2406         if (!fs_devices->seeding)
2407                 return ERR_PTR(-EINVAL);
2408
2409         /*
2410          * Private copy of the seed devices, anchored at
2411          * fs_info->fs_devices->seed_list
2412          */
2413         seed_devices = alloc_fs_devices(NULL, NULL);
2414         if (IS_ERR(seed_devices))
2415                 return seed_devices;
2416
2417         /*
2418          * It's necessary to retain a copy of the original seed fs_devices in
2419          * fs_uuids so that filesystems which have been seeded can successfully
2420          * reference the seed device from open_seed_devices. This also supports
2421          * multiple fs seed.
2422          */
2423         old_devices = clone_fs_devices(fs_devices);
2424         if (IS_ERR(old_devices)) {
2425                 kfree(seed_devices);
2426                 return old_devices;
2427         }
2428
2429         list_add(&old_devices->fs_list, &fs_uuids);
2430
2431         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2432         seed_devices->opened = 1;
2433         INIT_LIST_HEAD(&seed_devices->devices);
2434         INIT_LIST_HEAD(&seed_devices->alloc_list);
2435         mutex_init(&seed_devices->device_list_mutex);
2436
2437         return seed_devices;
2438 }
2439
2440 /*
2441  * Splice seed devices into the sprout fs_devices.
2442  * Generate a new fsid for the sprouted read-write filesystem.
2443  */
2444 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2445                                struct btrfs_fs_devices *seed_devices)
2446 {
2447         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2448         struct btrfs_super_block *disk_super = fs_info->super_copy;
2449         struct btrfs_device *device;
2450         u64 super_flags;
2451
2452         /*
2453          * We are updating the fsid, the thread leading to device_list_add()
2454          * could race, so uuid_mutex is needed.
2455          */
2456         lockdep_assert_held(&uuid_mutex);
2457
2458         /*
2459          * The threads listed below may traverse dev_list but can do that without
2460          * device_list_mutex:
2461          * - All device ops and balance - as we are in btrfs_exclop_start.
2462          * - Various dev_list readers - are using RCU.
2463          * - btrfs_ioctl_fitrim() - is using RCU.
2464          *
2465          * For-read threads as below are using device_list_mutex:
2466          * - Readonly scrub btrfs_scrub_dev()
2467          * - Readonly scrub btrfs_scrub_progress()
2468          * - btrfs_get_dev_stats()
2469          */
2470         lockdep_assert_held(&fs_devices->device_list_mutex);
2471
2472         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2473                               synchronize_rcu);
2474         list_for_each_entry(device, &seed_devices->devices, dev_list)
2475                 device->fs_devices = seed_devices;
2476
2477         fs_devices->seeding = false;
2478         fs_devices->num_devices = 0;
2479         fs_devices->open_devices = 0;
2480         fs_devices->missing_devices = 0;
2481         fs_devices->rotating = false;
2482         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2483
2484         generate_random_uuid(fs_devices->fsid);
2485         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2486         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2487
2488         super_flags = btrfs_super_flags(disk_super) &
2489                       ~BTRFS_SUPER_FLAG_SEEDING;
2490         btrfs_set_super_flags(disk_super, super_flags);
2491 }
2492
2493 /*
2494  * Store the expected generation for seed devices in device items.
2495  */
2496 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2497 {
2498         BTRFS_DEV_LOOKUP_ARGS(args);
2499         struct btrfs_fs_info *fs_info = trans->fs_info;
2500         struct btrfs_root *root = fs_info->chunk_root;
2501         struct btrfs_path *path;
2502         struct extent_buffer *leaf;
2503         struct btrfs_dev_item *dev_item;
2504         struct btrfs_device *device;
2505         struct btrfs_key key;
2506         u8 fs_uuid[BTRFS_FSID_SIZE];
2507         u8 dev_uuid[BTRFS_UUID_SIZE];
2508         int ret;
2509
2510         path = btrfs_alloc_path();
2511         if (!path)
2512                 return -ENOMEM;
2513
2514         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2515         key.offset = 0;
2516         key.type = BTRFS_DEV_ITEM_KEY;
2517
2518         while (1) {
2519                 btrfs_reserve_chunk_metadata(trans, false);
2520                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2521                 btrfs_trans_release_chunk_metadata(trans);
2522                 if (ret < 0)
2523                         goto error;
2524
2525                 leaf = path->nodes[0];
2526 next_slot:
2527                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2528                         ret = btrfs_next_leaf(root, path);
2529                         if (ret > 0)
2530                                 break;
2531                         if (ret < 0)
2532                                 goto error;
2533                         leaf = path->nodes[0];
2534                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2535                         btrfs_release_path(path);
2536                         continue;
2537                 }
2538
2539                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2540                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2541                     key.type != BTRFS_DEV_ITEM_KEY)
2542                         break;
2543
2544                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2545                                           struct btrfs_dev_item);
2546                 args.devid = btrfs_device_id(leaf, dev_item);
2547                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2548                                    BTRFS_UUID_SIZE);
2549                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2550                                    BTRFS_FSID_SIZE);
2551                 args.uuid = dev_uuid;
2552                 args.fsid = fs_uuid;
2553                 device = btrfs_find_device(fs_info->fs_devices, &args);
2554                 BUG_ON(!device); /* Logic error */
2555
2556                 if (device->fs_devices->seeding) {
2557                         btrfs_set_device_generation(leaf, dev_item,
2558                                                     device->generation);
2559                         btrfs_mark_buffer_dirty(leaf);
2560                 }
2561
2562                 path->slots[0]++;
2563                 goto next_slot;
2564         }
2565         ret = 0;
2566 error:
2567         btrfs_free_path(path);
2568         return ret;
2569 }
2570
2571 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2572 {
2573         struct btrfs_root *root = fs_info->dev_root;
2574         struct btrfs_trans_handle *trans;
2575         struct btrfs_device *device;
2576         struct block_device *bdev;
2577         struct super_block *sb = fs_info->sb;
2578         struct rcu_string *name;
2579         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2580         struct btrfs_fs_devices *seed_devices;
2581         u64 orig_super_total_bytes;
2582         u64 orig_super_num_devices;
2583         int ret = 0;
2584         bool seeding_dev = false;
2585         bool locked = false;
2586
2587         if (sb_rdonly(sb) && !fs_devices->seeding)
2588                 return -EROFS;
2589
2590         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2591                                   fs_info->bdev_holder);
2592         if (IS_ERR(bdev))
2593                 return PTR_ERR(bdev);
2594
2595         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2596                 ret = -EINVAL;
2597                 goto error;
2598         }
2599
2600         if (fs_devices->seeding) {
2601                 seeding_dev = true;
2602                 down_write(&sb->s_umount);
2603                 mutex_lock(&uuid_mutex);
2604                 locked = true;
2605         }
2606
2607         sync_blockdev(bdev);
2608
2609         rcu_read_lock();
2610         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2611                 if (device->bdev == bdev) {
2612                         ret = -EEXIST;
2613                         rcu_read_unlock();
2614                         goto error;
2615                 }
2616         }
2617         rcu_read_unlock();
2618
2619         device = btrfs_alloc_device(fs_info, NULL, NULL);
2620         if (IS_ERR(device)) {
2621                 /* we can safely leave the fs_devices entry around */
2622                 ret = PTR_ERR(device);
2623                 goto error;
2624         }
2625
2626         name = rcu_string_strdup(device_path, GFP_KERNEL);
2627         if (!name) {
2628                 ret = -ENOMEM;
2629                 goto error_free_device;
2630         }
2631         rcu_assign_pointer(device->name, name);
2632
2633         device->fs_info = fs_info;
2634         device->bdev = bdev;
2635         ret = lookup_bdev(device_path, &device->devt);
2636         if (ret)
2637                 goto error_free_device;
2638
2639         ret = btrfs_get_dev_zone_info(device, false);
2640         if (ret)
2641                 goto error_free_device;
2642
2643         trans = btrfs_start_transaction(root, 0);
2644         if (IS_ERR(trans)) {
2645                 ret = PTR_ERR(trans);
2646                 goto error_free_zone;
2647         }
2648
2649         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2650         device->generation = trans->transid;
2651         device->io_width = fs_info->sectorsize;
2652         device->io_align = fs_info->sectorsize;
2653         device->sector_size = fs_info->sectorsize;
2654         device->total_bytes =
2655                 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
2656         device->disk_total_bytes = device->total_bytes;
2657         device->commit_total_bytes = device->total_bytes;
2658         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2659         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2660         device->mode = FMODE_EXCL;
2661         device->dev_stats_valid = 1;
2662         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2663
2664         if (seeding_dev) {
2665                 btrfs_clear_sb_rdonly(sb);
2666
2667                 /* GFP_KERNEL allocation must not be under device_list_mutex */
2668                 seed_devices = btrfs_init_sprout(fs_info);
2669                 if (IS_ERR(seed_devices)) {
2670                         ret = PTR_ERR(seed_devices);
2671                         btrfs_abort_transaction(trans, ret);
2672                         goto error_trans;
2673                 }
2674         }
2675
2676         mutex_lock(&fs_devices->device_list_mutex);
2677         if (seeding_dev) {
2678                 btrfs_setup_sprout(fs_info, seed_devices);
2679                 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2680                                                 device);
2681         }
2682
2683         device->fs_devices = fs_devices;
2684
2685         mutex_lock(&fs_info->chunk_mutex);
2686         list_add_rcu(&device->dev_list, &fs_devices->devices);
2687         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2688         fs_devices->num_devices++;
2689         fs_devices->open_devices++;
2690         fs_devices->rw_devices++;
2691         fs_devices->total_devices++;
2692         fs_devices->total_rw_bytes += device->total_bytes;
2693
2694         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2695
2696         if (!bdev_nonrot(bdev))
2697                 fs_devices->rotating = true;
2698
2699         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2700         btrfs_set_super_total_bytes(fs_info->super_copy,
2701                 round_down(orig_super_total_bytes + device->total_bytes,
2702                            fs_info->sectorsize));
2703
2704         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2705         btrfs_set_super_num_devices(fs_info->super_copy,
2706                                     orig_super_num_devices + 1);
2707
2708         /*
2709          * we've got more storage, clear any full flags on the space
2710          * infos
2711          */
2712         btrfs_clear_space_info_full(fs_info);
2713
2714         mutex_unlock(&fs_info->chunk_mutex);
2715
2716         /* Add sysfs device entry */
2717         btrfs_sysfs_add_device(device);
2718
2719         mutex_unlock(&fs_devices->device_list_mutex);
2720
2721         if (seeding_dev) {
2722                 mutex_lock(&fs_info->chunk_mutex);
2723                 ret = init_first_rw_device(trans);
2724                 mutex_unlock(&fs_info->chunk_mutex);
2725                 if (ret) {
2726                         btrfs_abort_transaction(trans, ret);
2727                         goto error_sysfs;
2728                 }
2729         }
2730
2731         ret = btrfs_add_dev_item(trans, device);
2732         if (ret) {
2733                 btrfs_abort_transaction(trans, ret);
2734                 goto error_sysfs;
2735         }
2736
2737         if (seeding_dev) {
2738                 ret = btrfs_finish_sprout(trans);
2739                 if (ret) {
2740                         btrfs_abort_transaction(trans, ret);
2741                         goto error_sysfs;
2742                 }
2743
2744                 /*
2745                  * fs_devices now represents the newly sprouted filesystem and
2746                  * its fsid has been changed by btrfs_sprout_splice().
2747                  */
2748                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2749         }
2750
2751         ret = btrfs_commit_transaction(trans);
2752
2753         if (seeding_dev) {
2754                 mutex_unlock(&uuid_mutex);
2755                 up_write(&sb->s_umount);
2756                 locked = false;
2757
2758                 if (ret) /* transaction commit */
2759                         return ret;
2760
2761                 ret = btrfs_relocate_sys_chunks(fs_info);
2762                 if (ret < 0)
2763                         btrfs_handle_fs_error(fs_info, ret,
2764                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2765                 trans = btrfs_attach_transaction(root);
2766                 if (IS_ERR(trans)) {
2767                         if (PTR_ERR(trans) == -ENOENT)
2768                                 return 0;
2769                         ret = PTR_ERR(trans);
2770                         trans = NULL;
2771                         goto error_sysfs;
2772                 }
2773                 ret = btrfs_commit_transaction(trans);
2774         }
2775
2776         /*
2777          * Now that we have written a new super block to this device, check all
2778          * other fs_devices list if device_path alienates any other scanned
2779          * device.
2780          * We can ignore the return value as it typically returns -EINVAL and
2781          * only succeeds if the device was an alien.
2782          */
2783         btrfs_forget_devices(device->devt);
2784
2785         /* Update ctime/mtime for blkid or udev */
2786         update_dev_time(device_path);
2787
2788         return ret;
2789
2790 error_sysfs:
2791         btrfs_sysfs_remove_device(device);
2792         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2793         mutex_lock(&fs_info->chunk_mutex);
2794         list_del_rcu(&device->dev_list);
2795         list_del(&device->dev_alloc_list);
2796         fs_info->fs_devices->num_devices--;
2797         fs_info->fs_devices->open_devices--;
2798         fs_info->fs_devices->rw_devices--;
2799         fs_info->fs_devices->total_devices--;
2800         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2801         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2802         btrfs_set_super_total_bytes(fs_info->super_copy,
2803                                     orig_super_total_bytes);
2804         btrfs_set_super_num_devices(fs_info->super_copy,
2805                                     orig_super_num_devices);
2806         mutex_unlock(&fs_info->chunk_mutex);
2807         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2808 error_trans:
2809         if (seeding_dev)
2810                 btrfs_set_sb_rdonly(sb);
2811         if (trans)
2812                 btrfs_end_transaction(trans);
2813 error_free_zone:
2814         btrfs_destroy_dev_zone_info(device);
2815 error_free_device:
2816         btrfs_free_device(device);
2817 error:
2818         blkdev_put(bdev, FMODE_EXCL);
2819         if (locked) {
2820                 mutex_unlock(&uuid_mutex);
2821                 up_write(&sb->s_umount);
2822         }
2823         return ret;
2824 }
2825
2826 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2827                                         struct btrfs_device *device)
2828 {
2829         int ret;
2830         struct btrfs_path *path;
2831         struct btrfs_root *root = device->fs_info->chunk_root;
2832         struct btrfs_dev_item *dev_item;
2833         struct extent_buffer *leaf;
2834         struct btrfs_key key;
2835
2836         path = btrfs_alloc_path();
2837         if (!path)
2838                 return -ENOMEM;
2839
2840         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2841         key.type = BTRFS_DEV_ITEM_KEY;
2842         key.offset = device->devid;
2843
2844         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2845         if (ret < 0)
2846                 goto out;
2847
2848         if (ret > 0) {
2849                 ret = -ENOENT;
2850                 goto out;
2851         }
2852
2853         leaf = path->nodes[0];
2854         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2855
2856         btrfs_set_device_id(leaf, dev_item, device->devid);
2857         btrfs_set_device_type(leaf, dev_item, device->type);
2858         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2859         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2860         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2861         btrfs_set_device_total_bytes(leaf, dev_item,
2862                                      btrfs_device_get_disk_total_bytes(device));
2863         btrfs_set_device_bytes_used(leaf, dev_item,
2864                                     btrfs_device_get_bytes_used(device));
2865         btrfs_mark_buffer_dirty(leaf);
2866
2867 out:
2868         btrfs_free_path(path);
2869         return ret;
2870 }
2871
2872 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2873                       struct btrfs_device *device, u64 new_size)
2874 {
2875         struct btrfs_fs_info *fs_info = device->fs_info;
2876         struct btrfs_super_block *super_copy = fs_info->super_copy;
2877         u64 old_total;
2878         u64 diff;
2879         int ret;
2880
2881         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2882                 return -EACCES;
2883
2884         new_size = round_down(new_size, fs_info->sectorsize);
2885
2886         mutex_lock(&fs_info->chunk_mutex);
2887         old_total = btrfs_super_total_bytes(super_copy);
2888         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2889
2890         if (new_size <= device->total_bytes ||
2891             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2892                 mutex_unlock(&fs_info->chunk_mutex);
2893                 return -EINVAL;
2894         }
2895
2896         btrfs_set_super_total_bytes(super_copy,
2897                         round_down(old_total + diff, fs_info->sectorsize));
2898         device->fs_devices->total_rw_bytes += diff;
2899
2900         btrfs_device_set_total_bytes(device, new_size);
2901         btrfs_device_set_disk_total_bytes(device, new_size);
2902         btrfs_clear_space_info_full(device->fs_info);
2903         if (list_empty(&device->post_commit_list))
2904                 list_add_tail(&device->post_commit_list,
2905                               &trans->transaction->dev_update_list);
2906         mutex_unlock(&fs_info->chunk_mutex);
2907
2908         btrfs_reserve_chunk_metadata(trans, false);
2909         ret = btrfs_update_device(trans, device);
2910         btrfs_trans_release_chunk_metadata(trans);
2911
2912         return ret;
2913 }
2914
2915 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2916 {
2917         struct btrfs_fs_info *fs_info = trans->fs_info;
2918         struct btrfs_root *root = fs_info->chunk_root;
2919         int ret;
2920         struct btrfs_path *path;
2921         struct btrfs_key key;
2922
2923         path = btrfs_alloc_path();
2924         if (!path)
2925                 return -ENOMEM;
2926
2927         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2928         key.offset = chunk_offset;
2929         key.type = BTRFS_CHUNK_ITEM_KEY;
2930
2931         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2932         if (ret < 0)
2933                 goto out;
2934         else if (ret > 0) { /* Logic error or corruption */
2935                 btrfs_handle_fs_error(fs_info, -ENOENT,
2936                                       "Failed lookup while freeing chunk.");
2937                 ret = -ENOENT;
2938                 goto out;
2939         }
2940
2941         ret = btrfs_del_item(trans, root, path);
2942         if (ret < 0)
2943                 btrfs_handle_fs_error(fs_info, ret,
2944                                       "Failed to delete chunk item.");
2945 out:
2946         btrfs_free_path(path);
2947         return ret;
2948 }
2949
2950 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2951 {
2952         struct btrfs_super_block *super_copy = fs_info->super_copy;
2953         struct btrfs_disk_key *disk_key;
2954         struct btrfs_chunk *chunk;
2955         u8 *ptr;
2956         int ret = 0;
2957         u32 num_stripes;
2958         u32 array_size;
2959         u32 len = 0;
2960         u32 cur;
2961         struct btrfs_key key;
2962
2963         lockdep_assert_held(&fs_info->chunk_mutex);
2964         array_size = btrfs_super_sys_array_size(super_copy);
2965
2966         ptr = super_copy->sys_chunk_array;
2967         cur = 0;
2968
2969         while (cur < array_size) {
2970                 disk_key = (struct btrfs_disk_key *)ptr;
2971                 btrfs_disk_key_to_cpu(&key, disk_key);
2972
2973                 len = sizeof(*disk_key);
2974
2975                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2976                         chunk = (struct btrfs_chunk *)(ptr + len);
2977                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2978                         len += btrfs_chunk_item_size(num_stripes);
2979                 } else {
2980                         ret = -EIO;
2981                         break;
2982                 }
2983                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2984                     key.offset == chunk_offset) {
2985                         memmove(ptr, ptr + len, array_size - (cur + len));
2986                         array_size -= len;
2987                         btrfs_set_super_sys_array_size(super_copy, array_size);
2988                 } else {
2989                         ptr += len;
2990                         cur += len;
2991                 }
2992         }
2993         return ret;
2994 }
2995
2996 /*
2997  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2998  * @logical: Logical block offset in bytes.
2999  * @length: Length of extent in bytes.
3000  *
3001  * Return: Chunk mapping or ERR_PTR.
3002  */
3003 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3004                                        u64 logical, u64 length)
3005 {
3006         struct extent_map_tree *em_tree;
3007         struct extent_map *em;
3008
3009         em_tree = &fs_info->mapping_tree;
3010         read_lock(&em_tree->lock);
3011         em = lookup_extent_mapping(em_tree, logical, length);
3012         read_unlock(&em_tree->lock);
3013
3014         if (!em) {
3015                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
3016                            logical, length);
3017                 return ERR_PTR(-EINVAL);
3018         }
3019
3020         if (em->start > logical || em->start + em->len < logical) {
3021                 btrfs_crit(fs_info,
3022                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3023                            logical, length, em->start, em->start + em->len);
3024                 free_extent_map(em);
3025                 return ERR_PTR(-EINVAL);
3026         }
3027
3028         /* callers are responsible for dropping em's ref. */
3029         return em;
3030 }
3031
3032 static int remove_chunk_item(struct btrfs_trans_handle *trans,
3033                              struct map_lookup *map, u64 chunk_offset)
3034 {
3035         int i;
3036
3037         /*
3038          * Removing chunk items and updating the device items in the chunks btree
3039          * requires holding the chunk_mutex.
3040          * See the comment at btrfs_chunk_alloc() for the details.
3041          */
3042         lockdep_assert_held(&trans->fs_info->chunk_mutex);
3043
3044         for (i = 0; i < map->num_stripes; i++) {
3045                 int ret;
3046
3047                 ret = btrfs_update_device(trans, map->stripes[i].dev);
3048                 if (ret)
3049                         return ret;
3050         }
3051
3052         return btrfs_free_chunk(trans, chunk_offset);
3053 }
3054
3055 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3056 {
3057         struct btrfs_fs_info *fs_info = trans->fs_info;
3058         struct extent_map *em;
3059         struct map_lookup *map;
3060         u64 dev_extent_len = 0;
3061         int i, ret = 0;
3062         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3063
3064         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3065         if (IS_ERR(em)) {
3066                 /*
3067                  * This is a logic error, but we don't want to just rely on the
3068                  * user having built with ASSERT enabled, so if ASSERT doesn't
3069                  * do anything we still error out.
3070                  */
3071                 ASSERT(0);
3072                 return PTR_ERR(em);
3073         }
3074         map = em->map_lookup;
3075
3076         /*
3077          * First delete the device extent items from the devices btree.
3078          * We take the device_list_mutex to avoid racing with the finishing phase
3079          * of a device replace operation. See the comment below before acquiring
3080          * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3081          * because that can result in a deadlock when deleting the device extent
3082          * items from the devices btree - COWing an extent buffer from the btree
3083          * may result in allocating a new metadata chunk, which would attempt to
3084          * lock again fs_info->chunk_mutex.
3085          */
3086         mutex_lock(&fs_devices->device_list_mutex);
3087         for (i = 0; i < map->num_stripes; i++) {
3088                 struct btrfs_device *device = map->stripes[i].dev;
3089                 ret = btrfs_free_dev_extent(trans, device,
3090                                             map->stripes[i].physical,
3091                                             &dev_extent_len);
3092                 if (ret) {
3093                         mutex_unlock(&fs_devices->device_list_mutex);
3094                         btrfs_abort_transaction(trans, ret);
3095                         goto out;
3096                 }
3097
3098                 if (device->bytes_used > 0) {
3099                         mutex_lock(&fs_info->chunk_mutex);
3100                         btrfs_device_set_bytes_used(device,
3101                                         device->bytes_used - dev_extent_len);
3102                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3103                         btrfs_clear_space_info_full(fs_info);
3104                         mutex_unlock(&fs_info->chunk_mutex);
3105                 }
3106         }
3107         mutex_unlock(&fs_devices->device_list_mutex);
3108
3109         /*
3110          * We acquire fs_info->chunk_mutex for 2 reasons:
3111          *
3112          * 1) Just like with the first phase of the chunk allocation, we must
3113          *    reserve system space, do all chunk btree updates and deletions, and
3114          *    update the system chunk array in the superblock while holding this
3115          *    mutex. This is for similar reasons as explained on the comment at
3116          *    the top of btrfs_chunk_alloc();
3117          *
3118          * 2) Prevent races with the final phase of a device replace operation
3119          *    that replaces the device object associated with the map's stripes,
3120          *    because the device object's id can change at any time during that
3121          *    final phase of the device replace operation
3122          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3123          *    replaced device and then see it with an ID of
3124          *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3125          *    the device item, which does not exists on the chunk btree.
3126          *    The finishing phase of device replace acquires both the
3127          *    device_list_mutex and the chunk_mutex, in that order, so we are
3128          *    safe by just acquiring the chunk_mutex.
3129          */
3130         trans->removing_chunk = true;
3131         mutex_lock(&fs_info->chunk_mutex);
3132
3133         check_system_chunk(trans, map->type);
3134
3135         ret = remove_chunk_item(trans, map, chunk_offset);
3136         /*
3137          * Normally we should not get -ENOSPC since we reserved space before
3138          * through the call to check_system_chunk().
3139          *
3140          * Despite our system space_info having enough free space, we may not
3141          * be able to allocate extents from its block groups, because all have
3142          * an incompatible profile, which will force us to allocate a new system
3143          * block group with the right profile, or right after we called
3144          * check_system_space() above, a scrub turned the only system block group
3145          * with enough free space into RO mode.
3146          * This is explained with more detail at do_chunk_alloc().
3147          *
3148          * So if we get -ENOSPC, allocate a new system chunk and retry once.
3149          */
3150         if (ret == -ENOSPC) {
3151                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3152                 struct btrfs_block_group *sys_bg;
3153
3154                 sys_bg = btrfs_create_chunk(trans, sys_flags);
3155                 if (IS_ERR(sys_bg)) {
3156                         ret = PTR_ERR(sys_bg);
3157                         btrfs_abort_transaction(trans, ret);
3158                         goto out;
3159                 }
3160
3161                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3162                 if (ret) {
3163                         btrfs_abort_transaction(trans, ret);
3164                         goto out;
3165                 }
3166
3167                 ret = remove_chunk_item(trans, map, chunk_offset);
3168                 if (ret) {
3169                         btrfs_abort_transaction(trans, ret);
3170                         goto out;
3171                 }
3172         } else if (ret) {
3173                 btrfs_abort_transaction(trans, ret);
3174                 goto out;
3175         }
3176
3177         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3178
3179         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3180                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3181                 if (ret) {
3182                         btrfs_abort_transaction(trans, ret);
3183                         goto out;
3184                 }
3185         }
3186
3187         mutex_unlock(&fs_info->chunk_mutex);
3188         trans->removing_chunk = false;
3189
3190         /*
3191          * We are done with chunk btree updates and deletions, so release the
3192          * system space we previously reserved (with check_system_chunk()).
3193          */
3194         btrfs_trans_release_chunk_metadata(trans);
3195
3196         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3197         if (ret) {
3198                 btrfs_abort_transaction(trans, ret);
3199                 goto out;
3200         }
3201
3202 out:
3203         if (trans->removing_chunk) {
3204                 mutex_unlock(&fs_info->chunk_mutex);
3205                 trans->removing_chunk = false;
3206         }
3207         /* once for us */
3208         free_extent_map(em);
3209         return ret;
3210 }
3211
3212 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3213 {
3214         struct btrfs_root *root = fs_info->chunk_root;
3215         struct btrfs_trans_handle *trans;
3216         struct btrfs_block_group *block_group;
3217         u64 length;
3218         int ret;
3219
3220         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3221                 btrfs_err(fs_info,
3222                           "relocate: not supported on extent tree v2 yet");
3223                 return -EINVAL;
3224         }
3225
3226         /*
3227          * Prevent races with automatic removal of unused block groups.
3228          * After we relocate and before we remove the chunk with offset
3229          * chunk_offset, automatic removal of the block group can kick in,
3230          * resulting in a failure when calling btrfs_remove_chunk() below.
3231          *
3232          * Make sure to acquire this mutex before doing a tree search (dev
3233          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3234          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3235          * we release the path used to search the chunk/dev tree and before
3236          * the current task acquires this mutex and calls us.
3237          */
3238         lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3239
3240         /* step one, relocate all the extents inside this chunk */
3241         btrfs_scrub_pause(fs_info);
3242         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3243         btrfs_scrub_continue(fs_info);
3244         if (ret)
3245                 return ret;
3246
3247         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3248         if (!block_group)
3249                 return -ENOENT;
3250         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3251         length = block_group->length;
3252         btrfs_put_block_group(block_group);
3253
3254         /*
3255          * On a zoned file system, discard the whole block group, this will
3256          * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3257          * resetting the zone fails, don't treat it as a fatal problem from the
3258          * filesystem's point of view.
3259          */
3260         if (btrfs_is_zoned(fs_info)) {
3261                 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3262                 if (ret)
3263                         btrfs_info(fs_info,
3264                                 "failed to reset zone %llu after relocation",
3265                                 chunk_offset);
3266         }
3267
3268         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3269                                                      chunk_offset);
3270         if (IS_ERR(trans)) {
3271                 ret = PTR_ERR(trans);
3272                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3273                 return ret;
3274         }
3275
3276         /*
3277          * step two, delete the device extents and the
3278          * chunk tree entries
3279          */
3280         ret = btrfs_remove_chunk(trans, chunk_offset);
3281         btrfs_end_transaction(trans);
3282         return ret;
3283 }
3284
3285 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3286 {
3287         struct btrfs_root *chunk_root = fs_info->chunk_root;
3288         struct btrfs_path *path;
3289         struct extent_buffer *leaf;
3290         struct btrfs_chunk *chunk;
3291         struct btrfs_key key;
3292         struct btrfs_key found_key;
3293         u64 chunk_type;
3294         bool retried = false;
3295         int failed = 0;
3296         int ret;
3297
3298         path = btrfs_alloc_path();
3299         if (!path)
3300                 return -ENOMEM;
3301
3302 again:
3303         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3304         key.offset = (u64)-1;
3305         key.type = BTRFS_CHUNK_ITEM_KEY;
3306
3307         while (1) {
3308                 mutex_lock(&fs_info->reclaim_bgs_lock);
3309                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3310                 if (ret < 0) {
3311                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3312                         goto error;
3313                 }
3314                 BUG_ON(ret == 0); /* Corruption */
3315
3316                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3317                                           key.type);
3318                 if (ret)
3319                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3320                 if (ret < 0)
3321                         goto error;
3322                 if (ret > 0)
3323                         break;
3324
3325                 leaf = path->nodes[0];
3326                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3327
3328                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3329                                        struct btrfs_chunk);
3330                 chunk_type = btrfs_chunk_type(leaf, chunk);
3331                 btrfs_release_path(path);
3332
3333                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3334                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3335                         if (ret == -ENOSPC)
3336                                 failed++;
3337                         else
3338                                 BUG_ON(ret);
3339                 }
3340                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3341
3342                 if (found_key.offset == 0)
3343                         break;
3344                 key.offset = found_key.offset - 1;
3345         }
3346         ret = 0;
3347         if (failed && !retried) {
3348                 failed = 0;
3349                 retried = true;
3350                 goto again;
3351         } else if (WARN_ON(failed && retried)) {
3352                 ret = -ENOSPC;
3353         }
3354 error:
3355         btrfs_free_path(path);
3356         return ret;
3357 }
3358
3359 /*
3360  * return 1 : allocate a data chunk successfully,
3361  * return <0: errors during allocating a data chunk,
3362  * return 0 : no need to allocate a data chunk.
3363  */
3364 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3365                                       u64 chunk_offset)
3366 {
3367         struct btrfs_block_group *cache;
3368         u64 bytes_used;
3369         u64 chunk_type;
3370
3371         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3372         ASSERT(cache);
3373         chunk_type = cache->flags;
3374         btrfs_put_block_group(cache);
3375
3376         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3377                 return 0;
3378
3379         spin_lock(&fs_info->data_sinfo->lock);
3380         bytes_used = fs_info->data_sinfo->bytes_used;
3381         spin_unlock(&fs_info->data_sinfo->lock);
3382
3383         if (!bytes_used) {
3384                 struct btrfs_trans_handle *trans;
3385                 int ret;
3386
3387                 trans = btrfs_join_transaction(fs_info->tree_root);
3388                 if (IS_ERR(trans))
3389                         return PTR_ERR(trans);
3390
3391                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3392                 btrfs_end_transaction(trans);
3393                 if (ret < 0)
3394                         return ret;
3395                 return 1;
3396         }
3397
3398         return 0;
3399 }
3400
3401 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3402                                struct btrfs_balance_control *bctl)
3403 {
3404         struct btrfs_root *root = fs_info->tree_root;
3405         struct btrfs_trans_handle *trans;
3406         struct btrfs_balance_item *item;
3407         struct btrfs_disk_balance_args disk_bargs;
3408         struct btrfs_path *path;
3409         struct extent_buffer *leaf;
3410         struct btrfs_key key;
3411         int ret, err;
3412
3413         path = btrfs_alloc_path();
3414         if (!path)
3415                 return -ENOMEM;
3416
3417         trans = btrfs_start_transaction(root, 0);
3418         if (IS_ERR(trans)) {
3419                 btrfs_free_path(path);
3420                 return PTR_ERR(trans);
3421         }
3422
3423         key.objectid = BTRFS_BALANCE_OBJECTID;
3424         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3425         key.offset = 0;
3426
3427         ret = btrfs_insert_empty_item(trans, root, path, &key,
3428                                       sizeof(*item));
3429         if (ret)
3430                 goto out;
3431
3432         leaf = path->nodes[0];
3433         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3434
3435         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3436
3437         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3438         btrfs_set_balance_data(leaf, item, &disk_bargs);
3439         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3440         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3441         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3442         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3443
3444         btrfs_set_balance_flags(leaf, item, bctl->flags);
3445
3446         btrfs_mark_buffer_dirty(leaf);
3447 out:
3448         btrfs_free_path(path);
3449         err = btrfs_commit_transaction(trans);
3450         if (err && !ret)
3451                 ret = err;
3452         return ret;
3453 }
3454
3455 static int del_balance_item(struct btrfs_fs_info *fs_info)
3456 {
3457         struct btrfs_root *root = fs_info->tree_root;
3458         struct btrfs_trans_handle *trans;
3459         struct btrfs_path *path;
3460         struct btrfs_key key;
3461         int ret, err;
3462
3463         path = btrfs_alloc_path();
3464         if (!path)
3465                 return -ENOMEM;
3466
3467         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3468         if (IS_ERR(trans)) {
3469                 btrfs_free_path(path);
3470                 return PTR_ERR(trans);
3471         }
3472
3473         key.objectid = BTRFS_BALANCE_OBJECTID;
3474         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3475         key.offset = 0;
3476
3477         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3478         if (ret < 0)
3479                 goto out;
3480         if (ret > 0) {
3481                 ret = -ENOENT;
3482                 goto out;
3483         }
3484
3485         ret = btrfs_del_item(trans, root, path);
3486 out:
3487         btrfs_free_path(path);
3488         err = btrfs_commit_transaction(trans);
3489         if (err && !ret)
3490                 ret = err;
3491         return ret;
3492 }
3493
3494 /*
3495  * This is a heuristic used to reduce the number of chunks balanced on
3496  * resume after balance was interrupted.
3497  */
3498 static void update_balance_args(struct btrfs_balance_control *bctl)
3499 {
3500         /*
3501          * Turn on soft mode for chunk types that were being converted.
3502          */
3503         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3504                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3505         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3506                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3507         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3508                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3509
3510         /*
3511          * Turn on usage filter if is not already used.  The idea is
3512          * that chunks that we have already balanced should be
3513          * reasonably full.  Don't do it for chunks that are being
3514          * converted - that will keep us from relocating unconverted
3515          * (albeit full) chunks.
3516          */
3517         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3518             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3519             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3520                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3521                 bctl->data.usage = 90;
3522         }
3523         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3524             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3525             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3526                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3527                 bctl->sys.usage = 90;
3528         }
3529         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3530             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3531             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3532                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3533                 bctl->meta.usage = 90;
3534         }
3535 }
3536
3537 /*
3538  * Clear the balance status in fs_info and delete the balance item from disk.
3539  */
3540 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3541 {
3542         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3543         int ret;
3544
3545         BUG_ON(!fs_info->balance_ctl);
3546
3547         spin_lock(&fs_info->balance_lock);
3548         fs_info->balance_ctl = NULL;
3549         spin_unlock(&fs_info->balance_lock);
3550
3551         kfree(bctl);
3552         ret = del_balance_item(fs_info);
3553         if (ret)
3554                 btrfs_handle_fs_error(fs_info, ret, NULL);
3555 }
3556
3557 /*
3558  * Balance filters.  Return 1 if chunk should be filtered out
3559  * (should not be balanced).
3560  */
3561 static int chunk_profiles_filter(u64 chunk_type,
3562                                  struct btrfs_balance_args *bargs)
3563 {
3564         chunk_type = chunk_to_extended(chunk_type) &
3565                                 BTRFS_EXTENDED_PROFILE_MASK;
3566
3567         if (bargs->profiles & chunk_type)
3568                 return 0;
3569
3570         return 1;
3571 }
3572
3573 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3574                               struct btrfs_balance_args *bargs)
3575 {
3576         struct btrfs_block_group *cache;
3577         u64 chunk_used;
3578         u64 user_thresh_min;
3579         u64 user_thresh_max;
3580         int ret = 1;
3581
3582         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3583         chunk_used = cache->used;
3584
3585         if (bargs->usage_min == 0)
3586                 user_thresh_min = 0;
3587         else
3588                 user_thresh_min = div_factor_fine(cache->length,
3589                                                   bargs->usage_min);
3590
3591         if (bargs->usage_max == 0)
3592                 user_thresh_max = 1;
3593         else if (bargs->usage_max > 100)
3594                 user_thresh_max = cache->length;
3595         else
3596                 user_thresh_max = div_factor_fine(cache->length,
3597                                                   bargs->usage_max);
3598
3599         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3600                 ret = 0;
3601
3602         btrfs_put_block_group(cache);
3603         return ret;
3604 }
3605
3606 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3607                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3608 {
3609         struct btrfs_block_group *cache;
3610         u64 chunk_used, user_thresh;
3611         int ret = 1;
3612
3613         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3614         chunk_used = cache->used;
3615
3616         if (bargs->usage_min == 0)
3617                 user_thresh = 1;
3618         else if (bargs->usage > 100)
3619                 user_thresh = cache->length;
3620         else
3621                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3622
3623         if (chunk_used < user_thresh)
3624                 ret = 0;
3625
3626         btrfs_put_block_group(cache);
3627         return ret;
3628 }
3629
3630 static int chunk_devid_filter(struct extent_buffer *leaf,
3631                               struct btrfs_chunk *chunk,
3632                               struct btrfs_balance_args *bargs)
3633 {
3634         struct btrfs_stripe *stripe;
3635         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3636         int i;
3637
3638         for (i = 0; i < num_stripes; i++) {
3639                 stripe = btrfs_stripe_nr(chunk, i);
3640                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3641                         return 0;
3642         }
3643
3644         return 1;
3645 }
3646
3647 static u64 calc_data_stripes(u64 type, int num_stripes)
3648 {
3649         const int index = btrfs_bg_flags_to_raid_index(type);
3650         const int ncopies = btrfs_raid_array[index].ncopies;
3651         const int nparity = btrfs_raid_array[index].nparity;
3652
3653         return (num_stripes - nparity) / ncopies;
3654 }
3655
3656 /* [pstart, pend) */
3657 static int chunk_drange_filter(struct extent_buffer *leaf,
3658                                struct btrfs_chunk *chunk,
3659                                struct btrfs_balance_args *bargs)
3660 {
3661         struct btrfs_stripe *stripe;
3662         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3663         u64 stripe_offset;
3664         u64 stripe_length;
3665         u64 type;
3666         int factor;
3667         int i;
3668
3669         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3670                 return 0;
3671
3672         type = btrfs_chunk_type(leaf, chunk);
3673         factor = calc_data_stripes(type, num_stripes);
3674
3675         for (i = 0; i < num_stripes; i++) {
3676                 stripe = btrfs_stripe_nr(chunk, i);
3677                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3678                         continue;
3679
3680                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3681                 stripe_length = btrfs_chunk_length(leaf, chunk);
3682                 stripe_length = div_u64(stripe_length, factor);
3683
3684                 if (stripe_offset < bargs->pend &&
3685                     stripe_offset + stripe_length > bargs->pstart)
3686                         return 0;
3687         }
3688
3689         return 1;
3690 }
3691
3692 /* [vstart, vend) */
3693 static int chunk_vrange_filter(struct extent_buffer *leaf,
3694                                struct btrfs_chunk *chunk,
3695                                u64 chunk_offset,
3696                                struct btrfs_balance_args *bargs)
3697 {
3698         if (chunk_offset < bargs->vend &&
3699             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3700                 /* at least part of the chunk is inside this vrange */
3701                 return 0;
3702
3703         return 1;
3704 }
3705
3706 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3707                                struct btrfs_chunk *chunk,
3708                                struct btrfs_balance_args *bargs)
3709 {
3710         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3711
3712         if (bargs->stripes_min <= num_stripes
3713                         && num_stripes <= bargs->stripes_max)
3714                 return 0;
3715
3716         return 1;
3717 }
3718
3719 static int chunk_soft_convert_filter(u64 chunk_type,
3720                                      struct btrfs_balance_args *bargs)
3721 {
3722         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3723                 return 0;
3724
3725         chunk_type = chunk_to_extended(chunk_type) &
3726                                 BTRFS_EXTENDED_PROFILE_MASK;
3727
3728         if (bargs->target == chunk_type)
3729                 return 1;
3730
3731         return 0;
3732 }
3733
3734 static int should_balance_chunk(struct extent_buffer *leaf,
3735                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3736 {
3737         struct btrfs_fs_info *fs_info = leaf->fs_info;
3738         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3739         struct btrfs_balance_args *bargs = NULL;
3740         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3741
3742         /* type filter */
3743         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3744               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3745                 return 0;
3746         }
3747
3748         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3749                 bargs = &bctl->data;
3750         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3751                 bargs = &bctl->sys;
3752         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3753                 bargs = &bctl->meta;
3754
3755         /* profiles filter */
3756         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3757             chunk_profiles_filter(chunk_type, bargs)) {
3758                 return 0;
3759         }
3760
3761         /* usage filter */
3762         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3763             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3764                 return 0;
3765         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3766             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3767                 return 0;
3768         }
3769
3770         /* devid filter */
3771         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3772             chunk_devid_filter(leaf, chunk, bargs)) {
3773                 return 0;
3774         }
3775
3776         /* drange filter, makes sense only with devid filter */
3777         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3778             chunk_drange_filter(leaf, chunk, bargs)) {
3779                 return 0;
3780         }
3781
3782         /* vrange filter */
3783         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3784             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3785                 return 0;
3786         }
3787
3788         /* stripes filter */
3789         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3790             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3791                 return 0;
3792         }
3793
3794         /* soft profile changing mode */
3795         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3796             chunk_soft_convert_filter(chunk_type, bargs)) {
3797                 return 0;
3798         }
3799
3800         /*
3801          * limited by count, must be the last filter
3802          */
3803         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3804                 if (bargs->limit == 0)
3805                         return 0;
3806                 else
3807                         bargs->limit--;
3808         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3809                 /*
3810                  * Same logic as the 'limit' filter; the minimum cannot be
3811                  * determined here because we do not have the global information
3812                  * about the count of all chunks that satisfy the filters.
3813                  */
3814                 if (bargs->limit_max == 0)
3815                         return 0;
3816                 else
3817                         bargs->limit_max--;
3818         }
3819
3820         return 1;
3821 }
3822
3823 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3824 {
3825         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3826         struct btrfs_root *chunk_root = fs_info->chunk_root;
3827         u64 chunk_type;
3828         struct btrfs_chunk *chunk;
3829         struct btrfs_path *path = NULL;
3830         struct btrfs_key key;
3831         struct btrfs_key found_key;
3832         struct extent_buffer *leaf;
3833         int slot;
3834         int ret;
3835         int enospc_errors = 0;
3836         bool counting = true;
3837         /* The single value limit and min/max limits use the same bytes in the */
3838         u64 limit_data = bctl->data.limit;
3839         u64 limit_meta = bctl->meta.limit;
3840         u64 limit_sys = bctl->sys.limit;
3841         u32 count_data = 0;
3842         u32 count_meta = 0;
3843         u32 count_sys = 0;
3844         int chunk_reserved = 0;
3845
3846         path = btrfs_alloc_path();
3847         if (!path) {
3848                 ret = -ENOMEM;
3849                 goto error;
3850         }
3851
3852         /* zero out stat counters */
3853         spin_lock(&fs_info->balance_lock);
3854         memset(&bctl->stat, 0, sizeof(bctl->stat));
3855         spin_unlock(&fs_info->balance_lock);
3856 again:
3857         if (!counting) {
3858                 /*
3859                  * The single value limit and min/max limits use the same bytes
3860                  * in the
3861                  */
3862                 bctl->data.limit = limit_data;
3863                 bctl->meta.limit = limit_meta;
3864                 bctl->sys.limit = limit_sys;
3865         }
3866         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3867         key.offset = (u64)-1;
3868         key.type = BTRFS_CHUNK_ITEM_KEY;
3869
3870         while (1) {
3871                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3872                     atomic_read(&fs_info->balance_cancel_req)) {
3873                         ret = -ECANCELED;
3874                         goto error;
3875                 }
3876
3877                 mutex_lock(&fs_info->reclaim_bgs_lock);
3878                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3879                 if (ret < 0) {
3880                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3881                         goto error;
3882                 }
3883
3884                 /*
3885                  * this shouldn't happen, it means the last relocate
3886                  * failed
3887                  */
3888                 if (ret == 0)
3889                         BUG(); /* FIXME break ? */
3890
3891                 ret = btrfs_previous_item(chunk_root, path, 0,
3892                                           BTRFS_CHUNK_ITEM_KEY);
3893                 if (ret) {
3894                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3895                         ret = 0;
3896                         break;
3897                 }
3898
3899                 leaf = path->nodes[0];
3900                 slot = path->slots[0];
3901                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3902
3903                 if (found_key.objectid != key.objectid) {
3904                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3905                         break;
3906                 }
3907
3908                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3909                 chunk_type = btrfs_chunk_type(leaf, chunk);
3910
3911                 if (!counting) {
3912                         spin_lock(&fs_info->balance_lock);
3913                         bctl->stat.considered++;
3914                         spin_unlock(&fs_info->balance_lock);
3915                 }
3916
3917                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3918
3919                 btrfs_release_path(path);
3920                 if (!ret) {
3921                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3922                         goto loop;
3923                 }
3924
3925                 if (counting) {
3926                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3927                         spin_lock(&fs_info->balance_lock);
3928                         bctl->stat.expected++;
3929                         spin_unlock(&fs_info->balance_lock);
3930
3931                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3932                                 count_data++;
3933                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3934                                 count_sys++;
3935                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3936                                 count_meta++;
3937
3938                         goto loop;
3939                 }
3940
3941                 /*
3942                  * Apply limit_min filter, no need to check if the LIMITS
3943                  * filter is used, limit_min is 0 by default
3944                  */
3945                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3946                                         count_data < bctl->data.limit_min)
3947                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3948                                         count_meta < bctl->meta.limit_min)
3949                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3950                                         count_sys < bctl->sys.limit_min)) {
3951                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3952                         goto loop;
3953                 }
3954
3955                 if (!chunk_reserved) {
3956                         /*
3957                          * We may be relocating the only data chunk we have,
3958                          * which could potentially end up with losing data's
3959                          * raid profile, so lets allocate an empty one in
3960                          * advance.
3961                          */
3962                         ret = btrfs_may_alloc_data_chunk(fs_info,
3963                                                          found_key.offset);
3964                         if (ret < 0) {
3965                                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3966                                 goto error;
3967                         } else if (ret == 1) {
3968                                 chunk_reserved = 1;
3969                         }
3970                 }
3971
3972                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3973                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3974                 if (ret == -ENOSPC) {
3975                         enospc_errors++;
3976                 } else if (ret == -ETXTBSY) {
3977                         btrfs_info(fs_info,
3978            "skipping relocation of block group %llu due to active swapfile",
3979                                    found_key.offset);
3980                         ret = 0;
3981                 } else if (ret) {
3982                         goto error;
3983                 } else {
3984                         spin_lock(&fs_info->balance_lock);
3985                         bctl->stat.completed++;
3986                         spin_unlock(&fs_info->balance_lock);
3987                 }
3988 loop:
3989                 if (found_key.offset == 0)
3990                         break;
3991                 key.offset = found_key.offset - 1;
3992         }
3993
3994         if (counting) {
3995                 btrfs_release_path(path);
3996                 counting = false;
3997                 goto again;
3998         }
3999 error:
4000         btrfs_free_path(path);
4001         if (enospc_errors) {
4002                 btrfs_info(fs_info, "%d enospc errors during balance",
4003                            enospc_errors);
4004                 if (!ret)
4005                         ret = -ENOSPC;
4006         }
4007
4008         return ret;
4009 }
4010
4011 /**
4012  * alloc_profile_is_valid - see if a given profile is valid and reduced
4013  * @flags: profile to validate
4014  * @extended: if true @flags is treated as an extended profile
4015  */
4016 static int alloc_profile_is_valid(u64 flags, int extended)
4017 {
4018         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4019                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
4020
4021         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4022
4023         /* 1) check that all other bits are zeroed */
4024         if (flags & ~mask)
4025                 return 0;
4026
4027         /* 2) see if profile is reduced */
4028         if (flags == 0)
4029                 return !extended; /* "0" is valid for usual profiles */
4030
4031         return has_single_bit_set(flags);
4032 }
4033
4034 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
4035 {
4036         /* cancel requested || normal exit path */
4037         return atomic_read(&fs_info->balance_cancel_req) ||
4038                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
4039                  atomic_read(&fs_info->balance_cancel_req) == 0);
4040 }
4041
4042 /*
4043  * Validate target profile against allowed profiles and return true if it's OK.
4044  * Otherwise print the error message and return false.
4045  */
4046 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4047                 const struct btrfs_balance_args *bargs,
4048                 u64 allowed, const char *type)
4049 {
4050         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4051                 return true;
4052
4053         /* Profile is valid and does not have bits outside of the allowed set */
4054         if (alloc_profile_is_valid(bargs->target, 1) &&
4055             (bargs->target & ~allowed) == 0)
4056                 return true;
4057
4058         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4059                         type, btrfs_bg_type_to_raid_name(bargs->target));
4060         return false;
4061 }
4062
4063 /*
4064  * Fill @buf with textual description of balance filter flags @bargs, up to
4065  * @size_buf including the terminating null. The output may be trimmed if it
4066  * does not fit into the provided buffer.
4067  */
4068 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4069                                  u32 size_buf)
4070 {
4071         int ret;
4072         u32 size_bp = size_buf;
4073         char *bp = buf;
4074         u64 flags = bargs->flags;
4075         char tmp_buf[128] = {'\0'};
4076
4077         if (!flags)
4078                 return;
4079
4080 #define CHECK_APPEND_NOARG(a)                                           \
4081         do {                                                            \
4082                 ret = snprintf(bp, size_bp, (a));                       \
4083                 if (ret < 0 || ret >= size_bp)                          \
4084                         goto out_overflow;                              \
4085                 size_bp -= ret;                                         \
4086                 bp += ret;                                              \
4087         } while (0)
4088
4089 #define CHECK_APPEND_1ARG(a, v1)                                        \
4090         do {                                                            \
4091                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4092                 if (ret < 0 || ret >= size_bp)                          \
4093                         goto out_overflow;                              \
4094                 size_bp -= ret;                                         \
4095                 bp += ret;                                              \
4096         } while (0)
4097
4098 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
4099         do {                                                            \
4100                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
4101                 if (ret < 0 || ret >= size_bp)                          \
4102                         goto out_overflow;                              \
4103                 size_bp -= ret;                                         \
4104                 bp += ret;                                              \
4105         } while (0)
4106
4107         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4108                 CHECK_APPEND_1ARG("convert=%s,",
4109                                   btrfs_bg_type_to_raid_name(bargs->target));
4110
4111         if (flags & BTRFS_BALANCE_ARGS_SOFT)
4112                 CHECK_APPEND_NOARG("soft,");
4113
4114         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4115                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4116                                             sizeof(tmp_buf));
4117                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4118         }
4119
4120         if (flags & BTRFS_BALANCE_ARGS_USAGE)
4121                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4122
4123         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4124                 CHECK_APPEND_2ARG("usage=%u..%u,",
4125                                   bargs->usage_min, bargs->usage_max);
4126
4127         if (flags & BTRFS_BALANCE_ARGS_DEVID)
4128                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4129
4130         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4131                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4132                                   bargs->pstart, bargs->pend);
4133
4134         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4135                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4136                                   bargs->vstart, bargs->vend);
4137
4138         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4139                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4140
4141         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4142                 CHECK_APPEND_2ARG("limit=%u..%u,",
4143                                 bargs->limit_min, bargs->limit_max);
4144
4145         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4146                 CHECK_APPEND_2ARG("stripes=%u..%u,",
4147                                   bargs->stripes_min, bargs->stripes_max);
4148
4149 #undef CHECK_APPEND_2ARG
4150 #undef CHECK_APPEND_1ARG
4151 #undef CHECK_APPEND_NOARG
4152
4153 out_overflow:
4154
4155         if (size_bp < size_buf)
4156                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4157         else
4158                 buf[0] = '\0';
4159 }
4160
4161 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4162 {
4163         u32 size_buf = 1024;
4164         char tmp_buf[192] = {'\0'};
4165         char *buf;
4166         char *bp;
4167         u32 size_bp = size_buf;
4168         int ret;
4169         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4170
4171         buf = kzalloc(size_buf, GFP_KERNEL);
4172         if (!buf)
4173                 return;
4174
4175         bp = buf;
4176
4177 #define CHECK_APPEND_1ARG(a, v1)                                        \
4178         do {                                                            \
4179                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4180                 if (ret < 0 || ret >= size_bp)                          \
4181                         goto out_overflow;                              \
4182                 size_bp -= ret;                                         \
4183                 bp += ret;                                              \
4184         } while (0)
4185
4186         if (bctl->flags & BTRFS_BALANCE_FORCE)
4187                 CHECK_APPEND_1ARG("%s", "-f ");
4188
4189         if (bctl->flags & BTRFS_BALANCE_DATA) {
4190                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4191                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4192         }
4193
4194         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4195                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4196                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4197         }
4198
4199         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4200                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4201                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4202         }
4203
4204 #undef CHECK_APPEND_1ARG
4205
4206 out_overflow:
4207
4208         if (size_bp < size_buf)
4209                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4210         btrfs_info(fs_info, "balance: %s %s",
4211                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4212                    "resume" : "start", buf);
4213
4214         kfree(buf);
4215 }
4216
4217 /*
4218  * Should be called with balance mutexe held
4219  */
4220 int btrfs_balance(struct btrfs_fs_info *fs_info,
4221                   struct btrfs_balance_control *bctl,
4222                   struct btrfs_ioctl_balance_args *bargs)
4223 {
4224         u64 meta_target, data_target;
4225         u64 allowed;
4226         int mixed = 0;
4227         int ret;
4228         u64 num_devices;
4229         unsigned seq;
4230         bool reducing_redundancy;
4231         int i;
4232
4233         if (btrfs_fs_closing(fs_info) ||
4234             atomic_read(&fs_info->balance_pause_req) ||
4235             btrfs_should_cancel_balance(fs_info)) {
4236                 ret = -EINVAL;
4237                 goto out;
4238         }
4239
4240         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4241         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4242                 mixed = 1;
4243
4244         /*
4245          * In case of mixed groups both data and meta should be picked,
4246          * and identical options should be given for both of them.
4247          */
4248         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4249         if (mixed && (bctl->flags & allowed)) {
4250                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4251                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4252                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4253                         btrfs_err(fs_info,
4254           "balance: mixed groups data and metadata options must be the same");
4255                         ret = -EINVAL;
4256                         goto out;
4257                 }
4258         }
4259
4260         /*
4261          * rw_devices will not change at the moment, device add/delete/replace
4262          * are exclusive
4263          */
4264         num_devices = fs_info->fs_devices->rw_devices;
4265
4266         /*
4267          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4268          * special bit for it, to make it easier to distinguish.  Thus we need
4269          * to set it manually, or balance would refuse the profile.
4270          */
4271         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4272         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4273                 if (num_devices >= btrfs_raid_array[i].devs_min)
4274                         allowed |= btrfs_raid_array[i].bg_flag;
4275
4276         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4277             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4278             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4279                 ret = -EINVAL;
4280                 goto out;
4281         }
4282
4283         /*
4284          * Allow to reduce metadata or system integrity only if force set for
4285          * profiles with redundancy (copies, parity)
4286          */
4287         allowed = 0;
4288         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4289                 if (btrfs_raid_array[i].ncopies >= 2 ||
4290                     btrfs_raid_array[i].tolerated_failures >= 1)
4291                         allowed |= btrfs_raid_array[i].bg_flag;
4292         }
4293         do {
4294                 seq = read_seqbegin(&fs_info->profiles_lock);
4295
4296                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4297                      (fs_info->avail_system_alloc_bits & allowed) &&
4298                      !(bctl->sys.target & allowed)) ||
4299                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4300                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4301                      !(bctl->meta.target & allowed)))
4302                         reducing_redundancy = true;
4303                 else
4304                         reducing_redundancy = false;
4305
4306                 /* if we're not converting, the target field is uninitialized */
4307                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4308                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4309                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4310                         bctl->data.target : fs_info->avail_data_alloc_bits;
4311         } while (read_seqretry(&fs_info->profiles_lock, seq));
4312
4313         if (reducing_redundancy) {
4314                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4315                         btrfs_info(fs_info,
4316                            "balance: force reducing metadata redundancy");
4317                 } else {
4318                         btrfs_err(fs_info,
4319         "balance: reduces metadata redundancy, use --force if you want this");
4320                         ret = -EINVAL;
4321                         goto out;
4322                 }
4323         }
4324
4325         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4326                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4327                 btrfs_warn(fs_info,
4328         "balance: metadata profile %s has lower redundancy than data profile %s",
4329                                 btrfs_bg_type_to_raid_name(meta_target),
4330                                 btrfs_bg_type_to_raid_name(data_target));
4331         }
4332
4333         ret = insert_balance_item(fs_info, bctl);
4334         if (ret && ret != -EEXIST)
4335                 goto out;
4336
4337         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4338                 BUG_ON(ret == -EEXIST);
4339                 BUG_ON(fs_info->balance_ctl);
4340                 spin_lock(&fs_info->balance_lock);
4341                 fs_info->balance_ctl = bctl;
4342                 spin_unlock(&fs_info->balance_lock);
4343         } else {
4344                 BUG_ON(ret != -EEXIST);
4345                 spin_lock(&fs_info->balance_lock);
4346                 update_balance_args(bctl);
4347                 spin_unlock(&fs_info->balance_lock);
4348         }
4349
4350         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4351         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4352         describe_balance_start_or_resume(fs_info);
4353         mutex_unlock(&fs_info->balance_mutex);
4354
4355         ret = __btrfs_balance(fs_info);
4356
4357         mutex_lock(&fs_info->balance_mutex);
4358         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
4359                 btrfs_info(fs_info, "balance: paused");
4360                 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4361         }
4362         /*
4363          * Balance can be canceled by:
4364          *
4365          * - Regular cancel request
4366          *   Then ret == -ECANCELED and balance_cancel_req > 0
4367          *
4368          * - Fatal signal to "btrfs" process
4369          *   Either the signal caught by wait_reserve_ticket() and callers
4370          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4371          *   got -ECANCELED.
4372          *   Either way, in this case balance_cancel_req = 0, and
4373          *   ret == -EINTR or ret == -ECANCELED.
4374          *
4375          * So here we only check the return value to catch canceled balance.
4376          */
4377         else if (ret == -ECANCELED || ret == -EINTR)
4378                 btrfs_info(fs_info, "balance: canceled");
4379         else
4380                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4381
4382         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4383
4384         if (bargs) {
4385                 memset(bargs, 0, sizeof(*bargs));
4386                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4387         }
4388
4389         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4390             balance_need_close(fs_info)) {
4391                 reset_balance_state(fs_info);
4392                 btrfs_exclop_finish(fs_info);
4393         }
4394
4395         wake_up(&fs_info->balance_wait_q);
4396
4397         return ret;
4398 out:
4399         if (bctl->flags & BTRFS_BALANCE_RESUME)
4400                 reset_balance_state(fs_info);
4401         else
4402                 kfree(bctl);
4403         btrfs_exclop_finish(fs_info);
4404
4405         return ret;
4406 }
4407
4408 static int balance_kthread(void *data)
4409 {
4410         struct btrfs_fs_info *fs_info = data;
4411         int ret = 0;
4412
4413         sb_start_write(fs_info->sb);
4414         mutex_lock(&fs_info->balance_mutex);
4415         if (fs_info->balance_ctl)
4416                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4417         mutex_unlock(&fs_info->balance_mutex);
4418         sb_end_write(fs_info->sb);
4419
4420         return ret;
4421 }
4422
4423 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4424 {
4425         struct task_struct *tsk;
4426
4427         mutex_lock(&fs_info->balance_mutex);
4428         if (!fs_info->balance_ctl) {
4429                 mutex_unlock(&fs_info->balance_mutex);
4430                 return 0;
4431         }
4432         mutex_unlock(&fs_info->balance_mutex);
4433
4434         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4435                 btrfs_info(fs_info, "balance: resume skipped");
4436                 return 0;
4437         }
4438
4439         spin_lock(&fs_info->super_lock);
4440         ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4441         fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4442         spin_unlock(&fs_info->super_lock);
4443         /*
4444          * A ro->rw remount sequence should continue with the paused balance
4445          * regardless of who pauses it, system or the user as of now, so set
4446          * the resume flag.
4447          */
4448         spin_lock(&fs_info->balance_lock);
4449         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4450         spin_unlock(&fs_info->balance_lock);
4451
4452         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4453         return PTR_ERR_OR_ZERO(tsk);
4454 }
4455
4456 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4457 {
4458         struct btrfs_balance_control *bctl;
4459         struct btrfs_balance_item *item;
4460         struct btrfs_disk_balance_args disk_bargs;
4461         struct btrfs_path *path;
4462         struct extent_buffer *leaf;
4463         struct btrfs_key key;
4464         int ret;
4465
4466         path = btrfs_alloc_path();
4467         if (!path)
4468                 return -ENOMEM;
4469
4470         key.objectid = BTRFS_BALANCE_OBJECTID;
4471         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4472         key.offset = 0;
4473
4474         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4475         if (ret < 0)
4476                 goto out;
4477         if (ret > 0) { /* ret = -ENOENT; */
4478                 ret = 0;
4479                 goto out;
4480         }
4481
4482         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4483         if (!bctl) {
4484                 ret = -ENOMEM;
4485                 goto out;
4486         }
4487
4488         leaf = path->nodes[0];
4489         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4490
4491         bctl->flags = btrfs_balance_flags(leaf, item);
4492         bctl->flags |= BTRFS_BALANCE_RESUME;
4493
4494         btrfs_balance_data(leaf, item, &disk_bargs);
4495         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4496         btrfs_balance_meta(leaf, item, &disk_bargs);
4497         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4498         btrfs_balance_sys(leaf, item, &disk_bargs);
4499         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4500
4501         /*
4502          * This should never happen, as the paused balance state is recovered
4503          * during mount without any chance of other exclusive ops to collide.
4504          *
4505          * This gives the exclusive op status to balance and keeps in paused
4506          * state until user intervention (cancel or umount). If the ownership
4507          * cannot be assigned, show a message but do not fail. The balance
4508          * is in a paused state and must have fs_info::balance_ctl properly
4509          * set up.
4510          */
4511         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4512                 btrfs_warn(fs_info,
4513         "balance: cannot set exclusive op status, resume manually");
4514
4515         btrfs_release_path(path);
4516
4517         mutex_lock(&fs_info->balance_mutex);
4518         BUG_ON(fs_info->balance_ctl);
4519         spin_lock(&fs_info->balance_lock);
4520         fs_info->balance_ctl = bctl;
4521         spin_unlock(&fs_info->balance_lock);
4522         mutex_unlock(&fs_info->balance_mutex);
4523 out:
4524         btrfs_free_path(path);
4525         return ret;
4526 }
4527
4528 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4529 {
4530         int ret = 0;
4531
4532         mutex_lock(&fs_info->balance_mutex);
4533         if (!fs_info->balance_ctl) {
4534                 mutex_unlock(&fs_info->balance_mutex);
4535                 return -ENOTCONN;
4536         }
4537
4538         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4539                 atomic_inc(&fs_info->balance_pause_req);
4540                 mutex_unlock(&fs_info->balance_mutex);
4541
4542                 wait_event(fs_info->balance_wait_q,
4543                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4544
4545                 mutex_lock(&fs_info->balance_mutex);
4546                 /* we are good with balance_ctl ripped off from under us */
4547                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4548                 atomic_dec(&fs_info->balance_pause_req);
4549         } else {
4550                 ret = -ENOTCONN;
4551         }
4552
4553         mutex_unlock(&fs_info->balance_mutex);
4554         return ret;
4555 }
4556
4557 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4558 {
4559         mutex_lock(&fs_info->balance_mutex);
4560         if (!fs_info->balance_ctl) {
4561                 mutex_unlock(&fs_info->balance_mutex);
4562                 return -ENOTCONN;
4563         }
4564
4565         /*
4566          * A paused balance with the item stored on disk can be resumed at
4567          * mount time if the mount is read-write. Otherwise it's still paused
4568          * and we must not allow cancelling as it deletes the item.
4569          */
4570         if (sb_rdonly(fs_info->sb)) {
4571                 mutex_unlock(&fs_info->balance_mutex);
4572                 return -EROFS;
4573         }
4574
4575         atomic_inc(&fs_info->balance_cancel_req);
4576         /*
4577          * if we are running just wait and return, balance item is
4578          * deleted in btrfs_balance in this case
4579          */
4580         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4581                 mutex_unlock(&fs_info->balance_mutex);
4582                 wait_event(fs_info->balance_wait_q,
4583                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4584                 mutex_lock(&fs_info->balance_mutex);
4585         } else {
4586                 mutex_unlock(&fs_info->balance_mutex);
4587                 /*
4588                  * Lock released to allow other waiters to continue, we'll
4589                  * reexamine the status again.
4590                  */
4591                 mutex_lock(&fs_info->balance_mutex);
4592
4593                 if (fs_info->balance_ctl) {
4594                         reset_balance_state(fs_info);
4595                         btrfs_exclop_finish(fs_info);
4596                         btrfs_info(fs_info, "balance: canceled");
4597                 }
4598         }
4599
4600         BUG_ON(fs_info->balance_ctl ||
4601                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4602         atomic_dec(&fs_info->balance_cancel_req);
4603         mutex_unlock(&fs_info->balance_mutex);
4604         return 0;
4605 }
4606
4607 int btrfs_uuid_scan_kthread(void *data)
4608 {
4609         struct btrfs_fs_info *fs_info = data;
4610         struct btrfs_root *root = fs_info->tree_root;
4611         struct btrfs_key key;
4612         struct btrfs_path *path = NULL;
4613         int ret = 0;
4614         struct extent_buffer *eb;
4615         int slot;
4616         struct btrfs_root_item root_item;
4617         u32 item_size;
4618         struct btrfs_trans_handle *trans = NULL;
4619         bool closing = false;
4620
4621         path = btrfs_alloc_path();
4622         if (!path) {
4623                 ret = -ENOMEM;
4624                 goto out;
4625         }
4626
4627         key.objectid = 0;
4628         key.type = BTRFS_ROOT_ITEM_KEY;
4629         key.offset = 0;
4630
4631         while (1) {
4632                 if (btrfs_fs_closing(fs_info)) {
4633                         closing = true;
4634                         break;
4635                 }
4636                 ret = btrfs_search_forward(root, &key, path,
4637                                 BTRFS_OLDEST_GENERATION);
4638                 if (ret) {
4639                         if (ret > 0)
4640                                 ret = 0;
4641                         break;
4642                 }
4643
4644                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4645                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4646                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4647                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4648                         goto skip;
4649
4650                 eb = path->nodes[0];
4651                 slot = path->slots[0];
4652                 item_size = btrfs_item_size(eb, slot);
4653                 if (item_size < sizeof(root_item))
4654                         goto skip;
4655
4656                 read_extent_buffer(eb, &root_item,
4657                                    btrfs_item_ptr_offset(eb, slot),
4658                                    (int)sizeof(root_item));
4659                 if (btrfs_root_refs(&root_item) == 0)
4660                         goto skip;
4661
4662                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4663                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4664                         if (trans)
4665                                 goto update_tree;
4666
4667                         btrfs_release_path(path);
4668                         /*
4669                          * 1 - subvol uuid item
4670                          * 1 - received_subvol uuid item
4671                          */
4672                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4673                         if (IS_ERR(trans)) {
4674                                 ret = PTR_ERR(trans);
4675                                 break;
4676                         }
4677                         continue;
4678                 } else {
4679                         goto skip;
4680                 }
4681 update_tree:
4682                 btrfs_release_path(path);
4683                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4684                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4685                                                   BTRFS_UUID_KEY_SUBVOL,
4686                                                   key.objectid);
4687                         if (ret < 0) {
4688                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4689                                         ret);
4690                                 break;
4691                         }
4692                 }
4693
4694                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4695                         ret = btrfs_uuid_tree_add(trans,
4696                                                   root_item.received_uuid,
4697                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4698                                                   key.objectid);
4699                         if (ret < 0) {
4700                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4701                                         ret);
4702                                 break;
4703                         }
4704                 }
4705
4706 skip:
4707                 btrfs_release_path(path);
4708                 if (trans) {
4709                         ret = btrfs_end_transaction(trans);
4710                         trans = NULL;
4711                         if (ret)
4712                                 break;
4713                 }
4714
4715                 if (key.offset < (u64)-1) {
4716                         key.offset++;
4717                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4718                         key.offset = 0;
4719                         key.type = BTRFS_ROOT_ITEM_KEY;
4720                 } else if (key.objectid < (u64)-1) {
4721                         key.offset = 0;
4722                         key.type = BTRFS_ROOT_ITEM_KEY;
4723                         key.objectid++;
4724                 } else {
4725                         break;
4726                 }
4727                 cond_resched();
4728         }
4729
4730 out:
4731         btrfs_free_path(path);
4732         if (trans && !IS_ERR(trans))
4733                 btrfs_end_transaction(trans);
4734         if (ret)
4735                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4736         else if (!closing)
4737                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4738         up(&fs_info->uuid_tree_rescan_sem);
4739         return 0;
4740 }
4741
4742 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4743 {
4744         struct btrfs_trans_handle *trans;
4745         struct btrfs_root *tree_root = fs_info->tree_root;
4746         struct btrfs_root *uuid_root;
4747         struct task_struct *task;
4748         int ret;
4749
4750         /*
4751          * 1 - root node
4752          * 1 - root item
4753          */
4754         trans = btrfs_start_transaction(tree_root, 2);
4755         if (IS_ERR(trans))
4756                 return PTR_ERR(trans);
4757
4758         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4759         if (IS_ERR(uuid_root)) {
4760                 ret = PTR_ERR(uuid_root);
4761                 btrfs_abort_transaction(trans, ret);
4762                 btrfs_end_transaction(trans);
4763                 return ret;
4764         }
4765
4766         fs_info->uuid_root = uuid_root;
4767
4768         ret = btrfs_commit_transaction(trans);
4769         if (ret)
4770                 return ret;
4771
4772         down(&fs_info->uuid_tree_rescan_sem);
4773         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4774         if (IS_ERR(task)) {
4775                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4776                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4777                 up(&fs_info->uuid_tree_rescan_sem);
4778                 return PTR_ERR(task);
4779         }
4780
4781         return 0;
4782 }
4783
4784 /*
4785  * shrinking a device means finding all of the device extents past
4786  * the new size, and then following the back refs to the chunks.
4787  * The chunk relocation code actually frees the device extent
4788  */
4789 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4790 {
4791         struct btrfs_fs_info *fs_info = device->fs_info;
4792         struct btrfs_root *root = fs_info->dev_root;
4793         struct btrfs_trans_handle *trans;
4794         struct btrfs_dev_extent *dev_extent = NULL;
4795         struct btrfs_path *path;
4796         u64 length;
4797         u64 chunk_offset;
4798         int ret;
4799         int slot;
4800         int failed = 0;
4801         bool retried = false;
4802         struct extent_buffer *l;
4803         struct btrfs_key key;
4804         struct btrfs_super_block *super_copy = fs_info->super_copy;
4805         u64 old_total = btrfs_super_total_bytes(super_copy);
4806         u64 old_size = btrfs_device_get_total_bytes(device);
4807         u64 diff;
4808         u64 start;
4809
4810         new_size = round_down(new_size, fs_info->sectorsize);
4811         start = new_size;
4812         diff = round_down(old_size - new_size, fs_info->sectorsize);
4813
4814         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4815                 return -EINVAL;
4816
4817         path = btrfs_alloc_path();
4818         if (!path)
4819                 return -ENOMEM;
4820
4821         path->reada = READA_BACK;
4822
4823         trans = btrfs_start_transaction(root, 0);
4824         if (IS_ERR(trans)) {
4825                 btrfs_free_path(path);
4826                 return PTR_ERR(trans);
4827         }
4828
4829         mutex_lock(&fs_info->chunk_mutex);
4830
4831         btrfs_device_set_total_bytes(device, new_size);
4832         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4833                 device->fs_devices->total_rw_bytes -= diff;
4834                 atomic64_sub(diff, &fs_info->free_chunk_space);
4835         }
4836
4837         /*
4838          * Once the device's size has been set to the new size, ensure all
4839          * in-memory chunks are synced to disk so that the loop below sees them
4840          * and relocates them accordingly.
4841          */
4842         if (contains_pending_extent(device, &start, diff)) {
4843                 mutex_unlock(&fs_info->chunk_mutex);
4844                 ret = btrfs_commit_transaction(trans);
4845                 if (ret)
4846                         goto done;
4847         } else {
4848                 mutex_unlock(&fs_info->chunk_mutex);
4849                 btrfs_end_transaction(trans);
4850         }
4851
4852 again:
4853         key.objectid = device->devid;
4854         key.offset = (u64)-1;
4855         key.type = BTRFS_DEV_EXTENT_KEY;
4856
4857         do {
4858                 mutex_lock(&fs_info->reclaim_bgs_lock);
4859                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4860                 if (ret < 0) {
4861                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4862                         goto done;
4863                 }
4864
4865                 ret = btrfs_previous_item(root, path, 0, key.type);
4866                 if (ret) {
4867                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4868                         if (ret < 0)
4869                                 goto done;
4870                         ret = 0;
4871                         btrfs_release_path(path);
4872                         break;
4873                 }
4874
4875                 l = path->nodes[0];
4876                 slot = path->slots[0];
4877                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4878
4879                 if (key.objectid != device->devid) {
4880                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4881                         btrfs_release_path(path);
4882                         break;
4883                 }
4884
4885                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4886                 length = btrfs_dev_extent_length(l, dev_extent);
4887
4888                 if (key.offset + length <= new_size) {
4889                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4890                         btrfs_release_path(path);
4891                         break;
4892                 }
4893
4894                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4895                 btrfs_release_path(path);
4896
4897                 /*
4898                  * We may be relocating the only data chunk we have,
4899                  * which could potentially end up with losing data's
4900                  * raid profile, so lets allocate an empty one in
4901                  * advance.
4902                  */
4903                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4904                 if (ret < 0) {
4905                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4906                         goto done;
4907                 }
4908
4909                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4910                 mutex_unlock(&fs_info->reclaim_bgs_lock);
4911                 if (ret == -ENOSPC) {
4912                         failed++;
4913                 } else if (ret) {
4914                         if (ret == -ETXTBSY) {
4915                                 btrfs_warn(fs_info,
4916                    "could not shrink block group %llu due to active swapfile",
4917                                            chunk_offset);
4918                         }
4919                         goto done;
4920                 }
4921         } while (key.offset-- > 0);
4922
4923         if (failed && !retried) {
4924                 failed = 0;
4925                 retried = true;
4926                 goto again;
4927         } else if (failed && retried) {
4928                 ret = -ENOSPC;
4929                 goto done;
4930         }
4931
4932         /* Shrinking succeeded, else we would be at "done". */
4933         trans = btrfs_start_transaction(root, 0);
4934         if (IS_ERR(trans)) {
4935                 ret = PTR_ERR(trans);
4936                 goto done;
4937         }
4938
4939         mutex_lock(&fs_info->chunk_mutex);
4940         /* Clear all state bits beyond the shrunk device size */
4941         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4942                           CHUNK_STATE_MASK);
4943
4944         btrfs_device_set_disk_total_bytes(device, new_size);
4945         if (list_empty(&device->post_commit_list))
4946                 list_add_tail(&device->post_commit_list,
4947                               &trans->transaction->dev_update_list);
4948
4949         WARN_ON(diff > old_total);
4950         btrfs_set_super_total_bytes(super_copy,
4951                         round_down(old_total - diff, fs_info->sectorsize));
4952         mutex_unlock(&fs_info->chunk_mutex);
4953
4954         btrfs_reserve_chunk_metadata(trans, false);
4955         /* Now btrfs_update_device() will change the on-disk size. */
4956         ret = btrfs_update_device(trans, device);
4957         btrfs_trans_release_chunk_metadata(trans);
4958         if (ret < 0) {
4959                 btrfs_abort_transaction(trans, ret);
4960                 btrfs_end_transaction(trans);
4961         } else {
4962                 ret = btrfs_commit_transaction(trans);
4963         }
4964 done:
4965         btrfs_free_path(path);
4966         if (ret) {
4967                 mutex_lock(&fs_info->chunk_mutex);
4968                 btrfs_device_set_total_bytes(device, old_size);
4969                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4970                         device->fs_devices->total_rw_bytes += diff;
4971                 atomic64_add(diff, &fs_info->free_chunk_space);
4972                 mutex_unlock(&fs_info->chunk_mutex);
4973         }
4974         return ret;
4975 }
4976
4977 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4978                            struct btrfs_key *key,
4979                            struct btrfs_chunk *chunk, int item_size)
4980 {
4981         struct btrfs_super_block *super_copy = fs_info->super_copy;
4982         struct btrfs_disk_key disk_key;
4983         u32 array_size;
4984         u8 *ptr;
4985
4986         lockdep_assert_held(&fs_info->chunk_mutex);
4987
4988         array_size = btrfs_super_sys_array_size(super_copy);
4989         if (array_size + item_size + sizeof(disk_key)
4990                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4991                 return -EFBIG;
4992
4993         ptr = super_copy->sys_chunk_array + array_size;
4994         btrfs_cpu_key_to_disk(&disk_key, key);
4995         memcpy(ptr, &disk_key, sizeof(disk_key));
4996         ptr += sizeof(disk_key);
4997         memcpy(ptr, chunk, item_size);
4998         item_size += sizeof(disk_key);
4999         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5000
5001         return 0;
5002 }
5003
5004 /*
5005  * sort the devices in descending order by max_avail, total_avail
5006  */
5007 static int btrfs_cmp_device_info(const void *a, const void *b)
5008 {
5009         const struct btrfs_device_info *di_a = a;
5010         const struct btrfs_device_info *di_b = b;
5011
5012         if (di_a->max_avail > di_b->max_avail)
5013                 return -1;
5014         if (di_a->max_avail < di_b->max_avail)
5015                 return 1;
5016         if (di_a->total_avail > di_b->total_avail)
5017                 return -1;
5018         if (di_a->total_avail < di_b->total_avail)
5019                 return 1;
5020         return 0;
5021 }
5022
5023 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5024 {
5025         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5026                 return;
5027
5028         btrfs_set_fs_incompat(info, RAID56);
5029 }
5030
5031 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5032 {
5033         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5034                 return;
5035
5036         btrfs_set_fs_incompat(info, RAID1C34);
5037 }
5038
5039 /*
5040  * Structure used internally for btrfs_create_chunk() function.
5041  * Wraps needed parameters.
5042  */
5043 struct alloc_chunk_ctl {
5044         u64 start;
5045         u64 type;
5046         /* Total number of stripes to allocate */
5047         int num_stripes;
5048         /* sub_stripes info for map */
5049         int sub_stripes;
5050         /* Stripes per device */
5051         int dev_stripes;
5052         /* Maximum number of devices to use */
5053         int devs_max;
5054         /* Minimum number of devices to use */
5055         int devs_min;
5056         /* ndevs has to be a multiple of this */
5057         int devs_increment;
5058         /* Number of copies */
5059         int ncopies;
5060         /* Number of stripes worth of bytes to store parity information */
5061         int nparity;
5062         u64 max_stripe_size;
5063         u64 max_chunk_size;
5064         u64 dev_extent_min;
5065         u64 stripe_size;
5066         u64 chunk_size;
5067         int ndevs;
5068 };
5069
5070 static void init_alloc_chunk_ctl_policy_regular(
5071                                 struct btrfs_fs_devices *fs_devices,
5072                                 struct alloc_chunk_ctl *ctl)
5073 {
5074         struct btrfs_space_info *space_info;
5075
5076         space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
5077         ASSERT(space_info);
5078
5079         ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
5080         ctl->max_stripe_size = ctl->max_chunk_size;
5081
5082         if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5083                 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
5084
5085         /* We don't want a chunk larger than 10% of writable space */
5086         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
5087                                   ctl->max_chunk_size);
5088         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5089 }
5090
5091 static void init_alloc_chunk_ctl_policy_zoned(
5092                                       struct btrfs_fs_devices *fs_devices,
5093                                       struct alloc_chunk_ctl *ctl)
5094 {
5095         u64 zone_size = fs_devices->fs_info->zone_size;
5096         u64 limit;
5097         int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5098         int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5099         u64 min_chunk_size = min_data_stripes * zone_size;
5100         u64 type = ctl->type;
5101
5102         ctl->max_stripe_size = zone_size;
5103         if (type & BTRFS_BLOCK_GROUP_DATA) {
5104                 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5105                                                  zone_size);
5106         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5107                 ctl->max_chunk_size = ctl->max_stripe_size;
5108         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5109                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5110                 ctl->devs_max = min_t(int, ctl->devs_max,
5111                                       BTRFS_MAX_DEVS_SYS_CHUNK);
5112         } else {
5113                 BUG();
5114         }
5115
5116         /* We don't want a chunk larger than 10% of writable space */
5117         limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
5118                                zone_size),
5119                     min_chunk_size);
5120         ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5121         ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5122 }
5123
5124 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5125                                  struct alloc_chunk_ctl *ctl)
5126 {
5127         int index = btrfs_bg_flags_to_raid_index(ctl->type);
5128
5129         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5130         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5131         ctl->devs_max = btrfs_raid_array[index].devs_max;
5132         if (!ctl->devs_max)
5133                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5134         ctl->devs_min = btrfs_raid_array[index].devs_min;
5135         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5136         ctl->ncopies = btrfs_raid_array[index].ncopies;
5137         ctl->nparity = btrfs_raid_array[index].nparity;
5138         ctl->ndevs = 0;
5139
5140         switch (fs_devices->chunk_alloc_policy) {
5141         case BTRFS_CHUNK_ALLOC_REGULAR:
5142                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5143                 break;
5144         case BTRFS_CHUNK_ALLOC_ZONED:
5145                 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5146                 break;
5147         default:
5148                 BUG();
5149         }
5150 }
5151
5152 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5153                               struct alloc_chunk_ctl *ctl,
5154                               struct btrfs_device_info *devices_info)
5155 {
5156         struct btrfs_fs_info *info = fs_devices->fs_info;
5157         struct btrfs_device *device;
5158         u64 total_avail;
5159         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5160         int ret;
5161         int ndevs = 0;
5162         u64 max_avail;
5163         u64 dev_offset;
5164
5165         /*
5166          * in the first pass through the devices list, we gather information
5167          * about the available holes on each device.
5168          */
5169         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5170                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5171                         WARN(1, KERN_ERR
5172                                "BTRFS: read-only device in alloc_list\n");
5173                         continue;
5174                 }
5175
5176                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5177                                         &device->dev_state) ||
5178                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5179                         continue;
5180
5181                 if (device->total_bytes > device->bytes_used)
5182                         total_avail = device->total_bytes - device->bytes_used;
5183                 else
5184                         total_avail = 0;
5185
5186                 /* If there is no space on this device, skip it. */
5187                 if (total_avail < ctl->dev_extent_min)
5188                         continue;
5189
5190                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5191                                            &max_avail);
5192                 if (ret && ret != -ENOSPC)
5193                         return ret;
5194
5195                 if (ret == 0)
5196                         max_avail = dev_extent_want;
5197
5198                 if (max_avail < ctl->dev_extent_min) {
5199                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
5200                                 btrfs_debug(info,
5201                         "%s: devid %llu has no free space, have=%llu want=%llu",
5202                                             __func__, device->devid, max_avail,
5203                                             ctl->dev_extent_min);
5204                         continue;
5205                 }
5206
5207                 if (ndevs == fs_devices->rw_devices) {
5208                         WARN(1, "%s: found more than %llu devices\n",
5209                              __func__, fs_devices->rw_devices);
5210                         break;
5211                 }
5212                 devices_info[ndevs].dev_offset = dev_offset;
5213                 devices_info[ndevs].max_avail = max_avail;
5214                 devices_info[ndevs].total_avail = total_avail;
5215                 devices_info[ndevs].dev = device;
5216                 ++ndevs;
5217         }
5218         ctl->ndevs = ndevs;
5219
5220         /*
5221          * now sort the devices by hole size / available space
5222          */
5223         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5224              btrfs_cmp_device_info, NULL);
5225
5226         return 0;
5227 }
5228
5229 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5230                                       struct btrfs_device_info *devices_info)
5231 {
5232         /* Number of stripes that count for block group size */
5233         int data_stripes;
5234
5235         /*
5236          * The primary goal is to maximize the number of stripes, so use as
5237          * many devices as possible, even if the stripes are not maximum sized.
5238          *
5239          * The DUP profile stores more than one stripe per device, the
5240          * max_avail is the total size so we have to adjust.
5241          */
5242         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5243                                    ctl->dev_stripes);
5244         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5245
5246         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5247         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5248
5249         /*
5250          * Use the number of data stripes to figure out how big this chunk is
5251          * really going to be in terms of logical address space, and compare
5252          * that answer with the max chunk size. If it's higher, we try to
5253          * reduce stripe_size.
5254          */
5255         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5256                 /*
5257                  * Reduce stripe_size, round it up to a 16MB boundary again and
5258                  * then use it, unless it ends up being even bigger than the
5259                  * previous value we had already.
5260                  */
5261                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5262                                                         data_stripes), SZ_16M),
5263                                        ctl->stripe_size);
5264         }
5265
5266         /* Align to BTRFS_STRIPE_LEN */
5267         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5268         ctl->chunk_size = ctl->stripe_size * data_stripes;
5269
5270         return 0;
5271 }
5272
5273 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5274                                     struct btrfs_device_info *devices_info)
5275 {
5276         u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5277         /* Number of stripes that count for block group size */
5278         int data_stripes;
5279
5280         /*
5281          * It should hold because:
5282          *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
5283          */
5284         ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5285
5286         ctl->stripe_size = zone_size;
5287         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5288         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5289
5290         /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5291         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5292                 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5293                                              ctl->stripe_size) + ctl->nparity,
5294                                      ctl->dev_stripes);
5295                 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5296                 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5297                 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5298         }
5299
5300         ctl->chunk_size = ctl->stripe_size * data_stripes;
5301
5302         return 0;
5303 }
5304
5305 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5306                               struct alloc_chunk_ctl *ctl,
5307                               struct btrfs_device_info *devices_info)
5308 {
5309         struct btrfs_fs_info *info = fs_devices->fs_info;
5310
5311         /*
5312          * Round down to number of usable stripes, devs_increment can be any
5313          * number so we can't use round_down() that requires power of 2, while
5314          * rounddown is safe.
5315          */
5316         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5317
5318         if (ctl->ndevs < ctl->devs_min) {
5319                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5320                         btrfs_debug(info,
5321         "%s: not enough devices with free space: have=%d minimum required=%d",
5322                                     __func__, ctl->ndevs, ctl->devs_min);
5323                 }
5324                 return -ENOSPC;
5325         }
5326
5327         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5328
5329         switch (fs_devices->chunk_alloc_policy) {
5330         case BTRFS_CHUNK_ALLOC_REGULAR:
5331                 return decide_stripe_size_regular(ctl, devices_info);
5332         case BTRFS_CHUNK_ALLOC_ZONED:
5333                 return decide_stripe_size_zoned(ctl, devices_info);
5334         default:
5335                 BUG();
5336         }
5337 }
5338
5339 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5340                         struct alloc_chunk_ctl *ctl,
5341                         struct btrfs_device_info *devices_info)
5342 {
5343         struct btrfs_fs_info *info = trans->fs_info;
5344         struct map_lookup *map = NULL;
5345         struct extent_map_tree *em_tree;
5346         struct btrfs_block_group *block_group;
5347         struct extent_map *em;
5348         u64 start = ctl->start;
5349         u64 type = ctl->type;
5350         int ret;
5351         int i;
5352         int j;
5353
5354         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5355         if (!map)
5356                 return ERR_PTR(-ENOMEM);
5357         map->num_stripes = ctl->num_stripes;
5358
5359         for (i = 0; i < ctl->ndevs; ++i) {
5360                 for (j = 0; j < ctl->dev_stripes; ++j) {
5361                         int s = i * ctl->dev_stripes + j;
5362                         map->stripes[s].dev = devices_info[i].dev;
5363                         map->stripes[s].physical = devices_info[i].dev_offset +
5364                                                    j * ctl->stripe_size;
5365                 }
5366         }
5367         map->stripe_len = BTRFS_STRIPE_LEN;
5368         map->io_align = BTRFS_STRIPE_LEN;
5369         map->io_width = BTRFS_STRIPE_LEN;
5370         map->type = type;
5371         map->sub_stripes = ctl->sub_stripes;
5372
5373         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5374
5375         em = alloc_extent_map();
5376         if (!em) {
5377                 kfree(map);
5378                 return ERR_PTR(-ENOMEM);
5379         }
5380         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5381         em->map_lookup = map;
5382         em->start = start;
5383         em->len = ctl->chunk_size;
5384         em->block_start = 0;
5385         em->block_len = em->len;
5386         em->orig_block_len = ctl->stripe_size;
5387
5388         em_tree = &info->mapping_tree;
5389         write_lock(&em_tree->lock);
5390         ret = add_extent_mapping(em_tree, em, 0);
5391         if (ret) {
5392                 write_unlock(&em_tree->lock);
5393                 free_extent_map(em);
5394                 return ERR_PTR(ret);
5395         }
5396         write_unlock(&em_tree->lock);
5397
5398         block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5399         if (IS_ERR(block_group))
5400                 goto error_del_extent;
5401
5402         for (i = 0; i < map->num_stripes; i++) {
5403                 struct btrfs_device *dev = map->stripes[i].dev;
5404
5405                 btrfs_device_set_bytes_used(dev,
5406                                             dev->bytes_used + ctl->stripe_size);
5407                 if (list_empty(&dev->post_commit_list))
5408                         list_add_tail(&dev->post_commit_list,
5409                                       &trans->transaction->dev_update_list);
5410         }
5411
5412         atomic64_sub(ctl->stripe_size * map->num_stripes,
5413                      &info->free_chunk_space);
5414
5415         free_extent_map(em);
5416         check_raid56_incompat_flag(info, type);
5417         check_raid1c34_incompat_flag(info, type);
5418
5419         return block_group;
5420
5421 error_del_extent:
5422         write_lock(&em_tree->lock);
5423         remove_extent_mapping(em_tree, em);
5424         write_unlock(&em_tree->lock);
5425
5426         /* One for our allocation */
5427         free_extent_map(em);
5428         /* One for the tree reference */
5429         free_extent_map(em);
5430
5431         return block_group;
5432 }
5433
5434 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
5435                                             u64 type)
5436 {
5437         struct btrfs_fs_info *info = trans->fs_info;
5438         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5439         struct btrfs_device_info *devices_info = NULL;
5440         struct alloc_chunk_ctl ctl;
5441         struct btrfs_block_group *block_group;
5442         int ret;
5443
5444         lockdep_assert_held(&info->chunk_mutex);
5445
5446         if (!alloc_profile_is_valid(type, 0)) {
5447                 ASSERT(0);
5448                 return ERR_PTR(-EINVAL);
5449         }
5450
5451         if (list_empty(&fs_devices->alloc_list)) {
5452                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5453                         btrfs_debug(info, "%s: no writable device", __func__);
5454                 return ERR_PTR(-ENOSPC);
5455         }
5456
5457         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5458                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5459                 ASSERT(0);
5460                 return ERR_PTR(-EINVAL);
5461         }
5462
5463         ctl.start = find_next_chunk(info);
5464         ctl.type = type;
5465         init_alloc_chunk_ctl(fs_devices, &ctl);
5466
5467         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5468                                GFP_NOFS);
5469         if (!devices_info)
5470                 return ERR_PTR(-ENOMEM);
5471
5472         ret = gather_device_info(fs_devices, &ctl, devices_info);
5473         if (ret < 0) {
5474                 block_group = ERR_PTR(ret);
5475                 goto out;
5476         }
5477
5478         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5479         if (ret < 0) {
5480                 block_group = ERR_PTR(ret);
5481                 goto out;
5482         }
5483
5484         block_group = create_chunk(trans, &ctl, devices_info);
5485
5486 out:
5487         kfree(devices_info);
5488         return block_group;
5489 }
5490
5491 /*
5492  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5493  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5494  * chunks.
5495  *
5496  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5497  * phases.
5498  */
5499 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5500                                      struct btrfs_block_group *bg)
5501 {
5502         struct btrfs_fs_info *fs_info = trans->fs_info;
5503         struct btrfs_root *chunk_root = fs_info->chunk_root;
5504         struct btrfs_key key;
5505         struct btrfs_chunk *chunk;
5506         struct btrfs_stripe *stripe;
5507         struct extent_map *em;
5508         struct map_lookup *map;
5509         size_t item_size;
5510         int i;
5511         int ret;
5512
5513         /*
5514          * We take the chunk_mutex for 2 reasons:
5515          *
5516          * 1) Updates and insertions in the chunk btree must be done while holding
5517          *    the chunk_mutex, as well as updating the system chunk array in the
5518          *    superblock. See the comment on top of btrfs_chunk_alloc() for the
5519          *    details;
5520          *
5521          * 2) To prevent races with the final phase of a device replace operation
5522          *    that replaces the device object associated with the map's stripes,
5523          *    because the device object's id can change at any time during that
5524          *    final phase of the device replace operation
5525          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5526          *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5527          *    which would cause a failure when updating the device item, which does
5528          *    not exists, or persisting a stripe of the chunk item with such ID.
5529          *    Here we can't use the device_list_mutex because our caller already
5530          *    has locked the chunk_mutex, and the final phase of device replace
5531          *    acquires both mutexes - first the device_list_mutex and then the
5532          *    chunk_mutex. Using any of those two mutexes protects us from a
5533          *    concurrent device replace.
5534          */
5535         lockdep_assert_held(&fs_info->chunk_mutex);
5536
5537         em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5538         if (IS_ERR(em)) {
5539                 ret = PTR_ERR(em);
5540                 btrfs_abort_transaction(trans, ret);
5541                 return ret;
5542         }
5543
5544         map = em->map_lookup;
5545         item_size = btrfs_chunk_item_size(map->num_stripes);
5546
5547         chunk = kzalloc(item_size, GFP_NOFS);
5548         if (!chunk) {
5549                 ret = -ENOMEM;
5550                 btrfs_abort_transaction(trans, ret);
5551                 goto out;
5552         }
5553
5554         for (i = 0; i < map->num_stripes; i++) {
5555                 struct btrfs_device *device = map->stripes[i].dev;
5556
5557                 ret = btrfs_update_device(trans, device);
5558                 if (ret)
5559                         goto out;
5560         }
5561
5562         stripe = &chunk->stripe;
5563         for (i = 0; i < map->num_stripes; i++) {
5564                 struct btrfs_device *device = map->stripes[i].dev;
5565                 const u64 dev_offset = map->stripes[i].physical;
5566
5567                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5568                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5569                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5570                 stripe++;
5571         }
5572
5573         btrfs_set_stack_chunk_length(chunk, bg->length);
5574         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5575         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5576         btrfs_set_stack_chunk_type(chunk, map->type);
5577         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5578         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5579         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5580         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5581         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5582
5583         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5584         key.type = BTRFS_CHUNK_ITEM_KEY;
5585         key.offset = bg->start;
5586
5587         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5588         if (ret)
5589                 goto out;
5590
5591         bg->chunk_item_inserted = 1;
5592
5593         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5594                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5595                 if (ret)
5596                         goto out;
5597         }
5598
5599 out:
5600         kfree(chunk);
5601         free_extent_map(em);
5602         return ret;
5603 }
5604
5605 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5606 {
5607         struct btrfs_fs_info *fs_info = trans->fs_info;
5608         u64 alloc_profile;
5609         struct btrfs_block_group *meta_bg;
5610         struct btrfs_block_group *sys_bg;
5611
5612         /*
5613          * When adding a new device for sprouting, the seed device is read-only
5614          * so we must first allocate a metadata and a system chunk. But before
5615          * adding the block group items to the extent, device and chunk btrees,
5616          * we must first:
5617          *
5618          * 1) Create both chunks without doing any changes to the btrees, as
5619          *    otherwise we would get -ENOSPC since the block groups from the
5620          *    seed device are read-only;
5621          *
5622          * 2) Add the device item for the new sprout device - finishing the setup
5623          *    of a new block group requires updating the device item in the chunk
5624          *    btree, so it must exist when we attempt to do it. The previous step
5625          *    ensures this does not fail with -ENOSPC.
5626          *
5627          * After that we can add the block group items to their btrees:
5628          * update existing device item in the chunk btree, add a new block group
5629          * item to the extent btree, add a new chunk item to the chunk btree and
5630          * finally add the new device extent items to the devices btree.
5631          */
5632
5633         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5634         meta_bg = btrfs_create_chunk(trans, alloc_profile);
5635         if (IS_ERR(meta_bg))
5636                 return PTR_ERR(meta_bg);
5637
5638         alloc_profile = btrfs_system_alloc_profile(fs_info);
5639         sys_bg = btrfs_create_chunk(trans, alloc_profile);
5640         if (IS_ERR(sys_bg))
5641                 return PTR_ERR(sys_bg);
5642
5643         return 0;
5644 }
5645
5646 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5647 {
5648         const int index = btrfs_bg_flags_to_raid_index(map->type);
5649
5650         return btrfs_raid_array[index].tolerated_failures;
5651 }
5652
5653 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5654 {
5655         struct extent_map *em;
5656         struct map_lookup *map;
5657         int miss_ndevs = 0;
5658         int i;
5659         bool ret = true;
5660
5661         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5662         if (IS_ERR(em))
5663                 return false;
5664
5665         map = em->map_lookup;
5666         for (i = 0; i < map->num_stripes; i++) {
5667                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5668                                         &map->stripes[i].dev->dev_state)) {
5669                         miss_ndevs++;
5670                         continue;
5671                 }
5672                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5673                                         &map->stripes[i].dev->dev_state)) {
5674                         ret = false;
5675                         goto end;
5676                 }
5677         }
5678
5679         /*
5680          * If the number of missing devices is larger than max errors, we can
5681          * not write the data into that chunk successfully.
5682          */
5683         if (miss_ndevs > btrfs_chunk_max_errors(map))
5684                 ret = false;
5685 end:
5686         free_extent_map(em);
5687         return ret;
5688 }
5689
5690 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5691 {
5692         struct extent_map *em;
5693
5694         while (1) {
5695                 write_lock(&tree->lock);
5696                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5697                 if (em)
5698                         remove_extent_mapping(tree, em);
5699                 write_unlock(&tree->lock);
5700                 if (!em)
5701                         break;
5702                 /* once for us */
5703                 free_extent_map(em);
5704                 /* once for the tree */
5705                 free_extent_map(em);
5706         }
5707 }
5708
5709 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5710 {
5711         struct extent_map *em;
5712         struct map_lookup *map;
5713         int ret;
5714
5715         em = btrfs_get_chunk_map(fs_info, logical, len);
5716         if (IS_ERR(em))
5717                 /*
5718                  * We could return errors for these cases, but that could get
5719                  * ugly and we'd probably do the same thing which is just not do
5720                  * anything else and exit, so return 1 so the callers don't try
5721                  * to use other copies.
5722                  */
5723                 return 1;
5724
5725         map = em->map_lookup;
5726         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5727                 ret = map->num_stripes;
5728         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5729                 ret = map->sub_stripes;
5730         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5731                 ret = 2;
5732         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5733                 /*
5734                  * There could be two corrupted data stripes, we need
5735                  * to loop retry in order to rebuild the correct data.
5736                  *
5737                  * Fail a stripe at a time on every retry except the
5738                  * stripe under reconstruction.
5739                  */
5740                 ret = map->num_stripes;
5741         else
5742                 ret = 1;
5743         free_extent_map(em);
5744
5745         down_read(&fs_info->dev_replace.rwsem);
5746         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5747             fs_info->dev_replace.tgtdev)
5748                 ret++;
5749         up_read(&fs_info->dev_replace.rwsem);
5750
5751         return ret;
5752 }
5753
5754 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5755                                     u64 logical)
5756 {
5757         struct extent_map *em;
5758         struct map_lookup *map;
5759         unsigned long len = fs_info->sectorsize;
5760
5761         if (!btrfs_fs_incompat(fs_info, RAID56))
5762                 return len;
5763
5764         em = btrfs_get_chunk_map(fs_info, logical, len);
5765
5766         if (!WARN_ON(IS_ERR(em))) {
5767                 map = em->map_lookup;
5768                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5769                         len = map->stripe_len * nr_data_stripes(map);
5770                 free_extent_map(em);
5771         }
5772         return len;
5773 }
5774
5775 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5776 {
5777         struct extent_map *em;
5778         struct map_lookup *map;
5779         int ret = 0;
5780
5781         if (!btrfs_fs_incompat(fs_info, RAID56))
5782                 return 0;
5783
5784         em = btrfs_get_chunk_map(fs_info, logical, len);
5785
5786         if(!WARN_ON(IS_ERR(em))) {
5787                 map = em->map_lookup;
5788                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5789                         ret = 1;
5790                 free_extent_map(em);
5791         }
5792         return ret;
5793 }
5794
5795 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5796                             struct map_lookup *map, int first,
5797                             int dev_replace_is_ongoing)
5798 {
5799         int i;
5800         int num_stripes;
5801         int preferred_mirror;
5802         int tolerance;
5803         struct btrfs_device *srcdev;
5804
5805         ASSERT((map->type &
5806                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5807
5808         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5809                 num_stripes = map->sub_stripes;
5810         else
5811                 num_stripes = map->num_stripes;
5812
5813         switch (fs_info->fs_devices->read_policy) {
5814         default:
5815                 /* Shouldn't happen, just warn and use pid instead of failing */
5816                 btrfs_warn_rl(fs_info,
5817                               "unknown read_policy type %u, reset to pid",
5818                               fs_info->fs_devices->read_policy);
5819                 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5820                 fallthrough;
5821         case BTRFS_READ_POLICY_PID:
5822                 preferred_mirror = first + (current->pid % num_stripes);
5823                 break;
5824         }
5825
5826         if (dev_replace_is_ongoing &&
5827             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5828              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5829                 srcdev = fs_info->dev_replace.srcdev;
5830         else
5831                 srcdev = NULL;
5832
5833         /*
5834          * try to avoid the drive that is the source drive for a
5835          * dev-replace procedure, only choose it if no other non-missing
5836          * mirror is available
5837          */
5838         for (tolerance = 0; tolerance < 2; tolerance++) {
5839                 if (map->stripes[preferred_mirror].dev->bdev &&
5840                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5841                         return preferred_mirror;
5842                 for (i = first; i < first + num_stripes; i++) {
5843                         if (map->stripes[i].dev->bdev &&
5844                             (tolerance || map->stripes[i].dev != srcdev))
5845                                 return i;
5846                 }
5847         }
5848
5849         /* we couldn't find one that doesn't fail.  Just return something
5850          * and the io error handling code will clean up eventually
5851          */
5852         return preferred_mirror;
5853 }
5854
5855 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5856 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
5857 {
5858         int i;
5859         int again = 1;
5860
5861         while (again) {
5862                 again = 0;
5863                 for (i = 0; i < num_stripes - 1; i++) {
5864                         /* Swap if parity is on a smaller index */
5865                         if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
5866                                 swap(bioc->stripes[i], bioc->stripes[i + 1]);
5867                                 swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
5868                                 again = 1;
5869                         }
5870                 }
5871         }
5872 }
5873
5874 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5875                                                        int total_stripes,
5876                                                        int real_stripes)
5877 {
5878         struct btrfs_io_context *bioc = kzalloc(
5879                  /* The size of btrfs_io_context */
5880                 sizeof(struct btrfs_io_context) +
5881                 /* Plus the variable array for the stripes */
5882                 sizeof(struct btrfs_io_stripe) * (total_stripes) +
5883                 /* Plus the variable array for the tgt dev */
5884                 sizeof(int) * (real_stripes) +
5885                 /*
5886                  * Plus the raid_map, which includes both the tgt dev
5887                  * and the stripes.
5888                  */
5889                 sizeof(u64) * (total_stripes),
5890                 GFP_NOFS|__GFP_NOFAIL);
5891
5892         atomic_set(&bioc->error, 0);
5893         refcount_set(&bioc->refs, 1);
5894
5895         bioc->fs_info = fs_info;
5896         bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
5897         bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
5898
5899         return bioc;
5900 }
5901
5902 void btrfs_get_bioc(struct btrfs_io_context *bioc)
5903 {
5904         WARN_ON(!refcount_read(&bioc->refs));
5905         refcount_inc(&bioc->refs);
5906 }
5907
5908 void btrfs_put_bioc(struct btrfs_io_context *bioc)
5909 {
5910         if (!bioc)
5911                 return;
5912         if (refcount_dec_and_test(&bioc->refs))
5913                 kfree(bioc);
5914 }
5915
5916 /*
5917  * Please note that, discard won't be sent to target device of device
5918  * replace.
5919  */
5920 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
5921                                                u64 logical, u64 *length_ret,
5922                                                u32 *num_stripes)
5923 {
5924         struct extent_map *em;
5925         struct map_lookup *map;
5926         struct btrfs_discard_stripe *stripes;
5927         u64 length = *length_ret;
5928         u64 offset;
5929         u64 stripe_nr;
5930         u64 stripe_nr_end;
5931         u64 stripe_end_offset;
5932         u64 stripe_cnt;
5933         u64 stripe_len;
5934         u64 stripe_offset;
5935         u32 stripe_index;
5936         u32 factor = 0;
5937         u32 sub_stripes = 0;
5938         u64 stripes_per_dev = 0;
5939         u32 remaining_stripes = 0;
5940         u32 last_stripe = 0;
5941         int ret;
5942         int i;
5943
5944         em = btrfs_get_chunk_map(fs_info, logical, length);
5945         if (IS_ERR(em))
5946                 return ERR_CAST(em);
5947
5948         map = em->map_lookup;
5949
5950         /* we don't discard raid56 yet */
5951         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5952                 ret = -EOPNOTSUPP;
5953                 goto out_free_map;
5954 }
5955
5956         offset = logical - em->start;
5957         length = min_t(u64, em->start + em->len - logical, length);
5958         *length_ret = length;
5959
5960         stripe_len = map->stripe_len;
5961         /*
5962          * stripe_nr counts the total number of stripes we have to stride
5963          * to get to this block
5964          */
5965         stripe_nr = div64_u64(offset, stripe_len);
5966
5967         /* stripe_offset is the offset of this block in its stripe */
5968         stripe_offset = offset - stripe_nr * stripe_len;
5969
5970         stripe_nr_end = round_up(offset + length, map->stripe_len);
5971         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5972         stripe_cnt = stripe_nr_end - stripe_nr;
5973         stripe_end_offset = stripe_nr_end * map->stripe_len -
5974                             (offset + length);
5975         /*
5976          * after this, stripe_nr is the number of stripes on this
5977          * device we have to walk to find the data, and stripe_index is
5978          * the number of our device in the stripe array
5979          */
5980         *num_stripes = 1;
5981         stripe_index = 0;
5982         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5983                          BTRFS_BLOCK_GROUP_RAID10)) {
5984                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5985                         sub_stripes = 1;
5986                 else
5987                         sub_stripes = map->sub_stripes;
5988
5989                 factor = map->num_stripes / sub_stripes;
5990                 *num_stripes = min_t(u64, map->num_stripes,
5991                                     sub_stripes * stripe_cnt);
5992                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5993                 stripe_index *= sub_stripes;
5994                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5995                                               &remaining_stripes);
5996                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5997                 last_stripe *= sub_stripes;
5998         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5999                                 BTRFS_BLOCK_GROUP_DUP)) {
6000                 *num_stripes = map->num_stripes;
6001         } else {
6002                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6003                                         &stripe_index);
6004         }
6005
6006         stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
6007         if (!stripes) {
6008                 ret = -ENOMEM;
6009                 goto out_free_map;
6010         }
6011
6012         for (i = 0; i < *num_stripes; i++) {
6013                 stripes[i].physical =
6014                         map->stripes[stripe_index].physical +
6015                         stripe_offset + stripe_nr * map->stripe_len;
6016                 stripes[i].dev = map->stripes[stripe_index].dev;
6017
6018                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6019                                  BTRFS_BLOCK_GROUP_RAID10)) {
6020                         stripes[i].length = stripes_per_dev * map->stripe_len;
6021
6022                         if (i / sub_stripes < remaining_stripes)
6023                                 stripes[i].length += map->stripe_len;
6024
6025                         /*
6026                          * Special for the first stripe and
6027                          * the last stripe:
6028                          *
6029                          * |-------|...|-------|
6030                          *     |----------|
6031                          *    off     end_off
6032                          */
6033                         if (i < sub_stripes)
6034                                 stripes[i].length -= stripe_offset;
6035
6036                         if (stripe_index >= last_stripe &&
6037                             stripe_index <= (last_stripe +
6038                                              sub_stripes - 1))
6039                                 stripes[i].length -= stripe_end_offset;
6040
6041                         if (i == sub_stripes - 1)
6042                                 stripe_offset = 0;
6043                 } else {
6044                         stripes[i].length = length;
6045                 }
6046
6047                 stripe_index++;
6048                 if (stripe_index == map->num_stripes) {
6049                         stripe_index = 0;
6050                         stripe_nr++;
6051                 }
6052         }
6053
6054         free_extent_map(em);
6055         return stripes;
6056 out_free_map:
6057         free_extent_map(em);
6058         return ERR_PTR(ret);
6059 }
6060
6061 /*
6062  * In dev-replace case, for repair case (that's the only case where the mirror
6063  * is selected explicitly when calling btrfs_map_block), blocks left of the
6064  * left cursor can also be read from the target drive.
6065  *
6066  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
6067  * array of stripes.
6068  * For READ, it also needs to be supported using the same mirror number.
6069  *
6070  * If the requested block is not left of the left cursor, EIO is returned. This
6071  * can happen because btrfs_num_copies() returns one more in the dev-replace
6072  * case.
6073  */
6074 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
6075                                          u64 logical, u64 length,
6076                                          u64 srcdev_devid, int *mirror_num,
6077                                          u64 *physical)
6078 {
6079         struct btrfs_io_context *bioc = NULL;
6080         int num_stripes;
6081         int index_srcdev = 0;
6082         int found = 0;
6083         u64 physical_of_found = 0;
6084         int i;
6085         int ret = 0;
6086
6087         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
6088                                 logical, &length, &bioc, 0, 0);
6089         if (ret) {
6090                 ASSERT(bioc == NULL);
6091                 return ret;
6092         }
6093
6094         num_stripes = bioc->num_stripes;
6095         if (*mirror_num > num_stripes) {
6096                 /*
6097                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
6098                  * that means that the requested area is not left of the left
6099                  * cursor
6100                  */
6101                 btrfs_put_bioc(bioc);
6102                 return -EIO;
6103         }
6104
6105         /*
6106          * process the rest of the function using the mirror_num of the source
6107          * drive. Therefore look it up first.  At the end, patch the device
6108          * pointer to the one of the target drive.
6109          */
6110         for (i = 0; i < num_stripes; i++) {
6111                 if (bioc->stripes[i].dev->devid != srcdev_devid)
6112                         continue;
6113
6114                 /*
6115                  * In case of DUP, in order to keep it simple, only add the
6116                  * mirror with the lowest physical address
6117                  */
6118                 if (found &&
6119                     physical_of_found <= bioc->stripes[i].physical)
6120                         continue;
6121
6122                 index_srcdev = i;
6123                 found = 1;
6124                 physical_of_found = bioc->stripes[i].physical;
6125         }
6126
6127         btrfs_put_bioc(bioc);
6128
6129         ASSERT(found);
6130         if (!found)
6131                 return -EIO;
6132
6133         *mirror_num = index_srcdev + 1;
6134         *physical = physical_of_found;
6135         return ret;
6136 }
6137
6138 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6139 {
6140         struct btrfs_block_group *cache;
6141         bool ret;
6142
6143         /* Non zoned filesystem does not use "to_copy" flag */
6144         if (!btrfs_is_zoned(fs_info))
6145                 return false;
6146
6147         cache = btrfs_lookup_block_group(fs_info, logical);
6148
6149         spin_lock(&cache->lock);
6150         ret = cache->to_copy;
6151         spin_unlock(&cache->lock);
6152
6153         btrfs_put_block_group(cache);
6154         return ret;
6155 }
6156
6157 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6158                                       struct btrfs_io_context **bioc_ret,
6159                                       struct btrfs_dev_replace *dev_replace,
6160                                       u64 logical,
6161                                       int *num_stripes_ret, int *max_errors_ret)
6162 {
6163         struct btrfs_io_context *bioc = *bioc_ret;
6164         u64 srcdev_devid = dev_replace->srcdev->devid;
6165         int tgtdev_indexes = 0;
6166         int num_stripes = *num_stripes_ret;
6167         int max_errors = *max_errors_ret;
6168         int i;
6169
6170         if (op == BTRFS_MAP_WRITE) {
6171                 int index_where_to_add;
6172
6173                 /*
6174                  * A block group which have "to_copy" set will eventually
6175                  * copied by dev-replace process. We can avoid cloning IO here.
6176                  */
6177                 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6178                         return;
6179
6180                 /*
6181                  * duplicate the write operations while the dev replace
6182                  * procedure is running. Since the copying of the old disk to
6183                  * the new disk takes place at run time while the filesystem is
6184                  * mounted writable, the regular write operations to the old
6185                  * disk have to be duplicated to go to the new disk as well.
6186                  *
6187                  * Note that device->missing is handled by the caller, and that
6188                  * the write to the old disk is already set up in the stripes
6189                  * array.
6190                  */
6191                 index_where_to_add = num_stripes;
6192                 for (i = 0; i < num_stripes; i++) {
6193                         if (bioc->stripes[i].dev->devid == srcdev_devid) {
6194                                 /* write to new disk, too */
6195                                 struct btrfs_io_stripe *new =
6196                                         bioc->stripes + index_where_to_add;
6197                                 struct btrfs_io_stripe *old =
6198                                         bioc->stripes + i;
6199
6200                                 new->physical = old->physical;
6201                                 new->dev = dev_replace->tgtdev;
6202                                 bioc->tgtdev_map[i] = index_where_to_add;
6203                                 index_where_to_add++;
6204                                 max_errors++;
6205                                 tgtdev_indexes++;
6206                         }
6207                 }
6208                 num_stripes = index_where_to_add;
6209         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6210                 int index_srcdev = 0;
6211                 int found = 0;
6212                 u64 physical_of_found = 0;
6213
6214                 /*
6215                  * During the dev-replace procedure, the target drive can also
6216                  * be used to read data in case it is needed to repair a corrupt
6217                  * block elsewhere. This is possible if the requested area is
6218                  * left of the left cursor. In this area, the target drive is a
6219                  * full copy of the source drive.
6220                  */
6221                 for (i = 0; i < num_stripes; i++) {
6222                         if (bioc->stripes[i].dev->devid == srcdev_devid) {
6223                                 /*
6224                                  * In case of DUP, in order to keep it simple,
6225                                  * only add the mirror with the lowest physical
6226                                  * address
6227                                  */
6228                                 if (found &&
6229                                     physical_of_found <= bioc->stripes[i].physical)
6230                                         continue;
6231                                 index_srcdev = i;
6232                                 found = 1;
6233                                 physical_of_found = bioc->stripes[i].physical;
6234                         }
6235                 }
6236                 if (found) {
6237                         struct btrfs_io_stripe *tgtdev_stripe =
6238                                 bioc->stripes + num_stripes;
6239
6240                         tgtdev_stripe->physical = physical_of_found;
6241                         tgtdev_stripe->dev = dev_replace->tgtdev;
6242                         bioc->tgtdev_map[index_srcdev] = num_stripes;
6243
6244                         tgtdev_indexes++;
6245                         num_stripes++;
6246                 }
6247         }
6248
6249         *num_stripes_ret = num_stripes;
6250         *max_errors_ret = max_errors;
6251         bioc->num_tgtdevs = tgtdev_indexes;
6252         *bioc_ret = bioc;
6253 }
6254
6255 static bool need_full_stripe(enum btrfs_map_op op)
6256 {
6257         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6258 }
6259
6260 /*
6261  * Calculate the geometry of a particular (address, len) tuple. This
6262  * information is used to calculate how big a particular bio can get before it
6263  * straddles a stripe.
6264  *
6265  * @fs_info: the filesystem
6266  * @em:      mapping containing the logical extent
6267  * @op:      type of operation - write or read
6268  * @logical: address that we want to figure out the geometry of
6269  * @io_geom: pointer used to return values
6270  *
6271  * Returns < 0 in case a chunk for the given logical address cannot be found,
6272  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6273  */
6274 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6275                           enum btrfs_map_op op, u64 logical,
6276                           struct btrfs_io_geometry *io_geom)
6277 {
6278         struct map_lookup *map;
6279         u64 len;
6280         u64 offset;
6281         u64 stripe_offset;
6282         u64 stripe_nr;
6283         u32 stripe_len;
6284         u64 raid56_full_stripe_start = (u64)-1;
6285         int data_stripes;
6286
6287         ASSERT(op != BTRFS_MAP_DISCARD);
6288
6289         map = em->map_lookup;
6290         /* Offset of this logical address in the chunk */
6291         offset = logical - em->start;
6292         /* Len of a stripe in a chunk */
6293         stripe_len = map->stripe_len;
6294         /*
6295          * Stripe_nr is where this block falls in
6296          * stripe_offset is the offset of this block in its stripe.
6297          */
6298         stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
6299         ASSERT(stripe_offset < U32_MAX);
6300
6301         data_stripes = nr_data_stripes(map);
6302
6303         /* Only stripe based profiles needs to check against stripe length. */
6304         if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
6305                 u64 max_len = stripe_len - stripe_offset;
6306
6307                 /*
6308                  * In case of raid56, we need to know the stripe aligned start
6309                  */
6310                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6311                         unsigned long full_stripe_len = stripe_len * data_stripes;
6312                         raid56_full_stripe_start = offset;
6313
6314                         /*
6315                          * Allow a write of a full stripe, but make sure we
6316                          * don't allow straddling of stripes
6317                          */
6318                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6319                                         full_stripe_len);
6320                         raid56_full_stripe_start *= full_stripe_len;
6321
6322                         /*
6323                          * For writes to RAID[56], allow a full stripeset across
6324                          * all disks. For other RAID types and for RAID[56]
6325                          * reads, just allow a single stripe (on a single disk).
6326                          */
6327                         if (op == BTRFS_MAP_WRITE) {
6328                                 max_len = stripe_len * data_stripes -
6329                                           (offset - raid56_full_stripe_start);
6330                         }
6331                 }
6332                 len = min_t(u64, em->len - offset, max_len);
6333         } else {
6334                 len = em->len - offset;
6335         }
6336
6337         io_geom->len = len;
6338         io_geom->offset = offset;
6339         io_geom->stripe_len = stripe_len;
6340         io_geom->stripe_nr = stripe_nr;
6341         io_geom->stripe_offset = stripe_offset;
6342         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6343
6344         return 0;
6345 }
6346
6347 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6348                              enum btrfs_map_op op,
6349                              u64 logical, u64 *length,
6350                              struct btrfs_io_context **bioc_ret,
6351                              int mirror_num, int need_raid_map)
6352 {
6353         struct extent_map *em;
6354         struct map_lookup *map;
6355         u64 stripe_offset;
6356         u64 stripe_nr;
6357         u64 stripe_len;
6358         u32 stripe_index;
6359         int data_stripes;
6360         int i;
6361         int ret = 0;
6362         int num_stripes;
6363         int max_errors = 0;
6364         int tgtdev_indexes = 0;
6365         struct btrfs_io_context *bioc = NULL;
6366         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6367         int dev_replace_is_ongoing = 0;
6368         int num_alloc_stripes;
6369         int patch_the_first_stripe_for_dev_replace = 0;
6370         u64 physical_to_patch_in_first_stripe = 0;
6371         u64 raid56_full_stripe_start = (u64)-1;
6372         struct btrfs_io_geometry geom;
6373
6374         ASSERT(bioc_ret);
6375         ASSERT(op != BTRFS_MAP_DISCARD);
6376
6377         em = btrfs_get_chunk_map(fs_info, logical, *length);
6378         ASSERT(!IS_ERR(em));
6379
6380         ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6381         if (ret < 0)
6382                 return ret;
6383
6384         map = em->map_lookup;
6385
6386         *length = geom.len;
6387         stripe_len = geom.stripe_len;
6388         stripe_nr = geom.stripe_nr;
6389         stripe_offset = geom.stripe_offset;
6390         raid56_full_stripe_start = geom.raid56_stripe_offset;
6391         data_stripes = nr_data_stripes(map);
6392
6393         down_read(&dev_replace->rwsem);
6394         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6395         /*
6396          * Hold the semaphore for read during the whole operation, write is
6397          * requested at commit time but must wait.
6398          */
6399         if (!dev_replace_is_ongoing)
6400                 up_read(&dev_replace->rwsem);
6401
6402         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6403             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6404                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6405                                                     dev_replace->srcdev->devid,
6406                                                     &mirror_num,
6407                                             &physical_to_patch_in_first_stripe);
6408                 if (ret)
6409                         goto out;
6410                 else
6411                         patch_the_first_stripe_for_dev_replace = 1;
6412         } else if (mirror_num > map->num_stripes) {
6413                 mirror_num = 0;
6414         }
6415
6416         num_stripes = 1;
6417         stripe_index = 0;
6418         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6419                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6420                                 &stripe_index);
6421                 if (!need_full_stripe(op))
6422                         mirror_num = 1;
6423         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6424                 if (need_full_stripe(op))
6425                         num_stripes = map->num_stripes;
6426                 else if (mirror_num)
6427                         stripe_index = mirror_num - 1;
6428                 else {
6429                         stripe_index = find_live_mirror(fs_info, map, 0,
6430                                             dev_replace_is_ongoing);
6431                         mirror_num = stripe_index + 1;
6432                 }
6433
6434         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6435                 if (need_full_stripe(op)) {
6436                         num_stripes = map->num_stripes;
6437                 } else if (mirror_num) {
6438                         stripe_index = mirror_num - 1;
6439                 } else {
6440                         mirror_num = 1;
6441                 }
6442
6443         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6444                 u32 factor = map->num_stripes / map->sub_stripes;
6445
6446                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6447                 stripe_index *= map->sub_stripes;
6448
6449                 if (need_full_stripe(op))
6450                         num_stripes = map->sub_stripes;
6451                 else if (mirror_num)
6452                         stripe_index += mirror_num - 1;
6453                 else {
6454                         int old_stripe_index = stripe_index;
6455                         stripe_index = find_live_mirror(fs_info, map,
6456                                               stripe_index,
6457                                               dev_replace_is_ongoing);
6458                         mirror_num = stripe_index - old_stripe_index + 1;
6459                 }
6460
6461         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6462                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6463                         /* push stripe_nr back to the start of the full stripe */
6464                         stripe_nr = div64_u64(raid56_full_stripe_start,
6465                                         stripe_len * data_stripes);
6466
6467                         /* RAID[56] write or recovery. Return all stripes */
6468                         num_stripes = map->num_stripes;
6469                         max_errors = nr_parity_stripes(map);
6470
6471                         *length = map->stripe_len;
6472                         stripe_index = 0;
6473                         stripe_offset = 0;
6474                 } else {
6475                         /*
6476                          * Mirror #0 or #1 means the original data block.
6477                          * Mirror #2 is RAID5 parity block.
6478                          * Mirror #3 is RAID6 Q block.
6479                          */
6480                         stripe_nr = div_u64_rem(stripe_nr,
6481                                         data_stripes, &stripe_index);
6482                         if (mirror_num > 1)
6483                                 stripe_index = data_stripes + mirror_num - 2;
6484
6485                         /* We distribute the parity blocks across stripes */
6486                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6487                                         &stripe_index);
6488                         if (!need_full_stripe(op) && mirror_num <= 1)
6489                                 mirror_num = 1;
6490                 }
6491         } else {
6492                 /*
6493                  * after this, stripe_nr is the number of stripes on this
6494                  * device we have to walk to find the data, and stripe_index is
6495                  * the number of our device in the stripe array
6496                  */
6497                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6498                                 &stripe_index);
6499                 mirror_num = stripe_index + 1;
6500         }
6501         if (stripe_index >= map->num_stripes) {
6502                 btrfs_crit(fs_info,
6503                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6504                            stripe_index, map->num_stripes);
6505                 ret = -EINVAL;
6506                 goto out;
6507         }
6508
6509         num_alloc_stripes = num_stripes;
6510         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6511                 if (op == BTRFS_MAP_WRITE)
6512                         num_alloc_stripes <<= 1;
6513                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6514                         num_alloc_stripes++;
6515                 tgtdev_indexes = num_stripes;
6516         }
6517
6518         bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
6519         if (!bioc) {
6520                 ret = -ENOMEM;
6521                 goto out;
6522         }
6523
6524         for (i = 0; i < num_stripes; i++) {
6525                 bioc->stripes[i].physical = map->stripes[stripe_index].physical +
6526                         stripe_offset + stripe_nr * map->stripe_len;
6527                 bioc->stripes[i].dev = map->stripes[stripe_index].dev;
6528                 stripe_index++;
6529         }
6530
6531         /* Build raid_map */
6532         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6533             (need_full_stripe(op) || mirror_num > 1)) {
6534                 u64 tmp;
6535                 unsigned rot;
6536
6537                 /* Work out the disk rotation on this stripe-set */
6538                 div_u64_rem(stripe_nr, num_stripes, &rot);
6539
6540                 /* Fill in the logical address of each stripe */
6541                 tmp = stripe_nr * data_stripes;
6542                 for (i = 0; i < data_stripes; i++)
6543                         bioc->raid_map[(i + rot) % num_stripes] =
6544                                 em->start + (tmp + i) * map->stripe_len;
6545
6546                 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
6547                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6548                         bioc->raid_map[(i + rot + 1) % num_stripes] =
6549                                 RAID6_Q_STRIPE;
6550
6551                 sort_parity_stripes(bioc, num_stripes);
6552         }
6553
6554         if (need_full_stripe(op))
6555                 max_errors = btrfs_chunk_max_errors(map);
6556
6557         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6558             need_full_stripe(op)) {
6559                 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
6560                                           &num_stripes, &max_errors);
6561         }
6562
6563         *bioc_ret = bioc;
6564         bioc->map_type = map->type;
6565         bioc->num_stripes = num_stripes;
6566         bioc->max_errors = max_errors;
6567         bioc->mirror_num = mirror_num;
6568
6569         /*
6570          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6571          * mirror_num == num_stripes + 1 && dev_replace target drive is
6572          * available as a mirror
6573          */
6574         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6575                 WARN_ON(num_stripes > 1);
6576                 bioc->stripes[0].dev = dev_replace->tgtdev;
6577                 bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
6578                 bioc->mirror_num = map->num_stripes + 1;
6579         }
6580 out:
6581         if (dev_replace_is_ongoing) {
6582                 lockdep_assert_held(&dev_replace->rwsem);
6583                 /* Unlock and let waiting writers proceed */
6584                 up_read(&dev_replace->rwsem);
6585         }
6586         free_extent_map(em);
6587         return ret;
6588 }
6589
6590 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6591                       u64 logical, u64 *length,
6592                       struct btrfs_io_context **bioc_ret, int mirror_num)
6593 {
6594         return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
6595                                  mirror_num, 0);
6596 }
6597
6598 /* For Scrub/replace */
6599 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6600                      u64 logical, u64 *length,
6601                      struct btrfs_io_context **bioc_ret)
6602 {
6603         return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
6604 }
6605
6606 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
6607 {
6608         if (bioc->orig_bio->bi_opf & REQ_META)
6609                 return bioc->fs_info->endio_meta_workers;
6610         return bioc->fs_info->endio_workers;
6611 }
6612
6613 static void btrfs_end_bio_work(struct work_struct *work)
6614 {
6615         struct btrfs_bio *bbio =
6616                 container_of(work, struct btrfs_bio, end_io_work);
6617
6618         bio_endio(&bbio->bio);
6619 }
6620
6621 static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
6622 {
6623         struct bio *orig_bio = bioc->orig_bio;
6624         struct btrfs_bio *bbio = btrfs_bio(orig_bio);
6625
6626         bbio->mirror_num = bioc->mirror_num;
6627         orig_bio->bi_private = bioc->private;
6628         orig_bio->bi_end_io = bioc->end_io;
6629
6630         /*
6631          * Only send an error to the higher layers if it is beyond the tolerance
6632          * threshold.
6633          */
6634         if (atomic_read(&bioc->error) > bioc->max_errors)
6635                 orig_bio->bi_status = BLK_STS_IOERR;
6636         else
6637                 orig_bio->bi_status = BLK_STS_OK;
6638
6639         if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
6640                 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
6641                 queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
6642         } else {
6643                 bio_endio(orig_bio);
6644         }
6645
6646         btrfs_put_bioc(bioc);
6647 }
6648
6649 static void btrfs_end_bio(struct bio *bio)
6650 {
6651         struct btrfs_io_stripe *stripe = bio->bi_private;
6652         struct btrfs_io_context *bioc = stripe->bioc;
6653
6654         if (bio->bi_status) {
6655                 atomic_inc(&bioc->error);
6656                 if (bio->bi_status == BLK_STS_IOERR ||
6657                     bio->bi_status == BLK_STS_TARGET) {
6658                         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6659                                 btrfs_dev_stat_inc_and_print(stripe->dev,
6660                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6661                         else if (!(bio->bi_opf & REQ_RAHEAD))
6662                                 btrfs_dev_stat_inc_and_print(stripe->dev,
6663                                                 BTRFS_DEV_STAT_READ_ERRS);
6664                         if (bio->bi_opf & REQ_PREFLUSH)
6665                                 btrfs_dev_stat_inc_and_print(stripe->dev,
6666                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6667                 }
6668         }
6669
6670         if (bio != bioc->orig_bio)
6671                 bio_put(bio);
6672
6673         btrfs_bio_counter_dec(bioc->fs_info);
6674         if (atomic_dec_and_test(&bioc->stripes_pending))
6675                 btrfs_end_bioc(bioc, true);
6676 }
6677
6678 static void submit_stripe_bio(struct btrfs_io_context *bioc,
6679                               struct bio *orig_bio, int dev_nr, bool clone)
6680 {
6681         struct btrfs_fs_info *fs_info = bioc->fs_info;
6682         struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
6683         u64 physical = bioc->stripes[dev_nr].physical;
6684         struct bio *bio;
6685
6686         if (!dev || !dev->bdev ||
6687             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6688             (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
6689              !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6690                 atomic_inc(&bioc->error);
6691                 if (atomic_dec_and_test(&bioc->stripes_pending))
6692                         btrfs_end_bioc(bioc, false);
6693                 return;
6694         }
6695
6696         if (clone) {
6697                 bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
6698         } else {
6699                 bio = orig_bio;
6700                 bio_set_dev(bio, dev->bdev);
6701                 btrfs_bio(bio)->device = dev;
6702         }
6703
6704         bioc->stripes[dev_nr].bioc = bioc;
6705         bio->bi_private = &bioc->stripes[dev_nr];
6706         bio->bi_end_io = btrfs_end_bio;
6707         bio->bi_iter.bi_sector = physical >> 9;
6708         /*
6709          * For zone append writing, bi_sector must point the beginning of the
6710          * zone
6711          */
6712         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6713                 if (btrfs_dev_is_sequential(dev, physical)) {
6714                         u64 zone_start = round_down(physical, fs_info->zone_size);
6715
6716                         bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6717                 } else {
6718                         bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6719                         bio->bi_opf |= REQ_OP_WRITE;
6720                 }
6721         }
6722         btrfs_debug_in_rcu(fs_info,
6723         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6724                 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6725                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6726                 dev->devid, bio->bi_iter.bi_size);
6727
6728         btrfs_bio_counter_inc_noblocked(fs_info);
6729
6730         btrfsic_check_bio(bio);
6731         submit_bio(bio);
6732 }
6733
6734 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6735                            int mirror_num)
6736 {
6737         u64 logical = bio->bi_iter.bi_sector << 9;
6738         u64 length = bio->bi_iter.bi_size;
6739         u64 map_length = length;
6740         int ret;
6741         int dev_nr;
6742         int total_devs;
6743         struct btrfs_io_context *bioc = NULL;
6744
6745         btrfs_bio_counter_inc_blocked(fs_info);
6746         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6747                                 &map_length, &bioc, mirror_num, 1);
6748         if (ret)
6749                 goto out_dec;
6750
6751         total_devs = bioc->num_stripes;
6752         bioc->orig_bio = bio;
6753         bioc->private = bio->bi_private;
6754         bioc->end_io = bio->bi_end_io;
6755         atomic_set(&bioc->stripes_pending, total_devs);
6756
6757         if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6758             ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
6759                 /* In this case, map_length has been set to the length of
6760                    a single stripe; not the whole write */
6761                 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6762                         ret = raid56_parity_write(bio, bioc, map_length);
6763                 } else {
6764                         ret = raid56_parity_recover(bio, bioc, map_length,
6765                                                     mirror_num, 1);
6766                 }
6767                 goto out_dec;
6768         }
6769
6770         if (map_length < length) {
6771                 btrfs_crit(fs_info,
6772                            "mapping failed logical %llu bio len %llu len %llu",
6773                            logical, length, map_length);
6774                 BUG();
6775         }
6776
6777         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6778                 const bool should_clone = (dev_nr < total_devs - 1);
6779
6780                 submit_stripe_bio(bioc, bio, dev_nr, should_clone);
6781         }
6782 out_dec:
6783         btrfs_bio_counter_dec(fs_info);
6784         return errno_to_blk_status(ret);
6785 }
6786
6787 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6788                                       const struct btrfs_fs_devices *fs_devices)
6789 {
6790         if (args->fsid == NULL)
6791                 return true;
6792         if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6793                 return true;
6794         return false;
6795 }
6796
6797 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6798                                   const struct btrfs_device *device)
6799 {
6800         ASSERT((args->devid != (u64)-1) || args->missing);
6801
6802         if ((args->devid != (u64)-1) && device->devid != args->devid)
6803                 return false;
6804         if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
6805                 return false;
6806         if (!args->missing)
6807                 return true;
6808         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6809             !device->bdev)
6810                 return true;
6811         return false;
6812 }
6813
6814 /*
6815  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6816  * return NULL.
6817  *
6818  * If devid and uuid are both specified, the match must be exact, otherwise
6819  * only devid is used.
6820  */
6821 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6822                                        const struct btrfs_dev_lookup_args *args)
6823 {
6824         struct btrfs_device *device;
6825         struct btrfs_fs_devices *seed_devs;
6826
6827         if (dev_args_match_fs_devices(args, fs_devices)) {
6828                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6829                         if (dev_args_match_device(args, device))
6830                                 return device;
6831                 }
6832         }
6833
6834         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6835                 if (!dev_args_match_fs_devices(args, seed_devs))
6836                         continue;
6837                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
6838                         if (dev_args_match_device(args, device))
6839                                 return device;
6840                 }
6841         }
6842
6843         return NULL;
6844 }
6845
6846 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6847                                             u64 devid, u8 *dev_uuid)
6848 {
6849         struct btrfs_device *device;
6850         unsigned int nofs_flag;
6851
6852         /*
6853          * We call this under the chunk_mutex, so we want to use NOFS for this
6854          * allocation, however we don't want to change btrfs_alloc_device() to
6855          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6856          * places.
6857          */
6858         nofs_flag = memalloc_nofs_save();
6859         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6860         memalloc_nofs_restore(nofs_flag);
6861         if (IS_ERR(device))
6862                 return device;
6863
6864         list_add(&device->dev_list, &fs_devices->devices);
6865         device->fs_devices = fs_devices;
6866         fs_devices->num_devices++;
6867
6868         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6869         fs_devices->missing_devices++;
6870
6871         return device;
6872 }
6873
6874 /**
6875  * btrfs_alloc_device - allocate struct btrfs_device
6876  * @fs_info:    used only for generating a new devid, can be NULL if
6877  *              devid is provided (i.e. @devid != NULL).
6878  * @devid:      a pointer to devid for this device.  If NULL a new devid
6879  *              is generated.
6880  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6881  *              is generated.
6882  *
6883  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6884  * on error.  Returned struct is not linked onto any lists and must be
6885  * destroyed with btrfs_free_device.
6886  */
6887 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6888                                         const u64 *devid,
6889                                         const u8 *uuid)
6890 {
6891         struct btrfs_device *dev;
6892         u64 tmp;
6893
6894         if (WARN_ON(!devid && !fs_info))
6895                 return ERR_PTR(-EINVAL);
6896
6897         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6898         if (!dev)
6899                 return ERR_PTR(-ENOMEM);
6900
6901         INIT_LIST_HEAD(&dev->dev_list);
6902         INIT_LIST_HEAD(&dev->dev_alloc_list);
6903         INIT_LIST_HEAD(&dev->post_commit_list);
6904
6905         atomic_set(&dev->dev_stats_ccnt, 0);
6906         btrfs_device_data_ordered_init(dev);
6907         extent_io_tree_init(fs_info, &dev->alloc_state,
6908                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
6909
6910         if (devid)
6911                 tmp = *devid;
6912         else {
6913                 int ret;
6914
6915                 ret = find_next_devid(fs_info, &tmp);
6916                 if (ret) {
6917                         btrfs_free_device(dev);
6918                         return ERR_PTR(ret);
6919                 }
6920         }
6921         dev->devid = tmp;
6922
6923         if (uuid)
6924                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6925         else
6926                 generate_random_uuid(dev->uuid);
6927
6928         return dev;
6929 }
6930
6931 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6932                                         u64 devid, u8 *uuid, bool error)
6933 {
6934         if (error)
6935                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6936                               devid, uuid);
6937         else
6938                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6939                               devid, uuid);
6940 }
6941
6942 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6943 {
6944         const int data_stripes = calc_data_stripes(type, num_stripes);
6945
6946         return div_u64(chunk_len, data_stripes);
6947 }
6948
6949 #if BITS_PER_LONG == 32
6950 /*
6951  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6952  * can't be accessed on 32bit systems.
6953  *
6954  * This function do mount time check to reject the fs if it already has
6955  * metadata chunk beyond that limit.
6956  */
6957 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6958                                   u64 logical, u64 length, u64 type)
6959 {
6960         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6961                 return 0;
6962
6963         if (logical + length < MAX_LFS_FILESIZE)
6964                 return 0;
6965
6966         btrfs_err_32bit_limit(fs_info);
6967         return -EOVERFLOW;
6968 }
6969
6970 /*
6971  * This is to give early warning for any metadata chunk reaching
6972  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6973  * Although we can still access the metadata, it's not going to be possible
6974  * once the limit is reached.
6975  */
6976 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6977                                   u64 logical, u64 length, u64 type)
6978 {
6979         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6980                 return;
6981
6982         if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6983                 return;
6984
6985         btrfs_warn_32bit_limit(fs_info);
6986 }
6987 #endif
6988
6989 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
6990                                                   u64 devid, u8 *uuid)
6991 {
6992         struct btrfs_device *dev;
6993
6994         if (!btrfs_test_opt(fs_info, DEGRADED)) {
6995                 btrfs_report_missing_device(fs_info, devid, uuid, true);
6996                 return ERR_PTR(-ENOENT);
6997         }
6998
6999         dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
7000         if (IS_ERR(dev)) {
7001                 btrfs_err(fs_info, "failed to init missing device %llu: %ld",
7002                           devid, PTR_ERR(dev));
7003                 return dev;
7004         }
7005         btrfs_report_missing_device(fs_info, devid, uuid, false);
7006
7007         return dev;
7008 }
7009
7010 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
7011                           struct btrfs_chunk *chunk)
7012 {
7013         BTRFS_DEV_LOOKUP_ARGS(args);
7014         struct btrfs_fs_info *fs_info = leaf->fs_info;
7015         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7016         struct map_lookup *map;
7017         struct extent_map *em;
7018         u64 logical;
7019         u64 length;
7020         u64 devid;
7021         u64 type;
7022         u8 uuid[BTRFS_UUID_SIZE];
7023         int num_stripes;
7024         int ret;
7025         int i;
7026
7027         logical = key->offset;
7028         length = btrfs_chunk_length(leaf, chunk);
7029         type = btrfs_chunk_type(leaf, chunk);
7030         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
7031
7032 #if BITS_PER_LONG == 32
7033         ret = check_32bit_meta_chunk(fs_info, logical, length, type);
7034         if (ret < 0)
7035                 return ret;
7036         warn_32bit_meta_chunk(fs_info, logical, length, type);
7037 #endif
7038
7039         /*
7040          * Only need to verify chunk item if we're reading from sys chunk array,
7041          * as chunk item in tree block is already verified by tree-checker.
7042          */
7043         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
7044                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
7045                 if (ret)
7046                         return ret;
7047         }
7048
7049         read_lock(&map_tree->lock);
7050         em = lookup_extent_mapping(map_tree, logical, 1);
7051         read_unlock(&map_tree->lock);
7052
7053         /* already mapped? */
7054         if (em && em->start <= logical && em->start + em->len > logical) {
7055                 free_extent_map(em);
7056                 return 0;
7057         } else if (em) {
7058                 free_extent_map(em);
7059         }
7060
7061         em = alloc_extent_map();
7062         if (!em)
7063                 return -ENOMEM;
7064         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
7065         if (!map) {
7066                 free_extent_map(em);
7067                 return -ENOMEM;
7068         }
7069
7070         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
7071         em->map_lookup = map;
7072         em->start = logical;
7073         em->len = length;
7074         em->orig_start = 0;
7075         em->block_start = 0;
7076         em->block_len = em->len;
7077
7078         map->num_stripes = num_stripes;
7079         map->io_width = btrfs_chunk_io_width(leaf, chunk);
7080         map->io_align = btrfs_chunk_io_align(leaf, chunk);
7081         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7082         map->type = type;
7083         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7084         map->verified_stripes = 0;
7085         em->orig_block_len = calc_stripe_length(type, em->len,
7086                                                 map->num_stripes);
7087         for (i = 0; i < num_stripes; i++) {
7088                 map->stripes[i].physical =
7089                         btrfs_stripe_offset_nr(leaf, chunk, i);
7090                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7091                 args.devid = devid;
7092                 read_extent_buffer(leaf, uuid, (unsigned long)
7093                                    btrfs_stripe_dev_uuid_nr(chunk, i),
7094                                    BTRFS_UUID_SIZE);
7095                 args.uuid = uuid;
7096                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
7097                 if (!map->stripes[i].dev) {
7098                         map->stripes[i].dev = handle_missing_device(fs_info,
7099                                                                     devid, uuid);
7100                         if (IS_ERR(map->stripes[i].dev)) {
7101                                 free_extent_map(em);
7102                                 return PTR_ERR(map->stripes[i].dev);
7103                         }
7104                 }
7105
7106                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7107                                 &(map->stripes[i].dev->dev_state));
7108         }
7109
7110         write_lock(&map_tree->lock);
7111         ret = add_extent_mapping(map_tree, em, 0);
7112         write_unlock(&map_tree->lock);
7113         if (ret < 0) {
7114                 btrfs_err(fs_info,
7115                           "failed to add chunk map, start=%llu len=%llu: %d",
7116                           em->start, em->len, ret);
7117         }
7118         free_extent_map(em);
7119
7120         return ret;
7121 }
7122
7123 static void fill_device_from_item(struct extent_buffer *leaf,
7124                                  struct btrfs_dev_item *dev_item,
7125                                  struct btrfs_device *device)
7126 {
7127         unsigned long ptr;
7128
7129         device->devid = btrfs_device_id(leaf, dev_item);
7130         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7131         device->total_bytes = device->disk_total_bytes;
7132         device->commit_total_bytes = device->disk_total_bytes;
7133         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7134         device->commit_bytes_used = device->bytes_used;
7135         device->type = btrfs_device_type(leaf, dev_item);
7136         device->io_align = btrfs_device_io_align(leaf, dev_item);
7137         device->io_width = btrfs_device_io_width(leaf, dev_item);
7138         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7139         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7140         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7141
7142         ptr = btrfs_device_uuid(dev_item);
7143         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7144 }
7145
7146 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7147                                                   u8 *fsid)
7148 {
7149         struct btrfs_fs_devices *fs_devices;
7150         int ret;
7151
7152         lockdep_assert_held(&uuid_mutex);
7153         ASSERT(fsid);
7154
7155         /* This will match only for multi-device seed fs */
7156         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7157                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7158                         return fs_devices;
7159
7160
7161         fs_devices = find_fsid(fsid, NULL);
7162         if (!fs_devices) {
7163                 if (!btrfs_test_opt(fs_info, DEGRADED))
7164                         return ERR_PTR(-ENOENT);
7165
7166                 fs_devices = alloc_fs_devices(fsid, NULL);
7167                 if (IS_ERR(fs_devices))
7168                         return fs_devices;
7169
7170                 fs_devices->seeding = true;
7171                 fs_devices->opened = 1;
7172                 return fs_devices;
7173         }
7174
7175         /*
7176          * Upon first call for a seed fs fsid, just create a private copy of the
7177          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7178          */
7179         fs_devices = clone_fs_devices(fs_devices);
7180         if (IS_ERR(fs_devices))
7181                 return fs_devices;
7182
7183         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7184         if (ret) {
7185                 free_fs_devices(fs_devices);
7186                 return ERR_PTR(ret);
7187         }
7188
7189         if (!fs_devices->seeding) {
7190                 close_fs_devices(fs_devices);
7191                 free_fs_devices(fs_devices);
7192                 return ERR_PTR(-EINVAL);
7193         }
7194
7195         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7196
7197         return fs_devices;
7198 }
7199
7200 static int read_one_dev(struct extent_buffer *leaf,
7201                         struct btrfs_dev_item *dev_item)
7202 {
7203         BTRFS_DEV_LOOKUP_ARGS(args);
7204         struct btrfs_fs_info *fs_info = leaf->fs_info;
7205         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7206         struct btrfs_device *device;
7207         u64 devid;
7208         int ret;
7209         u8 fs_uuid[BTRFS_FSID_SIZE];
7210         u8 dev_uuid[BTRFS_UUID_SIZE];
7211
7212         devid = args.devid = btrfs_device_id(leaf, dev_item);
7213         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7214                            BTRFS_UUID_SIZE);
7215         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7216                            BTRFS_FSID_SIZE);
7217         args.uuid = dev_uuid;
7218         args.fsid = fs_uuid;
7219
7220         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7221                 fs_devices = open_seed_devices(fs_info, fs_uuid);
7222                 if (IS_ERR(fs_devices))
7223                         return PTR_ERR(fs_devices);
7224         }
7225
7226         device = btrfs_find_device(fs_info->fs_devices, &args);
7227         if (!device) {
7228                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7229                         btrfs_report_missing_device(fs_info, devid,
7230                                                         dev_uuid, true);
7231                         return -ENOENT;
7232                 }
7233
7234                 device = add_missing_dev(fs_devices, devid, dev_uuid);
7235                 if (IS_ERR(device)) {
7236                         btrfs_err(fs_info,
7237                                 "failed to add missing dev %llu: %ld",
7238                                 devid, PTR_ERR(device));
7239                         return PTR_ERR(device);
7240                 }
7241                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7242         } else {
7243                 if (!device->bdev) {
7244                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
7245                                 btrfs_report_missing_device(fs_info,
7246                                                 devid, dev_uuid, true);
7247                                 return -ENOENT;
7248                         }
7249                         btrfs_report_missing_device(fs_info, devid,
7250                                                         dev_uuid, false);
7251                 }
7252
7253                 if (!device->bdev &&
7254                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7255                         /*
7256                          * this happens when a device that was properly setup
7257                          * in the device info lists suddenly goes bad.
7258                          * device->bdev is NULL, and so we have to set
7259                          * device->missing to one here
7260                          */
7261                         device->fs_devices->missing_devices++;
7262                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7263                 }
7264
7265                 /* Move the device to its own fs_devices */
7266                 if (device->fs_devices != fs_devices) {
7267                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7268                                                         &device->dev_state));
7269
7270                         list_move(&device->dev_list, &fs_devices->devices);
7271                         device->fs_devices->num_devices--;
7272                         fs_devices->num_devices++;
7273
7274                         device->fs_devices->missing_devices--;
7275                         fs_devices->missing_devices++;
7276
7277                         device->fs_devices = fs_devices;
7278                 }
7279         }
7280
7281         if (device->fs_devices != fs_info->fs_devices) {
7282                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7283                 if (device->generation !=
7284                     btrfs_device_generation(leaf, dev_item))
7285                         return -EINVAL;
7286         }
7287
7288         fill_device_from_item(leaf, dev_item, device);
7289         if (device->bdev) {
7290                 u64 max_total_bytes = bdev_nr_bytes(device->bdev);
7291
7292                 if (device->total_bytes > max_total_bytes) {
7293                         btrfs_err(fs_info,
7294                         "device total_bytes should be at most %llu but found %llu",
7295                                   max_total_bytes, device->total_bytes);
7296                         return -EINVAL;
7297                 }
7298         }
7299         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7300         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7301            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7302                 device->fs_devices->total_rw_bytes += device->total_bytes;
7303                 atomic64_add(device->total_bytes - device->bytes_used,
7304                                 &fs_info->free_chunk_space);
7305         }
7306         ret = 0;
7307         return ret;
7308 }
7309
7310 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7311 {
7312         struct btrfs_super_block *super_copy = fs_info->super_copy;
7313         struct extent_buffer *sb;
7314         struct btrfs_disk_key *disk_key;
7315         struct btrfs_chunk *chunk;
7316         u8 *array_ptr;
7317         unsigned long sb_array_offset;
7318         int ret = 0;
7319         u32 num_stripes;
7320         u32 array_size;
7321         u32 len = 0;
7322         u32 cur_offset;
7323         u64 type;
7324         struct btrfs_key key;
7325
7326         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7327
7328         /*
7329          * We allocated a dummy extent, just to use extent buffer accessors.
7330          * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7331          * that's fine, we will not go beyond system chunk array anyway.
7332          */
7333         sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7334         if (!sb)
7335                 return -ENOMEM;
7336         set_extent_buffer_uptodate(sb);
7337
7338         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7339         array_size = btrfs_super_sys_array_size(super_copy);
7340
7341         array_ptr = super_copy->sys_chunk_array;
7342         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7343         cur_offset = 0;
7344
7345         while (cur_offset < array_size) {
7346                 disk_key = (struct btrfs_disk_key *)array_ptr;
7347                 len = sizeof(*disk_key);
7348                 if (cur_offset + len > array_size)
7349                         goto out_short_read;
7350
7351                 btrfs_disk_key_to_cpu(&key, disk_key);
7352
7353                 array_ptr += len;
7354                 sb_array_offset += len;
7355                 cur_offset += len;
7356
7357                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7358                         btrfs_err(fs_info,
7359                             "unexpected item type %u in sys_array at offset %u",
7360                                   (u32)key.type, cur_offset);
7361                         ret = -EIO;
7362                         break;
7363                 }
7364
7365                 chunk = (struct btrfs_chunk *)sb_array_offset;
7366                 /*
7367                  * At least one btrfs_chunk with one stripe must be present,
7368                  * exact stripe count check comes afterwards
7369                  */
7370                 len = btrfs_chunk_item_size(1);
7371                 if (cur_offset + len > array_size)
7372                         goto out_short_read;
7373
7374                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7375                 if (!num_stripes) {
7376                         btrfs_err(fs_info,
7377                         "invalid number of stripes %u in sys_array at offset %u",
7378                                   num_stripes, cur_offset);
7379                         ret = -EIO;
7380                         break;
7381                 }
7382
7383                 type = btrfs_chunk_type(sb, chunk);
7384                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7385                         btrfs_err(fs_info,
7386                         "invalid chunk type %llu in sys_array at offset %u",
7387                                   type, cur_offset);
7388                         ret = -EIO;
7389                         break;
7390                 }
7391
7392                 len = btrfs_chunk_item_size(num_stripes);
7393                 if (cur_offset + len > array_size)
7394                         goto out_short_read;
7395
7396                 ret = read_one_chunk(&key, sb, chunk);
7397                 if (ret)
7398                         break;
7399
7400                 array_ptr += len;
7401                 sb_array_offset += len;
7402                 cur_offset += len;
7403         }
7404         clear_extent_buffer_uptodate(sb);
7405         free_extent_buffer_stale(sb);
7406         return ret;
7407
7408 out_short_read:
7409         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7410                         len, cur_offset);
7411         clear_extent_buffer_uptodate(sb);
7412         free_extent_buffer_stale(sb);
7413         return -EIO;
7414 }
7415
7416 /*
7417  * Check if all chunks in the fs are OK for read-write degraded mount
7418  *
7419  * If the @failing_dev is specified, it's accounted as missing.
7420  *
7421  * Return true if all chunks meet the minimal RW mount requirements.
7422  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7423  */
7424 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7425                                         struct btrfs_device *failing_dev)
7426 {
7427         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7428         struct extent_map *em;
7429         u64 next_start = 0;
7430         bool ret = true;
7431
7432         read_lock(&map_tree->lock);
7433         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7434         read_unlock(&map_tree->lock);
7435         /* No chunk at all? Return false anyway */
7436         if (!em) {
7437                 ret = false;
7438                 goto out;
7439         }
7440         while (em) {
7441                 struct map_lookup *map;
7442                 int missing = 0;
7443                 int max_tolerated;
7444                 int i;
7445
7446                 map = em->map_lookup;
7447                 max_tolerated =
7448                         btrfs_get_num_tolerated_disk_barrier_failures(
7449                                         map->type);
7450                 for (i = 0; i < map->num_stripes; i++) {
7451                         struct btrfs_device *dev = map->stripes[i].dev;
7452
7453                         if (!dev || !dev->bdev ||
7454                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7455                             dev->last_flush_error)
7456                                 missing++;
7457                         else if (failing_dev && failing_dev == dev)
7458                                 missing++;
7459                 }
7460                 if (missing > max_tolerated) {
7461                         if (!failing_dev)
7462                                 btrfs_warn(fs_info,
7463         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7464                                    em->start, missing, max_tolerated);
7465                         free_extent_map(em);
7466                         ret = false;
7467                         goto out;
7468                 }
7469                 next_start = extent_map_end(em);
7470                 free_extent_map(em);
7471
7472                 read_lock(&map_tree->lock);
7473                 em = lookup_extent_mapping(map_tree, next_start,
7474                                            (u64)(-1) - next_start);
7475                 read_unlock(&map_tree->lock);
7476         }
7477 out:
7478         return ret;
7479 }
7480
7481 static void readahead_tree_node_children(struct extent_buffer *node)
7482 {
7483         int i;
7484         const int nr_items = btrfs_header_nritems(node);
7485
7486         for (i = 0; i < nr_items; i++)
7487                 btrfs_readahead_node_child(node, i);
7488 }
7489
7490 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7491 {
7492         struct btrfs_root *root = fs_info->chunk_root;
7493         struct btrfs_path *path;
7494         struct extent_buffer *leaf;
7495         struct btrfs_key key;
7496         struct btrfs_key found_key;
7497         int ret;
7498         int slot;
7499         int iter_ret = 0;
7500         u64 total_dev = 0;
7501         u64 last_ra_node = 0;
7502
7503         path = btrfs_alloc_path();
7504         if (!path)
7505                 return -ENOMEM;
7506
7507         /*
7508          * uuid_mutex is needed only if we are mounting a sprout FS
7509          * otherwise we don't need it.
7510          */
7511         mutex_lock(&uuid_mutex);
7512
7513         /*
7514          * It is possible for mount and umount to race in such a way that
7515          * we execute this code path, but open_fs_devices failed to clear
7516          * total_rw_bytes. We certainly want it cleared before reading the
7517          * device items, so clear it here.
7518          */
7519         fs_info->fs_devices->total_rw_bytes = 0;
7520
7521         /*
7522          * Lockdep complains about possible circular locking dependency between
7523          * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
7524          * used for freeze procection of a fs (struct super_block.s_writers),
7525          * which we take when starting a transaction, and extent buffers of the
7526          * chunk tree if we call read_one_dev() while holding a lock on an
7527          * extent buffer of the chunk tree. Since we are mounting the filesystem
7528          * and at this point there can't be any concurrent task modifying the
7529          * chunk tree, to keep it simple, just skip locking on the chunk tree.
7530          */
7531         ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7532         path->skip_locking = 1;
7533
7534         /*
7535          * Read all device items, and then all the chunk items. All
7536          * device items are found before any chunk item (their object id
7537          * is smaller than the lowest possible object id for a chunk
7538          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7539          */
7540         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7541         key.offset = 0;
7542         key.type = 0;
7543         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
7544                 struct extent_buffer *node = path->nodes[1];
7545
7546                 leaf = path->nodes[0];
7547                 slot = path->slots[0];
7548
7549                 if (node) {
7550                         if (last_ra_node != node->start) {
7551                                 readahead_tree_node_children(node);
7552                                 last_ra_node = node->start;
7553                         }
7554                 }
7555                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7556                         struct btrfs_dev_item *dev_item;
7557                         dev_item = btrfs_item_ptr(leaf, slot,
7558                                                   struct btrfs_dev_item);
7559                         ret = read_one_dev(leaf, dev_item);
7560                         if (ret)
7561                                 goto error;
7562                         total_dev++;
7563                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7564                         struct btrfs_chunk *chunk;
7565
7566                         /*
7567                          * We are only called at mount time, so no need to take
7568                          * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7569                          * we always lock first fs_info->chunk_mutex before
7570                          * acquiring any locks on the chunk tree. This is a
7571                          * requirement for chunk allocation, see the comment on
7572                          * top of btrfs_chunk_alloc() for details.
7573                          */
7574                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7575                         ret = read_one_chunk(&found_key, leaf, chunk);
7576                         if (ret)
7577                                 goto error;
7578                 }
7579         }
7580         /* Catch error found during iteration */
7581         if (iter_ret < 0) {
7582                 ret = iter_ret;
7583                 goto error;
7584         }
7585
7586         /*
7587          * After loading chunk tree, we've got all device information,
7588          * do another round of validation checks.
7589          */
7590         if (total_dev != fs_info->fs_devices->total_devices) {
7591                 btrfs_warn(fs_info,
7592 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7593                           btrfs_super_num_devices(fs_info->super_copy),
7594                           total_dev);
7595                 fs_info->fs_devices->total_devices = total_dev;
7596                 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7597         }
7598         if (btrfs_super_total_bytes(fs_info->super_copy) <
7599             fs_info->fs_devices->total_rw_bytes) {
7600                 btrfs_err(fs_info,
7601         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7602                           btrfs_super_total_bytes(fs_info->super_copy),
7603                           fs_info->fs_devices->total_rw_bytes);
7604                 ret = -EINVAL;
7605                 goto error;
7606         }
7607         ret = 0;
7608 error:
7609         mutex_unlock(&uuid_mutex);
7610
7611         btrfs_free_path(path);
7612         return ret;
7613 }
7614
7615 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7616 {
7617         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7618         struct btrfs_device *device;
7619
7620         fs_devices->fs_info = fs_info;
7621
7622         mutex_lock(&fs_devices->device_list_mutex);
7623         list_for_each_entry(device, &fs_devices->devices, dev_list)
7624                 device->fs_info = fs_info;
7625
7626         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7627                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7628                         device->fs_info = fs_info;
7629
7630                 seed_devs->fs_info = fs_info;
7631         }
7632         mutex_unlock(&fs_devices->device_list_mutex);
7633 }
7634
7635 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7636                                  const struct btrfs_dev_stats_item *ptr,
7637                                  int index)
7638 {
7639         u64 val;
7640
7641         read_extent_buffer(eb, &val,
7642                            offsetof(struct btrfs_dev_stats_item, values) +
7643                             ((unsigned long)ptr) + (index * sizeof(u64)),
7644                            sizeof(val));
7645         return val;
7646 }
7647
7648 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7649                                       struct btrfs_dev_stats_item *ptr,
7650                                       int index, u64 val)
7651 {
7652         write_extent_buffer(eb, &val,
7653                             offsetof(struct btrfs_dev_stats_item, values) +
7654                              ((unsigned long)ptr) + (index * sizeof(u64)),
7655                             sizeof(val));
7656 }
7657
7658 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7659                                        struct btrfs_path *path)
7660 {
7661         struct btrfs_dev_stats_item *ptr;
7662         struct extent_buffer *eb;
7663         struct btrfs_key key;
7664         int item_size;
7665         int i, ret, slot;
7666
7667         if (!device->fs_info->dev_root)
7668                 return 0;
7669
7670         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7671         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7672         key.offset = device->devid;
7673         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7674         if (ret) {
7675                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7676                         btrfs_dev_stat_set(device, i, 0);
7677                 device->dev_stats_valid = 1;
7678                 btrfs_release_path(path);
7679                 return ret < 0 ? ret : 0;
7680         }
7681         slot = path->slots[0];
7682         eb = path->nodes[0];
7683         item_size = btrfs_item_size(eb, slot);
7684
7685         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7686
7687         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7688                 if (item_size >= (1 + i) * sizeof(__le64))
7689                         btrfs_dev_stat_set(device, i,
7690                                            btrfs_dev_stats_value(eb, ptr, i));
7691                 else
7692                         btrfs_dev_stat_set(device, i, 0);
7693         }
7694
7695         device->dev_stats_valid = 1;
7696         btrfs_dev_stat_print_on_load(device);
7697         btrfs_release_path(path);
7698
7699         return 0;
7700 }
7701
7702 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7703 {
7704         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7705         struct btrfs_device *device;
7706         struct btrfs_path *path = NULL;
7707         int ret = 0;
7708
7709         path = btrfs_alloc_path();
7710         if (!path)
7711                 return -ENOMEM;
7712
7713         mutex_lock(&fs_devices->device_list_mutex);
7714         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7715                 ret = btrfs_device_init_dev_stats(device, path);
7716                 if (ret)
7717                         goto out;
7718         }
7719         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7720                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7721                         ret = btrfs_device_init_dev_stats(device, path);
7722                         if (ret)
7723                                 goto out;
7724                 }
7725         }
7726 out:
7727         mutex_unlock(&fs_devices->device_list_mutex);
7728
7729         btrfs_free_path(path);
7730         return ret;
7731 }
7732
7733 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7734                                 struct btrfs_device *device)
7735 {
7736         struct btrfs_fs_info *fs_info = trans->fs_info;
7737         struct btrfs_root *dev_root = fs_info->dev_root;
7738         struct btrfs_path *path;
7739         struct btrfs_key key;
7740         struct extent_buffer *eb;
7741         struct btrfs_dev_stats_item *ptr;
7742         int ret;
7743         int i;
7744
7745         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7746         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7747         key.offset = device->devid;
7748
7749         path = btrfs_alloc_path();
7750         if (!path)
7751                 return -ENOMEM;
7752         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7753         if (ret < 0) {
7754                 btrfs_warn_in_rcu(fs_info,
7755                         "error %d while searching for dev_stats item for device %s",
7756                               ret, rcu_str_deref(device->name));
7757                 goto out;
7758         }
7759
7760         if (ret == 0 &&
7761             btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7762                 /* need to delete old one and insert a new one */
7763                 ret = btrfs_del_item(trans, dev_root, path);
7764                 if (ret != 0) {
7765                         btrfs_warn_in_rcu(fs_info,
7766                                 "delete too small dev_stats item for device %s failed %d",
7767                                       rcu_str_deref(device->name), ret);
7768                         goto out;
7769                 }
7770                 ret = 1;
7771         }
7772
7773         if (ret == 1) {
7774                 /* need to insert a new item */
7775                 btrfs_release_path(path);
7776                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7777                                               &key, sizeof(*ptr));
7778                 if (ret < 0) {
7779                         btrfs_warn_in_rcu(fs_info,
7780                                 "insert dev_stats item for device %s failed %d",
7781                                 rcu_str_deref(device->name), ret);
7782                         goto out;
7783                 }
7784         }
7785
7786         eb = path->nodes[0];
7787         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7788         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7789                 btrfs_set_dev_stats_value(eb, ptr, i,
7790                                           btrfs_dev_stat_read(device, i));
7791         btrfs_mark_buffer_dirty(eb);
7792
7793 out:
7794         btrfs_free_path(path);
7795         return ret;
7796 }
7797
7798 /*
7799  * called from commit_transaction. Writes all changed device stats to disk.
7800  */
7801 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7802 {
7803         struct btrfs_fs_info *fs_info = trans->fs_info;
7804         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7805         struct btrfs_device *device;
7806         int stats_cnt;
7807         int ret = 0;
7808
7809         mutex_lock(&fs_devices->device_list_mutex);
7810         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7811                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7812                 if (!device->dev_stats_valid || stats_cnt == 0)
7813                         continue;
7814
7815
7816                 /*
7817                  * There is a LOAD-LOAD control dependency between the value of
7818                  * dev_stats_ccnt and updating the on-disk values which requires
7819                  * reading the in-memory counters. Such control dependencies
7820                  * require explicit read memory barriers.
7821                  *
7822                  * This memory barriers pairs with smp_mb__before_atomic in
7823                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7824                  * barrier implied by atomic_xchg in
7825                  * btrfs_dev_stats_read_and_reset
7826                  */
7827                 smp_rmb();
7828
7829                 ret = update_dev_stat_item(trans, device);
7830                 if (!ret)
7831                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7832         }
7833         mutex_unlock(&fs_devices->device_list_mutex);
7834
7835         return ret;
7836 }
7837
7838 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7839 {
7840         btrfs_dev_stat_inc(dev, index);
7841         btrfs_dev_stat_print_on_error(dev);
7842 }
7843
7844 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7845 {
7846         if (!dev->dev_stats_valid)
7847                 return;
7848         btrfs_err_rl_in_rcu(dev->fs_info,
7849                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7850                            rcu_str_deref(dev->name),
7851                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7852                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7853                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7854                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7855                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7856 }
7857
7858 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7859 {
7860         int i;
7861
7862         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7863                 if (btrfs_dev_stat_read(dev, i) != 0)
7864                         break;
7865         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7866                 return; /* all values == 0, suppress message */
7867
7868         btrfs_info_in_rcu(dev->fs_info,
7869                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7870                rcu_str_deref(dev->name),
7871                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7872                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7873                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7874                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7875                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7876 }
7877
7878 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7879                         struct btrfs_ioctl_get_dev_stats *stats)
7880 {
7881         BTRFS_DEV_LOOKUP_ARGS(args);
7882         struct btrfs_device *dev;
7883         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7884         int i;
7885
7886         mutex_lock(&fs_devices->device_list_mutex);
7887         args.devid = stats->devid;
7888         dev = btrfs_find_device(fs_info->fs_devices, &args);
7889         mutex_unlock(&fs_devices->device_list_mutex);
7890
7891         if (!dev) {
7892                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7893                 return -ENODEV;
7894         } else if (!dev->dev_stats_valid) {
7895                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7896                 return -ENODEV;
7897         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7898                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7899                         if (stats->nr_items > i)
7900                                 stats->values[i] =
7901                                         btrfs_dev_stat_read_and_reset(dev, i);
7902                         else
7903                                 btrfs_dev_stat_set(dev, i, 0);
7904                 }
7905                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7906                            current->comm, task_pid_nr(current));
7907         } else {
7908                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7909                         if (stats->nr_items > i)
7910                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7911         }
7912         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7913                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7914         return 0;
7915 }
7916
7917 /*
7918  * Update the size and bytes used for each device where it changed.  This is
7919  * delayed since we would otherwise get errors while writing out the
7920  * superblocks.
7921  *
7922  * Must be invoked during transaction commit.
7923  */
7924 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7925 {
7926         struct btrfs_device *curr, *next;
7927
7928         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7929
7930         if (list_empty(&trans->dev_update_list))
7931                 return;
7932
7933         /*
7934          * We don't need the device_list_mutex here.  This list is owned by the
7935          * transaction and the transaction must complete before the device is
7936          * released.
7937          */
7938         mutex_lock(&trans->fs_info->chunk_mutex);
7939         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7940                                  post_commit_list) {
7941                 list_del_init(&curr->post_commit_list);
7942                 curr->commit_total_bytes = curr->disk_total_bytes;
7943                 curr->commit_bytes_used = curr->bytes_used;
7944         }
7945         mutex_unlock(&trans->fs_info->chunk_mutex);
7946 }
7947
7948 /*
7949  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7950  */
7951 int btrfs_bg_type_to_factor(u64 flags)
7952 {
7953         const int index = btrfs_bg_flags_to_raid_index(flags);
7954
7955         return btrfs_raid_array[index].ncopies;
7956 }
7957
7958
7959
7960 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7961                                  u64 chunk_offset, u64 devid,
7962                                  u64 physical_offset, u64 physical_len)
7963 {
7964         struct btrfs_dev_lookup_args args = { .devid = devid };
7965         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7966         struct extent_map *em;
7967         struct map_lookup *map;
7968         struct btrfs_device *dev;
7969         u64 stripe_len;
7970         bool found = false;
7971         int ret = 0;
7972         int i;
7973
7974         read_lock(&em_tree->lock);
7975         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7976         read_unlock(&em_tree->lock);
7977
7978         if (!em) {
7979                 btrfs_err(fs_info,
7980 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7981                           physical_offset, devid);
7982                 ret = -EUCLEAN;
7983                 goto out;
7984         }
7985
7986         map = em->map_lookup;
7987         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7988         if (physical_len != stripe_len) {
7989                 btrfs_err(fs_info,
7990 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7991                           physical_offset, devid, em->start, physical_len,
7992                           stripe_len);
7993                 ret = -EUCLEAN;
7994                 goto out;
7995         }
7996
7997         for (i = 0; i < map->num_stripes; i++) {
7998                 if (map->stripes[i].dev->devid == devid &&
7999                     map->stripes[i].physical == physical_offset) {
8000                         found = true;
8001                         if (map->verified_stripes >= map->num_stripes) {
8002                                 btrfs_err(fs_info,
8003                                 "too many dev extents for chunk %llu found",
8004                                           em->start);
8005                                 ret = -EUCLEAN;
8006                                 goto out;
8007                         }
8008                         map->verified_stripes++;
8009                         break;
8010                 }
8011         }
8012         if (!found) {
8013                 btrfs_err(fs_info,
8014         "dev extent physical offset %llu devid %llu has no corresponding chunk",
8015                         physical_offset, devid);
8016                 ret = -EUCLEAN;
8017         }
8018
8019         /* Make sure no dev extent is beyond device boundary */
8020         dev = btrfs_find_device(fs_info->fs_devices, &args);
8021         if (!dev) {
8022                 btrfs_err(fs_info, "failed to find devid %llu", devid);
8023                 ret = -EUCLEAN;
8024                 goto out;
8025         }
8026
8027         if (physical_offset + physical_len > dev->disk_total_bytes) {
8028                 btrfs_err(fs_info,
8029 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
8030                           devid, physical_offset, physical_len,
8031                           dev->disk_total_bytes);
8032                 ret = -EUCLEAN;
8033                 goto out;
8034         }
8035
8036         if (dev->zone_info) {
8037                 u64 zone_size = dev->zone_info->zone_size;
8038
8039                 if (!IS_ALIGNED(physical_offset, zone_size) ||
8040                     !IS_ALIGNED(physical_len, zone_size)) {
8041                         btrfs_err(fs_info,
8042 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8043                                   devid, physical_offset, physical_len);
8044                         ret = -EUCLEAN;
8045                         goto out;
8046                 }
8047         }
8048
8049 out:
8050         free_extent_map(em);
8051         return ret;
8052 }
8053
8054 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8055 {
8056         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8057         struct extent_map *em;
8058         struct rb_node *node;
8059         int ret = 0;
8060
8061         read_lock(&em_tree->lock);
8062         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8063                 em = rb_entry(node, struct extent_map, rb_node);
8064                 if (em->map_lookup->num_stripes !=
8065                     em->map_lookup->verified_stripes) {
8066                         btrfs_err(fs_info,
8067                         "chunk %llu has missing dev extent, have %d expect %d",
8068                                   em->start, em->map_lookup->verified_stripes,
8069                                   em->map_lookup->num_stripes);
8070                         ret = -EUCLEAN;
8071                         goto out;
8072                 }
8073         }
8074 out:
8075         read_unlock(&em_tree->lock);
8076         return ret;
8077 }
8078
8079 /*
8080  * Ensure that all dev extents are mapped to correct chunk, otherwise
8081  * later chunk allocation/free would cause unexpected behavior.
8082  *
8083  * NOTE: This will iterate through the whole device tree, which should be of
8084  * the same size level as the chunk tree.  This slightly increases mount time.
8085  */
8086 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8087 {
8088         struct btrfs_path *path;
8089         struct btrfs_root *root = fs_info->dev_root;
8090         struct btrfs_key key;
8091         u64 prev_devid = 0;
8092         u64 prev_dev_ext_end = 0;
8093         int ret = 0;
8094
8095         /*
8096          * We don't have a dev_root because we mounted with ignorebadroots and
8097          * failed to load the root, so we want to skip the verification in this
8098          * case for sure.
8099          *
8100          * However if the dev root is fine, but the tree itself is corrupted
8101          * we'd still fail to mount.  This verification is only to make sure
8102          * writes can happen safely, so instead just bypass this check
8103          * completely in the case of IGNOREBADROOTS.
8104          */
8105         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
8106                 return 0;
8107
8108         key.objectid = 1;
8109         key.type = BTRFS_DEV_EXTENT_KEY;
8110         key.offset = 0;
8111
8112         path = btrfs_alloc_path();
8113         if (!path)
8114                 return -ENOMEM;
8115
8116         path->reada = READA_FORWARD;
8117         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8118         if (ret < 0)
8119                 goto out;
8120
8121         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8122                 ret = btrfs_next_leaf(root, path);
8123                 if (ret < 0)
8124                         goto out;
8125                 /* No dev extents at all? Not good */
8126                 if (ret > 0) {
8127                         ret = -EUCLEAN;
8128                         goto out;
8129                 }
8130         }
8131         while (1) {
8132                 struct extent_buffer *leaf = path->nodes[0];
8133                 struct btrfs_dev_extent *dext;
8134                 int slot = path->slots[0];
8135                 u64 chunk_offset;
8136                 u64 physical_offset;
8137                 u64 physical_len;
8138                 u64 devid;
8139
8140                 btrfs_item_key_to_cpu(leaf, &key, slot);
8141                 if (key.type != BTRFS_DEV_EXTENT_KEY)
8142                         break;
8143                 devid = key.objectid;
8144                 physical_offset = key.offset;
8145
8146                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8147                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8148                 physical_len = btrfs_dev_extent_length(leaf, dext);
8149
8150                 /* Check if this dev extent overlaps with the previous one */
8151                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8152                         btrfs_err(fs_info,
8153 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8154                                   devid, physical_offset, prev_dev_ext_end);
8155                         ret = -EUCLEAN;
8156                         goto out;
8157                 }
8158
8159                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8160                                             physical_offset, physical_len);
8161                 if (ret < 0)
8162                         goto out;
8163                 prev_devid = devid;
8164                 prev_dev_ext_end = physical_offset + physical_len;
8165
8166                 ret = btrfs_next_item(root, path);
8167                 if (ret < 0)
8168                         goto out;
8169                 if (ret > 0) {
8170                         ret = 0;
8171                         break;
8172                 }
8173         }
8174
8175         /* Ensure all chunks have corresponding dev extents */
8176         ret = verify_chunk_dev_extent_mapping(fs_info);
8177 out:
8178         btrfs_free_path(path);
8179         return ret;
8180 }
8181
8182 /*
8183  * Check whether the given block group or device is pinned by any inode being
8184  * used as a swapfile.
8185  */
8186 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8187 {
8188         struct btrfs_swapfile_pin *sp;
8189         struct rb_node *node;
8190
8191         spin_lock(&fs_info->swapfile_pins_lock);
8192         node = fs_info->swapfile_pins.rb_node;
8193         while (node) {
8194                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8195                 if (ptr < sp->ptr)
8196                         node = node->rb_left;
8197                 else if (ptr > sp->ptr)
8198                         node = node->rb_right;
8199                 else
8200                         break;
8201         }
8202         spin_unlock(&fs_info->swapfile_pins_lock);
8203         return node != NULL;
8204 }
8205
8206 static int relocating_repair_kthread(void *data)
8207 {
8208         struct btrfs_block_group *cache = data;
8209         struct btrfs_fs_info *fs_info = cache->fs_info;
8210         u64 target;
8211         int ret = 0;
8212
8213         target = cache->start;
8214         btrfs_put_block_group(cache);
8215
8216         sb_start_write(fs_info->sb);
8217         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8218                 btrfs_info(fs_info,
8219                            "zoned: skip relocating block group %llu to repair: EBUSY",
8220                            target);
8221                 sb_end_write(fs_info->sb);
8222                 return -EBUSY;
8223         }
8224
8225         mutex_lock(&fs_info->reclaim_bgs_lock);
8226
8227         /* Ensure block group still exists */
8228         cache = btrfs_lookup_block_group(fs_info, target);
8229         if (!cache)
8230                 goto out;
8231
8232         if (!cache->relocating_repair)
8233                 goto out;
8234
8235         ret = btrfs_may_alloc_data_chunk(fs_info, target);
8236         if (ret < 0)
8237                 goto out;
8238
8239         btrfs_info(fs_info,
8240                    "zoned: relocating block group %llu to repair IO failure",
8241                    target);
8242         ret = btrfs_relocate_chunk(fs_info, target);
8243
8244 out:
8245         if (cache)
8246                 btrfs_put_block_group(cache);
8247         mutex_unlock(&fs_info->reclaim_bgs_lock);
8248         btrfs_exclop_finish(fs_info);
8249         sb_end_write(fs_info->sb);
8250
8251         return ret;
8252 }
8253
8254 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8255 {
8256         struct btrfs_block_group *cache;
8257
8258         if (!btrfs_is_zoned(fs_info))
8259                 return false;
8260
8261         /* Do not attempt to repair in degraded state */
8262         if (btrfs_test_opt(fs_info, DEGRADED))
8263                 return true;
8264
8265         cache = btrfs_lookup_block_group(fs_info, logical);
8266         if (!cache)
8267                 return true;
8268
8269         spin_lock(&cache->lock);
8270         if (cache->relocating_repair) {
8271                 spin_unlock(&cache->lock);
8272                 btrfs_put_block_group(cache);
8273                 return true;
8274         }
8275         cache->relocating_repair = 1;
8276         spin_unlock(&cache->lock);
8277
8278         kthread_run(relocating_repair_kthread, cache,
8279                     "btrfs-relocating-repair");
8280
8281         return true;
8282 }