fs/btrfs/volumes.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/ratelimit.h>
  12 #include <linux/kthread.h>
  13 #include <linux/raid/pq.h>
  14 #include <linux/semaphore.h>
  15 #include <linux/uuid.h>
  16 #include <linux/list_sort.h>
  17 #include <linux/namei.h>
  18 #include "misc.h"
  19 #include "ctree.h"
  20 #include "extent_map.h"
  21 #include "disk-io.h"
  22 #include "transaction.h"
  23 #include "print-tree.h"
  24 #include "volumes.h"
  25 #include "raid56.h"
  26 #include "async-thread.h"
  27 #include "check-integrity.h"
  28 #include "rcu-string.h"
  29 #include "dev-replace.h"
  30 #include "sysfs.h"
  31 #include "tree-checker.h"
  32 #include "space-info.h"
  33 #include "block-group.h"
  34 #include "discard.h"
  35 #include "zoned.h"
  36
  37 #define BTRFS_BLOCK_GROUP_STRIPE_MASK   (BTRFS_BLOCK_GROUP_RAID0 | \
  38                                          BTRFS_BLOCK_GROUP_RAID10 | \
  39                                          BTRFS_BLOCK_GROUP_RAID56_MASK)
  40
  41 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  42         [BTRFS_RAID_RAID10] = {
  43                 .sub_stripes    = 2,
  44                 .dev_stripes    = 1,
  45                 .devs_max       = 0,    /* 0 == as many as possible */
  46                 .devs_min       = 2,
  47                 .tolerated_failures = 1,
  48                 .devs_increment = 2,
  49                 .ncopies        = 2,
  50                 .nparity        = 0,
  51                 .raid_name      = "raid10",
  52                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  53                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  54         },
  55         [BTRFS_RAID_RAID1] = {
  56                 .sub_stripes    = 1,
  57                 .dev_stripes    = 1,
  58                 .devs_max       = 2,
  59                 .devs_min       = 2,
  60                 .tolerated_failures = 1,
  61                 .devs_increment = 2,
  62                 .ncopies        = 2,
  63                 .nparity        = 0,
  64                 .raid_name      = "raid1",
  65                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  66                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  67         },
  68         [BTRFS_RAID_RAID1C3] = {
  69                 .sub_stripes    = 1,
  70                 .dev_stripes    = 1,
  71                 .devs_max       = 3,
  72                 .devs_min       = 3,
  73                 .tolerated_failures = 2,
  74                 .devs_increment = 3,
  75                 .ncopies        = 3,
  76                 .nparity        = 0,
  77                 .raid_name      = "raid1c3",
  78                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  79                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  80         },
  81         [BTRFS_RAID_RAID1C4] = {
  82                 .sub_stripes    = 1,
  83                 .dev_stripes    = 1,
  84                 .devs_max       = 4,
  85                 .devs_min       = 4,
  86                 .tolerated_failures = 3,
  87                 .devs_increment = 4,
  88                 .ncopies        = 4,
  89                 .nparity        = 0,
  90                 .raid_name      = "raid1c4",
  91                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  92                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  93         },
  94         [BTRFS_RAID_DUP] = {
  95                 .sub_stripes    = 1,
  96                 .dev_stripes    = 2,
  97                 .devs_max       = 1,
  98                 .devs_min       = 1,
  99                 .tolerated_failures = 0,
 100                 .devs_increment = 1,
 101                 .ncopies        = 2,
 102                 .nparity        = 0,
 103                 .raid_name      = "dup",
 104                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
 105                 .mindev_error   = 0,
 106         },
 107         [BTRFS_RAID_RAID0] = {
 108                 .sub_stripes    = 1,
 109                 .dev_stripes    = 1,
 110                 .devs_max       = 0,
 111                 .devs_min       = 1,
 112                 .tolerated_failures = 0,
 113                 .devs_increment = 1,
 114                 .ncopies        = 1,
 115                 .nparity        = 0,
 116                 .raid_name      = "raid0",
 117                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 118                 .mindev_error   = 0,
 119         },
 120         [BTRFS_RAID_SINGLE] = {
 121                 .sub_stripes    = 1,
 122                 .dev_stripes    = 1,
 123                 .devs_max       = 1,
 124                 .devs_min       = 1,
 125                 .tolerated_failures = 0,
 126                 .devs_increment = 1,
 127                 .ncopies        = 1,
 128                 .nparity        = 0,
 129                 .raid_name      = "single",
 130                 .bg_flag        = 0,
 131                 .mindev_error   = 0,
 132         },
 133         [BTRFS_RAID_RAID5] = {
 134                 .sub_stripes    = 1,
 135                 .dev_stripes    = 1,
 136                 .devs_max       = 0,
 137                 .devs_min       = 2,
 138                 .tolerated_failures = 1,
 139                 .devs_increment = 1,
 140                 .ncopies        = 1,
 141                 .nparity        = 1,
 142                 .raid_name      = "raid5",
 143                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 144                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 145         },
 146         [BTRFS_RAID_RAID6] = {
 147                 .sub_stripes    = 1,
 148                 .dev_stripes    = 1,
 149                 .devs_max       = 0,
 150                 .devs_min       = 3,
 151                 .tolerated_failures = 2,
 152                 .devs_increment = 1,
 153                 .ncopies        = 1,
 154                 .nparity        = 2,
 155                 .raid_name      = "raid6",
 156                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 157                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 158         },
 159 };
 160
 161 /*
 162  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 163  * can be used as index to access btrfs_raid_array[].
 164  */
 165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
 166 {
 167         const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
 168
 169         if (!profile)
 170                 return BTRFS_RAID_SINGLE;
 171
 172         return BTRFS_BG_FLAG_TO_INDEX(profile);
 173 }
 174
 175 const char *btrfs_bg_type_to_raid_name(u64 flags)
 176 {
 177         const int index = btrfs_bg_flags_to_raid_index(flags);
 178
 179         if (index >= BTRFS_NR_RAID_TYPES)
 180                 return NULL;
 181
 182         return btrfs_raid_array[index].raid_name;
 183 }
 184
 185 /*
 186  * Fill @buf with textual description of @bg_flags, no more than @size_buf
 187  * bytes including terminating null byte.
 188  */
 189 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 190 {
 191         int i;
 192         int ret;
 193         char *bp = buf;
 194         u64 flags = bg_flags;
 195         u32 size_bp = size_buf;
 196
 197         if (!flags) {
 198                 strcpy(bp, "NONE");
 199                 return;
 200         }
 201
 202 #define DESCRIBE_FLAG(flag, desc)                                               \
 203         do {                                                            \
 204                 if (flags & (flag)) {                                   \
 205                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
 206                         if (ret < 0 || ret >= size_bp)                  \
 207                                 goto out_overflow;                      \
 208                         size_bp -= ret;                                 \
 209                         bp += ret;                                      \
 210                         flags &= ~(flag);                               \
 211                 }                                                       \
 212         } while (0)
 213
 214         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 215         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 216         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 217
 218         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 219         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 220                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 221                               btrfs_raid_array[i].raid_name);
 222 #undef DESCRIBE_FLAG
 223
 224         if (flags) {
 225                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
 226                 size_bp -= ret;
 227         }
 228
 229         if (size_bp < size_buf)
 230                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 231
 232         /*
 233          * The text is trimmed, it's up to the caller to provide sufficiently
 234          * large buffer
 235          */
 236 out_overflow:;
 237 }
 238
 239 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 240 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 241 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 242 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 243 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 244                              enum btrfs_map_op op,
 245                              u64 logical, u64 *length,
 246                              struct btrfs_io_context **bioc_ret,
 247                              int mirror_num, int need_raid_map);
 248
 249 /*
 250  * Device locking
 251  * ==============
 252  *
 253  * There are several mutexes that protect manipulation of devices and low-level
 254  * structures like chunks but not block groups, extents or files
 255  *
 256  * uuid_mutex (global lock)
 257  * ------------------------
 258  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 259  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 260  * device) or requested by the device= mount option
 261  *
 262  * the mutex can be very coarse and can cover long-running operations
 263  *
 264  * protects: updates to fs_devices counters like missing devices, rw devices,
 265  * seeding, structure cloning, opening/closing devices at mount/umount time
 266  *
 267  * global::fs_devs - add, remove, updates to the global list
 268  *
 269  * does not protect: manipulation of the fs_devices::devices list in general
 270  * but in mount context it could be used to exclude list modifications by eg.
 271  * scan ioctl
 272  *
 273  * btrfs_device::name - renames (write side), read is RCU
 274  *
 275  * fs_devices::device_list_mutex (per-fs, with RCU)
 276  * ------------------------------------------------
 277  * protects updates to fs_devices::devices, ie. adding and deleting
 278  *
 279  * simple list traversal with read-only actions can be done with RCU protection
 280  *
 281  * may be used to exclude some operations from running concurrently without any
 282  * modifications to the list (see write_all_supers)
 283  *
 284  * Is not required at mount and close times, because our device list is
 285  * protected by the uuid_mutex at that point.
 286  *
 287  * balance_mutex
 288  * -------------
 289  * protects balance structures (status, state) and context accessed from
 290  * several places (internally, ioctl)
 291  *
 292  * chunk_mutex
 293  * -----------
 294  * protects chunks, adding or removing during allocation, trim or when a new
 295  * device is added/removed. Additionally it also protects post_commit_list of
 296  * individual devices, since they can be added to the transaction's
 297  * post_commit_list only with chunk_mutex held.
 298  *
 299  * cleaner_mutex
 300  * -------------
 301  * a big lock that is held by the cleaner thread and prevents running subvolume
 302  * cleaning together with relocation or delayed iputs
 303  *
 304  *
 305  * Lock nesting
 306  * ============
 307  *
 308  * uuid_mutex
 309  *   device_list_mutex
 310  *     chunk_mutex
 311  *   balance_mutex
 312  *
 313  *
 314  * Exclusive operations
 315  * ====================
 316  *
 317  * Maintains the exclusivity of the following operations that apply to the
 318  * whole filesystem and cannot run in parallel.
 319  *
 320  * - Balance (*)
 321  * - Device add
 322  * - Device remove
 323  * - Device replace (*)
 324  * - Resize
 325  *
 326  * The device operations (as above) can be in one of the following states:
 327  *
 328  * - Running state
 329  * - Paused state
 330  * - Completed state
 331  *
 332  * Only device operations marked with (*) can go into the Paused state for the
 333  * following reasons:
 334  *
 335  * - ioctl (only Balance can be Paused through ioctl)
 336  * - filesystem remounted as read-only
 337  * - filesystem unmounted and mounted as read-only
 338  * - system power-cycle and filesystem mounted as read-only
 339  * - filesystem or device errors leading to forced read-only
 340  *
 341  * The status of exclusive operation is set and cleared atomically.
 342  * During the course of Paused state, fs_info::exclusive_operation remains set.
 343  * A device operation in Paused or Running state can be canceled or resumed
 344  * either by ioctl (Balance only) or when remounted as read-write.
 345  * The exclusive status is cleared when the device operation is canceled or
 346  * completed.
 347  */
 348
 349 DEFINE_MUTEX(uuid_mutex);
 350 static LIST_HEAD(fs_uuids);
 351 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 352 {
 353         return &fs_uuids;
 354 }
 355
 356 /*
 357  * alloc_fs_devices - allocate struct btrfs_fs_devices
 358  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 359  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 360  *
 361  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 362  * The returned struct is not linked onto any lists and can be destroyed with
 363  * kfree() right away.
 364  */
 365 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 366                                                  const u8 *metadata_fsid)
 367 {
 368         struct btrfs_fs_devices *fs_devs;
 369
 370         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 371         if (!fs_devs)
 372                 return ERR_PTR(-ENOMEM);
 373
 374         mutex_init(&fs_devs->device_list_mutex);
 375
 376         INIT_LIST_HEAD(&fs_devs->devices);
 377         INIT_LIST_HEAD(&fs_devs->alloc_list);
 378         INIT_LIST_HEAD(&fs_devs->fs_list);
 379         INIT_LIST_HEAD(&fs_devs->seed_list);
 380         if (fsid)
 381                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 382
 383         if (metadata_fsid)
 384                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 385         else if (fsid)
 386                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 387
 388         return fs_devs;
 389 }
 390
 391 void btrfs_free_device(struct btrfs_device *device)
 392 {
 393         WARN_ON(!list_empty(&device->post_commit_list));
 394         rcu_string_free(device->name);
 395         extent_io_tree_release(&device->alloc_state);
 396         btrfs_destroy_dev_zone_info(device);
 397         kfree(device);
 398 }
 399
 400 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 401 {
 402         struct btrfs_device *device;
 403         WARN_ON(fs_devices->opened);
 404         while (!list_empty(&fs_devices->devices)) {
 405                 device = list_entry(fs_devices->devices.next,
 406                                     struct btrfs_device, dev_list);
 407                 list_del(&device->dev_list);
 408                 btrfs_free_device(device);
 409         }
 410         kfree(fs_devices);
 411 }
 412
 413 void __exit btrfs_cleanup_fs_uuids(void)
 414 {
 415         struct btrfs_fs_devices *fs_devices;
 416
 417         while (!list_empty(&fs_uuids)) {
 418                 fs_devices = list_entry(fs_uuids.next,
 419                                         struct btrfs_fs_devices, fs_list);
 420                 list_del(&fs_devices->fs_list);
 421                 free_fs_devices(fs_devices);
 422         }
 423 }
 424
 425 static noinline struct btrfs_fs_devices *find_fsid(
 426                 const u8 *fsid, const u8 *metadata_fsid)
 427 {
 428         struct btrfs_fs_devices *fs_devices;
 429
 430         ASSERT(fsid);
 431
 432         /* Handle non-split brain cases */
 433         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 434                 if (metadata_fsid) {
 435                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 436                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 437                                       BTRFS_FSID_SIZE) == 0)
 438                                 return fs_devices;
 439                 } else {
 440                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 441                                 return fs_devices;
 442                 }
 443         }
 444         return NULL;
 445 }
 446
 447 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 448                                 struct btrfs_super_block *disk_super)
 449 {
 450
 451         struct btrfs_fs_devices *fs_devices;
 452
 453         /*
 454          * Handle scanned device having completed its fsid change but
 455          * belonging to a fs_devices that was created by first scanning
 456          * a device which didn't have its fsid/metadata_uuid changed
 457          * at all and the CHANGING_FSID_V2 flag set.
 458          */
 459         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 460                 if (fs_devices->fsid_change &&
 461                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 462                            BTRFS_FSID_SIZE) == 0 &&
 463                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 464                            BTRFS_FSID_SIZE) == 0) {
 465                         return fs_devices;
 466                 }
 467         }
 468         /*
 469          * Handle scanned device having completed its fsid change but
 470          * belonging to a fs_devices that was created by a device that
 471          * has an outdated pair of fsid/metadata_uuid and
 472          * CHANGING_FSID_V2 flag set.
 473          */
 474         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 475                 if (fs_devices->fsid_change &&
 476                     memcmp(fs_devices->metadata_uuid,
 477                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 478                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 479                            BTRFS_FSID_SIZE) == 0) {
 480                         return fs_devices;
 481                 }
 482         }
 483
 484         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 485 }
 486
 487
 488 static int
 489 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 490                       int flush, struct block_device **bdev,
 491                       struct btrfs_super_block **disk_super)
 492 {
 493         int ret;
 494
 495         *bdev = blkdev_get_by_path(device_path, flags, holder);
 496
 497         if (IS_ERR(*bdev)) {
 498                 ret = PTR_ERR(*bdev);
 499                 goto error;
 500         }
 501
 502         if (flush)
 503                 sync_blockdev(*bdev);
 504         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 505         if (ret) {
 506                 blkdev_put(*bdev, flags);
 507                 goto error;
 508         }
 509         invalidate_bdev(*bdev);
 510         *disk_super = btrfs_read_dev_super(*bdev);
 511         if (IS_ERR(*disk_super)) {
 512                 ret = PTR_ERR(*disk_super);
 513                 blkdev_put(*bdev, flags);
 514                 goto error;
 515         }
 516
 517         return 0;
 518
 519 error:
 520         *bdev = NULL;
 521         return ret;
 522 }
 523
 524 /**
 525  *  Search and remove all stale devices (which are not mounted).
 526  *  When both inputs are NULL, it will search and release all stale devices.
 527  *
 528  *  @devt:      Optional. When provided will it release all unmounted devices
 529  *              matching this devt only.
 530  *  @skip_device:  Optional. Will skip this device when searching for the stale
 531  *              devices.
 532  *
 533  *  Return:     0 for success or if @devt is 0.
 534  *              -EBUSY if @devt is a mounted device.
 535  *              -ENOENT if @devt does not match any device in the list.
 536  */
 537 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
 538 {
 539         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 540         struct btrfs_device *device, *tmp_device;
 541         int ret = 0;
 542
 543         lockdep_assert_held(&uuid_mutex);
 544
 545         if (devt)
 546                 ret = -ENOENT;
 547
 548         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 549
 550                 mutex_lock(&fs_devices->device_list_mutex);
 551                 list_for_each_entry_safe(device, tmp_device,
 552                                          &fs_devices->devices, dev_list) {
 553                         if (skip_device && skip_device == device)
 554                                 continue;
 555                         if (devt && devt != device->devt)
 556                                 continue;
 557                         if (fs_devices->opened) {
 558                                 /* for an already deleted device return 0 */
 559                                 if (devt && ret != 0)
 560                                         ret = -EBUSY;
 561                                 break;
 562                         }
 563
 564                         /* delete the stale device */
 565                         fs_devices->num_devices--;
 566                         list_del(&device->dev_list);
 567                         btrfs_free_device(device);
 568
 569                         ret = 0;
 570                 }
 571                 mutex_unlock(&fs_devices->device_list_mutex);
 572
 573                 if (fs_devices->num_devices == 0) {
 574                         btrfs_sysfs_remove_fsid(fs_devices);
 575                         list_del(&fs_devices->fs_list);
 576                         free_fs_devices(fs_devices);
 577                 }
 578         }
 579
 580         return ret;
 581 }
 582
 583 /*
 584  * This is only used on mount, and we are protected from competing things
 585  * messing with our fs_devices by the uuid_mutex, thus we do not need the
 586  * fs_devices->device_list_mutex here.
 587  */
 588 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 589                         struct btrfs_device *device, fmode_t flags,
 590                         void *holder)
 591 {
 592         struct block_device *bdev;
 593         struct btrfs_super_block *disk_super;
 594         u64 devid;
 595         int ret;
 596
 597         if (device->bdev)
 598                 return -EINVAL;
 599         if (!device->name)
 600                 return -EINVAL;
 601
 602         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 603                                     &bdev, &disk_super);
 604         if (ret)
 605                 return ret;
 606
 607         devid = btrfs_stack_device_id(&disk_super->dev_item);
 608         if (devid != device->devid)
 609                 goto error_free_page;
 610
 611         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 612                 goto error_free_page;
 613
 614         device->generation = btrfs_super_generation(disk_super);
 615
 616         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 617                 if (btrfs_super_incompat_flags(disk_super) &
 618                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 619                         pr_err(
 620                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
 621                         goto error_free_page;
 622                 }
 623
 624                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 625                 fs_devices->seeding = true;
 626         } else {
 627                 if (bdev_read_only(bdev))
 628                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 629                 else
 630                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 631         }
 632
 633         if (!bdev_nonrot(bdev))
 634                 fs_devices->rotating = true;
 635
 636         device->bdev = bdev;
 637         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 638         device->mode = flags;
 639
 640         fs_devices->open_devices++;
 641         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 642             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 643                 fs_devices->rw_devices++;
 644                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 645         }
 646         btrfs_release_disk_super(disk_super);
 647
 648         return 0;
 649
 650 error_free_page:
 651         btrfs_release_disk_super(disk_super);
 652         blkdev_put(bdev, flags);
 653
 654         return -EINVAL;
 655 }
 656
 657 /*
 658  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 659  * being created with a disk that has already completed its fsid change. Such
 660  * disk can belong to an fs which has its FSID changed or to one which doesn't.
 661  * Handle both cases here.
 662  */
 663 static struct btrfs_fs_devices *find_fsid_inprogress(
 664                                         struct btrfs_super_block *disk_super)
 665 {
 666         struct btrfs_fs_devices *fs_devices;
 667
 668         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 669                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 670                            BTRFS_FSID_SIZE) != 0 &&
 671                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 672                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 673                         return fs_devices;
 674                 }
 675         }
 676
 677         return find_fsid(disk_super->fsid, NULL);
 678 }
 679
 680
 681 static struct btrfs_fs_devices *find_fsid_changed(
 682                                         struct btrfs_super_block *disk_super)
 683 {
 684         struct btrfs_fs_devices *fs_devices;
 685
 686         /*
 687          * Handles the case where scanned device is part of an fs that had
 688          * multiple successful changes of FSID but currently device didn't
 689          * observe it. Meaning our fsid will be different than theirs. We need
 690          * to handle two subcases :
 691          *  1 - The fs still continues to have different METADATA/FSID uuids.
 692          *  2 - The fs is switched back to its original FSID (METADATA/FSID
 693          *  are equal).
 694          */
 695         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 696                 /* Changed UUIDs */
 697                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 698                            BTRFS_FSID_SIZE) != 0 &&
 699                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 700                            BTRFS_FSID_SIZE) == 0 &&
 701                     memcmp(fs_devices->fsid, disk_super->fsid,
 702                            BTRFS_FSID_SIZE) != 0)
 703                         return fs_devices;
 704
 705                 /* Unchanged UUIDs */
 706                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 707                            BTRFS_FSID_SIZE) == 0 &&
 708                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 709                            BTRFS_FSID_SIZE) == 0)
 710                         return fs_devices;
 711         }
 712
 713         return NULL;
 714 }
 715
 716 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 717                                 struct btrfs_super_block *disk_super)
 718 {
 719         struct btrfs_fs_devices *fs_devices;
 720
 721         /*
 722          * Handle the case where the scanned device is part of an fs whose last
 723          * metadata UUID change reverted it to the original FSID. At the same
 724          * time * fs_devices was first created by another constitutent device
 725          * which didn't fully observe the operation. This results in an
 726          * btrfs_fs_devices created with metadata/fsid different AND
 727          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 728          * fs_devices equal to the FSID of the disk.
 729          */
 730         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 731                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 732                            BTRFS_FSID_SIZE) != 0 &&
 733                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 734                            BTRFS_FSID_SIZE) == 0 &&
 735                     fs_devices->fsid_change)
 736                         return fs_devices;
 737         }
 738
 739         return NULL;
 740 }
 741 /*
 742  * Add new device to list of registered devices
 743  *
 744  * Returns:
 745  * device pointer which was just added or updated when successful
 746  * error pointer when failed
 747  */
 748 static noinline struct btrfs_device *device_list_add(const char *path,
 749                            struct btrfs_super_block *disk_super,
 750                            bool *new_device_added)
 751 {
 752         struct btrfs_device *device;
 753         struct btrfs_fs_devices *fs_devices = NULL;
 754         struct rcu_string *name;
 755         u64 found_transid = btrfs_super_generation(disk_super);
 756         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 757         dev_t path_devt;
 758         int error;
 759         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 760                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 761         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 762                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 763
 764         error = lookup_bdev(path, &path_devt);
 765         if (error)
 766                 return ERR_PTR(error);
 767
 768         if (fsid_change_in_progress) {
 769                 if (!has_metadata_uuid)
 770                         fs_devices = find_fsid_inprogress(disk_super);
 771                 else
 772                         fs_devices = find_fsid_changed(disk_super);
 773         } else if (has_metadata_uuid) {
 774                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
 775         } else {
 776                 fs_devices = find_fsid_reverted_metadata(disk_super);
 777                 if (!fs_devices)
 778                         fs_devices = find_fsid(disk_super->fsid, NULL);
 779         }
 780
 781
 782         if (!fs_devices) {
 783                 if (has_metadata_uuid)
 784                         fs_devices = alloc_fs_devices(disk_super->fsid,
 785                                                       disk_super->metadata_uuid);
 786                 else
 787                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 788
 789                 if (IS_ERR(fs_devices))
 790                         return ERR_CAST(fs_devices);
 791
 792                 fs_devices->fsid_change = fsid_change_in_progress;
 793
 794                 mutex_lock(&fs_devices->device_list_mutex);
 795                 list_add(&fs_devices->fs_list, &fs_uuids);
 796
 797                 device = NULL;
 798         } else {
 799                 struct btrfs_dev_lookup_args args = {
 800                         .devid = devid,
 801                         .uuid = disk_super->dev_item.uuid,
 802                 };
 803
 804                 mutex_lock(&fs_devices->device_list_mutex);
 805                 device = btrfs_find_device(fs_devices, &args);
 806
 807                 /*
 808                  * If this disk has been pulled into an fs devices created by
 809                  * a device which had the CHANGING_FSID_V2 flag then replace the
 810                  * metadata_uuid/fsid values of the fs_devices.
 811                  */
 812                 if (fs_devices->fsid_change &&
 813                     found_transid > fs_devices->latest_generation) {
 814                         memcpy(fs_devices->fsid, disk_super->fsid,
 815                                         BTRFS_FSID_SIZE);
 816
 817                         if (has_metadata_uuid)
 818                                 memcpy(fs_devices->metadata_uuid,
 819                                        disk_super->metadata_uuid,
 820                                        BTRFS_FSID_SIZE);
 821                         else
 822                                 memcpy(fs_devices->metadata_uuid,
 823                                        disk_super->fsid, BTRFS_FSID_SIZE);
 824
 825                         fs_devices->fsid_change = false;
 826                 }
 827         }
 828
 829         if (!device) {
 830                 if (fs_devices->opened) {
 831                         mutex_unlock(&fs_devices->device_list_mutex);
 832                         return ERR_PTR(-EBUSY);
 833                 }
 834
 835                 device = btrfs_alloc_device(NULL, &devid,
 836                                             disk_super->dev_item.uuid);
 837                 if (IS_ERR(device)) {
 838                         mutex_unlock(&fs_devices->device_list_mutex);
 839                         /* we can safely leave the fs_devices entry around */
 840                         return device;
 841                 }
 842
 843                 name = rcu_string_strdup(path, GFP_NOFS);
 844                 if (!name) {
 845                         btrfs_free_device(device);
 846                         mutex_unlock(&fs_devices->device_list_mutex);
 847                         return ERR_PTR(-ENOMEM);
 848                 }
 849                 rcu_assign_pointer(device->name, name);
 850                 device->devt = path_devt;
 851
 852                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 853                 fs_devices->num_devices++;
 854
 855                 device->fs_devices = fs_devices;
 856                 *new_device_added = true;
 857
 858                 if (disk_super->label[0])
 859                         pr_info(
 860         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 861                                 disk_super->label, devid, found_transid, path,
 862                                 current->comm, task_pid_nr(current));
 863                 else
 864                         pr_info(
 865         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 866                                 disk_super->fsid, devid, found_transid, path,
 867                                 current->comm, task_pid_nr(current));
 868
 869         } else if (!device->name || strcmp(device->name->str, path)) {
 870                 /*
 871                  * When FS is already mounted.
 872                  * 1. If you are here and if the device->name is NULL that
 873                  *    means this device was missing at time of FS mount.
 874                  * 2. If you are here and if the device->name is different
 875                  *    from 'path' that means either
 876                  *      a. The same device disappeared and reappeared with
 877                  *         different name. or
 878                  *      b. The missing-disk-which-was-replaced, has
 879                  *         reappeared now.
 880                  *
 881                  * We must allow 1 and 2a above. But 2b would be a spurious
 882                  * and unintentional.
 883                  *
 884                  * Further in case of 1 and 2a above, the disk at 'path'
 885                  * would have missed some transaction when it was away and
 886                  * in case of 2a the stale bdev has to be updated as well.
 887                  * 2b must not be allowed at all time.
 888                  */
 889
 890                 /*
 891                  * For now, we do allow update to btrfs_fs_device through the
 892                  * btrfs dev scan cli after FS has been mounted.  We're still
 893                  * tracking a problem where systems fail mount by subvolume id
 894                  * when we reject replacement on a mounted FS.
 895                  */
 896                 if (!fs_devices->opened && found_transid < device->generation) {
 897                         /*
 898                          * That is if the FS is _not_ mounted and if you
 899                          * are here, that means there is more than one
 900                          * disk with same uuid and devid.We keep the one
 901                          * with larger generation number or the last-in if
 902                          * generation are equal.
 903                          */
 904                         mutex_unlock(&fs_devices->device_list_mutex);
 905                         return ERR_PTR(-EEXIST);
 906                 }
 907
 908                 /*
 909                  * We are going to replace the device path for a given devid,
 910                  * make sure it's the same device if the device is mounted
 911                  *
 912                  * NOTE: the device->fs_info may not be reliable here so pass
 913                  * in a NULL to message helpers instead. This avoids a possible
 914                  * use-after-free when the fs_info and fs_info->sb are already
 915                  * torn down.
 916                  */
 917                 if (device->bdev) {
 918                         if (device->devt != path_devt) {
 919                                 mutex_unlock(&fs_devices->device_list_mutex);
 920                                 btrfs_warn_in_rcu(NULL,
 921         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 922                                                   path, devid, found_transid,
 923                                                   current->comm,
 924                                                   task_pid_nr(current));
 925                                 return ERR_PTR(-EEXIST);
 926                         }
 927                         btrfs_info_in_rcu(NULL,
 928         "devid %llu device path %s changed to %s scanned by %s (%d)",
 929                                           devid, rcu_str_deref(device->name),
 930                                           path, current->comm,
 931                                           task_pid_nr(current));
 932                 }
 933
 934                 name = rcu_string_strdup(path, GFP_NOFS);
 935                 if (!name) {
 936                         mutex_unlock(&fs_devices->device_list_mutex);
 937                         return ERR_PTR(-ENOMEM);
 938                 }
 939                 rcu_string_free(device->name);
 940                 rcu_assign_pointer(device->name, name);
 941                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 942                         fs_devices->missing_devices--;
 943                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 944                 }
 945                 device->devt = path_devt;
 946         }
 947
 948         /*
 949          * Unmount does not free the btrfs_device struct but would zero
 950          * generation along with most of the other members. So just update
 951          * it back. We need it to pick the disk with largest generation
 952          * (as above).
 953          */
 954         if (!fs_devices->opened) {
 955                 device->generation = found_transid;
 956                 fs_devices->latest_generation = max_t(u64, found_transid,
 957                                                 fs_devices->latest_generation);
 958         }
 959
 960         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 961
 962         mutex_unlock(&fs_devices->device_list_mutex);
 963         return device;
 964 }
 965
 966 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 967 {
 968         struct btrfs_fs_devices *fs_devices;
 969         struct btrfs_device *device;
 970         struct btrfs_device *orig_dev;
 971         int ret = 0;
 972
 973         lockdep_assert_held(&uuid_mutex);
 974
 975         fs_devices = alloc_fs_devices(orig->fsid, NULL);
 976         if (IS_ERR(fs_devices))
 977                 return fs_devices;
 978
 979         fs_devices->total_devices = orig->total_devices;
 980
 981         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 982                 struct rcu_string *name;
 983
 984                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
 985                                             orig_dev->uuid);
 986                 if (IS_ERR(device)) {
 987                         ret = PTR_ERR(device);
 988                         goto error;
 989                 }
 990
 991                 /*
 992                  * This is ok to do without rcu read locked because we hold the
 993                  * uuid mutex so nothing we touch in here is going to disappear.
 994                  */
 995                 if (orig_dev->name) {
 996                         name = rcu_string_strdup(orig_dev->name->str,
 997                                         GFP_KERNEL);
 998                         if (!name) {
 999                                 btrfs_free_device(device);
1000                                 ret = -ENOMEM;
1001                                 goto error;
1002                         }
1003                         rcu_assign_pointer(device->name, name);
1004                 }
1005
1006                 list_add(&device->dev_list, &fs_devices->devices);
1007                 device->fs_devices = fs_devices;
1008                 fs_devices->num_devices++;
1009         }
1010         return fs_devices;
1011 error:
1012         free_fs_devices(fs_devices);
1013         return ERR_PTR(ret);
1014 }
1015
1016 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1017                                       struct btrfs_device **latest_dev)
1018 {
1019         struct btrfs_device *device, *next;
1020
1021         /* This is the initialized path, it is safe to release the devices. */
1022         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1023                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1024                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1025                                       &device->dev_state) &&
1026                             !test_bit(BTRFS_DEV_STATE_MISSING,
1027                                       &device->dev_state) &&
1028                             (!*latest_dev ||
1029                              device->generation > (*latest_dev)->generation)) {
1030                                 *latest_dev = device;
1031                         }
1032                         continue;
1033                 }
1034
1035                 /*
1036                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1037                  * in btrfs_init_dev_replace() so just continue.
1038                  */
1039                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1040                         continue;
1041
1042                 if (device->bdev) {
1043                         blkdev_put(device->bdev, device->mode);
1044                         device->bdev = NULL;
1045                         fs_devices->open_devices--;
1046                 }
1047                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1048                         list_del_init(&device->dev_alloc_list);
1049                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1050                         fs_devices->rw_devices--;
1051                 }
1052                 list_del_init(&device->dev_list);
1053                 fs_devices->num_devices--;
1054                 btrfs_free_device(device);
1055         }
1056
1057 }
1058
1059 /*
1060  * After we have read the system tree and know devids belonging to this
1061  * filesystem, remove the device which does not belong there.
1062  */
1063 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1064 {
1065         struct btrfs_device *latest_dev = NULL;
1066         struct btrfs_fs_devices *seed_dev;
1067
1068         mutex_lock(&uuid_mutex);
1069         __btrfs_free_extra_devids(fs_devices, &latest_dev);
1070
1071         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1072                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1073
1074         fs_devices->latest_dev = latest_dev;
1075
1076         mutex_unlock(&uuid_mutex);
1077 }
1078
1079 static void btrfs_close_bdev(struct btrfs_device *device)
1080 {
1081         if (!device->bdev)
1082                 return;
1083
1084         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1085                 sync_blockdev(device->bdev);
1086                 invalidate_bdev(device->bdev);
1087         }
1088
1089         blkdev_put(device->bdev, device->mode);
1090 }
1091
1092 static void btrfs_close_one_device(struct btrfs_device *device)
1093 {
1094         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1095
1096         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1097             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1098                 list_del_init(&device->dev_alloc_list);
1099                 fs_devices->rw_devices--;
1100         }
1101
1102         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1103                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1104
1105         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1106                 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1107                 fs_devices->missing_devices--;
1108         }
1109
1110         btrfs_close_bdev(device);
1111         if (device->bdev) {
1112                 fs_devices->open_devices--;
1113                 device->bdev = NULL;
1114         }
1115         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1116         btrfs_destroy_dev_zone_info(device);
1117
1118         device->fs_info = NULL;
1119         atomic_set(&device->dev_stats_ccnt, 0);
1120         extent_io_tree_release(&device->alloc_state);
1121
1122         /*
1123          * Reset the flush error record. We might have a transient flush error
1124          * in this mount, and if so we aborted the current transaction and set
1125          * the fs to an error state, guaranteeing no super blocks can be further
1126          * committed. However that error might be transient and if we unmount the
1127          * filesystem and mount it again, we should allow the mount to succeed
1128          * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1129          * filesystem again we still get flush errors, then we will again abort
1130          * any transaction and set the error state, guaranteeing no commits of
1131          * unsafe super blocks.
1132          */
1133         device->last_flush_error = 0;
1134
1135         /* Verify the device is back in a pristine state  */
1136         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1137         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1138         ASSERT(list_empty(&device->dev_alloc_list));
1139         ASSERT(list_empty(&device->post_commit_list));
1140 }
1141
1142 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1143 {
1144         struct btrfs_device *device, *tmp;
1145
1146         lockdep_assert_held(&uuid_mutex);
1147
1148         if (--fs_devices->opened > 0)
1149                 return;
1150
1151         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1152                 btrfs_close_one_device(device);
1153
1154         WARN_ON(fs_devices->open_devices);
1155         WARN_ON(fs_devices->rw_devices);
1156         fs_devices->opened = 0;
1157         fs_devices->seeding = false;
1158         fs_devices->fs_info = NULL;
1159 }
1160
1161 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1162 {
1163         LIST_HEAD(list);
1164         struct btrfs_fs_devices *tmp;
1165
1166         mutex_lock(&uuid_mutex);
1167         close_fs_devices(fs_devices);
1168         if (!fs_devices->opened)
1169                 list_splice_init(&fs_devices->seed_list, &list);
1170
1171         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1172                 close_fs_devices(fs_devices);
1173                 list_del(&fs_devices->seed_list);
1174                 free_fs_devices(fs_devices);
1175         }
1176         mutex_unlock(&uuid_mutex);
1177 }
1178
1179 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1180                                 fmode_t flags, void *holder)
1181 {
1182         struct btrfs_device *device;
1183         struct btrfs_device *latest_dev = NULL;
1184         struct btrfs_device *tmp_device;
1185
1186         flags |= FMODE_EXCL;
1187
1188         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1189                                  dev_list) {
1190                 int ret;
1191
1192                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1193                 if (ret == 0 &&
1194                     (!latest_dev || device->generation > latest_dev->generation)) {
1195                         latest_dev = device;
1196                 } else if (ret == -ENODATA) {
1197                         fs_devices->num_devices--;
1198                         list_del(&device->dev_list);
1199                         btrfs_free_device(device);
1200                 }
1201         }
1202         if (fs_devices->open_devices == 0)
1203                 return -EINVAL;
1204
1205         fs_devices->opened = 1;
1206         fs_devices->latest_dev = latest_dev;
1207         fs_devices->total_rw_bytes = 0;
1208         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1209         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1210
1211         return 0;
1212 }
1213
1214 static int devid_cmp(void *priv, const struct list_head *a,
1215                      const struct list_head *b)
1216 {
1217         const struct btrfs_device *dev1, *dev2;
1218
1219         dev1 = list_entry(a, struct btrfs_device, dev_list);
1220         dev2 = list_entry(b, struct btrfs_device, dev_list);
1221
1222         if (dev1->devid < dev2->devid)
1223                 return -1;
1224         else if (dev1->devid > dev2->devid)
1225                 return 1;
1226         return 0;
1227 }
1228
1229 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1230                        fmode_t flags, void *holder)
1231 {
1232         int ret;
1233
1234         lockdep_assert_held(&uuid_mutex);
1235         /*
1236          * The device_list_mutex cannot be taken here in case opening the
1237          * underlying device takes further locks like open_mutex.
1238          *
1239          * We also don't need the lock here as this is called during mount and
1240          * exclusion is provided by uuid_mutex
1241          */
1242
1243         if (fs_devices->opened) {
1244                 fs_devices->opened++;
1245                 ret = 0;
1246         } else {
1247                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1248                 ret = open_fs_devices(fs_devices, flags, holder);
1249         }
1250
1251         return ret;
1252 }
1253
1254 void btrfs_release_disk_super(struct btrfs_super_block *super)
1255 {
1256         struct page *page = virt_to_page(super);
1257
1258         put_page(page);
1259 }
1260
1261 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1262                                                        u64 bytenr, u64 bytenr_orig)
1263 {
1264         struct btrfs_super_block *disk_super;
1265         struct page *page;
1266         void *p;
1267         pgoff_t index;
1268
1269         /* make sure our super fits in the device */
1270         if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1271                 return ERR_PTR(-EINVAL);
1272
1273         /* make sure our super fits in the page */
1274         if (sizeof(*disk_super) > PAGE_SIZE)
1275                 return ERR_PTR(-EINVAL);
1276
1277         /* make sure our super doesn't straddle pages on disk */
1278         index = bytenr >> PAGE_SHIFT;
1279         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1280                 return ERR_PTR(-EINVAL);
1281
1282         /* pull in the page with our super */
1283         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1284
1285         if (IS_ERR(page))
1286                 return ERR_CAST(page);
1287
1288         p = page_address(page);
1289
1290         /* align our pointer to the offset of the super block */
1291         disk_super = p + offset_in_page(bytenr);
1292
1293         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1294             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1295                 btrfs_release_disk_super(p);
1296                 return ERR_PTR(-EINVAL);
1297         }
1298
1299         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1300                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1301
1302         return disk_super;
1303 }
1304
1305 int btrfs_forget_devices(dev_t devt)
1306 {
1307         int ret;
1308
1309         mutex_lock(&uuid_mutex);
1310         ret = btrfs_free_stale_devices(devt, NULL);
1311         mutex_unlock(&uuid_mutex);
1312
1313         return ret;
1314 }
1315
1316 /*
1317  * Look for a btrfs signature on a device. This may be called out of the mount path
1318  * and we are not allowed to call set_blocksize during the scan. The superblock
1319  * is read via pagecache
1320  */
1321 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1322                                            void *holder)
1323 {
1324         struct btrfs_super_block *disk_super;
1325         bool new_device_added = false;
1326         struct btrfs_device *device = NULL;
1327         struct block_device *bdev;
1328         u64 bytenr, bytenr_orig;
1329         int ret;
1330
1331         lockdep_assert_held(&uuid_mutex);
1332
1333         /*
1334          * we would like to check all the supers, but that would make
1335          * a btrfs mount succeed after a mkfs from a different FS.
1336          * So, we need to add a special mount option to scan for
1337          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1338          */
1339         flags |= FMODE_EXCL;
1340
1341         bdev = blkdev_get_by_path(path, flags, holder);
1342         if (IS_ERR(bdev))
1343                 return ERR_CAST(bdev);
1344
1345         bytenr_orig = btrfs_sb_offset(0);
1346         ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1347         if (ret) {
1348                 device = ERR_PTR(ret);
1349                 goto error_bdev_put;
1350         }
1351
1352         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1353         if (IS_ERR(disk_super)) {
1354                 device = ERR_CAST(disk_super);
1355                 goto error_bdev_put;
1356         }
1357
1358         device = device_list_add(path, disk_super, &new_device_added);
1359         if (!IS_ERR(device) && new_device_added)
1360                 btrfs_free_stale_devices(device->devt, device);
1361
1362         btrfs_release_disk_super(disk_super);
1363
1364 error_bdev_put:
1365         blkdev_put(bdev, flags);
1366
1367         return device;
1368 }
1369
1370 /*
1371  * Try to find a chunk that intersects [start, start + len] range and when one
1372  * such is found, record the end of it in *start
1373  */
1374 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1375                                     u64 len)
1376 {
1377         u64 physical_start, physical_end;
1378
1379         lockdep_assert_held(&device->fs_info->chunk_mutex);
1380
1381         if (!find_first_extent_bit(&device->alloc_state, *start,
1382                                    &physical_start, &physical_end,
1383                                    CHUNK_ALLOCATED, NULL)) {
1384
1385                 if (in_range(physical_start, *start, len) ||
1386                     in_range(*start, physical_start,
1387                              physical_end - physical_start)) {
1388                         *start = physical_end + 1;
1389                         return true;
1390                 }
1391         }
1392         return false;
1393 }
1394
1395 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1396 {
1397         switch (device->fs_devices->chunk_alloc_policy) {
1398         case BTRFS_CHUNK_ALLOC_REGULAR:
1399                 /*
1400                  * We don't want to overwrite the superblock on the drive nor
1401                  * any area used by the boot loader (grub for example), so we
1402                  * make sure to start at an offset of at least 1MB.
1403                  */
1404                 return max_t(u64, start, SZ_1M);
1405         case BTRFS_CHUNK_ALLOC_ZONED:
1406                 /*
1407                  * We don't care about the starting region like regular
1408                  * allocator, because we anyway use/reserve the first two zones
1409                  * for superblock logging.
1410                  */
1411                 return ALIGN(start, device->zone_info->zone_size);
1412         default:
1413                 BUG();
1414         }
1415 }
1416
1417 static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1418                                         u64 *hole_start, u64 *hole_size,
1419                                         u64 num_bytes)
1420 {
1421         u64 zone_size = device->zone_info->zone_size;
1422         u64 pos;
1423         int ret;
1424         bool changed = false;
1425
1426         ASSERT(IS_ALIGNED(*hole_start, zone_size));
1427
1428         while (*hole_size > 0) {
1429                 pos = btrfs_find_allocatable_zones(device, *hole_start,
1430                                                    *hole_start + *hole_size,
1431                                                    num_bytes);
1432                 if (pos != *hole_start) {
1433                         *hole_size = *hole_start + *hole_size - pos;
1434                         *hole_start = pos;
1435                         changed = true;
1436                         if (*hole_size < num_bytes)
1437                                 break;
1438                 }
1439
1440                 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1441
1442                 /* Range is ensured to be empty */
1443                 if (!ret)
1444                         return changed;
1445
1446                 /* Given hole range was invalid (outside of device) */
1447                 if (ret == -ERANGE) {
1448                         *hole_start += *hole_size;
1449                         *hole_size = 0;
1450                         return true;
1451                 }
1452
1453                 *hole_start += zone_size;
1454                 *hole_size -= zone_size;
1455                 changed = true;
1456         }
1457
1458         return changed;
1459 }
1460
1461 /**
1462  * dev_extent_hole_check - check if specified hole is suitable for allocation
1463  * @device:     the device which we have the hole
1464  * @hole_start: starting position of the hole
1465  * @hole_size:  the size of the hole
1466  * @num_bytes:  the size of the free space that we need
1467  *
1468  * This function may modify @hole_start and @hole_size to reflect the suitable
1469  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1470  */
1471 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1472                                   u64 *hole_size, u64 num_bytes)
1473 {
1474         bool changed = false;
1475         u64 hole_end = *hole_start + *hole_size;
1476
1477         for (;;) {
1478                 /*
1479                  * Check before we set max_hole_start, otherwise we could end up
1480                  * sending back this offset anyway.
1481                  */
1482                 if (contains_pending_extent(device, hole_start, *hole_size)) {
1483                         if (hole_end >= *hole_start)
1484                                 *hole_size = hole_end - *hole_start;
1485                         else
1486                                 *hole_size = 0;
1487                         changed = true;
1488                 }
1489
1490                 switch (device->fs_devices->chunk_alloc_policy) {
1491                 case BTRFS_CHUNK_ALLOC_REGULAR:
1492                         /* No extra check */
1493                         break;
1494                 case BTRFS_CHUNK_ALLOC_ZONED:
1495                         if (dev_extent_hole_check_zoned(device, hole_start,
1496                                                         hole_size, num_bytes)) {
1497                                 changed = true;
1498                                 /*
1499                                  * The changed hole can contain pending extent.
1500                                  * Loop again to check that.
1501                                  */
1502                                 continue;
1503                         }
1504                         break;
1505                 default:
1506                         BUG();
1507                 }
1508
1509                 break;
1510         }
1511
1512         return changed;
1513 }
1514
1515 /*
1516  * find_free_dev_extent_start - find free space in the specified device
1517  * @device:       the device which we search the free space in
1518  * @num_bytes:    the size of the free space that we need
1519  * @search_start: the position from which to begin the search
1520  * @start:        store the start of the free space.
1521  * @len:          the size of the free space. that we find, or the size
1522  *                of the max free space if we don't find suitable free space
1523  *
1524  * this uses a pretty simple search, the expectation is that it is
1525  * called very infrequently and that a given device has a small number
1526  * of extents
1527  *
1528  * @start is used to store the start of the free space if we find. But if we
1529  * don't find suitable free space, it will be used to store the start position
1530  * of the max free space.
1531  *
1532  * @len is used to store the size of the free space that we find.
1533  * But if we don't find suitable free space, it is used to store the size of
1534  * the max free space.
1535  *
1536  * NOTE: This function will search *commit* root of device tree, and does extra
1537  * check to ensure dev extents are not double allocated.
1538  * This makes the function safe to allocate dev extents but may not report
1539  * correct usable device space, as device extent freed in current transaction
1540  * is not reported as available.
1541  */
1542 static int find_free_dev_extent_start(struct btrfs_device *device,
1543                                 u64 num_bytes, u64 search_start, u64 *start,
1544                                 u64 *len)
1545 {
1546         struct btrfs_fs_info *fs_info = device->fs_info;
1547         struct btrfs_root *root = fs_info->dev_root;
1548         struct btrfs_key key;
1549         struct btrfs_dev_extent *dev_extent;
1550         struct btrfs_path *path;
1551         u64 hole_size;
1552         u64 max_hole_start;
1553         u64 max_hole_size;
1554         u64 extent_end;
1555         u64 search_end = device->total_bytes;
1556         int ret;
1557         int slot;
1558         struct extent_buffer *l;
1559
1560         search_start = dev_extent_search_start(device, search_start);
1561
1562         WARN_ON(device->zone_info &&
1563                 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1564
1565         path = btrfs_alloc_path();
1566         if (!path)
1567                 return -ENOMEM;
1568
1569         max_hole_start = search_start;
1570         max_hole_size = 0;
1571
1572 again:
1573         if (search_start >= search_end ||
1574                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1575                 ret = -ENOSPC;
1576                 goto out;
1577         }
1578
1579         path->reada = READA_FORWARD;
1580         path->search_commit_root = 1;
1581         path->skip_locking = 1;
1582
1583         key.objectid = device->devid;
1584         key.offset = search_start;
1585         key.type = BTRFS_DEV_EXTENT_KEY;
1586
1587         ret = btrfs_search_backwards(root, &key, path);
1588         if (ret < 0)
1589                 goto out;
1590
1591         while (1) {
1592                 l = path->nodes[0];
1593                 slot = path->slots[0];
1594                 if (slot >= btrfs_header_nritems(l)) {
1595                         ret = btrfs_next_leaf(root, path);
1596                         if (ret == 0)
1597                                 continue;
1598                         if (ret < 0)
1599                                 goto out;
1600
1601                         break;
1602                 }
1603                 btrfs_item_key_to_cpu(l, &key, slot);
1604
1605                 if (key.objectid < device->devid)
1606                         goto next;
1607
1608                 if (key.objectid > device->devid)
1609                         break;
1610
1611                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1612                         goto next;
1613
1614                 if (key.offset > search_start) {
1615                         hole_size = key.offset - search_start;
1616                         dev_extent_hole_check(device, &search_start, &hole_size,
1617                                               num_bytes);
1618
1619                         if (hole_size > max_hole_size) {
1620                                 max_hole_start = search_start;
1621                                 max_hole_size = hole_size;
1622                         }
1623
1624                         /*
1625                          * If this free space is greater than which we need,
1626                          * it must be the max free space that we have found
1627                          * until now, so max_hole_start must point to the start
1628                          * of this free space and the length of this free space
1629                          * is stored in max_hole_size. Thus, we return
1630                          * max_hole_start and max_hole_size and go back to the
1631                          * caller.
1632                          */
1633                         if (hole_size >= num_bytes) {
1634                                 ret = 0;
1635                                 goto out;
1636                         }
1637                 }
1638
1639                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1640                 extent_end = key.offset + btrfs_dev_extent_length(l,
1641                                                                   dev_extent);
1642                 if (extent_end > search_start)
1643                         search_start = extent_end;
1644 next:
1645                 path->slots[0]++;
1646                 cond_resched();
1647         }
1648
1649         /*
1650          * At this point, search_start should be the end of
1651          * allocated dev extents, and when shrinking the device,
1652          * search_end may be smaller than search_start.
1653          */
1654         if (search_end > search_start) {
1655                 hole_size = search_end - search_start;
1656                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1657                                           num_bytes)) {
1658                         btrfs_release_path(path);
1659                         goto again;
1660                 }
1661
1662                 if (hole_size > max_hole_size) {
1663                         max_hole_start = search_start;
1664                         max_hole_size = hole_size;
1665                 }
1666         }
1667
1668         /* See above. */
1669         if (max_hole_size < num_bytes)
1670                 ret = -ENOSPC;
1671         else
1672                 ret = 0;
1673
1674 out:
1675         btrfs_free_path(path);
1676         *start = max_hole_start;
1677         if (len)
1678                 *len = max_hole_size;
1679         return ret;
1680 }
1681
1682 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1683                          u64 *start, u64 *len)
1684 {
1685         /* FIXME use last free of some kind */
1686         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1687 }
1688
1689 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1690                           struct btrfs_device *device,
1691                           u64 start, u64 *dev_extent_len)
1692 {
1693         struct btrfs_fs_info *fs_info = device->fs_info;
1694         struct btrfs_root *root = fs_info->dev_root;
1695         int ret;
1696         struct btrfs_path *path;
1697         struct btrfs_key key;
1698         struct btrfs_key found_key;
1699         struct extent_buffer *leaf = NULL;
1700         struct btrfs_dev_extent *extent = NULL;
1701
1702         path = btrfs_alloc_path();
1703         if (!path)
1704                 return -ENOMEM;
1705
1706         key.objectid = device->devid;
1707         key.offset = start;
1708         key.type = BTRFS_DEV_EXTENT_KEY;
1709 again:
1710         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1711         if (ret > 0) {
1712                 ret = btrfs_previous_item(root, path, key.objectid,
1713                                           BTRFS_DEV_EXTENT_KEY);
1714                 if (ret)
1715                         goto out;
1716                 leaf = path->nodes[0];
1717                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1718                 extent = btrfs_item_ptr(leaf, path->slots[0],
1719                                         struct btrfs_dev_extent);
1720                 BUG_ON(found_key.offset > start || found_key.offset +
1721                        btrfs_dev_extent_length(leaf, extent) < start);
1722                 key = found_key;
1723                 btrfs_release_path(path);
1724                 goto again;
1725         } else if (ret == 0) {
1726                 leaf = path->nodes[0];
1727                 extent = btrfs_item_ptr(leaf, path->slots[0],
1728                                         struct btrfs_dev_extent);
1729         } else {
1730                 goto out;
1731         }
1732
1733         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1734
1735         ret = btrfs_del_item(trans, root, path);
1736         if (ret == 0)
1737                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1738 out:
1739         btrfs_free_path(path);
1740         return ret;
1741 }
1742
1743 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1744 {
1745         struct extent_map_tree *em_tree;
1746         struct extent_map *em;
1747         struct rb_node *n;
1748         u64 ret = 0;
1749
1750         em_tree = &fs_info->mapping_tree;
1751         read_lock(&em_tree->lock);
1752         n = rb_last(&em_tree->map.rb_root);
1753         if (n) {
1754                 em = rb_entry(n, struct extent_map, rb_node);
1755                 ret = em->start + em->len;
1756         }
1757         read_unlock(&em_tree->lock);
1758
1759         return ret;
1760 }
1761
1762 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1763                                     u64 *devid_ret)
1764 {
1765         int ret;
1766         struct btrfs_key key;
1767         struct btrfs_key found_key;
1768         struct btrfs_path *path;
1769
1770         path = btrfs_alloc_path();
1771         if (!path)
1772                 return -ENOMEM;
1773
1774         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775         key.type = BTRFS_DEV_ITEM_KEY;
1776         key.offset = (u64)-1;
1777
1778         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1779         if (ret < 0)
1780                 goto error;
1781
1782         if (ret == 0) {
1783                 /* Corruption */
1784                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1785                 ret = -EUCLEAN;
1786                 goto error;
1787         }
1788
1789         ret = btrfs_previous_item(fs_info->chunk_root, path,
1790                                   BTRFS_DEV_ITEMS_OBJECTID,
1791                                   BTRFS_DEV_ITEM_KEY);
1792         if (ret) {
1793                 *devid_ret = 1;
1794         } else {
1795                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1796                                       path->slots[0]);
1797                 *devid_ret = found_key.offset + 1;
1798         }
1799         ret = 0;
1800 error:
1801         btrfs_free_path(path);
1802         return ret;
1803 }
1804
1805 /*
1806  * the device information is stored in the chunk root
1807  * the btrfs_device struct should be fully filled in
1808  */
1809 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1810                             struct btrfs_device *device)
1811 {
1812         int ret;
1813         struct btrfs_path *path;
1814         struct btrfs_dev_item *dev_item;
1815         struct extent_buffer *leaf;
1816         struct btrfs_key key;
1817         unsigned long ptr;
1818
1819         path = btrfs_alloc_path();
1820         if (!path)
1821                 return -ENOMEM;
1822
1823         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1824         key.type = BTRFS_DEV_ITEM_KEY;
1825         key.offset = device->devid;
1826
1827         btrfs_reserve_chunk_metadata(trans, true);
1828         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1829                                       &key, sizeof(*dev_item));
1830         btrfs_trans_release_chunk_metadata(trans);
1831         if (ret)
1832                 goto out;
1833
1834         leaf = path->nodes[0];
1835         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1836
1837         btrfs_set_device_id(leaf, dev_item, device->devid);
1838         btrfs_set_device_generation(leaf, dev_item, 0);
1839         btrfs_set_device_type(leaf, dev_item, device->type);
1840         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1841         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1842         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1843         btrfs_set_device_total_bytes(leaf, dev_item,
1844                                      btrfs_device_get_disk_total_bytes(device));
1845         btrfs_set_device_bytes_used(leaf, dev_item,
1846                                     btrfs_device_get_bytes_used(device));
1847         btrfs_set_device_group(leaf, dev_item, 0);
1848         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1849         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1850         btrfs_set_device_start_offset(leaf, dev_item, 0);
1851
1852         ptr = btrfs_device_uuid(dev_item);
1853         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1854         ptr = btrfs_device_fsid(dev_item);
1855         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1856                             ptr, BTRFS_FSID_SIZE);
1857         btrfs_mark_buffer_dirty(leaf);
1858
1859         ret = 0;
1860 out:
1861         btrfs_free_path(path);
1862         return ret;
1863 }
1864
1865 /*
1866  * Function to update ctime/mtime for a given device path.
1867  * Mainly used for ctime/mtime based probe like libblkid.
1868  *
1869  * We don't care about errors here, this is just to be kind to userspace.
1870  */
1871 static void update_dev_time(const char *device_path)
1872 {
1873         struct path path;
1874         struct timespec64 now;
1875         int ret;
1876
1877         ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1878         if (ret)
1879                 return;
1880
1881         now = current_time(d_inode(path.dentry));
1882         inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1883         path_put(&path);
1884 }
1885
1886 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1887                              struct btrfs_device *device)
1888 {
1889         struct btrfs_root *root = device->fs_info->chunk_root;
1890         int ret;
1891         struct btrfs_path *path;
1892         struct btrfs_key key;
1893
1894         path = btrfs_alloc_path();
1895         if (!path)
1896                 return -ENOMEM;
1897
1898         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1899         key.type = BTRFS_DEV_ITEM_KEY;
1900         key.offset = device->devid;
1901
1902         btrfs_reserve_chunk_metadata(trans, false);
1903         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1904         btrfs_trans_release_chunk_metadata(trans);
1905         if (ret) {
1906                 if (ret > 0)
1907                         ret = -ENOENT;
1908                 goto out;
1909         }
1910
1911         ret = btrfs_del_item(trans, root, path);
1912 out:
1913         btrfs_free_path(path);
1914         return ret;
1915 }
1916
1917 /*
1918  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1919  * filesystem. It's up to the caller to adjust that number regarding eg. device
1920  * replace.
1921  */
1922 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1923                 u64 num_devices)
1924 {
1925         u64 all_avail;
1926         unsigned seq;
1927         int i;
1928
1929         do {
1930                 seq = read_seqbegin(&fs_info->profiles_lock);
1931
1932                 all_avail = fs_info->avail_data_alloc_bits |
1933                             fs_info->avail_system_alloc_bits |
1934                             fs_info->avail_metadata_alloc_bits;
1935         } while (read_seqretry(&fs_info->profiles_lock, seq));
1936
1937         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1938                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1939                         continue;
1940
1941                 if (num_devices < btrfs_raid_array[i].devs_min)
1942                         return btrfs_raid_array[i].mindev_error;
1943         }
1944
1945         return 0;
1946 }
1947
1948 static struct btrfs_device * btrfs_find_next_active_device(
1949                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1950 {
1951         struct btrfs_device *next_device;
1952
1953         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1954                 if (next_device != device &&
1955                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1956                     && next_device->bdev)
1957                         return next_device;
1958         }
1959
1960         return NULL;
1961 }
1962
1963 /*
1964  * Helper function to check if the given device is part of s_bdev / latest_dev
1965  * and replace it with the provided or the next active device, in the context
1966  * where this function called, there should be always be another device (or
1967  * this_dev) which is active.
1968  */
1969 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1970                                             struct btrfs_device *next_device)
1971 {
1972         struct btrfs_fs_info *fs_info = device->fs_info;
1973
1974         if (!next_device)
1975                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1976                                                             device);
1977         ASSERT(next_device);
1978
1979         if (fs_info->sb->s_bdev &&
1980                         (fs_info->sb->s_bdev == device->bdev))
1981                 fs_info->sb->s_bdev = next_device->bdev;
1982
1983         if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
1984                 fs_info->fs_devices->latest_dev = next_device;
1985 }
1986
1987 /*
1988  * Return btrfs_fs_devices::num_devices excluding the device that's being
1989  * currently replaced.
1990  */
1991 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1992 {
1993         u64 num_devices = fs_info->fs_devices->num_devices;
1994
1995         down_read(&fs_info->dev_replace.rwsem);
1996         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1997                 ASSERT(num_devices > 1);
1998                 num_devices--;
1999         }
2000         up_read(&fs_info->dev_replace.rwsem);
2001
2002         return num_devices;
2003 }
2004
2005 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2006                                struct block_device *bdev,
2007                                const char *device_path)
2008 {
2009         struct btrfs_super_block *disk_super;
2010         int copy_num;
2011
2012         if (!bdev)
2013                 return;
2014
2015         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2016                 struct page *page;
2017                 int ret;
2018
2019                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2020                 if (IS_ERR(disk_super))
2021                         continue;
2022
2023                 if (bdev_is_zoned(bdev)) {
2024                         btrfs_reset_sb_log_zones(bdev, copy_num);
2025                         continue;
2026                 }
2027
2028                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2029
2030                 page = virt_to_page(disk_super);
2031                 set_page_dirty(page);
2032                 lock_page(page);
2033                 /* write_on_page() unlocks the page */
2034                 ret = write_one_page(page);
2035                 if (ret)
2036                         btrfs_warn(fs_info,
2037                                 "error clearing superblock number %d (%d)",
2038                                 copy_num, ret);
2039                 btrfs_release_disk_super(disk_super);
2040
2041         }
2042
2043         /* Notify udev that device has changed */
2044         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2045
2046         /* Update ctime/mtime for device path for libblkid */
2047         update_dev_time(device_path);
2048 }
2049
2050 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
2051                     struct btrfs_dev_lookup_args *args,
2052                     struct block_device **bdev, fmode_t *mode)
2053 {
2054         struct btrfs_trans_handle *trans;
2055         struct btrfs_device *device;
2056         struct btrfs_fs_devices *cur_devices;
2057         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2058         u64 num_devices;
2059         int ret = 0;
2060
2061         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2062                 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2063                 return -EINVAL;
2064         }
2065
2066         /*
2067          * The device list in fs_devices is accessed without locks (neither
2068          * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2069          * filesystem and another device rm cannot run.
2070          */
2071         num_devices = btrfs_num_devices(fs_info);
2072
2073         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2074         if (ret)
2075                 return ret;
2076
2077         device = btrfs_find_device(fs_info->fs_devices, args);
2078         if (!device) {
2079                 if (args->missing)
2080                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2081                 else
2082                         ret = -ENOENT;
2083                 return ret;
2084         }
2085
2086         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2087                 btrfs_warn_in_rcu(fs_info,
2088                   "cannot remove device %s (devid %llu) due to active swapfile",
2089                                   rcu_str_deref(device->name), device->devid);
2090                 return -ETXTBSY;
2091         }
2092
2093         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2094                 return BTRFS_ERROR_DEV_TGT_REPLACE;
2095
2096         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2097             fs_info->fs_devices->rw_devices == 1)
2098                 return BTRFS_ERROR_DEV_ONLY_WRITABLE;
2099
2100         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2101                 mutex_lock(&fs_info->chunk_mutex);
2102                 list_del_init(&device->dev_alloc_list);
2103                 device->fs_devices->rw_devices--;
2104                 mutex_unlock(&fs_info->chunk_mutex);
2105         }
2106
2107         ret = btrfs_shrink_device(device, 0);
2108         if (ret)
2109                 goto error_undo;
2110
2111         trans = btrfs_start_transaction(fs_info->chunk_root, 0);
2112         if (IS_ERR(trans)) {
2113                 ret = PTR_ERR(trans);
2114                 goto error_undo;
2115         }
2116
2117         ret = btrfs_rm_dev_item(trans, device);
2118         if (ret) {
2119                 /* Any error in dev item removal is critical */
2120                 btrfs_crit(fs_info,
2121                            "failed to remove device item for devid %llu: %d",
2122                            device->devid, ret);
2123                 btrfs_abort_transaction(trans, ret);
2124                 btrfs_end_transaction(trans);
2125                 return ret;
2126         }
2127
2128         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2129         btrfs_scrub_cancel_dev(device);
2130
2131         /*
2132          * the device list mutex makes sure that we don't change
2133          * the device list while someone else is writing out all
2134          * the device supers. Whoever is writing all supers, should
2135          * lock the device list mutex before getting the number of
2136          * devices in the super block (super_copy). Conversely,
2137          * whoever updates the number of devices in the super block
2138          * (super_copy) should hold the device list mutex.
2139          */
2140
2141         /*
2142          * In normal cases the cur_devices == fs_devices. But in case
2143          * of deleting a seed device, the cur_devices should point to
2144          * its own fs_devices listed under the fs_devices->seed_list.
2145          */
2146         cur_devices = device->fs_devices;
2147         mutex_lock(&fs_devices->device_list_mutex);
2148         list_del_rcu(&device->dev_list);
2149
2150         cur_devices->num_devices--;
2151         cur_devices->total_devices--;
2152         /* Update total_devices of the parent fs_devices if it's seed */
2153         if (cur_devices != fs_devices)
2154                 fs_devices->total_devices--;
2155
2156         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2157                 cur_devices->missing_devices--;
2158
2159         btrfs_assign_next_active_device(device, NULL);
2160
2161         if (device->bdev) {
2162                 cur_devices->open_devices--;
2163                 /* remove sysfs entry */
2164                 btrfs_sysfs_remove_device(device);
2165         }
2166
2167         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2168         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2169         mutex_unlock(&fs_devices->device_list_mutex);
2170
2171         /*
2172          * At this point, the device is zero sized and detached from the
2173          * devices list.  All that's left is to zero out the old supers and
2174          * free the device.
2175          *
2176          * We cannot call btrfs_close_bdev() here because we're holding the sb
2177          * write lock, and blkdev_put() will pull in the ->open_mutex on the
2178          * block device and it's dependencies.  Instead just flush the device
2179          * and let the caller do the final blkdev_put.
2180          */
2181         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2182                 btrfs_scratch_superblocks(fs_info, device->bdev,
2183                                           device->name->str);
2184                 if (device->bdev) {
2185                         sync_blockdev(device->bdev);
2186                         invalidate_bdev(device->bdev);
2187                 }
2188         }
2189
2190         *bdev = device->bdev;
2191         *mode = device->mode;
2192         synchronize_rcu();
2193         btrfs_free_device(device);
2194
2195         /*
2196          * This can happen if cur_devices is the private seed devices list.  We
2197          * cannot call close_fs_devices() here because it expects the uuid_mutex
2198          * to be held, but in fact we don't need that for the private
2199          * seed_devices, we can simply decrement cur_devices->opened and then
2200          * remove it from our list and free the fs_devices.
2201          */
2202         if (cur_devices->num_devices == 0) {
2203                 list_del_init(&cur_devices->seed_list);
2204                 ASSERT(cur_devices->opened == 1);
2205                 cur_devices->opened--;
2206                 free_fs_devices(cur_devices);
2207         }
2208
2209         ret = btrfs_commit_transaction(trans);
2210
2211         return ret;
2212
2213 error_undo:
2214         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2215                 mutex_lock(&fs_info->chunk_mutex);
2216                 list_add(&device->dev_alloc_list,
2217                          &fs_devices->alloc_list);
2218                 device->fs_devices->rw_devices++;
2219                 mutex_unlock(&fs_info->chunk_mutex);
2220         }
2221         return ret;
2222 }
2223
2224 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2225 {
2226         struct btrfs_fs_devices *fs_devices;
2227
2228         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2229
2230         /*
2231          * in case of fs with no seed, srcdev->fs_devices will point
2232          * to fs_devices of fs_info. However when the dev being replaced is
2233          * a seed dev it will point to the seed's local fs_devices. In short
2234          * srcdev will have its correct fs_devices in both the cases.
2235          */
2236         fs_devices = srcdev->fs_devices;
2237
2238         list_del_rcu(&srcdev->dev_list);
2239         list_del(&srcdev->dev_alloc_list);
2240         fs_devices->num_devices--;
2241         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2242                 fs_devices->missing_devices--;
2243
2244         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2245                 fs_devices->rw_devices--;
2246
2247         if (srcdev->bdev)
2248                 fs_devices->open_devices--;
2249 }
2250
2251 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2252 {
2253         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2254
2255         mutex_lock(&uuid_mutex);
2256
2257         btrfs_close_bdev(srcdev);
2258         synchronize_rcu();
2259         btrfs_free_device(srcdev);
2260
2261         /* if this is no devs we rather delete the fs_devices */
2262         if (!fs_devices->num_devices) {
2263                 /*
2264                  * On a mounted FS, num_devices can't be zero unless it's a
2265                  * seed. In case of a seed device being replaced, the replace
2266                  * target added to the sprout FS, so there will be no more
2267                  * device left under the seed FS.
2268                  */
2269                 ASSERT(fs_devices->seeding);
2270
2271                 list_del_init(&fs_devices->seed_list);
2272                 close_fs_devices(fs_devices);
2273                 free_fs_devices(fs_devices);
2274         }
2275         mutex_unlock(&uuid_mutex);
2276 }
2277
2278 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2279 {
2280         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2281
2282         mutex_lock(&fs_devices->device_list_mutex);
2283
2284         btrfs_sysfs_remove_device(tgtdev);
2285
2286         if (tgtdev->bdev)
2287                 fs_devices->open_devices--;
2288
2289         fs_devices->num_devices--;
2290
2291         btrfs_assign_next_active_device(tgtdev, NULL);
2292
2293         list_del_rcu(&tgtdev->dev_list);
2294
2295         mutex_unlock(&fs_devices->device_list_mutex);
2296
2297         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2298                                   tgtdev->name->str);
2299
2300         btrfs_close_bdev(tgtdev);
2301         synchronize_rcu();
2302         btrfs_free_device(tgtdev);
2303 }
2304
2305 /**
2306  * Populate args from device at path
2307  *
2308  * @fs_info:    the filesystem
2309  * @args:       the args to populate
2310  * @path:       the path to the device
2311  *
2312  * This will read the super block of the device at @path and populate @args with
2313  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
2314  * lookup a device to operate on, but need to do it before we take any locks.
2315  * This properly handles the special case of "missing" that a user may pass in,
2316  * and does some basic sanity checks.  The caller must make sure that @path is
2317  * properly NUL terminated before calling in, and must call
2318  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2319  * uuid buffers.
2320  *
2321  * Return: 0 for success, -errno for failure
2322  */
2323 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2324                                  struct btrfs_dev_lookup_args *args,
2325                                  const char *path)
2326 {
2327         struct btrfs_super_block *disk_super;
2328         struct block_device *bdev;
2329         int ret;
2330
2331         if (!path || !path[0])
2332                 return -EINVAL;
2333         if (!strcmp(path, "missing")) {
2334                 args->missing = true;
2335                 return 0;
2336         }
2337
2338         args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2339         args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2340         if (!args->uuid || !args->fsid) {
2341                 btrfs_put_dev_args_from_path(args);
2342                 return -ENOMEM;
2343         }
2344
2345         ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
2346                                     &bdev, &disk_super);
2347         if (ret)
2348                 return ret;
2349         args->devid = btrfs_stack_device_id(&disk_super->dev_item);
2350         memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
2351         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2352                 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
2353         else
2354                 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
2355         btrfs_release_disk_super(disk_super);
2356         blkdev_put(bdev, FMODE_READ);
2357         return 0;
2358 }
2359
2360 /*
2361  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
2362  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2363  * that don't need to be freed.
2364  */
2365 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2366 {
2367         kfree(args->uuid);
2368         kfree(args->fsid);
2369         args->uuid = NULL;
2370         args->fsid = NULL;
2371 }
2372
2373 struct btrfs_device *btrfs_find_device_by_devspec(
2374                 struct btrfs_fs_info *fs_info, u64 devid,
2375                 const char *device_path)
2376 {
2377         BTRFS_DEV_LOOKUP_ARGS(args);
2378         struct btrfs_device *device;
2379         int ret;
2380
2381         if (devid) {
2382                 args.devid = devid;
2383                 device = btrfs_find_device(fs_info->fs_devices, &args);
2384                 if (!device)
2385                         return ERR_PTR(-ENOENT);
2386                 return device;
2387         }
2388
2389         ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
2390         if (ret)
2391                 return ERR_PTR(ret);
2392         device = btrfs_find_device(fs_info->fs_devices, &args);
2393         btrfs_put_dev_args_from_path(&args);
2394         if (!device)
2395                 return ERR_PTR(-ENOENT);
2396         return device;
2397 }
2398
2399 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
2400 {
2401         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2402         struct btrfs_fs_devices *old_devices;
2403         struct btrfs_fs_devices *seed_devices;
2404
2405         lockdep_assert_held(&uuid_mutex);
2406         if (!fs_devices->seeding)
2407                 return ERR_PTR(-EINVAL);
2408
2409         /*
2410          * Private copy of the seed devices, anchored at
2411          * fs_info->fs_devices->seed_list
2412          */
2413         seed_devices = alloc_fs_devices(NULL, NULL);
2414         if (IS_ERR(seed_devices))
2415                 return seed_devices;
2416
2417         /*
2418          * It's necessary to retain a copy of the original seed fs_devices in
2419          * fs_uuids so that filesystems which have been seeded can successfully
2420          * reference the seed device from open_seed_devices. This also supports
2421          * multiple fs seed.
2422          */
2423         old_devices = clone_fs_devices(fs_devices);
2424         if (IS_ERR(old_devices)) {
2425                 kfree(seed_devices);
2426                 return old_devices;
2427         }
2428
2429         list_add(&old_devices->fs_list, &fs_uuids);
2430
2431         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2432         seed_devices->opened = 1;
2433         INIT_LIST_HEAD(&seed_devices->devices);
2434         INIT_LIST_HEAD(&seed_devices->alloc_list);
2435         mutex_init(&seed_devices->device_list_mutex);
2436
2437         return seed_devices;
2438 }
2439
2440 /*
2441  * Splice seed devices into the sprout fs_devices.
2442  * Generate a new fsid for the sprouted read-write filesystem.
2443  */
2444 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2445                                struct btrfs_fs_devices *seed_devices)
2446 {
2447         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2448         struct btrfs_super_block *disk_super = fs_info->super_copy;
2449         struct btrfs_device *device;
2450         u64 super_flags;
2451
2452         /*
2453          * We are updating the fsid, the thread leading to device_list_add()
2454          * could race, so uuid_mutex is needed.
2455          */
2456         lockdep_assert_held(&uuid_mutex);
2457
2458         /*
2459          * The threads listed below may traverse dev_list but can do that without
2460          * device_list_mutex:
2461          * - All device ops and balance - as we are in btrfs_exclop_start.
2462          * - Various dev_list readers - are using RCU.
2463          * - btrfs_ioctl_fitrim() - is using RCU.
2464          *
2465          * For-read threads as below are using device_list_mutex:
2466          * - Readonly scrub btrfs_scrub_dev()
2467          * - Readonly scrub btrfs_scrub_progress()
2468          * - btrfs_get_dev_stats()
2469          */
2470         lockdep_assert_held(&fs_devices->device_list_mutex);
2471
2472         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2473                               synchronize_rcu);
2474         list_for_each_entry(device, &seed_devices->devices, dev_list)
2475                 device->fs_devices = seed_devices;
2476
2477         fs_devices->seeding = false;
2478         fs_devices->num_devices = 0;
2479         fs_devices->open_devices = 0;
2480         fs_devices->missing_devices = 0;
2481         fs_devices->rotating = false;
2482         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2483
2484         generate_random_uuid(fs_devices->fsid);
2485         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2486         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2487
2488         super_flags = btrfs_super_flags(disk_super) &
2489                       ~BTRFS_SUPER_FLAG_SEEDING;
2490         btrfs_set_super_flags(disk_super, super_flags);
2491 }
2492
2493 /*
2494  * Store the expected generation for seed devices in device items.
2495  */
2496 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2497 {
2498         BTRFS_DEV_LOOKUP_ARGS(args);
2499         struct btrfs_fs_info *fs_info = trans->fs_info;
2500         struct btrfs_root *root = fs_info->chunk_root;
2501         struct btrfs_path *path;
2502         struct extent_buffer *leaf;
2503         struct btrfs_dev_item *dev_item;
2504         struct btrfs_device *device;
2505         struct btrfs_key key;
2506         u8 fs_uuid[BTRFS_FSID_SIZE];
2507         u8 dev_uuid[BTRFS_UUID_SIZE];
2508         int ret;
2509
2510         path = btrfs_alloc_path();
2511         if (!path)
2512                 return -ENOMEM;
2513
2514         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2515         key.offset = 0;
2516         key.type = BTRFS_DEV_ITEM_KEY;
2517
2518         while (1) {
2519                 btrfs_reserve_chunk_metadata(trans, false);
2520                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2521                 btrfs_trans_release_chunk_metadata(trans);
2522                 if (ret < 0)
2523                         goto error;
2524
2525                 leaf = path->nodes[0];
2526 next_slot:
2527                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2528                         ret = btrfs_next_leaf(root, path);
2529                         if (ret > 0)
2530                                 break;
2531                         if (ret < 0)
2532                                 goto error;
2533                         leaf = path->nodes[0];
2534                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2535                         btrfs_release_path(path);
2536                         continue;
2537                 }
2538
2539                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2540                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2541                     key.type != BTRFS_DEV_ITEM_KEY)
2542                         break;
2543
2544                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2545                                           struct btrfs_dev_item);
2546                 args.devid = btrfs_device_id(leaf, dev_item);
2547                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2548                                    BTRFS_UUID_SIZE);
2549                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2550                                    BTRFS_FSID_SIZE);
2551                 args.uuid = dev_uuid;
2552                 args.fsid = fs_uuid;
2553                 device = btrfs_find_device(fs_info->fs_devices, &args);
2554                 BUG_ON(!device); /* Logic error */
2555
2556                 if (device->fs_devices->seeding) {
2557                         btrfs_set_device_generation(leaf, dev_item,
2558                                                     device->generation);
2559                         btrfs_mark_buffer_dirty(leaf);
2560                 }
2561
2562                 path->slots[0]++;
2563                 goto next_slot;
2564         }
2565         ret = 0;
2566 error:
2567         btrfs_free_path(path);
2568         return ret;
2569 }
2570
2571 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2572 {
2573         struct btrfs_root *root = fs_info->dev_root;
2574         struct btrfs_trans_handle *trans;
2575         struct btrfs_device *device;
2576         struct block_device *bdev;
2577         struct super_block *sb = fs_info->sb;
2578         struct rcu_string *name;
2579         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2580         struct btrfs_fs_devices *seed_devices;
2581         u64 orig_super_total_bytes;
2582         u64 orig_super_num_devices;
2583         int ret = 0;
2584         bool seeding_dev = false;
2585         bool locked = false;
2586
2587         if (sb_rdonly(sb) && !fs_devices->seeding)
2588                 return -EROFS;
2589
2590         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2591                                   fs_info->bdev_holder);
2592         if (IS_ERR(bdev))
2593                 return PTR_ERR(bdev);
2594
2595         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2596                 ret = -EINVAL;
2597                 goto error;
2598         }
2599
2600         if (fs_devices->seeding) {
2601                 seeding_dev = true;
2602                 down_write(&sb->s_umount);
2603                 mutex_lock(&uuid_mutex);
2604                 locked = true;
2605         }
2606
2607         sync_blockdev(bdev);
2608
2609         rcu_read_lock();
2610         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2611                 if (device->bdev == bdev) {
2612                         ret = -EEXIST;
2613                         rcu_read_unlock();
2614                         goto error;
2615                 }
2616         }
2617         rcu_read_unlock();
2618
2619         device = btrfs_alloc_device(fs_info, NULL, NULL);
2620         if (IS_ERR(device)) {
2621                 /* we can safely leave the fs_devices entry around */
2622                 ret = PTR_ERR(device);
2623                 goto error;
2624         }
2625
2626         name = rcu_string_strdup(device_path, GFP_KERNEL);
2627         if (!name) {
2628                 ret = -ENOMEM;
2629                 goto error_free_device;
2630         }
2631         rcu_assign_pointer(device->name, name);
2632
2633         device->fs_info = fs_info;
2634         device->bdev = bdev;
2635         ret = lookup_bdev(device_path, &device->devt);
2636         if (ret)
2637                 goto error_free_device;
2638
2639         ret = btrfs_get_dev_zone_info(device, false);
2640         if (ret)
2641                 goto error_free_device;
2642
2643         trans = btrfs_start_transaction(root, 0);
2644         if (IS_ERR(trans)) {
2645                 ret = PTR_ERR(trans);
2646                 goto error_free_zone;
2647         }
2648
2649         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2650         device->generation = trans->transid;
2651         device->io_width = fs_info->sectorsize;
2652         device->io_align = fs_info->sectorsize;
2653         device->sector_size = fs_info->sectorsize;
2654         device->total_bytes =
2655                 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
2656         device->disk_total_bytes = device->total_bytes;
2657         device->commit_total_bytes = device->total_bytes;
2658         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2659         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2660         device->mode = FMODE_EXCL;
2661         device->dev_stats_valid = 1;
2662         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2663
2664         if (seeding_dev) {
2665                 btrfs_clear_sb_rdonly(sb);
2666
2667                 /* GFP_KERNEL allocation must not be under device_list_mutex */
2668                 seed_devices = btrfs_init_sprout(fs_info);
2669                 if (IS_ERR(seed_devices)) {
2670                         ret = PTR_ERR(seed_devices);
2671                         btrfs_abort_transaction(trans, ret);
2672                         goto error_trans;
2673                 }
2674         }
2675
2676         mutex_lock(&fs_devices->device_list_mutex);
2677         if (seeding_dev) {
2678                 btrfs_setup_sprout(fs_info, seed_devices);
2679                 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2680                                                 device);
2681         }
2682
2683         device->fs_devices = fs_devices;
2684
2685         mutex_lock(&fs_info->chunk_mutex);
2686         list_add_rcu(&device->dev_list, &fs_devices->devices);
2687         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2688         fs_devices->num_devices++;
2689         fs_devices->open_devices++;
2690         fs_devices->rw_devices++;
2691         fs_devices->total_devices++;
2692         fs_devices->total_rw_bytes += device->total_bytes;
2693
2694         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2695
2696         if (!bdev_nonrot(bdev))
2697                 fs_devices->rotating = true;
2698
2699         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2700         btrfs_set_super_total_bytes(fs_info->super_copy,
2701                 round_down(orig_super_total_bytes + device->total_bytes,
2702                            fs_info->sectorsize));
2703
2704         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2705         btrfs_set_super_num_devices(fs_info->super_copy,
2706                                     orig_super_num_devices + 1);
2707
2708         /*
2709          * we've got more storage, clear any full flags on the space
2710          * infos
2711          */
2712         btrfs_clear_space_info_full(fs_info);
2713
2714         mutex_unlock(&fs_info->chunk_mutex);
2715
2716         /* Add sysfs device entry */
2717         btrfs_sysfs_add_device(device);
2718
2719         mutex_unlock(&fs_devices->device_list_mutex);
2720
2721         if (seeding_dev) {
2722                 mutex_lock(&fs_info->chunk_mutex);
2723                 ret = init_first_rw_device(trans);
2724                 mutex_unlock(&fs_info->chunk_mutex);
2725                 if (ret) {
2726                         btrfs_abort_transaction(trans, ret);
2727                         goto error_sysfs;
2728                 }
2729         }
2730
2731         ret = btrfs_add_dev_item(trans, device);
2732         if (ret) {
2733                 btrfs_abort_transaction(trans, ret);
2734                 goto error_sysfs;
2735         }
2736
2737         if (seeding_dev) {
2738                 ret = btrfs_finish_sprout(trans);
2739                 if (ret) {
2740                         btrfs_abort_transaction(trans, ret);
2741                         goto error_sysfs;
2742                 }
2743
2744                 /*
2745                  * fs_devices now represents the newly sprouted filesystem and
2746                  * its fsid has been changed by btrfs_sprout_splice().
2747                  */
2748                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2749         }
2750
2751         ret = btrfs_commit_transaction(trans);
2752
2753         if (seeding_dev) {
2754                 mutex_unlock(&uuid_mutex);
2755                 up_write(&sb->s_umount);
2756                 locked = false;
2757
2758                 if (ret) /* transaction commit */
2759                         return ret;
2760
2761                 ret = btrfs_relocate_sys_chunks(fs_info);
2762                 if (ret < 0)
2763                         btrfs_handle_fs_error(fs_info, ret,
2764                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2765                 trans = btrfs_attach_transaction(root);
2766                 if (IS_ERR(trans)) {
2767                         if (PTR_ERR(trans) == -ENOENT)
2768                                 return 0;
2769                         ret = PTR_ERR(trans);
2770                         trans = NULL;
2771                         goto error_sysfs;
2772                 }
2773                 ret = btrfs_commit_transaction(trans);
2774         }
2775
2776         /*
2777          * Now that we have written a new super block to this device, check all
2778          * other fs_devices list if device_path alienates any other scanned
2779          * device.
2780          * We can ignore the return value as it typically returns -EINVAL and
2781          * only succeeds if the device was an alien.
2782          */
2783         btrfs_forget_devices(device->devt);
2784
2785         /* Update ctime/mtime for blkid or udev */
2786         update_dev_time(device_path);
2787
2788         return ret;
2789
2790 error_sysfs:
2791         btrfs_sysfs_remove_device(device);
2792         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2793         mutex_lock(&fs_info->chunk_mutex);
2794         list_del_rcu(&device->dev_list);
2795         list_del(&device->dev_alloc_list);
2796         fs_info->fs_devices->num_devices--;
2797         fs_info->fs_devices->open_devices--;
2798         fs_info->fs_devices->rw_devices--;
2799         fs_info->fs_devices->total_devices--;
2800         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2801         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2802         btrfs_set_super_total_bytes(fs_info->super_copy,
2803                                     orig_super_total_bytes);
2804         btrfs_set_super_num_devices(fs_info->super_copy,
2805                                     orig_super_num_devices);
2806         mutex_unlock(&fs_info->chunk_mutex);
2807         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2808 error_trans:
2809         if (seeding_dev)
2810                 btrfs_set_sb_rdonly(sb);
2811         if (trans)
2812                 btrfs_end_transaction(trans);
2813 error_free_zone:
2814         btrfs_destroy_dev_zone_info(device);
2815 error_free_device:
2816         btrfs_free_device(device);
2817 error:
2818         blkdev_put(bdev, FMODE_EXCL);
2819         if (locked) {
2820                 mutex_unlock(&uuid_mutex);
2821                 up_write(&sb->s_umount);
2822         }
2823         return ret;
2824 }
2825
2826 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2827                                         struct btrfs_device *device)
2828 {
2829         int ret;
2830         struct btrfs_path *path;
2831         struct btrfs_root *root = device->fs_info->chunk_root;
2832         struct btrfs_dev_item *dev_item;
2833         struct extent_buffer *leaf;
2834         struct btrfs_key key;
2835
2836         path = btrfs_alloc_path();
2837         if (!path)
2838                 return -ENOMEM;
2839
2840         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2841         key.type = BTRFS_DEV_ITEM_KEY;
2842         key.offset = device->devid;
2843
2844         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2845         if (ret < 0)
2846                 goto out;
2847
2848         if (ret > 0) {
2849                 ret = -ENOENT;
2850                 goto out;
2851         }
2852
2853         leaf = path->nodes[0];
2854         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2855
2856         btrfs_set_device_id(leaf, dev_item, device->devid);
2857         btrfs_set_device_type(leaf, dev_item, device->type);
2858         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2859         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2860         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2861         btrfs_set_device_total_bytes(leaf, dev_item,
2862                                      btrfs_device_get_disk_total_bytes(device));
2863         btrfs_set_device_bytes_used(leaf, dev_item,
2864                                     btrfs_device_get_bytes_used(device));
2865         btrfs_mark_buffer_dirty(leaf);
2866
2867 out:
2868         btrfs_free_path(path);
2869         return ret;
2870 }
2871
2872 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2873                       struct btrfs_device *device, u64 new_size)
2874 {
2875         struct btrfs_fs_info *fs_info = device->fs_info;
2876         struct btrfs_super_block *super_copy = fs_info->super_copy;
2877         u64 old_total;
2878         u64 diff;
2879         int ret;
2880
2881         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2882                 return -EACCES;
2883
2884         new_size = round_down(new_size, fs_info->sectorsize);
2885
2886         mutex_lock(&fs_info->chunk_mutex);
2887         old_total = btrfs_super_total_bytes(super_copy);
2888         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2889
2890         if (new_size <= device->total_bytes ||
2891             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2892                 mutex_unlock(&fs_info->chunk_mutex);
2893                 return -EINVAL;
2894         }
2895
2896         btrfs_set_super_total_bytes(super_copy,
2897                         round_down(old_total + diff, fs_info->sectorsize));
2898         device->fs_devices->total_rw_bytes += diff;
2899
2900         btrfs_device_set_total_bytes(device, new_size);
2901         btrfs_device_set_disk_total_bytes(device, new_size);
2902         btrfs_clear_space_info_full(device->fs_info);
2903         if (list_empty(&device->post_commit_list))
2904                 list_add_tail(&device->post_commit_list,
2905                               &trans->transaction->dev_update_list);
2906         mutex_unlock(&fs_info->chunk_mutex);
2907
2908         btrfs_reserve_chunk_metadata(trans, false);
2909         ret = btrfs_update_device(trans, device);
2910         btrfs_trans_release_chunk_metadata(trans);
2911
2912         return ret;
2913 }
2914
2915 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2916 {
2917         struct btrfs_fs_info *fs_info = trans->fs_info;
2918         struct btrfs_root *root = fs_info->chunk_root;
2919         int ret;
2920         struct btrfs_path *path;
2921         struct btrfs_key key;
2922
2923         path = btrfs_alloc_path();
2924         if (!path)
2925                 return -ENOMEM;
2926
2927         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2928         key.offset = chunk_offset;
2929         key.type = BTRFS_CHUNK_ITEM_KEY;
2930
2931         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2932         if (ret < 0)
2933                 goto out;
2934         else if (ret > 0) { /* Logic error or corruption */
2935                 btrfs_handle_fs_error(fs_info, -ENOENT,
2936                                       "Failed lookup while freeing chunk.");
2937                 ret = -ENOENT;
2938                 goto out;
2939         }
2940
2941         ret = btrfs_del_item(trans, root, path);
2942         if (ret < 0)
2943                 btrfs_handle_fs_error(fs_info, ret,
2944                                       "Failed to delete chunk item.");
2945 out:
2946         btrfs_free_path(path);
2947         return ret;
2948 }
2949
2950 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2951 {
2952         struct btrfs_super_block *super_copy = fs_info->super_copy;
2953         struct btrfs_disk_key *disk_key;
2954         struct btrfs_chunk *chunk;
2955         u8 *ptr;
2956         int ret = 0;
2957         u32 num_stripes;
2958         u32 array_size;
2959         u32 len = 0;
2960         u32 cur;
2961         struct btrfs_key key;
2962
2963         lockdep_assert_held(&fs_info->chunk_mutex);
2964         array_size = btrfs_super_sys_array_size(super_copy);
2965
2966         ptr = super_copy->sys_chunk_array;
2967         cur = 0;
2968
2969         while (cur < array_size) {
2970                 disk_key = (struct btrfs_disk_key *)ptr;
2971                 btrfs_disk_key_to_cpu(&key, disk_key);
2972
2973                 len = sizeof(*disk_key);
2974
2975                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2976                         chunk = (struct btrfs_chunk *)(ptr + len);
2977                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2978                         len += btrfs_chunk_item_size(num_stripes);
2979                 } else {
2980                         ret = -EIO;
2981                         break;
2982                 }
2983                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2984                     key.offset == chunk_offset) {
2985                         memmove(ptr, ptr + len, array_size - (cur + len));
2986                         array_size -= len;
2987                         btrfs_set_super_sys_array_size(super_copy, array_size);
2988                 } else {
2989                         ptr += len;
2990                         cur += len;
2991                 }
2992         }
2993         return ret;
2994 }
2995
2996 /*
2997  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2998  * @logical: Logical block offset in bytes.
2999  * @length: Length of extent in bytes.
3000  *
3001  * Return: Chunk mapping or ERR_PTR.
3002  */
3003 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
3004                                        u64 logical, u64 length)
3005 {
3006         struct extent_map_tree *em_tree;
3007         struct extent_map *em;
3008
3009         em_tree = &fs_info->mapping_tree;
3010         read_lock(&em_tree->lock);
3011         em = lookup_extent_mapping(em_tree, logical, length);
3012         read_unlock(&em_tree->lock);
3013
3014         if (!em) {
3015                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
3016                            logical, length);
3017                 return ERR_PTR(-EINVAL);
3018         }
3019
3020         if (em->start > logical || em->start + em->len < logical) {
3021                 btrfs_crit(fs_info,
3022                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3023                            logical, length, em->start, em->start + em->len);
3024                 free_extent_map(em);
3025                 return ERR_PTR(-EINVAL);
3026         }
3027
3028         /* callers are responsible for dropping em's ref. */
3029         return em;
3030 }
3031
3032 static int remove_chunk_item(struct btrfs_trans_handle *trans,
3033                              struct map_lookup *map, u64 chunk_offset)
3034 {
3035         int i;
3036
3037         /*
3038          * Removing chunk items and updating the device items in the chunks btree
3039          * requires holding the chunk_mutex.
3040          * See the comment at btrfs_chunk_alloc() for the details.
3041          */
3042         lockdep_assert_held(&trans->fs_info->chunk_mutex);
3043
3044         for (i = 0; i < map->num_stripes; i++) {
3045                 int ret;
3046
3047                 ret = btrfs_update_device(trans, map->stripes[i].dev);
3048                 if (ret)
3049                         return ret;
3050         }
3051
3052         return btrfs_free_chunk(trans, chunk_offset);
3053 }
3054
3055 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3056 {
3057         struct btrfs_fs_info *fs_info = trans->fs_info;
3058         struct extent_map *em;
3059         struct map_lookup *map;
3060         u64 dev_extent_len = 0;
3061         int i, ret = 0;
3062         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3063
3064         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3065         if (IS_ERR(em)) {
3066                 /*
3067                  * This is a logic error, but we don't want to just rely on the
3068                  * user having built with ASSERT enabled, so if ASSERT doesn't
3069                  * do anything we still error out.
3070                  */
3071                 ASSERT(0);
3072                 return PTR_ERR(em);
3073         }
3074         map = em->map_lookup;
3075
3076         /*
3077          * First delete the device extent items from the devices btree.
3078          * We take the device_list_mutex to avoid racing with the finishing phase
3079          * of a device replace operation. See the comment below before acquiring
3080          * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3081          * because that can result in a deadlock when deleting the device extent
3082          * items from the devices btree - COWing an extent buffer from the btree
3083          * may result in allocating a new metadata chunk, which would attempt to
3084          * lock again fs_info->chunk_mutex.
3085          */
3086         mutex_lock(&fs_devices->device_list_mutex);
3087         for (i = 0; i < map->num_stripes; i++) {
3088                 struct btrfs_device *device = map->stripes[i].dev;
3089                 ret = btrfs_free_dev_extent(trans, device,
3090                                             map->stripes[i].physical,
3091                                             &dev_extent_len);
3092                 if (ret) {
3093                         mutex_unlock(&fs_devices->device_list_mutex);
3094                         btrfs_abort_transaction(trans, ret);
3095                         goto out;
3096                 }
3097
3098                 if (device->bytes_used > 0) {
3099                         mutex_lock(&fs_info->chunk_mutex);
3100                         btrfs_device_set_bytes_used(device,
3101                                         device->bytes_used - dev_extent_len);
3102                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3103                         btrfs_clear_space_info_full(fs_info);
3104                         mutex_unlock(&fs_info->chunk_mutex);
3105                 }
3106         }
3107         mutex_unlock(&fs_devices->device_list_mutex);
3108
3109         /*
3110          * We acquire fs_info->chunk_mutex for 2 reasons:
3111          *
3112          * 1) Just like with the first phase of the chunk allocation, we must
3113          *    reserve system space, do all chunk btree updates and deletions, and
3114          *    update the system chunk array in the superblock while holding this
3115          *    mutex. This is for similar reasons as explained on the comment at
3116          *    the top of btrfs_chunk_alloc();
3117          *
3118          * 2) Prevent races with the final phase of a device replace operation
3119          *    that replaces the device object associated with the map's stripes,
3120          *    because the device object's id can change at any time during that
3121          *    final phase of the device replace operation
3122          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3123          *    replaced device and then see it with an ID of
3124          *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3125          *    the device item, which does not exists on the chunk btree.
3126          *    The finishing phase of device replace acquires both the
3127          *    device_list_mutex and the chunk_mutex, in that order, so we are
3128          *    safe by just acquiring the chunk_mutex.
3129          */
3130         trans->removing_chunk = true;
3131         mutex_lock(&fs_info->chunk_mutex);
3132
3133         check_system_chunk(trans, map->type);
3134
3135         ret = remove_chunk_item(trans, map, chunk_offset);
3136         /*
3137          * Normally we should not get -ENOSPC since we reserved space before
3138          * through the call to check_system_chunk().
3139          *
3140          * Despite our system space_info having enough free space, we may not
3141          * be able to allocate extents from its block groups, because all have
3142          * an incompatible profile, which will force us to allocate a new system
3143          * block group with the right profile, or right after we called
3144          * check_system_space() above, a scrub turned the only system block group
3145          * with enough free space into RO mode.
3146          * This is explained with more detail at do_chunk_alloc().
3147          *
3148          * So if we get -ENOSPC, allocate a new system chunk and retry once.
3149          */
3150         if (ret == -ENOSPC) {
3151                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3152                 struct btrfs_block_group *sys_bg;
3153
3154                 sys_bg = btrfs_create_chunk(trans, sys_flags);
3155                 if (IS_ERR(sys_bg)) {
3156                         ret = PTR_ERR(sys_bg);
3157                         btrfs_abort_transaction(trans, ret);
3158                         goto out;
3159                 }
3160
3161                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3162                 if (ret) {
3163                         btrfs_abort_transaction(trans, ret);
3164                         goto out;
3165                 }
3166
3167                 ret = remove_chunk_item(trans, map, chunk_offset);
3168                 if (ret) {
3169                         btrfs_abort_transaction(trans, ret);
3170                         goto out;
3171                 }
3172         } else if (ret) {
3173                 btrfs_abort_transaction(trans, ret);
3174                 goto out;
3175         }
3176
3177         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3178
3179         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3180                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3181                 if (ret) {
3182                         btrfs_abort_transaction(trans, ret);
3183                         goto out;
3184                 }
3185         }
3186
3187         mutex_unlock(&fs_info->chunk_mutex);
3188         trans->removing_chunk = false;
3189
3190         /*
3191          * We are done with chunk btree updates and deletions, so release the
3192          * system space we previously reserved (with check_system_chunk()).
3193          */
3194         btrfs_trans_release_chunk_metadata(trans);
3195
3196         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3197         if (ret) {
3198                 btrfs_abort_transaction(trans, ret);
3199                 goto out;
3200         }
3201
3202 out:
3203         if (trans->removing_chunk) {
3204                 mutex_unlock(&fs_info->chunk_mutex);
3205                 trans->removing_chunk = false;
3206         }
3207         /* once for us */
3208         free_extent_map(em);
3209         return ret;
3210 }
3211
3212 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3213 {
3214         struct btrfs_root *root = fs_info->chunk_root;
3215         struct btrfs_trans_handle *trans;
3216         struct btrfs_block_group *block_group;
3217         u64 length;
3218         int ret;
3219
3220         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3221                 btrfs_err(fs_info,
3222                           "relocate: not supported on extent tree v2 yet");
3223                 return -EINVAL;
3224         }
3225
3226         /*
3227          * Prevent races with automatic removal of unused block groups.
3228          * After we relocate and before we remove the chunk with offset
3229          * chunk_offset, automatic removal of the block group can kick in,
3230          * resulting in a failure when calling btrfs_remove_chunk() below.
3231          *
3232          * Make sure to acquire this mutex before doing a tree search (dev
3233          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3234          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3235          * we release the path used to search the chunk/dev tree and before
3236          * the current task acquires this mutex and calls us.
3237          */
3238         lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3239
3240         /* step one, relocate all the extents inside this chunk */
3241         btrfs_scrub_pause(fs_info);
3242         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3243         btrfs_scrub_continue(fs_info);
3244         if (ret)
3245                 return ret;
3246
3247         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3248         if (!block_group)
3249                 return -ENOENT;
3250         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3251         length = block_group->length;
3252         btrfs_put_block_group(block_group);
3253
3254         /*
3255          * On a zoned file system, discard the whole block group, this will
3256          * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3257          * resetting the zone fails, don't treat it as a fatal problem from the
3258          * filesystem's point of view.
3259          */
3260         if (btrfs_is_zoned(fs_info)) {
3261                 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3262                 if (ret)
3263                         btrfs_info(fs_info,
3264                                 "failed to reset zone %llu after relocation",
3265                                 chunk_offset);
3266         }
3267
3268         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3269                                                      chunk_offset);
3270         if (IS_ERR(trans)) {
3271                 ret = PTR_ERR(trans);
3272                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3273                 return ret;
3274         }
3275
3276         /*
3277          * step two, delete the device extents and the
3278          * chunk tree entries
3279          */
3280         ret = btrfs_remove_chunk(trans, chunk_offset);
3281         btrfs_end_transaction(trans);
3282         return ret;
3283 }
3284
3285 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3286 {
3287         struct btrfs_root *chunk_root = fs_info->chunk_root;
3288         struct btrfs_path *path;
3289         struct extent_buffer *leaf;
3290         struct btrfs_chunk *chunk;
3291         struct btrfs_key key;
3292         struct btrfs_key found_key;
3293         u64 chunk_type;
3294         bool retried = false;
3295         int failed = 0;
3296         int ret;
3297
3298         path = btrfs_alloc_path();
3299         if (!path)
3300                 return -ENOMEM;
3301
3302 again:
3303         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3304         key.offset = (u64)-1;
3305         key.type = BTRFS_CHUNK_ITEM_KEY;
3306
3307         while (1) {
3308                 mutex_lock(&fs_info->reclaim_bgs_lock);
3309                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3310                 if (ret < 0) {
3311                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3312                         goto error;
3313                 }
3314                 BUG_ON(ret == 0); /* Corruption */
3315
3316                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3317                                           key.type);
3318                 if (ret)
3319                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3320                 if (ret < 0)
3321                         goto error;
3322                 if (ret > 0)
3323                         break;
3324
3325                 leaf = path->nodes[0];
3326                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3327
3328                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3329                                        struct btrfs_chunk);
3330                 chunk_type = btrfs_chunk_type(leaf, chunk);
3331                 btrfs_release_path(path);
3332
3333                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3334                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3335                         if (ret == -ENOSPC)
3336                                 failed++;
3337                         else
3338                                 BUG_ON(ret);
3339                 }
3340                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3341
3342                 if (found_key.offset == 0)
3343                         break;
3344                 key.offset = found_key.offset - 1;
3345         }
3346         ret = 0;
3347         if (failed && !retried) {
3348                 failed = 0;
3349                 retried = true;
3350                 goto again;
3351         } else if (WARN_ON(failed && retried)) {
3352                 ret = -ENOSPC;
3353         }
3354 error:
3355         btrfs_free_path(path);
3356         return ret;
3357 }
3358
3359 /*
3360  * return 1 : allocate a data chunk successfully,
3361  * return <0: errors during allocating a data chunk,
3362  * return 0 : no need to allocate a data chunk.
3363  */
3364 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3365                                       u64 chunk_offset)
3366 {
3367         struct btrfs_block_group *cache;
3368         u64 bytes_used;
3369         u64 chunk_type;
3370
3371         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3372         ASSERT(cache);
3373         chunk_type = cache->flags;
3374         btrfs_put_block_group(cache);
3375
3376         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3377                 return 0;
3378
3379         spin_lock(&fs_info->data_sinfo->lock);
3380         bytes_used = fs_info->data_sinfo->bytes_used;
3381         spin_unlock(&fs_info->data_sinfo->lock);
3382
3383         if (!bytes_used) {
3384                 struct btrfs_trans_handle *trans;
3385                 int ret;
3386
3387                 trans = btrfs_join_transaction(fs_info->tree_root);
3388                 if (IS_ERR(trans))
3389                         return PTR_ERR(trans);
3390
3391                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3392                 btrfs_end_transaction(trans);
3393                 if (ret < 0)
3394                         return ret;
3395                 return 1;
3396         }
3397
3398         return 0;
3399 }
3400
3401 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3402                                struct btrfs_balance_control *bctl)
3403 {
3404         struct btrfs_root *root = fs_info->tree_root;
3405         struct btrfs_trans_handle *trans;
3406         struct btrfs_balance_item *item;
3407         struct btrfs_disk_balance_args disk_bargs;
3408         struct btrfs_path *path;
3409         struct extent_buffer *leaf;
3410         struct btrfs_key key;
3411         int ret, err;
3412
3413         path = btrfs_alloc_path();
3414         if (!path)
3415                 return -ENOMEM;
3416
3417         trans = btrfs_start_transaction(root, 0);
3418         if (IS_ERR(trans)) {
3419                 btrfs_free_path(path);
3420                 return PTR_ERR(trans);
3421         }
3422
3423         key.objectid = BTRFS_BALANCE_OBJECTID;
3424         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3425         key.offset = 0;
3426
3427         ret = btrfs_insert_empty_item(trans, root, path, &key,
3428                                       sizeof(*item));
3429         if (ret)
3430                 goto out;
3431
3432         leaf = path->nodes[0];
3433         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3434
3435         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3436
3437         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3438         btrfs_set_balance_data(leaf, item, &disk_bargs);
3439         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3440         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3441         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3442         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3443
3444         btrfs_set_balance_flags(leaf, item, bctl->flags);
3445
3446         btrfs_mark_buffer_dirty(leaf);
3447 out:
3448         btrfs_free_path(path);
3449         err = btrfs_commit_transaction(trans);
3450         if (err && !ret)
3451                 ret = err;
3452         return ret;
3453 }
3454
3455 static int del_balance_item(struct btrfs_fs_info *fs_info)
3456 {
3457         struct btrfs_root *root = fs_info->tree_root;
3458         struct btrfs_trans_handle *trans;
3459         struct btrfs_path *path;
3460         struct btrfs_key key;
3461         int ret, err;
3462
3463         path = btrfs_alloc_path();
3464         if (!path)
3465                 return -ENOMEM;
3466
3467         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3468         if (IS_ERR(trans)) {
3469                 btrfs_free_path(path);
3470                 return PTR_ERR(trans);
3471         }
3472
3473         key.objectid = BTRFS_BALANCE_OBJECTID;
3474         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3475         key.offset = 0;
3476
3477         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3478         if (ret < 0)
3479                 goto out;
3480         if (ret > 0) {
3481                 ret = -ENOENT;
3482                 goto out;
3483         }
3484
3485         ret = btrfs_del_item(trans, root, path);
3486 out:
3487         btrfs_free_path(path);
3488         err = btrfs_commit_transaction(trans);
3489         if (err && !ret)
3490                 ret = err;
3491         return ret;
3492 }
3493
3494 /*
3495  * This is a heuristic used to reduce the number of chunks balanced on
3496  * resume after balance was interrupted.
3497  */
3498 static void update_balance_args(struct btrfs_balance_control *bctl)
3499 {
3500         /*
3501          * Turn on soft mode for chunk types that were being converted.
3502          */
3503         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3504                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3505         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3506                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3507         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3508                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3509
3510         /*
3511          * Turn on usage filter if is not already used.  The idea is
3512          * that chunks that we have already balanced should be
3513          * reasonably full.  Don't do it for chunks that are being
3514          * converted - that will keep us from relocating unconverted
3515          * (albeit full) chunks.
3516          */
3517         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3518             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3519             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3520                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3521                 bctl->data.usage = 90;
3522         }
3523         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3524             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3525             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3526                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3527                 bctl->sys.usage = 90;
3528         }
3529         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3530             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3531             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3532                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3533                 bctl->meta.usage = 90;
3534         }
3535 }
3536
3537 /*
3538  * Clear the balance status in fs_info and delete the balance item from disk.
3539  */
3540 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3541 {
3542         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3543         int ret;
3544
3545         BUG_ON(!fs_info->balance_ctl);
3546
3547         spin_lock(&fs_info->balance_lock);
3548         fs_info->balance_ctl = NULL;
3549         spin_unlock(&fs_info->balance_lock);
3550
3551         kfree(bctl);
3552         ret = del_balance_item(fs_info);
3553         if (ret)
3554                 btrfs_handle_fs_error(fs_info, ret, NULL);
3555 }
3556
3557 /*
3558  * Balance filters.  Return 1 if chunk should be filtered out
3559  * (should not be balanced).
3560  */
3561 static int chunk_profiles_filter(u64 chunk_type,
3562                                  struct btrfs_balance_args *bargs)
3563 {
3564         chunk_type = chunk_to_extended(chunk_type) &
3565                                 BTRFS_EXTENDED_PROFILE_MASK;
3566
3567         if (bargs->profiles & chunk_type)
3568                 return 0;
3569
3570         return 1;
3571 }
3572
3573 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3574                               struct btrfs_balance_args *bargs)
3575 {
3576         struct btrfs_block_group *cache;
3577         u64 chunk_used;
3578         u64 user_thresh_min;
3579         u64 user_thresh_max;
3580         int ret = 1;
3581
3582         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3583         chunk_used = cache->used;
3584
3585         if (bargs->usage_min == 0)
3586                 user_thresh_min = 0;
3587         else
3588                 user_thresh_min = div_factor_fine(cache->length,
3589                                                   bargs->usage_min);
3590
3591         if (bargs->usage_max == 0)
3592                 user_thresh_max = 1;
3593         else if (bargs->usage_max > 100)
3594                 user_thresh_max = cache->length;
3595         else
3596                 user_thresh_max = div_factor_fine(cache->length,
3597                                                   bargs->usage_max);
3598
3599         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3600                 ret = 0;
3601
3602         btrfs_put_block_group(cache);
3603         return ret;
3604 }
3605
3606 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3607                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3608 {
3609         struct btrfs_block_group *cache;
3610         u64 chunk_used, user_thresh;
3611         int ret = 1;
3612
3613         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3614         chunk_used = cache->used;
3615
3616         if (bargs->usage_min == 0)
3617                 user_thresh = 1;
3618         else if (bargs->usage > 100)
3619                 user_thresh = cache->length;
3620         else
3621                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3622
3623         if (chunk_used < user_thresh)
3624                 ret = 0;
3625
3626         btrfs_put_block_group(cache);
3627         return ret;
3628 }
3629
3630 static int chunk_devid_filter(struct extent_buffer *leaf,
3631                               struct btrfs_chunk *chunk,
3632                               struct btrfs_balance_args *bargs)
3633 {
3634         struct btrfs_stripe *stripe;
3635         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3636         int i;
3637
3638         for (i = 0; i < num_stripes; i++) {
3639                 stripe = btrfs_stripe_nr(chunk, i);
3640                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3641                         return 0;
3642         }
3643
3644         return 1;
3645 }
3646
3647 static u64 calc_data_stripes(u64 type, int num_stripes)
3648 {
3649         const int index = btrfs_bg_flags_to_raid_index(type);
3650         const int ncopies = btrfs_raid_array[index].ncopies;
3651         const int nparity = btrfs_raid_array[index].nparity;
3652
3653         return (num_stripes - nparity) / ncopies;
3654 }
3655
3656 /* [pstart, pend) */
3657 static int chunk_drange_filter(struct extent_buffer *leaf,
3658                                struct btrfs_chunk *chunk,
3659                                struct btrfs_balance_args *bargs)
3660 {
3661         struct btrfs_stripe *stripe;
3662         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3663         u64 stripe_offset;
3664         u64 stripe_length;
3665         u64 type;
3666         int factor;
3667         int i;
3668
3669         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3670                 return 0;
3671
3672         type = btrfs_chunk_type(leaf, chunk);
3673         factor = calc_data_stripes(type, num_stripes);
3674
3675         for (i = 0; i < num_stripes; i++) {
3676                 stripe = btrfs_stripe_nr(chunk, i);
3677                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3678                         continue;
3679
3680                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3681                 stripe_length = btrfs_chunk_length(leaf, chunk);
3682                 stripe_length = div_u64(stripe_length, factor);
3683
3684                 if (stripe_offset < bargs->pend &&
3685                     stripe_offset + stripe_length > bargs->pstart)
3686                         return 0;
3687         }
3688
3689         return 1;
3690 }
3691
3692 /* [vstart, vend) */
3693 static int chunk_vrange_filter(struct extent_buffer *leaf,
3694                                struct btrfs_chunk *chunk,
3695                                u64 chunk_offset,
3696                                struct btrfs_balance_args *bargs)
3697 {
3698         if (chunk_offset < bargs->vend &&
3699             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3700                 /* at least part of the chunk is inside this vrange */
3701                 return 0;
3702
3703         return 1;
3704 }
3705
3706 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3707                                struct btrfs_chunk *chunk,
3708                                struct btrfs_balance_args *bargs)
3709 {
3710         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3711
3712         if (bargs->stripes_min <= num_stripes
3713                         && num_stripes <= bargs->stripes_max)
3714                 return 0;
3715
3716         return 1;
3717 }
3718
3719 static int chunk_soft_convert_filter(u64 chunk_type,
3720                                      struct btrfs_balance_args *bargs)
3721 {
3722         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3723                 return 0;
3724
3725         chunk_type = chunk_to_extended(chunk_type) &
3726                                 BTRFS_EXTENDED_PROFILE_MASK;
3727
3728         if (bargs->target == chunk_type)
3729                 return 1;
3730
3731         return 0;
3732 }
3733
3734 static int should_balance_chunk(struct extent_buffer *leaf,
3735                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3736 {
3737         struct btrfs_fs_info *fs_info = leaf->fs_info;
3738         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3739         struct btrfs_balance_args *bargs = NULL;
3740         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3741
3742         /* type filter */
3743         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3744               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3745                 return 0;
3746         }
3747
3748         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3749                 bargs = &bctl->data;
3750         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3751                 bargs = &bctl->sys;
3752         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3753                 bargs = &bctl->meta;
3754
3755         /* profiles filter */
3756         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3757             chunk_profiles_filter(chunk_type, bargs)) {
3758                 return 0;
3759         }
3760
3761         /* usage filter */
3762         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3763             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3764                 return 0;
3765         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3766             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3767                 return 0;
3768         }
3769
3770         /* devid filter */
3771         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3772             chunk_devid_filter(leaf, chunk, bargs)) {
3773                 return 0;
3774         }
3775
3776         /* drange filter, makes sense only with devid filter */
3777         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3778             chunk_drange_filter(leaf, chunk, bargs)) {
3779                 return 0;
3780         }
3781
3782         /* vrange filter */
3783         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3784             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3785                 return 0;
3786         }
3787
3788         /* stripes filter */
3789         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3790             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3791                 return 0;
3792         }
3793
3794         /* soft profile changing mode */
3795         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3796             chunk_soft_convert_filter(chunk_type, bargs)) {
3797                 return 0;
3798         }
3799
3800         /*
3801          * limited by count, must be the last filter
3802          */
3803         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3804                 if (bargs->limit == 0)
3805                         return 0;
3806                 else
3807                         bargs->limit--;
3808         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3809                 /*
3810                  * Same logic as the 'limit' filter; the minimum cannot be
3811                  * determined here because we do not have the global information
3812                  * about the count of all chunks that satisfy the filters.
3813                  */
3814                 if (bargs->limit_max == 0)
3815                         return 0;
3816                 else
3817                         bargs->limit_max--;
3818         }
3819
3820         return 1;
3821 }
3822
3823 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3824 {
3825         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3826         struct btrfs_root *chunk_root = fs_info->chunk_root;
3827         u64 chunk_type;
3828         struct btrfs_chunk *chunk;
3829         struct btrfs_path *path = NULL;
3830         struct btrfs_key key;
3831         struct btrfs_key found_key;
3832         struct extent_buffer *leaf;
3833         int slot;
3834         int ret;
3835         int enospc_errors = 0;
3836         bool counting = true;
3837         /* The single value limit and min/max limits use the same bytes in the */
3838         u64 limit_data = bctl->data.limit;
3839         u64 limit_meta = bctl->meta.limit;
3840         u64 limit_sys = bctl->sys.limit;
3841         u32 count_data = 0;
3842         u32 count_meta = 0;
3843         u32 count_sys = 0;
3844         int chunk_reserved = 0;
3845
3846         path = btrfs_alloc_path();
3847         if (!path) {
3848                 ret = -ENOMEM;
3849                 goto error;
3850         }
3851
3852         /* zero out stat counters */
3853         spin_lock(&fs_info->balance_lock);
3854         memset(&bctl->stat, 0, sizeof(bctl->stat));
3855         spin_unlock(&fs_info->balance_lock);
3856 again:
3857         if (!counting) {
3858                 /*
3859                  * The single value limit and min/max limits use the same bytes
3860                  * in the
3861                  */
3862                 bctl->data.limit = limit_data;
3863                 bctl->meta.limit = limit_meta;
3864                 bctl->sys.limit = limit_sys;
3865         }
3866         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3867         key.offset = (u64)-1;
3868         key.type = BTRFS_CHUNK_ITEM_KEY;
3869
3870         while (1) {
3871                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3872                     atomic_read(&fs_info->balance_cancel_req)) {
3873                         ret = -ECANCELED;
3874                         goto error;
3875                 }
3876
3877                 mutex_lock(&fs_info->reclaim_bgs_lock);
3878                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3879                 if (ret < 0) {
3880                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3881                         goto error;
3882                 }
3883
3884                 /*
3885                  * this shouldn't happen, it means the last relocate
3886                  * failed
3887                  */
3888                 if (ret == 0)
3889                         BUG(); /* FIXME break ? */
3890
3891                 ret = btrfs_previous_item(chunk_root, path, 0,
3892                                           BTRFS_CHUNK_ITEM_KEY);
3893                 if (ret) {
3894                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3895                         ret = 0;
3896                         break;
3897                 }
3898
3899                 leaf = path->nodes[0];
3900                 slot = path->slots[0];
3901                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3902
3903                 if (found_key.objectid != key.objectid) {
3904                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3905                         break;
3906                 }
3907
3908                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3909                 chunk_type = btrfs_chunk_type(leaf, chunk);
3910
3911                 if (!counting) {
3912                         spin_lock(&fs_info->balance_lock);
3913                         bctl->stat.considered++;
3914                         spin_unlock(&fs_info->balance_lock);
3915                 }
3916
3917                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3918
3919                 btrfs_release_path(path);
3920                 if (!ret) {
3921                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3922                         goto loop;
3923                 }
3924
3925                 if (counting) {
3926                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3927                         spin_lock(&fs_info->balance_lock);
3928                         bctl->stat.expected++;
3929                         spin_unlock(&fs_info->balance_lock);
3930
3931                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3932                                 count_data++;
3933                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3934                                 count_sys++;
3935                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3936                                 count_meta++;
3937
3938                         goto loop;
3939                 }
3940
3941                 /*
3942                  * Apply limit_min filter, no need to check if the LIMITS
3943                  * filter is used, limit_min is 0 by default
3944                  */
3945                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3946                                         count_data < bctl->data.limit_min)
3947                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3948                                         count_meta < bctl->meta.limit_min)
3949                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3950                                         count_sys < bctl->sys.limit_min)) {
3951                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3952                         goto loop;
3953                 }
3954
3955                 if (!chunk_reserved) {
3956                         /*
3957                          * We may be relocating the only data chunk we have,
3958                          * which could potentially end up with losing data's
3959                          * raid profile, so lets allocate an empty one in
3960                          * advance.
3961                          */
3962                         ret = btrfs_may_alloc_data_chunk(fs_info,
3963                                                          found_key.offset);
3964                         if (ret < 0) {
3965                                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3966                                 goto error;
3967                         } else if (ret == 1) {
3968                                 chunk_reserved = 1;
3969                         }
3970                 }
3971
3972                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3973                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3974                 if (ret == -ENOSPC) {
3975                         enospc_errors++;
3976                 } else if (ret == -ETXTBSY) {
3977                         btrfs_info(fs_info,
3978            "skipping relocation of block group %llu due to active swapfile",
3979                                    found_key.offset);
3980                         ret = 0;
3981                 } else if (ret) {
3982                         goto error;
3983                 } else {
3984                         spin_lock(&fs_info->balance_lock);
3985                         bctl->stat.completed++;
3986                         spin_unlock(&fs_info->balance_lock);
3987                 }
3988 loop:
3989                 if (found_key.offset == 0)
3990                         break;
3991                 key.offset = found_key.offset - 1;
3992         }
3993
3994         if (counting) {
3995                 btrfs_release_path(path);
3996                 counting = false;
3997                 goto again;
3998         }
3999 error:
4000         btrfs_free_path(path);
4001         if (enospc_errors) {
4002                 btrfs_info(fs_info, "%d enospc errors during balance",
4003                            enospc_errors);
4004                 if (!ret)
4005                         ret = -ENOSPC;
4006         }
4007
4008         return ret;
4009 }
4010
4011 /**
4012  * alloc_profile_is_valid - see if a given profile is valid and reduced
4013  * @flags: profile to validate
4014  * @extended: if true @flags is treated as an extended profile
4015  */
4016 static int alloc_profile_is_valid(u64 flags, int extended)
4017 {
4018         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4019                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
4020
4021         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4022
4023         /* 1) check that all other bits are zeroed */
4024         if (flags & ~mask)
4025                 return 0;
4026
4027         /* 2) see if profile is reduced */
4028         if (flags == 0)
4029                 return !extended; /* "0" is valid for usual profiles */
4030
4031         return has_single_bit_set(flags);
4032 }
4033
4034 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
4035 {
4036         /* cancel requested || normal exit path */
4037         return atomic_read(&fs_info->balance_cancel_req) ||
4038                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
4039                  atomic_read(&fs_info->balance_cancel_req) == 0);
4040 }
4041
4042 /*
4043  * Validate target profile against allowed profiles and return true if it's OK.
4044  * Otherwise print the error message and return false.
4045  */
4046 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4047                 const struct btrfs_balance_args *bargs,
4048                 u64 allowed, const char *type)
4049 {
4050         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4051                 return true;
4052
4053         /* Profile is valid and does not have bits outside of the allowed set */
4054         if (alloc_profile_is_valid(bargs->target, 1) &&
4055             (bargs->target & ~allowed) == 0)
4056                 return true;
4057
4058         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4059                         type, btrfs_bg_type_to_raid_name(bargs->target));
4060         return false;
4061 }
4062
4063 /*
4064  * Fill @buf with textual description of balance filter flags @bargs, up to
4065  * @size_buf including the terminating null. The output may be trimmed if it
4066  * does not fit into the provided buffer.
4067  */
4068 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4069                                  u32 size_buf)
4070 {
4071         int ret;
4072         u32 size_bp = size_buf;
4073         char *bp = buf;
4074         u64 flags = bargs->flags;
4075         char tmp_buf[128] = {'\0'};
4076
4077         if (!flags)
4078                 return;
4079
4080 #define CHECK_APPEND_NOARG(a)                                           \
4081         do {                                                            \
4082                 ret = snprintf(bp, size_bp, (a));                       \
4083                 if (ret < 0 || ret >= size_bp)                          \
4084                         goto out_overflow;                              \
4085                 size_bp -= ret;                                         \
4086                 bp += ret;                                              \
4087         } while (0)
4088
4089 #define CHECK_APPEND_1ARG(a, v1)                                        \
4090         do {                                                            \
4091                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4092                 if (ret < 0 || ret >= size_bp)                          \
4093                         goto out_overflow;                              \
4094                 size_bp -= ret;                                         \
4095                 bp += ret;                                              \
4096         } while (0)
4097
4098 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
4099         do {                                                            \
4100                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
4101                 if (ret < 0 || ret >= size_bp)                          \
4102                         goto out_overflow;                              \
4103                 size_bp -= ret;                                         \
4104                 bp += ret;                                              \
4105         } while (0)
4106
4107         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4108                 CHECK_APPEND_1ARG("convert=%s,",
4109                                   btrfs_bg_type_to_raid_name(bargs->target));
4110
4111         if (flags & BTRFS_BALANCE_ARGS_SOFT)
4112                 CHECK_APPEND_NOARG("soft,");
4113
4114         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4115                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4116                                             sizeof(tmp_buf));
4117                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4118         }
4119
4120         if (flags & BTRFS_BALANCE_ARGS_USAGE)
4121                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4122
4123         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4124                 CHECK_APPEND_2ARG("usage=%u..%u,",
4125                                   bargs->usage_min, bargs->usage_max);
4126
4127         if (flags & BTRFS_BALANCE_ARGS_DEVID)
4128                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4129
4130         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4131                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4132                                   bargs->pstart, bargs->pend);
4133
4134         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4135                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4136                                   bargs->vstart, bargs->vend);
4137
4138         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4139                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4140
4141         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4142                 CHECK_APPEND_2ARG("limit=%u..%u,",
4143                                 bargs->limit_min, bargs->limit_max);
4144
4145         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4146                 CHECK_APPEND_2ARG("stripes=%u..%u,",
4147                                   bargs->stripes_min, bargs->stripes_max);
4148
4149 #undef CHECK_APPEND_2ARG
4150 #undef CHECK_APPEND_1ARG
4151 #undef CHECK_APPEND_NOARG
4152
4153 out_overflow:
4154
4155         if (size_bp < size_buf)
4156                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4157         else
4158                 buf[0] = '\0';
4159 }
4160
4161 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4162 {
4163         u32 size_buf = 1024;
4164         char tmp_buf[192] = {'\0'};
4165         char *buf;
4166         char *bp;
4167         u32 size_bp = size_buf;
4168         int ret;
4169         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4170
4171         buf = kzalloc(size_buf, GFP_KERNEL);
4172         if (!buf)
4173                 return;
4174
4175         bp = buf;
4176
4177 #define CHECK_APPEND_1ARG(a, v1)                                        \
4178         do {                                                            \
4179                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4180                 if (ret < 0 || ret >= size_bp)                          \
4181                         goto out_overflow;                              \
4182                 size_bp -= ret;                                         \
4183                 bp += ret;                                              \
4184         } while (0)
4185
4186         if (bctl->flags & BTRFS_BALANCE_FORCE)
4187                 CHECK_APPEND_1ARG("%s", "-f ");
4188
4189         if (bctl->flags & BTRFS_BALANCE_DATA) {
4190                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4191                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4192         }
4193
4194         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4195                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4196                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4197         }
4198
4199         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4200                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4201                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4202         }
4203
4204 #undef CHECK_APPEND_1ARG
4205
4206 out_overflow:
4207
4208         if (size_bp < size_buf)
4209                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4210         btrfs_info(fs_info, "balance: %s %s",
4211                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4212                    "resume" : "start", buf);
4213
4214         kfree(buf);
4215 }
4216
4217 /*
4218  * Should be called with balance mutexe held
4219  */
4220 int btrfs_balance(struct btrfs_fs_info *fs_info,
4221                   struct btrfs_balance_control *bctl,
4222                   struct btrfs_ioctl_balance_args *bargs)
4223 {
4224         u64 meta_target, data_target;
4225         u64 allowed;
4226         int mixed = 0;
4227         int ret;
4228         u64 num_devices;
4229         unsigned seq;
4230         bool reducing_redundancy;
4231         int i;
4232
4233         if (btrfs_fs_closing(fs_info) ||
4234             atomic_read(&fs_info->balance_pause_req) ||
4235             btrfs_should_cancel_balance(fs_info)) {
4236                 ret = -EINVAL;
4237                 goto out;
4238         }
4239
4240         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4241         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4242                 mixed = 1;
4243
4244         /*
4245          * In case of mixed groups both data and meta should be picked,
4246          * and identical options should be given for both of them.
4247          */
4248         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4249         if (mixed && (bctl->flags & allowed)) {
4250                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4251                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4252                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4253                         btrfs_err(fs_info,
4254           "balance: mixed groups data and metadata options must be the same");
4255                         ret = -EINVAL;
4256                         goto out;
4257                 }
4258         }
4259
4260         /*
4261          * rw_devices will not change at the moment, device add/delete/replace
4262          * are exclusive
4263          */
4264         num_devices = fs_info->fs_devices->rw_devices;
4265
4266         /*
4267          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4268          * special bit for it, to make it easier to distinguish.  Thus we need
4269          * to set it manually, or balance would refuse the profile.
4270          */
4271         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4272         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4273                 if (num_devices >= btrfs_raid_array[i].devs_min)
4274                         allowed |= btrfs_raid_array[i].bg_flag;
4275
4276         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4277             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4278             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4279                 ret = -EINVAL;
4280                 goto out;
4281         }
4282
4283         /*
4284          * Allow to reduce metadata or system integrity only if force set for
4285          * profiles with redundancy (copies, parity)
4286          */
4287         allowed = 0;
4288         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4289                 if (btrfs_raid_array[i].ncopies >= 2 ||
4290                     btrfs_raid_array[i].tolerated_failures >= 1)
4291                         allowed |= btrfs_raid_array[i].bg_flag;
4292         }
4293         do {
4294                 seq = read_seqbegin(&fs_info->profiles_lock);
4295
4296                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4297                      (fs_info->avail_system_alloc_bits & allowed) &&
4298                      !(bctl->sys.target & allowed)) ||
4299                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4300                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4301                      !(bctl->meta.target & allowed)))
4302                         reducing_redundancy = true;
4303                 else
4304                         reducing_redundancy = false;
4305
4306                 /* if we're not converting, the target field is uninitialized */
4307                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4308                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4309                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4310                         bctl->data.target : fs_info->avail_data_alloc_bits;
4311         } while (read_seqretry(&fs_info->profiles_lock, seq));
4312
4313         if (reducing_redundancy) {
4314                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4315                         btrfs_info(fs_info,
4316                            "balance: force reducing metadata redundancy");
4317                 } else {
4318                         btrfs_err(fs_info,
4319         "balance: reduces metadata redundancy, use --force if you want this");
4320                         ret = -EINVAL;
4321                         goto out;
4322                 }
4323         }
4324
4325         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4326                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4327                 btrfs_warn(fs_info,
4328         "balance: metadata profile %s has lower redundancy than data profile %s",
4329                                 btrfs_bg_type_to_raid_name(meta_target),
4330                                 btrfs_bg_type_to_raid_name(data_target));
4331         }
4332
4333         ret = insert_balance_item(fs_info, bctl);
4334         if (ret && ret != -EEXIST)
4335                 goto out;
4336
4337         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4338                 BUG_ON(ret == -EEXIST);
4339                 BUG_ON(fs_info->balance_ctl);
4340                 spin_lock(&fs_info->balance_lock);
4341                 fs_info->balance_ctl = bctl;
4342                 spin_unlock(&fs_info->balance_lock);
4343         } else {
4344                 BUG_ON(ret != -EEXIST);
4345                 spin_lock(&fs_info->balance_lock);
4346                 update_balance_args(bctl);
4347                 spin_unlock(&fs_info->balance_lock);
4348         }
4349
4350         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4351         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4352         describe_balance_start_or_resume(fs_info);
4353         mutex_unlock(&fs_info->balance_mutex);
4354
4355         ret = __btrfs_balance(fs_info);
4356
4357         mutex_lock(&fs_info->balance_mutex);
4358         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
4359                 btrfs_info(fs_info, "balance: paused");
4360                 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4361         }
4362         /*
4363          * Balance can be canceled by:
4364          *
4365          * - Regular cancel request
4366          *   Then ret == -ECANCELED and balance_cancel_req > 0
4367          *
4368          * - Fatal signal to "btrfs" process
4369          *   Either the signal caught by wait_reserve_ticket() and callers
4370          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4371          *   got -ECANCELED.
4372          *   Either way, in this case balance_cancel_req = 0, and
4373          *   ret == -EINTR or ret == -ECANCELED.
4374          *
4375          * So here we only check the return value to catch canceled balance.
4376          */
4377         else if (ret == -ECANCELED || ret == -EINTR)
4378                 btrfs_info(fs_info, "balance: canceled");
4379         else
4380                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4381
4382         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4383
4384         if (bargs) {
4385                 memset(bargs, 0, sizeof(*bargs));
4386                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4387         }
4388
4389         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4390             balance_need_close(fs_info)) {
4391                 reset_balance_state(fs_info);
4392                 btrfs_exclop_finish(fs_info);
4393         }
4394
4395         wake_up(&fs_info->balance_wait_q);
4396
4397         return ret;
4398 out:
4399         if (bctl->flags & BTRFS_BALANCE_RESUME)
4400                 reset_balance_state(fs_info);
4401         else
4402                 kfree(bctl);
4403         btrfs_exclop_finish(fs_info);
4404
4405         return ret;
4406 }
4407
4408 static int balance_kthread(void *data)
4409 {
4410         struct btrfs_fs_info *fs_info = data;
4411         int ret = 0;
4412
4413         sb_start_write(fs_info->sb);
4414         mutex_lock(&fs_info->balance_mutex);
4415         if (fs_info->balance_ctl)
4416                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4417         mutex_unlock(&fs_info->balance_mutex);
4418         sb_end_write(fs_info->sb);
4419
4420         return ret;
4421 }
4422
4423 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4424 {
4425         struct task_struct *tsk;
4426
4427         mutex_lock(&fs_info->balance_mutex);
4428         if (!fs_info->balance_ctl) {
4429                 mutex_unlock(&fs_info->balance_mutex);
4430                 return 0;
4431         }
4432         mutex_unlock(&fs_info->balance_mutex);
4433
4434         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4435                 btrfs_info(fs_info, "balance: resume skipped");
4436                 return 0;
4437         }
4438
4439         spin_lock(&fs_info->super_lock);
4440         ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4441         fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4442         spin_unlock(&fs_info->super_lock);
4443         /*
4444          * A ro->rw remount sequence should continue with the paused balance
4445          * regardless of who pauses it, system or the user as of now, so set
4446          * the resume flag.
4447          */
4448         spin_lock(&fs_info->balance_lock);
4449         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4450         spin_unlock(&fs_info->balance_lock);
4451
4452         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4453         return PTR_ERR_OR_ZERO(tsk);
4454 }
4455
4456 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4457 {
4458         struct btrfs_balance_control *bctl;
4459         struct btrfs_balance_item *item;
4460         struct btrfs_disk_balance_args disk_bargs;
4461         struct btrfs_path *path;
4462         struct extent_buffer *leaf;
4463         struct btrfs_key key;
4464         int ret;
4465
4466         path = btrfs_alloc_path();
4467         if (!path)
4468                 return -ENOMEM;
4469
4470         key.objectid = BTRFS_BALANCE_OBJECTID;
4471         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4472         key.offset = 0;
4473
4474         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4475         if (ret < 0)
4476                 goto out;
4477         if (ret > 0) { /* ret = -ENOENT; */
4478                 ret = 0;
4479                 goto out;
4480         }
4481
4482         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4483         if (!bctl) {
4484                 ret = -ENOMEM;
4485                 goto out;
4486         }
4487
4488         leaf = path->nodes[0];
4489         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4490
4491         bctl->flags = btrfs_balance_flags(leaf, item);
4492         bctl->flags |= BTRFS_BALANCE_RESUME;
4493
4494         btrfs_balance_data(leaf, item, &disk_bargs);
4495         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4496         btrfs_balance_meta(leaf, item, &disk_bargs);
4497         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4498         btrfs_balance_sys(leaf, item, &disk_bargs);
4499         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4500
4501         /*
4502          * This should never happen, as the paused balance state is recovered
4503          * during mount without any chance of other exclusive ops to collide.
4504          *
4505          * This gives the exclusive op status to balance and keeps in paused
4506          * state until user intervention (cancel or umount). If the ownership
4507          * cannot be assigned, show a message but do not fail. The balance
4508          * is in a paused state and must have fs_info::balance_ctl properly
4509          * set up.
4510          */
4511         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4512                 btrfs_warn(fs_info,
4513         "balance: cannot set exclusive op status, resume manually");
4514
4515         btrfs_release_path(path);
4516
4517         mutex_lock(&fs_info->balance_mutex);
4518         BUG_ON(fs_info->balance_ctl);
4519         spin_lock(&fs_info->balance_lock);
4520         fs_info->balance_ctl = bctl;
4521         spin_unlock(&fs_info->balance_lock);
4522         mutex_unlock(&fs_info->balance_mutex);
4523 out:
4524         btrfs_free_path(path);
4525         return ret;
4526 }
4527
4528 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4529 {
4530         int ret = 0;
4531
4532         mutex_lock(&fs_info->balance_mutex);
4533         if (!fs_info->balance_ctl) {
4534                 mutex_unlock(&fs_info->balance_mutex);
4535                 return -ENOTCONN;
4536         }
4537
4538         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4539                 atomic_inc(&fs_info->balance_pause_req);
4540                 mutex_unlock(&fs_info->balance_mutex);
4541
4542                 wait_event(fs_info->balance_wait_q,
4543                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4544
4545                 mutex_lock(&fs_info->balance_mutex);
4546                 /* we are good with balance_ctl ripped off from under us */
4547                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4548                 atomic_dec(&fs_info->balance_pause_req);
4549         } else {
4550                 ret = -ENOTCONN;
4551         }
4552
4553         mutex_unlock(&fs_info->balance_mutex);
4554         return ret;
4555 }
4556
4557 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4558 {
4559         mutex_lock(&fs_info->balance_mutex);
4560         if (!fs_info->balance_ctl) {
4561                 mutex_unlock(&fs_info->balance_mutex);
4562                 return -ENOTCONN;
4563         }
4564
4565         /*
4566          * A paused balance with the item stored on disk can be resumed at
4567          * mount time if the mount is read-write. Otherwise it's still paused
4568          * and we must not allow cancelling as it deletes the item.
4569          */
4570         if (sb_rdonly(fs_info->sb)) {
4571                 mutex_unlock(&fs_info->balance_mutex);
4572                 return -EROFS;
4573         }
4574
4575         atomic_inc(&fs_info->balance_cancel_req);
4576         /*
4577          * if we are running just wait and return, balance item is
4578          * deleted in btrfs_balance in this case
4579          */
4580         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4581                 mutex_unlock(&fs_info->balance_mutex);
4582                 wait_event(fs_info->balance_wait_q,
4583                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4584                 mutex_lock(&fs_info->balance_mutex);
4585         } else {
4586                 mutex_unlock(&fs_info->balance_mutex);
4587                 /*
4588                  * Lock released to allow other waiters to continue, we'll
4589                  * reexamine the status again.
4590                  */
4591                 mutex_lock(&fs_info->balance_mutex);
4592
4593                 if (fs_info->balance_ctl) {
4594                         reset_balance_state(fs_info);
4595                         btrfs_exclop_finish(fs_info);
4596                         btrfs_info(fs_info, "balance: canceled");
4597                 }
4598         }
4599
4600         BUG_ON(fs_info->balance_ctl ||
4601                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4602         atomic_dec(&fs_info->balance_cancel_req);
4603         mutex_unlock(&fs_info->balance_mutex);
4604         return 0;
4605 }
4606
4607 int btrfs_uuid_scan_kthread(void *data)
4608 {
4609         struct btrfs_fs_info *fs_info = data;
4610         struct btrfs_root *root = fs_info->tree_root;
4611         struct btrfs_key key;
4612         struct btrfs_path *path = NULL;
4613         int ret = 0;
4614         struct extent_buffer *eb;
4615         int slot;
4616         struct btrfs_root_item root_item;
4617         u32 item_size;
4618         struct btrfs_trans_handle *trans = NULL;
4619         bool closing = false;
4620
4621         path = btrfs_alloc_path();
4622         if (!path) {
4623                 ret = -ENOMEM;
4624                 goto out;
4625         }
4626
4627         key.objectid = 0;
4628         key.type = BTRFS_ROOT_ITEM_KEY;
4629         key.offset = 0;
4630
4631         while (1) {
4632                 if (btrfs_fs_closing(fs_info)) {
4633                         closing = true;
4634                         break;
4635                 }
4636                 ret = btrfs_search_forward(root, &key, path,
4637                                 BTRFS_OLDEST_GENERATION);
4638                 if (ret) {
4639                         if (ret > 0)
4640                                 ret = 0;
4641                         break;
4642                 }
4643
4644                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4645                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4646                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4647                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4648                         goto skip;
4649
4650                 eb = path->nodes[0];
4651                 slot = path->slots[0];
4652                 item_size = btrfs_item_size(eb, slot);
4653                 if (item_size < sizeof(root_item))
4654                         goto skip;
4655
4656                 read_extent_buffer(eb, &root_item,
4657                                    btrfs_item_ptr_offset(eb, slot),
4658                                    (int)sizeof(root_item));
4659                 if (btrfs_root_refs(&root_item) == 0)
4660                         goto skip;
4661
4662                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4663                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4664                         if (trans)
4665                                 goto update_tree;
4666
4667                         btrfs_release_path(path);
4668                         /*
4669                          * 1 - subvol uuid item
4670                          * 1 - received_subvol uuid item
4671                          */
4672                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4673                         if (IS_ERR(trans)) {
4674                                 ret = PTR_ERR(trans);
4675                                 break;
4676                         }
4677                         continue;
4678                 } else {
4679                         goto skip;
4680                 }
4681 update_tree:
4682                 btrfs_release_path(path);
4683                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4684                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4685                                                   BTRFS_UUID_KEY_SUBVOL,
4686                                                   key.objectid);
4687                         if (ret < 0) {
4688                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4689                                         ret);
4690                                 break;
4691                         }
4692                 }
4693
4694                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4695                         ret = btrfs_uuid_tree_add(trans,
4696                                                   root_item.received_uuid,
4697                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4698                                                   key.objectid);
4699                         if (ret < 0) {
4700                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4701                                         ret);
4702                                 break;
4703                         }
4704                 }
4705
4706 skip:
4707                 btrfs_release_path(path);
4708                 if (trans) {
4709                         ret = btrfs_end_transaction(trans);
4710                         trans = NULL;
4711                         if (ret)
4712                                 break;
4713                 }
4714
4715                 if (key.offset < (u64)-1) {
4716                         key.offset++;
4717                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4718                         key.offset = 0;
4719                         key.type = BTRFS_ROOT_ITEM_KEY;
4720                 } else if (key.objectid < (u64)-1) {
4721                         key.offset = 0;
4722                         key.type = BTRFS_ROOT_ITEM_KEY;
4723                         key.objectid++;
4724                 } else {
4725                         break;
4726                 }
4727                 cond_resched();
4728         }
4729
4730 out:
4731         btrfs_free_path(path);
4732         if (trans && !IS_ERR(trans))
4733                 btrfs_end_transaction(trans);
4734         if (ret)
4735                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4736         else if (!closing)
4737                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4738         up(&fs_info->uuid_tree_rescan_sem);
4739         return 0;
4740 }
4741
4742 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4743 {
4744         struct btrfs_trans_handle *trans;
4745         struct btrfs_root *tree_root = fs_info->tree_root;
4746         struct btrfs_root *uuid_root;
4747         struct task_struct *task;
4748         int ret;
4749
4750         /*
4751          * 1 - root node
4752          * 1 - root item
4753          */
4754         trans = btrfs_start_transaction(tree_root, 2);
4755         if (IS_ERR(trans))
4756                 return PTR_ERR(trans);
4757
4758         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4759         if (IS_ERR(uuid_root)) {
4760                 ret = PTR_ERR(uuid_root);
4761                 btrfs_abort_transaction(trans, ret);
4762                 btrfs_end_transaction(trans);
4763                 return ret;
4764         }
4765
4766         fs_info->uuid_root = uuid_root;
4767
4768         ret = btrfs_commit_transaction(trans);
4769         if (ret)
4770                 return ret;
4771
4772         down(&fs_info->uuid_tree_rescan_sem);
4773         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4774         if (IS_ERR(task)) {
4775                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4776                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4777                 up(&fs_info->uuid_tree_rescan_sem);
4778                 return PTR_ERR(task);
4779         }
4780
4781         return 0;
4782 }
4783
4784 /*
4785  * shrinking a device means finding all of the device extents past
4786  * the new size, and then following the back refs to the chunks.
4787  * The chunk relocation code actually frees the device extent
4788  */
4789 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4790 {
4791         struct btrfs_fs_info *fs_info = device->fs_info;
4792         struct btrfs_root *root = fs_info->dev_root;
4793         struct btrfs_trans_handle *trans;
4794         struct btrfs_dev_extent *dev_extent = NULL;
4795         struct btrfs_path *path;
4796         u64 length;
4797         u64 chunk_offset;
4798         int ret;
4799         int slot;
4800         int failed = 0;
4801         bool retried = false;
4802         struct extent_buffer *l;
4803         struct btrfs_key key;
4804         struct btrfs_super_block *super_copy = fs_info->super_copy;
4805         u64 old_total = btrfs_super_total_bytes(super_copy);
4806         u64 old_size = btrfs_device_get_total_bytes(device);
4807         u64 diff;
4808         u64 start;
4809
4810         new_size = round_down(new_size, fs_info->sectorsize);
4811         start = new_size;
4812         diff = round_down(old_size - new_size, fs_info->sectorsize);
4813
4814         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4815                 return -EINVAL;
4816
4817         path = btrfs_alloc_path();
4818         if (!path)
4819                 return -ENOMEM;
4820
4821         path->reada = READA_BACK;
4822
4823         trans = btrfs_start_transaction(root, 0);
4824         if (IS_ERR(trans)) {
4825                 btrfs_free_path(path);
4826                 return PTR_ERR(trans);
4827         }
4828
4829         mutex_lock(&fs_info->chunk_mutex);
4830
4831         btrfs_device_set_total_bytes(device, new_size);
4832         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4833                 device->fs_devices->total_rw_bytes -= diff;
4834                 atomic64_sub(diff, &fs_info->free_chunk_space);
4835         }
4836
4837         /*
4838          * Once the device's size has been set to the new size, ensure all
4839          * in-memory chunks are synced to disk so that the loop below sees them
4840          * and relocates them accordingly.
4841          */
4842         if (contains_pending_extent(device, &start, diff)) {
4843                 mutex_unlock(&fs_info->chunk_mutex);
4844                 ret = btrfs_commit_transaction(trans);
4845                 if (ret)
4846                         goto done;
4847         } else {
4848                 mutex_unlock(&fs_info->chunk_mutex);
4849                 btrfs_end_transaction(trans);
4850         }
4851
4852 again:
4853         key.objectid = device->devid;
4854         key.offset = (u64)-1;
4855         key.type = BTRFS_DEV_EXTENT_KEY;
4856
4857         do {
4858                 mutex_lock(&fs_info->reclaim_bgs_lock);
4859                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4860                 if (ret < 0) {
4861                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4862                         goto done;
4863                 }
4864
4865                 ret = btrfs_previous_item(root, path, 0, key.type);
4866                 if (ret) {
4867                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4868                         if (ret < 0)
4869                                 goto done;
4870                         ret = 0;
4871                         btrfs_release_path(path);
4872                         break;
4873                 }
4874
4875                 l = path->nodes[0];
4876                 slot = path->slots[0];
4877                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4878
4879                 if (key.objectid != device->devid) {
4880                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4881                         btrfs_release_path(path);
4882                         break;
4883                 }
4884
4885                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4886                 length = btrfs_dev_extent_length(l, dev_extent);
4887
4888                 if (key.offset + length <= new_size) {
4889                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4890                         btrfs_release_path(path);
4891                         break;
4892                 }
4893
4894                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4895                 btrfs_release_path(path);
4896
4897                 /*
4898                  * We may be relocating the only data chunk we have,
4899                  * which could potentially end up with losing data's
4900                  * raid profile, so lets allocate an empty one in
4901                  * advance.
4902                  */
4903                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4904                 if (ret < 0) {
4905                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4906                         goto done;
4907                 }
4908
4909                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4910                 mutex_unlock(&fs_info->reclaim_bgs_lock);
4911                 if (ret == -ENOSPC) {
4912                         failed++;
4913                 } else if (ret) {
4914                         if (ret == -ETXTBSY) {
4915                                 btrfs_warn(fs_info,
4916                    "could not shrink block group %llu due to active swapfile",
4917                                            chunk_offset);
4918                         }
4919                         goto done;
4920                 }
4921         } while (key.offset-- > 0);
4922
4923         if (failed && !retried) {
4924                 failed = 0;
4925                 retried = true;
4926                 goto again;
4927         } else if (failed && retried) {
4928                 ret = -ENOSPC;
4929                 goto done;
4930         }
4931
4932         /* Shrinking succeeded, else we would be at "done". */
4933         trans = btrfs_start_transaction(root, 0);
4934         if (IS_ERR(trans)) {
4935                 ret = PTR_ERR(trans);
4936                 goto done;
4937         }
4938
4939         mutex_lock(&fs_info->chunk_mutex);
4940         /* Clear all state bits beyond the shrunk device size */
4941         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4942                           CHUNK_STATE_MASK);
4943
4944         btrfs_device_set_disk_total_bytes(device, new_size);
4945         if (list_empty(&device->post_commit_list))
4946                 list_add_tail(&device->post_commit_list,
4947                               &trans->transaction->dev_update_list);
4948
4949         WARN_ON(diff > old_total);
4950         btrfs_set_super_total_bytes(super_copy,
4951                         round_down(old_total - diff, fs_info->sectorsize));
4952         mutex_unlock(&fs_info->chunk_mutex);
4953
4954         btrfs_reserve_chunk_metadata(trans, false);
4955         /* Now btrfs_update_device() will change the on-disk size. */
4956         ret = btrfs_update_device(trans, device);
4957         btrfs_trans_release_chunk_metadata(trans);
4958         if (ret < 0) {
4959                 btrfs_abort_transaction(trans, ret);
4960                 btrfs_end_transaction(trans);
4961         } else {
4962                 ret = btrfs_commit_transaction(trans);
4963         }
4964 done:
4965         btrfs_free_path(path);
4966         if (ret) {
4967                 mutex_lock(&fs_info->chunk_mutex);
4968                 btrfs_device_set_total_bytes(device, old_size);
4969                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4970                         device->fs_devices->total_rw_bytes += diff;
4971                 atomic64_add(diff, &fs_info->free_chunk_space);
4972                 mutex_unlock(&fs_info->chunk_mutex);
4973         }
4974         return ret;
4975 }
4976
4977 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4978                            struct btrfs_key *key,
4979                            struct btrfs_chunk *chunk, int item_size)
4980 {
4981         struct btrfs_super_block *super_copy = fs_info->super_copy;
4982         struct btrfs_disk_key disk_key;
4983         u32 array_size;
4984         u8 *ptr;
4985
4986         lockdep_assert_held(&fs_info->chunk_mutex);
4987
4988         array_size = btrfs_super_sys_array_size(super_copy);
4989         if (array_size + item_size + sizeof(disk_key)
4990                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4991                 return -EFBIG;
4992
4993         ptr = super_copy->sys_chunk_array + array_size;
4994         btrfs_cpu_key_to_disk(&disk_key, key);
4995         memcpy(ptr, &disk_key, sizeof(disk_key));
4996         ptr += sizeof(disk_key);
4997         memcpy(ptr, chunk, item_size);
4998         item_size += sizeof(disk_key);
4999         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5000
5001         return 0;
5002 }
5003
5004 /*
5005  * sort the devices in descending order by max_avail, total_avail
5006  */
5007 static int btrfs_cmp_device_info(const void *a, const void *b)
5008 {
5009         const struct btrfs_device_info *di_a = a;
5010         const struct btrfs_device_info *di_b = b;
5011
5012         if (di_a->max_avail > di_b->max_avail)
5013                 return -1;
5014         if (di_a->max_avail < di_b->max_avail)
5015                 return 1;
5016         if (di_a->total_avail > di_b->total_avail)
5017                 return -1;
5018         if (di_a->total_avail < di_b->total_avail)
5019                 return 1;
5020         return 0;
5021 }
5022
5023 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5024 {
5025         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5026                 return;
5027
5028         btrfs_set_fs_incompat(info, RAID56);
5029 }
5030
5031 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5032 {
5033         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5034                 return;
5035
5036         btrfs_set_fs_incompat(info, RAID1C34);
5037 }
5038
5039 /*
5040  * Structure used internally for btrfs_create_chunk() function.
5041  * Wraps needed parameters.
5042  */
5043 struct alloc_chunk_ctl {
5044         u64 start;
5045         u64 type;
5046         /* Total number of stripes to allocate */
5047         int num_stripes;
5048         /* sub_stripes info for map */
5049         int sub_stripes;
5050         /* Stripes per device */
5051         int dev_stripes;
5052         /* Maximum number of devices to use */
5053         int devs_max;
5054         /* Minimum number of devices to use */
5055         int devs_min;
5056         /* ndevs has to be a multiple of this */
5057         int devs_increment;
5058         /* Number of copies */
5059         int ncopies;
5060         /* Number of stripes worth of bytes to store parity information */
5061         int nparity;
5062         u64 max_stripe_size;
5063         u64 max_chunk_size;
5064         u64 dev_extent_min;
5065         u64 stripe_size;
5066         u64 chunk_size;
5067         int ndevs;
5068 };
5069
5070 static void init_alloc_chunk_ctl_policy_regular(
5071                                 struct btrfs_fs_devices *fs_devices,
5072                                 struct alloc_chunk_ctl *ctl)
5073 {
5074         u64 type = ctl->type;
5075
5076         if (type & BTRFS_BLOCK_GROUP_DATA) {
5077                 ctl->max_stripe_size = SZ_1G;
5078                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
5079         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5080                 /* For larger filesystems, use larger metadata chunks */
5081                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
5082                         ctl->max_stripe_size = SZ_1G;
5083                 else
5084                         ctl->max_stripe_size = SZ_256M;
5085                 ctl->max_chunk_size = ctl->max_stripe_size;
5086         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5087                 ctl->max_stripe_size = SZ_32M;
5088                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5089                 ctl->devs_max = min_t(int, ctl->devs_max,
5090                                       BTRFS_MAX_DEVS_SYS_CHUNK);
5091         } else {
5092                 BUG();
5093         }
5094
5095         /* We don't want a chunk larger than 10% of writable space */
5096         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
5097                                   ctl->max_chunk_size);
5098         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5099 }
5100
5101 static void init_alloc_chunk_ctl_policy_zoned(
5102                                       struct btrfs_fs_devices *fs_devices,
5103                                       struct alloc_chunk_ctl *ctl)
5104 {
5105         u64 zone_size = fs_devices->fs_info->zone_size;
5106         u64 limit;
5107         int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5108         int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5109         u64 min_chunk_size = min_data_stripes * zone_size;
5110         u64 type = ctl->type;
5111
5112         ctl->max_stripe_size = zone_size;
5113         if (type & BTRFS_BLOCK_GROUP_DATA) {
5114                 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5115                                                  zone_size);
5116         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5117                 ctl->max_chunk_size = ctl->max_stripe_size;
5118         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5119                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5120                 ctl->devs_max = min_t(int, ctl->devs_max,
5121                                       BTRFS_MAX_DEVS_SYS_CHUNK);
5122         } else {
5123                 BUG();
5124         }
5125
5126         /* We don't want a chunk larger than 10% of writable space */
5127         limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
5128                                zone_size),
5129                     min_chunk_size);
5130         ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5131         ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5132 }
5133
5134 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5135                                  struct alloc_chunk_ctl *ctl)
5136 {
5137         int index = btrfs_bg_flags_to_raid_index(ctl->type);
5138
5139         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5140         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5141         ctl->devs_max = btrfs_raid_array[index].devs_max;
5142         if (!ctl->devs_max)
5143                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5144         ctl->devs_min = btrfs_raid_array[index].devs_min;
5145         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5146         ctl->ncopies = btrfs_raid_array[index].ncopies;
5147         ctl->nparity = btrfs_raid_array[index].nparity;
5148         ctl->ndevs = 0;
5149
5150         switch (fs_devices->chunk_alloc_policy) {
5151         case BTRFS_CHUNK_ALLOC_REGULAR:
5152                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5153                 break;
5154         case BTRFS_CHUNK_ALLOC_ZONED:
5155                 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5156                 break;
5157         default:
5158                 BUG();
5159         }
5160 }
5161
5162 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5163                               struct alloc_chunk_ctl *ctl,
5164                               struct btrfs_device_info *devices_info)
5165 {
5166         struct btrfs_fs_info *info = fs_devices->fs_info;
5167         struct btrfs_device *device;
5168         u64 total_avail;
5169         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5170         int ret;
5171         int ndevs = 0;
5172         u64 max_avail;
5173         u64 dev_offset;
5174
5175         /*
5176          * in the first pass through the devices list, we gather information
5177          * about the available holes on each device.
5178          */
5179         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5180                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5181                         WARN(1, KERN_ERR
5182                                "BTRFS: read-only device in alloc_list\n");
5183                         continue;
5184                 }
5185
5186                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5187                                         &device->dev_state) ||
5188                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5189                         continue;
5190
5191                 if (device->total_bytes > device->bytes_used)
5192                         total_avail = device->total_bytes - device->bytes_used;
5193                 else
5194                         total_avail = 0;
5195
5196                 /* If there is no space on this device, skip it. */
5197                 if (total_avail < ctl->dev_extent_min)
5198                         continue;
5199
5200                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5201                                            &max_avail);
5202                 if (ret && ret != -ENOSPC)
5203                         return ret;
5204
5205                 if (ret == 0)
5206                         max_avail = dev_extent_want;
5207
5208                 if (max_avail < ctl->dev_extent_min) {
5209                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
5210                                 btrfs_debug(info,
5211                         "%s: devid %llu has no free space, have=%llu want=%llu",
5212                                             __func__, device->devid, max_avail,
5213                                             ctl->dev_extent_min);
5214                         continue;
5215                 }
5216
5217                 if (ndevs == fs_devices->rw_devices) {
5218                         WARN(1, "%s: found more than %llu devices\n",
5219                              __func__, fs_devices->rw_devices);
5220                         break;
5221                 }
5222                 devices_info[ndevs].dev_offset = dev_offset;
5223                 devices_info[ndevs].max_avail = max_avail;
5224                 devices_info[ndevs].total_avail = total_avail;
5225                 devices_info[ndevs].dev = device;
5226                 ++ndevs;
5227         }
5228         ctl->ndevs = ndevs;
5229
5230         /*
5231          * now sort the devices by hole size / available space
5232          */
5233         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5234              btrfs_cmp_device_info, NULL);
5235
5236         return 0;
5237 }
5238
5239 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5240                                       struct btrfs_device_info *devices_info)
5241 {
5242         /* Number of stripes that count for block group size */
5243         int data_stripes;
5244
5245         /*
5246          * The primary goal is to maximize the number of stripes, so use as
5247          * many devices as possible, even if the stripes are not maximum sized.
5248          *
5249          * The DUP profile stores more than one stripe per device, the
5250          * max_avail is the total size so we have to adjust.
5251          */
5252         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5253                                    ctl->dev_stripes);
5254         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5255
5256         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5257         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5258
5259         /*
5260          * Use the number of data stripes to figure out how big this chunk is
5261          * really going to be in terms of logical address space, and compare
5262          * that answer with the max chunk size. If it's higher, we try to
5263          * reduce stripe_size.
5264          */
5265         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5266                 /*
5267                  * Reduce stripe_size, round it up to a 16MB boundary again and
5268                  * then use it, unless it ends up being even bigger than the
5269                  * previous value we had already.
5270                  */
5271                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5272                                                         data_stripes), SZ_16M),
5273                                        ctl->stripe_size);
5274         }
5275
5276         /* Align to BTRFS_STRIPE_LEN */
5277         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5278         ctl->chunk_size = ctl->stripe_size * data_stripes;
5279
5280         return 0;
5281 }
5282
5283 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5284                                     struct btrfs_device_info *devices_info)
5285 {
5286         u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5287         /* Number of stripes that count for block group size */
5288         int data_stripes;
5289
5290         /*
5291          * It should hold because:
5292          *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
5293          */
5294         ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5295
5296         ctl->stripe_size = zone_size;
5297         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5298         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5299
5300         /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5301         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5302                 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5303                                              ctl->stripe_size) + ctl->nparity,
5304                                      ctl->dev_stripes);
5305                 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5306                 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5307                 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5308         }
5309
5310         ctl->chunk_size = ctl->stripe_size * data_stripes;
5311
5312         return 0;
5313 }
5314
5315 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5316                               struct alloc_chunk_ctl *ctl,
5317                               struct btrfs_device_info *devices_info)
5318 {
5319         struct btrfs_fs_info *info = fs_devices->fs_info;
5320
5321         /*
5322          * Round down to number of usable stripes, devs_increment can be any
5323          * number so we can't use round_down() that requires power of 2, while
5324          * rounddown is safe.
5325          */
5326         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5327
5328         if (ctl->ndevs < ctl->devs_min) {
5329                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5330                         btrfs_debug(info,
5331         "%s: not enough devices with free space: have=%d minimum required=%d",
5332                                     __func__, ctl->ndevs, ctl->devs_min);
5333                 }
5334                 return -ENOSPC;
5335         }
5336
5337         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5338
5339         switch (fs_devices->chunk_alloc_policy) {
5340         case BTRFS_CHUNK_ALLOC_REGULAR:
5341                 return decide_stripe_size_regular(ctl, devices_info);
5342         case BTRFS_CHUNK_ALLOC_ZONED:
5343                 return decide_stripe_size_zoned(ctl, devices_info);
5344         default:
5345                 BUG();
5346         }
5347 }
5348
5349 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5350                         struct alloc_chunk_ctl *ctl,
5351                         struct btrfs_device_info *devices_info)
5352 {
5353         struct btrfs_fs_info *info = trans->fs_info;
5354         struct map_lookup *map = NULL;
5355         struct extent_map_tree *em_tree;
5356         struct btrfs_block_group *block_group;
5357         struct extent_map *em;
5358         u64 start = ctl->start;
5359         u64 type = ctl->type;
5360         int ret;
5361         int i;
5362         int j;
5363
5364         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5365         if (!map)
5366                 return ERR_PTR(-ENOMEM);
5367         map->num_stripes = ctl->num_stripes;
5368
5369         for (i = 0; i < ctl->ndevs; ++i) {
5370                 for (j = 0; j < ctl->dev_stripes; ++j) {
5371                         int s = i * ctl->dev_stripes + j;
5372                         map->stripes[s].dev = devices_info[i].dev;
5373                         map->stripes[s].physical = devices_info[i].dev_offset +
5374                                                    j * ctl->stripe_size;
5375                 }
5376         }
5377         map->stripe_len = BTRFS_STRIPE_LEN;
5378         map->io_align = BTRFS_STRIPE_LEN;
5379         map->io_width = BTRFS_STRIPE_LEN;
5380         map->type = type;
5381         map->sub_stripes = ctl->sub_stripes;
5382
5383         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5384
5385         em = alloc_extent_map();
5386         if (!em) {
5387                 kfree(map);
5388                 return ERR_PTR(-ENOMEM);
5389         }
5390         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5391         em->map_lookup = map;
5392         em->start = start;
5393         em->len = ctl->chunk_size;
5394         em->block_start = 0;
5395         em->block_len = em->len;
5396         em->orig_block_len = ctl->stripe_size;
5397
5398         em_tree = &info->mapping_tree;
5399         write_lock(&em_tree->lock);
5400         ret = add_extent_mapping(em_tree, em, 0);
5401         if (ret) {
5402                 write_unlock(&em_tree->lock);
5403                 free_extent_map(em);
5404                 return ERR_PTR(ret);
5405         }
5406         write_unlock(&em_tree->lock);
5407
5408         block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5409         if (IS_ERR(block_group))
5410                 goto error_del_extent;
5411
5412         for (i = 0; i < map->num_stripes; i++) {
5413                 struct btrfs_device *dev = map->stripes[i].dev;
5414
5415                 btrfs_device_set_bytes_used(dev,
5416                                             dev->bytes_used + ctl->stripe_size);
5417                 if (list_empty(&dev->post_commit_list))
5418                         list_add_tail(&dev->post_commit_list,
5419                                       &trans->transaction->dev_update_list);
5420         }
5421
5422         atomic64_sub(ctl->stripe_size * map->num_stripes,
5423                      &info->free_chunk_space);
5424
5425         free_extent_map(em);
5426         check_raid56_incompat_flag(info, type);
5427         check_raid1c34_incompat_flag(info, type);
5428
5429         return block_group;
5430
5431 error_del_extent:
5432         write_lock(&em_tree->lock);
5433         remove_extent_mapping(em_tree, em);
5434         write_unlock(&em_tree->lock);
5435
5436         /* One for our allocation */
5437         free_extent_map(em);
5438         /* One for the tree reference */
5439         free_extent_map(em);
5440
5441         return block_group;
5442 }
5443
5444 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
5445                                             u64 type)
5446 {
5447         struct btrfs_fs_info *info = trans->fs_info;
5448         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5449         struct btrfs_device_info *devices_info = NULL;
5450         struct alloc_chunk_ctl ctl;
5451         struct btrfs_block_group *block_group;
5452         int ret;
5453
5454         lockdep_assert_held(&info->chunk_mutex);
5455
5456         if (!alloc_profile_is_valid(type, 0)) {
5457                 ASSERT(0);
5458                 return ERR_PTR(-EINVAL);
5459         }
5460
5461         if (list_empty(&fs_devices->alloc_list)) {
5462                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5463                         btrfs_debug(info, "%s: no writable device", __func__);
5464                 return ERR_PTR(-ENOSPC);
5465         }
5466
5467         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5468                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5469                 ASSERT(0);
5470                 return ERR_PTR(-EINVAL);
5471         }
5472
5473         ctl.start = find_next_chunk(info);
5474         ctl.type = type;
5475         init_alloc_chunk_ctl(fs_devices, &ctl);
5476
5477         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5478                                GFP_NOFS);
5479         if (!devices_info)
5480                 return ERR_PTR(-ENOMEM);
5481
5482         ret = gather_device_info(fs_devices, &ctl, devices_info);
5483         if (ret < 0) {
5484                 block_group = ERR_PTR(ret);
5485                 goto out;
5486         }
5487
5488         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5489         if (ret < 0) {
5490                 block_group = ERR_PTR(ret);
5491                 goto out;
5492         }
5493
5494         block_group = create_chunk(trans, &ctl, devices_info);
5495
5496 out:
5497         kfree(devices_info);
5498         return block_group;
5499 }
5500
5501 /*
5502  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5503  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5504  * chunks.
5505  *
5506  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5507  * phases.
5508  */
5509 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5510                                      struct btrfs_block_group *bg)
5511 {
5512         struct btrfs_fs_info *fs_info = trans->fs_info;
5513         struct btrfs_root *chunk_root = fs_info->chunk_root;
5514         struct btrfs_key key;
5515         struct btrfs_chunk *chunk;
5516         struct btrfs_stripe *stripe;
5517         struct extent_map *em;
5518         struct map_lookup *map;
5519         size_t item_size;
5520         int i;
5521         int ret;
5522
5523         /*
5524          * We take the chunk_mutex for 2 reasons:
5525          *
5526          * 1) Updates and insertions in the chunk btree must be done while holding
5527          *    the chunk_mutex, as well as updating the system chunk array in the
5528          *    superblock. See the comment on top of btrfs_chunk_alloc() for the
5529          *    details;
5530          *
5531          * 2) To prevent races with the final phase of a device replace operation
5532          *    that replaces the device object associated with the map's stripes,
5533          *    because the device object's id can change at any time during that
5534          *    final phase of the device replace operation
5535          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5536          *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5537          *    which would cause a failure when updating the device item, which does
5538          *    not exists, or persisting a stripe of the chunk item with such ID.
5539          *    Here we can't use the device_list_mutex because our caller already
5540          *    has locked the chunk_mutex, and the final phase of device replace
5541          *    acquires both mutexes - first the device_list_mutex and then the
5542          *    chunk_mutex. Using any of those two mutexes protects us from a
5543          *    concurrent device replace.
5544          */
5545         lockdep_assert_held(&fs_info->chunk_mutex);
5546
5547         em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5548         if (IS_ERR(em)) {
5549                 ret = PTR_ERR(em);
5550                 btrfs_abort_transaction(trans, ret);
5551                 return ret;
5552         }
5553
5554         map = em->map_lookup;
5555         item_size = btrfs_chunk_item_size(map->num_stripes);
5556
5557         chunk = kzalloc(item_size, GFP_NOFS);
5558         if (!chunk) {
5559                 ret = -ENOMEM;
5560                 btrfs_abort_transaction(trans, ret);
5561                 goto out;
5562         }
5563
5564         for (i = 0; i < map->num_stripes; i++) {
5565                 struct btrfs_device *device = map->stripes[i].dev;
5566
5567                 ret = btrfs_update_device(trans, device);
5568                 if (ret)
5569                         goto out;
5570         }
5571
5572         stripe = &chunk->stripe;
5573         for (i = 0; i < map->num_stripes; i++) {
5574                 struct btrfs_device *device = map->stripes[i].dev;
5575                 const u64 dev_offset = map->stripes[i].physical;
5576
5577                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5578                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5579                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5580                 stripe++;
5581         }
5582
5583         btrfs_set_stack_chunk_length(chunk, bg->length);
5584         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5585         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5586         btrfs_set_stack_chunk_type(chunk, map->type);
5587         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5588         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5589         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5590         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5591         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5592
5593         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5594         key.type = BTRFS_CHUNK_ITEM_KEY;
5595         key.offset = bg->start;
5596
5597         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5598         if (ret)
5599                 goto out;
5600
5601         bg->chunk_item_inserted = 1;
5602
5603         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5604                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5605                 if (ret)
5606                         goto out;
5607         }
5608
5609 out:
5610         kfree(chunk);
5611         free_extent_map(em);
5612         return ret;
5613 }
5614
5615 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5616 {
5617         struct btrfs_fs_info *fs_info = trans->fs_info;
5618         u64 alloc_profile;
5619         struct btrfs_block_group *meta_bg;
5620         struct btrfs_block_group *sys_bg;
5621
5622         /*
5623          * When adding a new device for sprouting, the seed device is read-only
5624          * so we must first allocate a metadata and a system chunk. But before
5625          * adding the block group items to the extent, device and chunk btrees,
5626          * we must first:
5627          *
5628          * 1) Create both chunks without doing any changes to the btrees, as
5629          *    otherwise we would get -ENOSPC since the block groups from the
5630          *    seed device are read-only;
5631          *
5632          * 2) Add the device item for the new sprout device - finishing the setup
5633          *    of a new block group requires updating the device item in the chunk
5634          *    btree, so it must exist when we attempt to do it. The previous step
5635          *    ensures this does not fail with -ENOSPC.
5636          *
5637          * After that we can add the block group items to their btrees:
5638          * update existing device item in the chunk btree, add a new block group
5639          * item to the extent btree, add a new chunk item to the chunk btree and
5640          * finally add the new device extent items to the devices btree.
5641          */
5642
5643         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5644         meta_bg = btrfs_create_chunk(trans, alloc_profile);
5645         if (IS_ERR(meta_bg))
5646                 return PTR_ERR(meta_bg);
5647
5648         alloc_profile = btrfs_system_alloc_profile(fs_info);
5649         sys_bg = btrfs_create_chunk(trans, alloc_profile);
5650         if (IS_ERR(sys_bg))
5651                 return PTR_ERR(sys_bg);
5652
5653         return 0;
5654 }
5655
5656 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5657 {
5658         const int index = btrfs_bg_flags_to_raid_index(map->type);
5659
5660         return btrfs_raid_array[index].tolerated_failures;
5661 }
5662
5663 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5664 {
5665         struct extent_map *em;
5666         struct map_lookup *map;
5667         int miss_ndevs = 0;
5668         int i;
5669         bool ret = true;
5670
5671         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5672         if (IS_ERR(em))
5673                 return false;
5674
5675         map = em->map_lookup;
5676         for (i = 0; i < map->num_stripes; i++) {
5677                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5678                                         &map->stripes[i].dev->dev_state)) {
5679                         miss_ndevs++;
5680                         continue;
5681                 }
5682                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5683                                         &map->stripes[i].dev->dev_state)) {
5684                         ret = false;
5685                         goto end;
5686                 }
5687         }
5688
5689         /*
5690          * If the number of missing devices is larger than max errors, we can
5691          * not write the data into that chunk successfully.
5692          */
5693         if (miss_ndevs > btrfs_chunk_max_errors(map))
5694                 ret = false;
5695 end:
5696         free_extent_map(em);
5697         return ret;
5698 }
5699
5700 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5701 {
5702         struct extent_map *em;
5703
5704         while (1) {
5705                 write_lock(&tree->lock);
5706                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5707                 if (em)
5708                         remove_extent_mapping(tree, em);
5709                 write_unlock(&tree->lock);
5710                 if (!em)
5711                         break;
5712                 /* once for us */
5713                 free_extent_map(em);
5714                 /* once for the tree */
5715                 free_extent_map(em);
5716         }
5717 }
5718
5719 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5720 {
5721         struct extent_map *em;
5722         struct map_lookup *map;
5723         int ret;
5724
5725         em = btrfs_get_chunk_map(fs_info, logical, len);
5726         if (IS_ERR(em))
5727                 /*
5728                  * We could return errors for these cases, but that could get
5729                  * ugly and we'd probably do the same thing which is just not do
5730                  * anything else and exit, so return 1 so the callers don't try
5731                  * to use other copies.
5732                  */
5733                 return 1;
5734
5735         map = em->map_lookup;
5736         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5737                 ret = map->num_stripes;
5738         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5739                 ret = map->sub_stripes;
5740         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5741                 ret = 2;
5742         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5743                 /*
5744                  * There could be two corrupted data stripes, we need
5745                  * to loop retry in order to rebuild the correct data.
5746                  *
5747                  * Fail a stripe at a time on every retry except the
5748                  * stripe under reconstruction.
5749                  */
5750                 ret = map->num_stripes;
5751         else
5752                 ret = 1;
5753         free_extent_map(em);
5754
5755         down_read(&fs_info->dev_replace.rwsem);
5756         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5757             fs_info->dev_replace.tgtdev)
5758                 ret++;
5759         up_read(&fs_info->dev_replace.rwsem);
5760
5761         return ret;
5762 }
5763
5764 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5765                                     u64 logical)
5766 {
5767         struct extent_map *em;
5768         struct map_lookup *map;
5769         unsigned long len = fs_info->sectorsize;
5770
5771         em = btrfs_get_chunk_map(fs_info, logical, len);
5772
5773         if (!WARN_ON(IS_ERR(em))) {
5774                 map = em->map_lookup;
5775                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5776                         len = map->stripe_len * nr_data_stripes(map);
5777                 free_extent_map(em);
5778         }
5779         return len;
5780 }
5781
5782 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5783 {
5784         struct extent_map *em;
5785         struct map_lookup *map;
5786         int ret = 0;
5787
5788         em = btrfs_get_chunk_map(fs_info, logical, len);
5789
5790         if(!WARN_ON(IS_ERR(em))) {
5791                 map = em->map_lookup;
5792                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5793                         ret = 1;
5794                 free_extent_map(em);
5795         }
5796         return ret;
5797 }
5798
5799 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5800                             struct map_lookup *map, int first,
5801                             int dev_replace_is_ongoing)
5802 {
5803         int i;
5804         int num_stripes;
5805         int preferred_mirror;
5806         int tolerance;
5807         struct btrfs_device *srcdev;
5808
5809         ASSERT((map->type &
5810                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5811
5812         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5813                 num_stripes = map->sub_stripes;
5814         else
5815                 num_stripes = map->num_stripes;
5816
5817         switch (fs_info->fs_devices->read_policy) {
5818         default:
5819                 /* Shouldn't happen, just warn and use pid instead of failing */
5820                 btrfs_warn_rl(fs_info,
5821                               "unknown read_policy type %u, reset to pid",
5822                               fs_info->fs_devices->read_policy);
5823                 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5824                 fallthrough;
5825         case BTRFS_READ_POLICY_PID:
5826                 preferred_mirror = first + (current->pid % num_stripes);
5827                 break;
5828         }
5829
5830         if (dev_replace_is_ongoing &&
5831             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5832              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5833                 srcdev = fs_info->dev_replace.srcdev;
5834         else
5835                 srcdev = NULL;
5836
5837         /*
5838          * try to avoid the drive that is the source drive for a
5839          * dev-replace procedure, only choose it if no other non-missing
5840          * mirror is available
5841          */
5842         for (tolerance = 0; tolerance < 2; tolerance++) {
5843                 if (map->stripes[preferred_mirror].dev->bdev &&
5844                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5845                         return preferred_mirror;
5846                 for (i = first; i < first + num_stripes; i++) {
5847                         if (map->stripes[i].dev->bdev &&
5848                             (tolerance || map->stripes[i].dev != srcdev))
5849                                 return i;
5850                 }
5851         }
5852
5853         /* we couldn't find one that doesn't fail.  Just return something
5854          * and the io error handling code will clean up eventually
5855          */
5856         return preferred_mirror;
5857 }
5858
5859 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5860 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
5861 {
5862         int i;
5863         int again = 1;
5864
5865         while (again) {
5866                 again = 0;
5867                 for (i = 0; i < num_stripes - 1; i++) {
5868                         /* Swap if parity is on a smaller index */
5869                         if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
5870                                 swap(bioc->stripes[i], bioc->stripes[i + 1]);
5871                                 swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
5872                                 again = 1;
5873                         }
5874                 }
5875         }
5876 }
5877
5878 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5879                                                        int total_stripes,
5880                                                        int real_stripes)
5881 {
5882         struct btrfs_io_context *bioc = kzalloc(
5883                  /* The size of btrfs_io_context */
5884                 sizeof(struct btrfs_io_context) +
5885                 /* Plus the variable array for the stripes */
5886                 sizeof(struct btrfs_io_stripe) * (total_stripes) +
5887                 /* Plus the variable array for the tgt dev */
5888                 sizeof(int) * (real_stripes) +
5889                 /*
5890                  * Plus the raid_map, which includes both the tgt dev
5891                  * and the stripes.
5892                  */
5893                 sizeof(u64) * (total_stripes),
5894                 GFP_NOFS|__GFP_NOFAIL);
5895
5896         atomic_set(&bioc->error, 0);
5897         refcount_set(&bioc->refs, 1);
5898
5899         bioc->fs_info = fs_info;
5900         bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
5901         bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
5902
5903         return bioc;
5904 }
5905
5906 void btrfs_get_bioc(struct btrfs_io_context *bioc)
5907 {
5908         WARN_ON(!refcount_read(&bioc->refs));
5909         refcount_inc(&bioc->refs);
5910 }
5911
5912 void btrfs_put_bioc(struct btrfs_io_context *bioc)
5913 {
5914         if (!bioc)
5915                 return;
5916         if (refcount_dec_and_test(&bioc->refs))
5917                 kfree(bioc);
5918 }
5919
5920 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5921 /*
5922  * Please note that, discard won't be sent to target device of device
5923  * replace.
5924  */
5925 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5926                                          u64 logical, u64 *length_ret,
5927                                          struct btrfs_io_context **bioc_ret)
5928 {
5929         struct extent_map *em;
5930         struct map_lookup *map;
5931         struct btrfs_io_context *bioc;
5932         u64 length = *length_ret;
5933         u64 offset;
5934         u64 stripe_nr;
5935         u64 stripe_nr_end;
5936         u64 stripe_end_offset;
5937         u64 stripe_cnt;
5938         u64 stripe_len;
5939         u64 stripe_offset;
5940         u64 num_stripes;
5941         u32 stripe_index;
5942         u32 factor = 0;
5943         u32 sub_stripes = 0;
5944         u64 stripes_per_dev = 0;
5945         u32 remaining_stripes = 0;
5946         u32 last_stripe = 0;
5947         int ret = 0;
5948         int i;
5949
5950         /* Discard always returns a bioc. */
5951         ASSERT(bioc_ret);
5952
5953         em = btrfs_get_chunk_map(fs_info, logical, length);
5954         if (IS_ERR(em))
5955                 return PTR_ERR(em);
5956
5957         map = em->map_lookup;
5958         /* we don't discard raid56 yet */
5959         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5960                 ret = -EOPNOTSUPP;
5961                 goto out;
5962         }
5963
5964         offset = logical - em->start;
5965         length = min_t(u64, em->start + em->len - logical, length);
5966         *length_ret = length;
5967
5968         stripe_len = map->stripe_len;
5969         /*
5970          * stripe_nr counts the total number of stripes we have to stride
5971          * to get to this block
5972          */
5973         stripe_nr = div64_u64(offset, stripe_len);
5974
5975         /* stripe_offset is the offset of this block in its stripe */
5976         stripe_offset = offset - stripe_nr * stripe_len;
5977
5978         stripe_nr_end = round_up(offset + length, map->stripe_len);
5979         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5980         stripe_cnt = stripe_nr_end - stripe_nr;
5981         stripe_end_offset = stripe_nr_end * map->stripe_len -
5982                             (offset + length);
5983         /*
5984          * after this, stripe_nr is the number of stripes on this
5985          * device we have to walk to find the data, and stripe_index is
5986          * the number of our device in the stripe array
5987          */
5988         num_stripes = 1;
5989         stripe_index = 0;
5990         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5991                          BTRFS_BLOCK_GROUP_RAID10)) {
5992                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5993                         sub_stripes = 1;
5994                 else
5995                         sub_stripes = map->sub_stripes;
5996
5997                 factor = map->num_stripes / sub_stripes;
5998                 num_stripes = min_t(u64, map->num_stripes,
5999                                     sub_stripes * stripe_cnt);
6000                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6001                 stripe_index *= sub_stripes;
6002                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
6003                                               &remaining_stripes);
6004                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
6005                 last_stripe *= sub_stripes;
6006         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
6007                                 BTRFS_BLOCK_GROUP_DUP)) {
6008                 num_stripes = map->num_stripes;
6009         } else {
6010                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6011                                         &stripe_index);
6012         }
6013
6014         bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
6015         if (!bioc) {
6016                 ret = -ENOMEM;
6017                 goto out;
6018         }
6019
6020         for (i = 0; i < num_stripes; i++) {
6021                 bioc->stripes[i].physical =
6022                         map->stripes[stripe_index].physical +
6023                         stripe_offset + stripe_nr * map->stripe_len;
6024                 bioc->stripes[i].dev = map->stripes[stripe_index].dev;
6025
6026                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6027                                  BTRFS_BLOCK_GROUP_RAID10)) {
6028                         bioc->stripes[i].length = stripes_per_dev *
6029                                 map->stripe_len;
6030
6031                         if (i / sub_stripes < remaining_stripes)
6032                                 bioc->stripes[i].length += map->stripe_len;
6033
6034                         /*
6035                          * Special for the first stripe and
6036                          * the last stripe:
6037                          *
6038                          * |-------|...|-------|
6039                          *     |----------|
6040                          *    off     end_off
6041                          */
6042                         if (i < sub_stripes)
6043                                 bioc->stripes[i].length -= stripe_offset;
6044
6045                         if (stripe_index >= last_stripe &&
6046                             stripe_index <= (last_stripe +
6047                                              sub_stripes - 1))
6048                                 bioc->stripes[i].length -= stripe_end_offset;
6049
6050                         if (i == sub_stripes - 1)
6051                                 stripe_offset = 0;
6052                 } else {
6053                         bioc->stripes[i].length = length;
6054                 }
6055
6056                 stripe_index++;
6057                 if (stripe_index == map->num_stripes) {
6058                         stripe_index = 0;
6059                         stripe_nr++;
6060                 }
6061         }
6062
6063         *bioc_ret = bioc;
6064         bioc->map_type = map->type;
6065         bioc->num_stripes = num_stripes;
6066 out:
6067         free_extent_map(em);
6068         return ret;
6069 }
6070
6071 /*
6072  * In dev-replace case, for repair case (that's the only case where the mirror
6073  * is selected explicitly when calling btrfs_map_block), blocks left of the
6074  * left cursor can also be read from the target drive.
6075  *
6076  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
6077  * array of stripes.
6078  * For READ, it also needs to be supported using the same mirror number.
6079  *
6080  * If the requested block is not left of the left cursor, EIO is returned. This
6081  * can happen because btrfs_num_copies() returns one more in the dev-replace
6082  * case.
6083  */
6084 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
6085                                          u64 logical, u64 length,
6086                                          u64 srcdev_devid, int *mirror_num,
6087                                          u64 *physical)
6088 {
6089         struct btrfs_io_context *bioc = NULL;
6090         int num_stripes;
6091         int index_srcdev = 0;
6092         int found = 0;
6093         u64 physical_of_found = 0;
6094         int i;
6095         int ret = 0;
6096
6097         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
6098                                 logical, &length, &bioc, 0, 0);
6099         if (ret) {
6100                 ASSERT(bioc == NULL);
6101                 return ret;
6102         }
6103
6104         num_stripes = bioc->num_stripes;
6105         if (*mirror_num > num_stripes) {
6106                 /*
6107                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
6108                  * that means that the requested area is not left of the left
6109                  * cursor
6110                  */
6111                 btrfs_put_bioc(bioc);
6112                 return -EIO;
6113         }
6114
6115         /*
6116          * process the rest of the function using the mirror_num of the source
6117          * drive. Therefore look it up first.  At the end, patch the device
6118          * pointer to the one of the target drive.
6119          */
6120         for (i = 0; i < num_stripes; i++) {
6121                 if (bioc->stripes[i].dev->devid != srcdev_devid)
6122                         continue;
6123
6124                 /*
6125                  * In case of DUP, in order to keep it simple, only add the
6126                  * mirror with the lowest physical address
6127                  */
6128                 if (found &&
6129                     physical_of_found <= bioc->stripes[i].physical)
6130                         continue;
6131
6132                 index_srcdev = i;
6133                 found = 1;
6134                 physical_of_found = bioc->stripes[i].physical;
6135         }
6136
6137         btrfs_put_bioc(bioc);
6138
6139         ASSERT(found);
6140         if (!found)
6141                 return -EIO;
6142
6143         *mirror_num = index_srcdev + 1;
6144         *physical = physical_of_found;
6145         return ret;
6146 }
6147
6148 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6149 {
6150         struct btrfs_block_group *cache;
6151         bool ret;
6152
6153         /* Non zoned filesystem does not use "to_copy" flag */
6154         if (!btrfs_is_zoned(fs_info))
6155                 return false;
6156
6157         cache = btrfs_lookup_block_group(fs_info, logical);
6158
6159         spin_lock(&cache->lock);
6160         ret = cache->to_copy;
6161         spin_unlock(&cache->lock);
6162
6163         btrfs_put_block_group(cache);
6164         return ret;
6165 }
6166
6167 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6168                                       struct btrfs_io_context **bioc_ret,
6169                                       struct btrfs_dev_replace *dev_replace,
6170                                       u64 logical,
6171                                       int *num_stripes_ret, int *max_errors_ret)
6172 {
6173         struct btrfs_io_context *bioc = *bioc_ret;
6174         u64 srcdev_devid = dev_replace->srcdev->devid;
6175         int tgtdev_indexes = 0;
6176         int num_stripes = *num_stripes_ret;
6177         int max_errors = *max_errors_ret;
6178         int i;
6179
6180         if (op == BTRFS_MAP_WRITE) {
6181                 int index_where_to_add;
6182
6183                 /*
6184                  * A block group which have "to_copy" set will eventually
6185                  * copied by dev-replace process. We can avoid cloning IO here.
6186                  */
6187                 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6188                         return;
6189
6190                 /*
6191                  * duplicate the write operations while the dev replace
6192                  * procedure is running. Since the copying of the old disk to
6193                  * the new disk takes place at run time while the filesystem is
6194                  * mounted writable, the regular write operations to the old
6195                  * disk have to be duplicated to go to the new disk as well.
6196                  *
6197                  * Note that device->missing is handled by the caller, and that
6198                  * the write to the old disk is already set up in the stripes
6199                  * array.
6200                  */
6201                 index_where_to_add = num_stripes;
6202                 for (i = 0; i < num_stripes; i++) {
6203                         if (bioc->stripes[i].dev->devid == srcdev_devid) {
6204                                 /* write to new disk, too */
6205                                 struct btrfs_io_stripe *new =
6206                                         bioc->stripes + index_where_to_add;
6207                                 struct btrfs_io_stripe *old =
6208                                         bioc->stripes + i;
6209
6210                                 new->physical = old->physical;
6211                                 new->length = old->length;
6212                                 new->dev = dev_replace->tgtdev;
6213                                 bioc->tgtdev_map[i] = index_where_to_add;
6214                                 index_where_to_add++;
6215                                 max_errors++;
6216                                 tgtdev_indexes++;
6217                         }
6218                 }
6219                 num_stripes = index_where_to_add;
6220         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6221                 int index_srcdev = 0;
6222                 int found = 0;
6223                 u64 physical_of_found = 0;
6224
6225                 /*
6226                  * During the dev-replace procedure, the target drive can also
6227                  * be used to read data in case it is needed to repair a corrupt
6228                  * block elsewhere. This is possible if the requested area is
6229                  * left of the left cursor. In this area, the target drive is a
6230                  * full copy of the source drive.
6231                  */
6232                 for (i = 0; i < num_stripes; i++) {
6233                         if (bioc->stripes[i].dev->devid == srcdev_devid) {
6234                                 /*
6235                                  * In case of DUP, in order to keep it simple,
6236                                  * only add the mirror with the lowest physical
6237                                  * address
6238                                  */
6239                                 if (found &&
6240                                     physical_of_found <= bioc->stripes[i].physical)
6241                                         continue;
6242                                 index_srcdev = i;
6243                                 found = 1;
6244                                 physical_of_found = bioc->stripes[i].physical;
6245                         }
6246                 }
6247                 if (found) {
6248                         struct btrfs_io_stripe *tgtdev_stripe =
6249                                 bioc->stripes + num_stripes;
6250
6251                         tgtdev_stripe->physical = physical_of_found;
6252                         tgtdev_stripe->length =
6253                                 bioc->stripes[index_srcdev].length;
6254                         tgtdev_stripe->dev = dev_replace->tgtdev;
6255                         bioc->tgtdev_map[index_srcdev] = num_stripes;
6256
6257                         tgtdev_indexes++;
6258                         num_stripes++;
6259                 }
6260         }
6261
6262         *num_stripes_ret = num_stripes;
6263         *max_errors_ret = max_errors;
6264         bioc->num_tgtdevs = tgtdev_indexes;
6265         *bioc_ret = bioc;
6266 }
6267
6268 static bool need_full_stripe(enum btrfs_map_op op)
6269 {
6270         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6271 }
6272
6273 /*
6274  * Calculate the geometry of a particular (address, len) tuple. This
6275  * information is used to calculate how big a particular bio can get before it
6276  * straddles a stripe.
6277  *
6278  * @fs_info: the filesystem
6279  * @em:      mapping containing the logical extent
6280  * @op:      type of operation - write or read
6281  * @logical: address that we want to figure out the geometry of
6282  * @io_geom: pointer used to return values
6283  *
6284  * Returns < 0 in case a chunk for the given logical address cannot be found,
6285  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6286  */
6287 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6288                           enum btrfs_map_op op, u64 logical,
6289                           struct btrfs_io_geometry *io_geom)
6290 {
6291         struct map_lookup *map;
6292         u64 len;
6293         u64 offset;
6294         u64 stripe_offset;
6295         u64 stripe_nr;
6296         u32 stripe_len;
6297         u64 raid56_full_stripe_start = (u64)-1;
6298         int data_stripes;
6299
6300         ASSERT(op != BTRFS_MAP_DISCARD);
6301
6302         map = em->map_lookup;
6303         /* Offset of this logical address in the chunk */
6304         offset = logical - em->start;
6305         /* Len of a stripe in a chunk */
6306         stripe_len = map->stripe_len;
6307         /*
6308          * Stripe_nr is where this block falls in
6309          * stripe_offset is the offset of this block in its stripe.
6310          */
6311         stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
6312         ASSERT(stripe_offset < U32_MAX);
6313
6314         data_stripes = nr_data_stripes(map);
6315
6316         /* Only stripe based profiles needs to check against stripe length. */
6317         if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) {
6318                 u64 max_len = stripe_len - stripe_offset;
6319
6320                 /*
6321                  * In case of raid56, we need to know the stripe aligned start
6322                  */
6323                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6324                         unsigned long full_stripe_len = stripe_len * data_stripes;
6325                         raid56_full_stripe_start = offset;
6326
6327                         /*
6328                          * Allow a write of a full stripe, but make sure we
6329                          * don't allow straddling of stripes
6330                          */
6331                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6332                                         full_stripe_len);
6333                         raid56_full_stripe_start *= full_stripe_len;
6334
6335                         /*
6336                          * For writes to RAID[56], allow a full stripeset across
6337                          * all disks. For other RAID types and for RAID[56]
6338                          * reads, just allow a single stripe (on a single disk).
6339                          */
6340                         if (op == BTRFS_MAP_WRITE) {
6341                                 max_len = stripe_len * data_stripes -
6342                                           (offset - raid56_full_stripe_start);
6343                         }
6344                 }
6345                 len = min_t(u64, em->len - offset, max_len);
6346         } else {
6347                 len = em->len - offset;
6348         }
6349
6350         io_geom->len = len;
6351         io_geom->offset = offset;
6352         io_geom->stripe_len = stripe_len;
6353         io_geom->stripe_nr = stripe_nr;
6354         io_geom->stripe_offset = stripe_offset;
6355         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6356
6357         return 0;
6358 }
6359
6360 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6361                              enum btrfs_map_op op,
6362                              u64 logical, u64 *length,
6363                              struct btrfs_io_context **bioc_ret,
6364                              int mirror_num, int need_raid_map)
6365 {
6366         struct extent_map *em;
6367         struct map_lookup *map;
6368         u64 stripe_offset;
6369         u64 stripe_nr;
6370         u64 stripe_len;
6371         u32 stripe_index;
6372         int data_stripes;
6373         int i;
6374         int ret = 0;
6375         int num_stripes;
6376         int max_errors = 0;
6377         int tgtdev_indexes = 0;
6378         struct btrfs_io_context *bioc = NULL;
6379         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6380         int dev_replace_is_ongoing = 0;
6381         int num_alloc_stripes;
6382         int patch_the_first_stripe_for_dev_replace = 0;
6383         u64 physical_to_patch_in_first_stripe = 0;
6384         u64 raid56_full_stripe_start = (u64)-1;
6385         struct btrfs_io_geometry geom;
6386
6387         ASSERT(bioc_ret);
6388         ASSERT(op != BTRFS_MAP_DISCARD);
6389
6390         em = btrfs_get_chunk_map(fs_info, logical, *length);
6391         ASSERT(!IS_ERR(em));
6392
6393         ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6394         if (ret < 0)
6395                 return ret;
6396
6397         map = em->map_lookup;
6398
6399         *length = geom.len;
6400         stripe_len = geom.stripe_len;
6401         stripe_nr = geom.stripe_nr;
6402         stripe_offset = geom.stripe_offset;
6403         raid56_full_stripe_start = geom.raid56_stripe_offset;
6404         data_stripes = nr_data_stripes(map);
6405
6406         down_read(&dev_replace->rwsem);
6407         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6408         /*
6409          * Hold the semaphore for read during the whole operation, write is
6410          * requested at commit time but must wait.
6411          */
6412         if (!dev_replace_is_ongoing)
6413                 up_read(&dev_replace->rwsem);
6414
6415         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6416             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6417                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6418                                                     dev_replace->srcdev->devid,
6419                                                     &mirror_num,
6420                                             &physical_to_patch_in_first_stripe);
6421                 if (ret)
6422                         goto out;
6423                 else
6424                         patch_the_first_stripe_for_dev_replace = 1;
6425         } else if (mirror_num > map->num_stripes) {
6426                 mirror_num = 0;
6427         }
6428
6429         num_stripes = 1;
6430         stripe_index = 0;
6431         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6432                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6433                                 &stripe_index);
6434                 if (!need_full_stripe(op))
6435                         mirror_num = 1;
6436         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6437                 if (need_full_stripe(op))
6438                         num_stripes = map->num_stripes;
6439                 else if (mirror_num)
6440                         stripe_index = mirror_num - 1;
6441                 else {
6442                         stripe_index = find_live_mirror(fs_info, map, 0,
6443                                             dev_replace_is_ongoing);
6444                         mirror_num = stripe_index + 1;
6445                 }
6446
6447         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6448                 if (need_full_stripe(op)) {
6449                         num_stripes = map->num_stripes;
6450                 } else if (mirror_num) {
6451                         stripe_index = mirror_num - 1;
6452                 } else {
6453                         mirror_num = 1;
6454                 }
6455
6456         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6457                 u32 factor = map->num_stripes / map->sub_stripes;
6458
6459                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6460                 stripe_index *= map->sub_stripes;
6461
6462                 if (need_full_stripe(op))
6463                         num_stripes = map->sub_stripes;
6464                 else if (mirror_num)
6465                         stripe_index += mirror_num - 1;
6466                 else {
6467                         int old_stripe_index = stripe_index;
6468                         stripe_index = find_live_mirror(fs_info, map,
6469                                               stripe_index,
6470                                               dev_replace_is_ongoing);
6471                         mirror_num = stripe_index - old_stripe_index + 1;
6472                 }
6473
6474         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6475                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6476                         /* push stripe_nr back to the start of the full stripe */
6477                         stripe_nr = div64_u64(raid56_full_stripe_start,
6478                                         stripe_len * data_stripes);
6479
6480                         /* RAID[56] write or recovery. Return all stripes */
6481                         num_stripes = map->num_stripes;
6482                         max_errors = nr_parity_stripes(map);
6483
6484                         *length = map->stripe_len;
6485                         stripe_index = 0;
6486                         stripe_offset = 0;
6487                 } else {
6488                         /*
6489                          * Mirror #0 or #1 means the original data block.
6490                          * Mirror #2 is RAID5 parity block.
6491                          * Mirror #3 is RAID6 Q block.
6492                          */
6493                         stripe_nr = div_u64_rem(stripe_nr,
6494                                         data_stripes, &stripe_index);
6495                         if (mirror_num > 1)
6496                                 stripe_index = data_stripes + mirror_num - 2;
6497
6498                         /* We distribute the parity blocks across stripes */
6499                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6500                                         &stripe_index);
6501                         if (!need_full_stripe(op) && mirror_num <= 1)
6502                                 mirror_num = 1;
6503                 }
6504         } else {
6505                 /*
6506                  * after this, stripe_nr is the number of stripes on this
6507                  * device we have to walk to find the data, and stripe_index is
6508                  * the number of our device in the stripe array
6509                  */
6510                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6511                                 &stripe_index);
6512                 mirror_num = stripe_index + 1;
6513         }
6514         if (stripe_index >= map->num_stripes) {
6515                 btrfs_crit(fs_info,
6516                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6517                            stripe_index, map->num_stripes);
6518                 ret = -EINVAL;
6519                 goto out;
6520         }
6521
6522         num_alloc_stripes = num_stripes;
6523         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6524                 if (op == BTRFS_MAP_WRITE)
6525                         num_alloc_stripes <<= 1;
6526                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6527                         num_alloc_stripes++;
6528                 tgtdev_indexes = num_stripes;
6529         }
6530
6531         bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
6532         if (!bioc) {
6533                 ret = -ENOMEM;
6534                 goto out;
6535         }
6536
6537         for (i = 0; i < num_stripes; i++) {
6538                 bioc->stripes[i].physical = map->stripes[stripe_index].physical +
6539                         stripe_offset + stripe_nr * map->stripe_len;
6540                 bioc->stripes[i].dev = map->stripes[stripe_index].dev;
6541                 stripe_index++;
6542         }
6543
6544         /* Build raid_map */
6545         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6546             (need_full_stripe(op) || mirror_num > 1)) {
6547                 u64 tmp;
6548                 unsigned rot;
6549
6550                 /* Work out the disk rotation on this stripe-set */
6551                 div_u64_rem(stripe_nr, num_stripes, &rot);
6552
6553                 /* Fill in the logical address of each stripe */
6554                 tmp = stripe_nr * data_stripes;
6555                 for (i = 0; i < data_stripes; i++)
6556                         bioc->raid_map[(i + rot) % num_stripes] =
6557                                 em->start + (tmp + i) * map->stripe_len;
6558
6559                 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
6560                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6561                         bioc->raid_map[(i + rot + 1) % num_stripes] =
6562                                 RAID6_Q_STRIPE;
6563
6564                 sort_parity_stripes(bioc, num_stripes);
6565         }
6566
6567         if (need_full_stripe(op))
6568                 max_errors = btrfs_chunk_max_errors(map);
6569
6570         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6571             need_full_stripe(op)) {
6572                 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
6573                                           &num_stripes, &max_errors);
6574         }
6575
6576         *bioc_ret = bioc;
6577         bioc->map_type = map->type;
6578         bioc->num_stripes = num_stripes;
6579         bioc->max_errors = max_errors;
6580         bioc->mirror_num = mirror_num;
6581
6582         /*
6583          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6584          * mirror_num == num_stripes + 1 && dev_replace target drive is
6585          * available as a mirror
6586          */
6587         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6588                 WARN_ON(num_stripes > 1);
6589                 bioc->stripes[0].dev = dev_replace->tgtdev;
6590                 bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
6591                 bioc->mirror_num = map->num_stripes + 1;
6592         }
6593 out:
6594         if (dev_replace_is_ongoing) {
6595                 lockdep_assert_held(&dev_replace->rwsem);
6596                 /* Unlock and let waiting writers proceed */
6597                 up_read(&dev_replace->rwsem);
6598         }
6599         free_extent_map(em);
6600         return ret;
6601 }
6602
6603 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6604                       u64 logical, u64 *length,
6605                       struct btrfs_io_context **bioc_ret, int mirror_num)
6606 {
6607         if (op == BTRFS_MAP_DISCARD)
6608                 return __btrfs_map_block_for_discard(fs_info, logical,
6609                                                      length, bioc_ret);
6610
6611         return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
6612                                  mirror_num, 0);
6613 }
6614
6615 /* For Scrub/replace */
6616 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6617                      u64 logical, u64 *length,
6618                      struct btrfs_io_context **bioc_ret)
6619 {
6620         return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
6621 }
6622
6623 static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
6624 {
6625         bio->bi_private = bioc->private;
6626         bio->bi_end_io = bioc->end_io;
6627         bio_endio(bio);
6628
6629         btrfs_put_bioc(bioc);
6630 }
6631
6632 static void btrfs_end_bio(struct bio *bio)
6633 {
6634         struct btrfs_io_context *bioc = bio->bi_private;
6635         int is_orig_bio = 0;
6636
6637         if (bio->bi_status) {
6638                 atomic_inc(&bioc->error);
6639                 if (bio->bi_status == BLK_STS_IOERR ||
6640                     bio->bi_status == BLK_STS_TARGET) {
6641                         struct btrfs_device *dev = btrfs_bio(bio)->device;
6642
6643                         ASSERT(dev->bdev);
6644                         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6645                                 btrfs_dev_stat_inc_and_print(dev,
6646                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6647                         else if (!(bio->bi_opf & REQ_RAHEAD))
6648                                 btrfs_dev_stat_inc_and_print(dev,
6649                                                 BTRFS_DEV_STAT_READ_ERRS);
6650                         if (bio->bi_opf & REQ_PREFLUSH)
6651                                 btrfs_dev_stat_inc_and_print(dev,
6652                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6653                 }
6654         }
6655
6656         if (bio == bioc->orig_bio)
6657                 is_orig_bio = 1;
6658
6659         btrfs_bio_counter_dec(bioc->fs_info);
6660
6661         if (atomic_dec_and_test(&bioc->stripes_pending)) {
6662                 if (!is_orig_bio) {
6663                         bio_put(bio);
6664                         bio = bioc->orig_bio;
6665                 }
6666
6667                 btrfs_bio(bio)->mirror_num = bioc->mirror_num;
6668                 /* only send an error to the higher layers if it is
6669                  * beyond the tolerance of the btrfs bio
6670                  */
6671                 if (atomic_read(&bioc->error) > bioc->max_errors) {
6672                         bio->bi_status = BLK_STS_IOERR;
6673                 } else {
6674                         /*
6675                          * this bio is actually up to date, we didn't
6676                          * go over the max number of errors
6677                          */
6678                         bio->bi_status = BLK_STS_OK;
6679                 }
6680
6681                 btrfs_end_bioc(bioc, bio);
6682         } else if (!is_orig_bio) {
6683                 bio_put(bio);
6684         }
6685 }
6686
6687 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
6688                               u64 physical, struct btrfs_device *dev)
6689 {
6690         struct btrfs_fs_info *fs_info = bioc->fs_info;
6691
6692         bio->bi_private = bioc;
6693         btrfs_bio(bio)->device = dev;
6694         bio->bi_end_io = btrfs_end_bio;
6695         bio->bi_iter.bi_sector = physical >> 9;
6696         /*
6697          * For zone append writing, bi_sector must point the beginning of the
6698          * zone
6699          */
6700         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6701                 if (btrfs_dev_is_sequential(dev, physical)) {
6702                         u64 zone_start = round_down(physical, fs_info->zone_size);
6703
6704                         bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6705                 } else {
6706                         bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6707                         bio->bi_opf |= REQ_OP_WRITE;
6708                 }
6709         }
6710         btrfs_debug_in_rcu(fs_info,
6711         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6712                 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6713                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6714                 dev->devid, bio->bi_iter.bi_size);
6715
6716         btrfs_bio_counter_inc_noblocked(fs_info);
6717
6718         btrfsic_check_bio(bio);
6719         submit_bio(bio);
6720 }
6721
6722 static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
6723 {
6724         atomic_inc(&bioc->error);
6725         if (atomic_dec_and_test(&bioc->stripes_pending)) {
6726                 /* Should be the original bio. */
6727                 WARN_ON(bio != bioc->orig_bio);
6728
6729                 btrfs_bio(bio)->mirror_num = bioc->mirror_num;
6730                 bio->bi_iter.bi_sector = logical >> 9;
6731                 if (atomic_read(&bioc->error) > bioc->max_errors)
6732                         bio->bi_status = BLK_STS_IOERR;
6733                 else
6734                         bio->bi_status = BLK_STS_OK;
6735                 btrfs_end_bioc(bioc, bio);
6736         }
6737 }
6738
6739 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6740                            int mirror_num)
6741 {
6742         struct btrfs_device *dev;
6743         struct bio *first_bio = bio;
6744         u64 logical = bio->bi_iter.bi_sector << 9;
6745         u64 length = 0;
6746         u64 map_length;
6747         int ret;
6748         int dev_nr;
6749         int total_devs;
6750         struct btrfs_io_context *bioc = NULL;
6751
6752         length = bio->bi_iter.bi_size;
6753         map_length = length;
6754
6755         btrfs_bio_counter_inc_blocked(fs_info);
6756         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6757                                 &map_length, &bioc, mirror_num, 1);
6758         if (ret) {
6759                 btrfs_bio_counter_dec(fs_info);
6760                 return errno_to_blk_status(ret);
6761         }
6762
6763         total_devs = bioc->num_stripes;
6764         bioc->orig_bio = first_bio;
6765         bioc->private = first_bio->bi_private;
6766         bioc->end_io = first_bio->bi_end_io;
6767         atomic_set(&bioc->stripes_pending, bioc->num_stripes);
6768
6769         if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6770             ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
6771                 /* In this case, map_length has been set to the length of
6772                    a single stripe; not the whole write */
6773                 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6774                         ret = raid56_parity_write(bio, bioc, map_length);
6775                 } else {
6776                         ret = raid56_parity_recover(bio, bioc, map_length,
6777                                                     mirror_num, 1);
6778                 }
6779
6780                 btrfs_bio_counter_dec(fs_info);
6781                 return errno_to_blk_status(ret);
6782         }
6783
6784         if (map_length < length) {
6785                 btrfs_crit(fs_info,
6786                            "mapping failed logical %llu bio len %llu len %llu",
6787                            logical, length, map_length);
6788                 BUG();
6789         }
6790
6791         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6792                 dev = bioc->stripes[dev_nr].dev;
6793                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6794                                                    &dev->dev_state) ||
6795                     (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6796                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6797                         bioc_error(bioc, first_bio, logical);
6798                         continue;
6799                 }
6800
6801                 if (dev_nr < total_devs - 1) {
6802                         bio = btrfs_bio_clone(dev->bdev, first_bio);
6803                 } else {
6804                         bio = first_bio;
6805                         bio_set_dev(bio, dev->bdev);
6806                 }
6807
6808                 submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
6809         }
6810         btrfs_bio_counter_dec(fs_info);
6811         return BLK_STS_OK;
6812 }
6813
6814 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6815                                       const struct btrfs_fs_devices *fs_devices)
6816 {
6817         if (args->fsid == NULL)
6818                 return true;
6819         if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
6820                 return true;
6821         return false;
6822 }
6823
6824 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6825                                   const struct btrfs_device *device)
6826 {
6827         ASSERT((args->devid != (u64)-1) || args->missing);
6828
6829         if ((args->devid != (u64)-1) && device->devid != args->devid)
6830                 return false;
6831         if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
6832                 return false;
6833         if (!args->missing)
6834                 return true;
6835         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6836             !device->bdev)
6837                 return true;
6838         return false;
6839 }
6840
6841 /*
6842  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6843  * return NULL.
6844  *
6845  * If devid and uuid are both specified, the match must be exact, otherwise
6846  * only devid is used.
6847  */
6848 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
6849                                        const struct btrfs_dev_lookup_args *args)
6850 {
6851         struct btrfs_device *device;
6852         struct btrfs_fs_devices *seed_devs;
6853
6854         if (dev_args_match_fs_devices(args, fs_devices)) {
6855                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6856                         if (dev_args_match_device(args, device))
6857                                 return device;
6858                 }
6859         }
6860
6861         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6862                 if (!dev_args_match_fs_devices(args, seed_devs))
6863                         continue;
6864                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
6865                         if (dev_args_match_device(args, device))
6866                                 return device;
6867                 }
6868         }
6869
6870         return NULL;
6871 }
6872
6873 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6874                                             u64 devid, u8 *dev_uuid)
6875 {
6876         struct btrfs_device *device;
6877         unsigned int nofs_flag;
6878
6879         /*
6880          * We call this under the chunk_mutex, so we want to use NOFS for this
6881          * allocation, however we don't want to change btrfs_alloc_device() to
6882          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6883          * places.
6884          */
6885         nofs_flag = memalloc_nofs_save();
6886         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6887         memalloc_nofs_restore(nofs_flag);
6888         if (IS_ERR(device))
6889                 return device;
6890
6891         list_add(&device->dev_list, &fs_devices->devices);
6892         device->fs_devices = fs_devices;
6893         fs_devices->num_devices++;
6894
6895         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6896         fs_devices->missing_devices++;
6897
6898         return device;
6899 }
6900
6901 /**
6902  * btrfs_alloc_device - allocate struct btrfs_device
6903  * @fs_info:    used only for generating a new devid, can be NULL if
6904  *              devid is provided (i.e. @devid != NULL).
6905  * @devid:      a pointer to devid for this device.  If NULL a new devid
6906  *              is generated.
6907  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6908  *              is generated.
6909  *
6910  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6911  * on error.  Returned struct is not linked onto any lists and must be
6912  * destroyed with btrfs_free_device.
6913  */
6914 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6915                                         const u64 *devid,
6916                                         const u8 *uuid)
6917 {
6918         struct btrfs_device *dev;
6919         u64 tmp;
6920
6921         if (WARN_ON(!devid && !fs_info))
6922                 return ERR_PTR(-EINVAL);
6923
6924         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6925         if (!dev)
6926                 return ERR_PTR(-ENOMEM);
6927
6928         INIT_LIST_HEAD(&dev->dev_list);
6929         INIT_LIST_HEAD(&dev->dev_alloc_list);
6930         INIT_LIST_HEAD(&dev->post_commit_list);
6931
6932         atomic_set(&dev->dev_stats_ccnt, 0);
6933         btrfs_device_data_ordered_init(dev);
6934         extent_io_tree_init(fs_info, &dev->alloc_state,
6935                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
6936
6937         if (devid)
6938                 tmp = *devid;
6939         else {
6940                 int ret;
6941
6942                 ret = find_next_devid(fs_info, &tmp);
6943                 if (ret) {
6944                         btrfs_free_device(dev);
6945                         return ERR_PTR(ret);
6946                 }
6947         }
6948         dev->devid = tmp;
6949
6950         if (uuid)
6951                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6952         else
6953                 generate_random_uuid(dev->uuid);
6954
6955         return dev;
6956 }
6957
6958 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6959                                         u64 devid, u8 *uuid, bool error)
6960 {
6961         if (error)
6962                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6963                               devid, uuid);
6964         else
6965                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6966                               devid, uuid);
6967 }
6968
6969 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6970 {
6971         const int data_stripes = calc_data_stripes(type, num_stripes);
6972
6973         return div_u64(chunk_len, data_stripes);
6974 }
6975
6976 #if BITS_PER_LONG == 32
6977 /*
6978  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6979  * can't be accessed on 32bit systems.
6980  *
6981  * This function do mount time check to reject the fs if it already has
6982  * metadata chunk beyond that limit.
6983  */
6984 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6985                                   u64 logical, u64 length, u64 type)
6986 {
6987         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6988                 return 0;
6989
6990         if (logical + length < MAX_LFS_FILESIZE)
6991                 return 0;
6992
6993         btrfs_err_32bit_limit(fs_info);
6994         return -EOVERFLOW;
6995 }
6996
6997 /*
6998  * This is to give early warning for any metadata chunk reaching
6999  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
7000  * Although we can still access the metadata, it's not going to be possible
7001  * once the limit is reached.
7002  */
7003 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
7004                                   u64 logical, u64 length, u64 type)
7005 {
7006         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
7007                 return;
7008
7009         if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
7010                 return;
7011
7012         btrfs_warn_32bit_limit(fs_info);
7013 }
7014 #endif
7015
7016 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
7017                                                   u64 devid, u8 *uuid)
7018 {
7019         struct btrfs_device *dev;
7020
7021         if (!btrfs_test_opt(fs_info, DEGRADED)) {
7022                 btrfs_report_missing_device(fs_info, devid, uuid, true);
7023                 return ERR_PTR(-ENOENT);
7024         }
7025
7026         dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
7027         if (IS_ERR(dev)) {
7028                 btrfs_err(fs_info, "failed to init missing device %llu: %ld",
7029                           devid, PTR_ERR(dev));
7030                 return dev;
7031         }
7032         btrfs_report_missing_device(fs_info, devid, uuid, false);
7033
7034         return dev;
7035 }
7036
7037 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
7038                           struct btrfs_chunk *chunk)
7039 {
7040         BTRFS_DEV_LOOKUP_ARGS(args);
7041         struct btrfs_fs_info *fs_info = leaf->fs_info;
7042         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7043         struct map_lookup *map;
7044         struct extent_map *em;
7045         u64 logical;
7046         u64 length;
7047         u64 devid;
7048         u64 type;
7049         u8 uuid[BTRFS_UUID_SIZE];
7050         int num_stripes;
7051         int ret;
7052         int i;
7053
7054         logical = key->offset;
7055         length = btrfs_chunk_length(leaf, chunk);
7056         type = btrfs_chunk_type(leaf, chunk);
7057         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
7058
7059 #if BITS_PER_LONG == 32
7060         ret = check_32bit_meta_chunk(fs_info, logical, length, type);
7061         if (ret < 0)
7062                 return ret;
7063         warn_32bit_meta_chunk(fs_info, logical, length, type);
7064 #endif
7065
7066         /*
7067          * Only need to verify chunk item if we're reading from sys chunk array,
7068          * as chunk item in tree block is already verified by tree-checker.
7069          */
7070         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
7071                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
7072                 if (ret)
7073                         return ret;
7074         }
7075
7076         read_lock(&map_tree->lock);
7077         em = lookup_extent_mapping(map_tree, logical, 1);
7078         read_unlock(&map_tree->lock);
7079
7080         /* already mapped? */
7081         if (em && em->start <= logical && em->start + em->len > logical) {
7082                 free_extent_map(em);
7083                 return 0;
7084         } else if (em) {
7085                 free_extent_map(em);
7086         }
7087
7088         em = alloc_extent_map();
7089         if (!em)
7090                 return -ENOMEM;
7091         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
7092         if (!map) {
7093                 free_extent_map(em);
7094                 return -ENOMEM;
7095         }
7096
7097         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
7098         em->map_lookup = map;
7099         em->start = logical;
7100         em->len = length;
7101         em->orig_start = 0;
7102         em->block_start = 0;
7103         em->block_len = em->len;
7104
7105         map->num_stripes = num_stripes;
7106         map->io_width = btrfs_chunk_io_width(leaf, chunk);
7107         map->io_align = btrfs_chunk_io_align(leaf, chunk);
7108         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7109         map->type = type;
7110         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7111         map->verified_stripes = 0;
7112         em->orig_block_len = calc_stripe_length(type, em->len,
7113                                                 map->num_stripes);
7114         for (i = 0; i < num_stripes; i++) {
7115                 map->stripes[i].physical =
7116                         btrfs_stripe_offset_nr(leaf, chunk, i);
7117                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7118                 args.devid = devid;
7119                 read_extent_buffer(leaf, uuid, (unsigned long)
7120                                    btrfs_stripe_dev_uuid_nr(chunk, i),
7121                                    BTRFS_UUID_SIZE);
7122                 args.uuid = uuid;
7123                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
7124                 if (!map->stripes[i].dev) {
7125                         map->stripes[i].dev = handle_missing_device(fs_info,
7126                                                                     devid, uuid);
7127                         if (IS_ERR(map->stripes[i].dev)) {
7128                                 free_extent_map(em);
7129                                 return PTR_ERR(map->stripes[i].dev);
7130                         }
7131                 }
7132
7133                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7134                                 &(map->stripes[i].dev->dev_state));
7135         }
7136
7137         write_lock(&map_tree->lock);
7138         ret = add_extent_mapping(map_tree, em, 0);
7139         write_unlock(&map_tree->lock);
7140         if (ret < 0) {
7141                 btrfs_err(fs_info,
7142                           "failed to add chunk map, start=%llu len=%llu: %d",
7143                           em->start, em->len, ret);
7144         }
7145         free_extent_map(em);
7146
7147         return ret;
7148 }
7149
7150 static void fill_device_from_item(struct extent_buffer *leaf,
7151                                  struct btrfs_dev_item *dev_item,
7152                                  struct btrfs_device *device)
7153 {
7154         unsigned long ptr;
7155
7156         device->devid = btrfs_device_id(leaf, dev_item);
7157         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7158         device->total_bytes = device->disk_total_bytes;
7159         device->commit_total_bytes = device->disk_total_bytes;
7160         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7161         device->commit_bytes_used = device->bytes_used;
7162         device->type = btrfs_device_type(leaf, dev_item);
7163         device->io_align = btrfs_device_io_align(leaf, dev_item);
7164         device->io_width = btrfs_device_io_width(leaf, dev_item);
7165         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7166         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7167         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7168
7169         ptr = btrfs_device_uuid(dev_item);
7170         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7171 }
7172
7173 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7174                                                   u8 *fsid)
7175 {
7176         struct btrfs_fs_devices *fs_devices;
7177         int ret;
7178
7179         lockdep_assert_held(&uuid_mutex);
7180         ASSERT(fsid);
7181
7182         /* This will match only for multi-device seed fs */
7183         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7184                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7185                         return fs_devices;
7186
7187
7188         fs_devices = find_fsid(fsid, NULL);
7189         if (!fs_devices) {
7190                 if (!btrfs_test_opt(fs_info, DEGRADED))
7191                         return ERR_PTR(-ENOENT);
7192
7193                 fs_devices = alloc_fs_devices(fsid, NULL);
7194                 if (IS_ERR(fs_devices))
7195                         return fs_devices;
7196
7197                 fs_devices->seeding = true;
7198                 fs_devices->opened = 1;
7199                 return fs_devices;
7200         }
7201
7202         /*
7203          * Upon first call for a seed fs fsid, just create a private copy of the
7204          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7205          */
7206         fs_devices = clone_fs_devices(fs_devices);
7207         if (IS_ERR(fs_devices))
7208                 return fs_devices;
7209
7210         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7211         if (ret) {
7212                 free_fs_devices(fs_devices);
7213                 return ERR_PTR(ret);
7214         }
7215
7216         if (!fs_devices->seeding) {
7217                 close_fs_devices(fs_devices);
7218                 free_fs_devices(fs_devices);
7219                 return ERR_PTR(-EINVAL);
7220         }
7221
7222         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7223
7224         return fs_devices;
7225 }
7226
7227 static int read_one_dev(struct extent_buffer *leaf,
7228                         struct btrfs_dev_item *dev_item)
7229 {
7230         BTRFS_DEV_LOOKUP_ARGS(args);
7231         struct btrfs_fs_info *fs_info = leaf->fs_info;
7232         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7233         struct btrfs_device *device;
7234         u64 devid;
7235         int ret;
7236         u8 fs_uuid[BTRFS_FSID_SIZE];
7237         u8 dev_uuid[BTRFS_UUID_SIZE];
7238
7239         devid = args.devid = btrfs_device_id(leaf, dev_item);
7240         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7241                            BTRFS_UUID_SIZE);
7242         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7243                            BTRFS_FSID_SIZE);
7244         args.uuid = dev_uuid;
7245         args.fsid = fs_uuid;
7246
7247         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7248                 fs_devices = open_seed_devices(fs_info, fs_uuid);
7249                 if (IS_ERR(fs_devices))
7250                         return PTR_ERR(fs_devices);
7251         }
7252
7253         device = btrfs_find_device(fs_info->fs_devices, &args);
7254         if (!device) {
7255                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7256                         btrfs_report_missing_device(fs_info, devid,
7257                                                         dev_uuid, true);
7258                         return -ENOENT;
7259                 }
7260
7261                 device = add_missing_dev(fs_devices, devid, dev_uuid);
7262                 if (IS_ERR(device)) {
7263                         btrfs_err(fs_info,
7264                                 "failed to add missing dev %llu: %ld",
7265                                 devid, PTR_ERR(device));
7266                         return PTR_ERR(device);
7267                 }
7268                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7269         } else {
7270                 if (!device->bdev) {
7271                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
7272                                 btrfs_report_missing_device(fs_info,
7273                                                 devid, dev_uuid, true);
7274                                 return -ENOENT;
7275                         }
7276                         btrfs_report_missing_device(fs_info, devid,
7277                                                         dev_uuid, false);
7278                 }
7279
7280                 if (!device->bdev &&
7281                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7282                         /*
7283                          * this happens when a device that was properly setup
7284                          * in the device info lists suddenly goes bad.
7285                          * device->bdev is NULL, and so we have to set
7286                          * device->missing to one here
7287                          */
7288                         device->fs_devices->missing_devices++;
7289                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7290                 }
7291
7292                 /* Move the device to its own fs_devices */
7293                 if (device->fs_devices != fs_devices) {
7294                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7295                                                         &device->dev_state));
7296
7297                         list_move(&device->dev_list, &fs_devices->devices);
7298                         device->fs_devices->num_devices--;
7299                         fs_devices->num_devices++;
7300
7301                         device->fs_devices->missing_devices--;
7302                         fs_devices->missing_devices++;
7303
7304                         device->fs_devices = fs_devices;
7305                 }
7306         }
7307
7308         if (device->fs_devices != fs_info->fs_devices) {
7309                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7310                 if (device->generation !=
7311                     btrfs_device_generation(leaf, dev_item))
7312                         return -EINVAL;
7313         }
7314
7315         fill_device_from_item(leaf, dev_item, device);
7316         if (device->bdev) {
7317                 u64 max_total_bytes = bdev_nr_bytes(device->bdev);
7318
7319                 if (device->total_bytes > max_total_bytes) {
7320                         btrfs_err(fs_info,
7321                         "device total_bytes should be at most %llu but found %llu",
7322                                   max_total_bytes, device->total_bytes);
7323                         return -EINVAL;
7324                 }
7325         }
7326         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7327         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7328            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7329                 device->fs_devices->total_rw_bytes += device->total_bytes;
7330                 atomic64_add(device->total_bytes - device->bytes_used,
7331                                 &fs_info->free_chunk_space);
7332         }
7333         ret = 0;
7334         return ret;
7335 }
7336
7337 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7338 {
7339         struct btrfs_super_block *super_copy = fs_info->super_copy;
7340         struct extent_buffer *sb;
7341         struct btrfs_disk_key *disk_key;
7342         struct btrfs_chunk *chunk;
7343         u8 *array_ptr;
7344         unsigned long sb_array_offset;
7345         int ret = 0;
7346         u32 num_stripes;
7347         u32 array_size;
7348         u32 len = 0;
7349         u32 cur_offset;
7350         u64 type;
7351         struct btrfs_key key;
7352
7353         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7354
7355         /*
7356          * We allocated a dummy extent, just to use extent buffer accessors.
7357          * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7358          * that's fine, we will not go beyond system chunk array anyway.
7359          */
7360         sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7361         if (!sb)
7362                 return -ENOMEM;
7363         set_extent_buffer_uptodate(sb);
7364
7365         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7366         array_size = btrfs_super_sys_array_size(super_copy);
7367
7368         array_ptr = super_copy->sys_chunk_array;
7369         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7370         cur_offset = 0;
7371
7372         while (cur_offset < array_size) {
7373                 disk_key = (struct btrfs_disk_key *)array_ptr;
7374                 len = sizeof(*disk_key);
7375                 if (cur_offset + len > array_size)
7376                         goto out_short_read;
7377
7378                 btrfs_disk_key_to_cpu(&key, disk_key);
7379
7380                 array_ptr += len;
7381                 sb_array_offset += len;
7382                 cur_offset += len;
7383
7384                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7385                         btrfs_err(fs_info,
7386                             "unexpected item type %u in sys_array at offset %u",
7387                                   (u32)key.type, cur_offset);
7388                         ret = -EIO;
7389                         break;
7390                 }
7391
7392                 chunk = (struct btrfs_chunk *)sb_array_offset;
7393                 /*
7394                  * At least one btrfs_chunk with one stripe must be present,
7395                  * exact stripe count check comes afterwards
7396                  */
7397                 len = btrfs_chunk_item_size(1);
7398                 if (cur_offset + len > array_size)
7399                         goto out_short_read;
7400
7401                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7402                 if (!num_stripes) {
7403                         btrfs_err(fs_info,
7404                         "invalid number of stripes %u in sys_array at offset %u",
7405                                   num_stripes, cur_offset);
7406                         ret = -EIO;
7407                         break;
7408                 }
7409
7410                 type = btrfs_chunk_type(sb, chunk);
7411                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7412                         btrfs_err(fs_info,
7413                         "invalid chunk type %llu in sys_array at offset %u",
7414                                   type, cur_offset);
7415                         ret = -EIO;
7416                         break;
7417                 }
7418
7419                 len = btrfs_chunk_item_size(num_stripes);
7420                 if (cur_offset + len > array_size)
7421                         goto out_short_read;
7422
7423                 ret = read_one_chunk(&key, sb, chunk);
7424                 if (ret)
7425                         break;
7426
7427                 array_ptr += len;
7428                 sb_array_offset += len;
7429                 cur_offset += len;
7430         }
7431         clear_extent_buffer_uptodate(sb);
7432         free_extent_buffer_stale(sb);
7433         return ret;
7434
7435 out_short_read:
7436         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7437                         len, cur_offset);
7438         clear_extent_buffer_uptodate(sb);
7439         free_extent_buffer_stale(sb);
7440         return -EIO;
7441 }
7442
7443 /*
7444  * Check if all chunks in the fs are OK for read-write degraded mount
7445  *
7446  * If the @failing_dev is specified, it's accounted as missing.
7447  *
7448  * Return true if all chunks meet the minimal RW mount requirements.
7449  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7450  */
7451 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7452                                         struct btrfs_device *failing_dev)
7453 {
7454         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7455         struct extent_map *em;
7456         u64 next_start = 0;
7457         bool ret = true;
7458
7459         read_lock(&map_tree->lock);
7460         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7461         read_unlock(&map_tree->lock);
7462         /* No chunk at all? Return false anyway */
7463         if (!em) {
7464                 ret = false;
7465                 goto out;
7466         }
7467         while (em) {
7468                 struct map_lookup *map;
7469                 int missing = 0;
7470                 int max_tolerated;
7471                 int i;
7472
7473                 map = em->map_lookup;
7474                 max_tolerated =
7475                         btrfs_get_num_tolerated_disk_barrier_failures(
7476                                         map->type);
7477                 for (i = 0; i < map->num_stripes; i++) {
7478                         struct btrfs_device *dev = map->stripes[i].dev;
7479
7480                         if (!dev || !dev->bdev ||
7481                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7482                             dev->last_flush_error)
7483                                 missing++;
7484                         else if (failing_dev && failing_dev == dev)
7485                                 missing++;
7486                 }
7487                 if (missing > max_tolerated) {
7488                         if (!failing_dev)
7489                                 btrfs_warn(fs_info,
7490         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7491                                    em->start, missing, max_tolerated);
7492                         free_extent_map(em);
7493                         ret = false;
7494                         goto out;
7495                 }
7496                 next_start = extent_map_end(em);
7497                 free_extent_map(em);
7498
7499                 read_lock(&map_tree->lock);
7500                 em = lookup_extent_mapping(map_tree, next_start,
7501                                            (u64)(-1) - next_start);
7502                 read_unlock(&map_tree->lock);
7503         }
7504 out:
7505         return ret;
7506 }
7507
7508 static void readahead_tree_node_children(struct extent_buffer *node)
7509 {
7510         int i;
7511         const int nr_items = btrfs_header_nritems(node);
7512
7513         for (i = 0; i < nr_items; i++)
7514                 btrfs_readahead_node_child(node, i);
7515 }
7516
7517 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7518 {
7519         struct btrfs_root *root = fs_info->chunk_root;
7520         struct btrfs_path *path;
7521         struct extent_buffer *leaf;
7522         struct btrfs_key key;
7523         struct btrfs_key found_key;
7524         int ret;
7525         int slot;
7526         int iter_ret = 0;
7527         u64 total_dev = 0;
7528         u64 last_ra_node = 0;
7529
7530         path = btrfs_alloc_path();
7531         if (!path)
7532                 return -ENOMEM;
7533
7534         /*
7535          * uuid_mutex is needed only if we are mounting a sprout FS
7536          * otherwise we don't need it.
7537          */
7538         mutex_lock(&uuid_mutex);
7539
7540         /*
7541          * It is possible for mount and umount to race in such a way that
7542          * we execute this code path, but open_fs_devices failed to clear
7543          * total_rw_bytes. We certainly want it cleared before reading the
7544          * device items, so clear it here.
7545          */
7546         fs_info->fs_devices->total_rw_bytes = 0;
7547
7548         /*
7549          * Lockdep complains about possible circular locking dependency between
7550          * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
7551          * used for freeze procection of a fs (struct super_block.s_writers),
7552          * which we take when starting a transaction, and extent buffers of the
7553          * chunk tree if we call read_one_dev() while holding a lock on an
7554          * extent buffer of the chunk tree. Since we are mounting the filesystem
7555          * and at this point there can't be any concurrent task modifying the
7556          * chunk tree, to keep it simple, just skip locking on the chunk tree.
7557          */
7558         ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7559         path->skip_locking = 1;
7560
7561         /*
7562          * Read all device items, and then all the chunk items. All
7563          * device items are found before any chunk item (their object id
7564          * is smaller than the lowest possible object id for a chunk
7565          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7566          */
7567         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7568         key.offset = 0;
7569         key.type = 0;
7570         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
7571                 struct extent_buffer *node = path->nodes[1];
7572
7573                 leaf = path->nodes[0];
7574                 slot = path->slots[0];
7575
7576                 if (node) {
7577                         if (last_ra_node != node->start) {
7578                                 readahead_tree_node_children(node);
7579                                 last_ra_node = node->start;
7580                         }
7581                 }
7582                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7583                         struct btrfs_dev_item *dev_item;
7584                         dev_item = btrfs_item_ptr(leaf, slot,
7585                                                   struct btrfs_dev_item);
7586                         ret = read_one_dev(leaf, dev_item);
7587                         if (ret)
7588                                 goto error;
7589                         total_dev++;
7590                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7591                         struct btrfs_chunk *chunk;
7592
7593                         /*
7594                          * We are only called at mount time, so no need to take
7595                          * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7596                          * we always lock first fs_info->chunk_mutex before
7597                          * acquiring any locks on the chunk tree. This is a
7598                          * requirement for chunk allocation, see the comment on
7599                          * top of btrfs_chunk_alloc() for details.
7600                          */
7601                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7602                         ret = read_one_chunk(&found_key, leaf, chunk);
7603                         if (ret)
7604                                 goto error;
7605                 }
7606         }
7607         /* Catch error found during iteration */
7608         if (iter_ret < 0) {
7609                 ret = iter_ret;
7610                 goto error;
7611         }
7612
7613         /*
7614          * After loading chunk tree, we've got all device information,
7615          * do another round of validation checks.
7616          */
7617         if (total_dev != fs_info->fs_devices->total_devices) {
7618                 btrfs_warn(fs_info,
7619 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7620                           btrfs_super_num_devices(fs_info->super_copy),
7621                           total_dev);
7622                 fs_info->fs_devices->total_devices = total_dev;
7623                 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7624         }
7625         if (btrfs_super_total_bytes(fs_info->super_copy) <
7626             fs_info->fs_devices->total_rw_bytes) {
7627                 btrfs_err(fs_info,
7628         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7629                           btrfs_super_total_bytes(fs_info->super_copy),
7630                           fs_info->fs_devices->total_rw_bytes);
7631                 ret = -EINVAL;
7632                 goto error;
7633         }
7634         ret = 0;
7635 error:
7636         mutex_unlock(&uuid_mutex);
7637
7638         btrfs_free_path(path);
7639         return ret;
7640 }
7641
7642 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7643 {
7644         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7645         struct btrfs_device *device;
7646
7647         fs_devices->fs_info = fs_info;
7648
7649         mutex_lock(&fs_devices->device_list_mutex);
7650         list_for_each_entry(device, &fs_devices->devices, dev_list)
7651                 device->fs_info = fs_info;
7652
7653         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7654                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7655                         device->fs_info = fs_info;
7656
7657                 seed_devs->fs_info = fs_info;
7658         }
7659         mutex_unlock(&fs_devices->device_list_mutex);
7660 }
7661
7662 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7663                                  const struct btrfs_dev_stats_item *ptr,
7664                                  int index)
7665 {
7666         u64 val;
7667
7668         read_extent_buffer(eb, &val,
7669                            offsetof(struct btrfs_dev_stats_item, values) +
7670                             ((unsigned long)ptr) + (index * sizeof(u64)),
7671                            sizeof(val));
7672         return val;
7673 }
7674
7675 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7676                                       struct btrfs_dev_stats_item *ptr,
7677                                       int index, u64 val)
7678 {
7679         write_extent_buffer(eb, &val,
7680                             offsetof(struct btrfs_dev_stats_item, values) +
7681                              ((unsigned long)ptr) + (index * sizeof(u64)),
7682                             sizeof(val));
7683 }
7684
7685 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7686                                        struct btrfs_path *path)
7687 {
7688         struct btrfs_dev_stats_item *ptr;
7689         struct extent_buffer *eb;
7690         struct btrfs_key key;
7691         int item_size;
7692         int i, ret, slot;
7693
7694         if (!device->fs_info->dev_root)
7695                 return 0;
7696
7697         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7698         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7699         key.offset = device->devid;
7700         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7701         if (ret) {
7702                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7703                         btrfs_dev_stat_set(device, i, 0);
7704                 device->dev_stats_valid = 1;
7705                 btrfs_release_path(path);
7706                 return ret < 0 ? ret : 0;
7707         }
7708         slot = path->slots[0];
7709         eb = path->nodes[0];
7710         item_size = btrfs_item_size(eb, slot);
7711
7712         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7713
7714         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7715                 if (item_size >= (1 + i) * sizeof(__le64))
7716                         btrfs_dev_stat_set(device, i,
7717                                            btrfs_dev_stats_value(eb, ptr, i));
7718                 else
7719                         btrfs_dev_stat_set(device, i, 0);
7720         }
7721
7722         device->dev_stats_valid = 1;
7723         btrfs_dev_stat_print_on_load(device);
7724         btrfs_release_path(path);
7725
7726         return 0;
7727 }
7728
7729 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7730 {
7731         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7732         struct btrfs_device *device;
7733         struct btrfs_path *path = NULL;
7734         int ret = 0;
7735
7736         path = btrfs_alloc_path();
7737         if (!path)
7738                 return -ENOMEM;
7739
7740         mutex_lock(&fs_devices->device_list_mutex);
7741         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7742                 ret = btrfs_device_init_dev_stats(device, path);
7743                 if (ret)
7744                         goto out;
7745         }
7746         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7747                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7748                         ret = btrfs_device_init_dev_stats(device, path);
7749                         if (ret)
7750                                 goto out;
7751                 }
7752         }
7753 out:
7754         mutex_unlock(&fs_devices->device_list_mutex);
7755
7756         btrfs_free_path(path);
7757         return ret;
7758 }
7759
7760 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7761                                 struct btrfs_device *device)
7762 {
7763         struct btrfs_fs_info *fs_info = trans->fs_info;
7764         struct btrfs_root *dev_root = fs_info->dev_root;
7765         struct btrfs_path *path;
7766         struct btrfs_key key;
7767         struct extent_buffer *eb;
7768         struct btrfs_dev_stats_item *ptr;
7769         int ret;
7770         int i;
7771
7772         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7773         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7774         key.offset = device->devid;
7775
7776         path = btrfs_alloc_path();
7777         if (!path)
7778                 return -ENOMEM;
7779         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7780         if (ret < 0) {
7781                 btrfs_warn_in_rcu(fs_info,
7782                         "error %d while searching for dev_stats item for device %s",
7783                               ret, rcu_str_deref(device->name));
7784                 goto out;
7785         }
7786
7787         if (ret == 0 &&
7788             btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7789                 /* need to delete old one and insert a new one */
7790                 ret = btrfs_del_item(trans, dev_root, path);
7791                 if (ret != 0) {
7792                         btrfs_warn_in_rcu(fs_info,
7793                                 "delete too small dev_stats item for device %s failed %d",
7794                                       rcu_str_deref(device->name), ret);
7795                         goto out;
7796                 }
7797                 ret = 1;
7798         }
7799
7800         if (ret == 1) {
7801                 /* need to insert a new item */
7802                 btrfs_release_path(path);
7803                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7804                                               &key, sizeof(*ptr));
7805                 if (ret < 0) {
7806                         btrfs_warn_in_rcu(fs_info,
7807                                 "insert dev_stats item for device %s failed %d",
7808                                 rcu_str_deref(device->name), ret);
7809                         goto out;
7810                 }
7811         }
7812
7813         eb = path->nodes[0];
7814         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7815         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7816                 btrfs_set_dev_stats_value(eb, ptr, i,
7817                                           btrfs_dev_stat_read(device, i));
7818         btrfs_mark_buffer_dirty(eb);
7819
7820 out:
7821         btrfs_free_path(path);
7822         return ret;
7823 }
7824
7825 /*
7826  * called from commit_transaction. Writes all changed device stats to disk.
7827  */
7828 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7829 {
7830         struct btrfs_fs_info *fs_info = trans->fs_info;
7831         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7832         struct btrfs_device *device;
7833         int stats_cnt;
7834         int ret = 0;
7835
7836         mutex_lock(&fs_devices->device_list_mutex);
7837         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7838                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7839                 if (!device->dev_stats_valid || stats_cnt == 0)
7840                         continue;
7841
7842
7843                 /*
7844                  * There is a LOAD-LOAD control dependency between the value of
7845                  * dev_stats_ccnt and updating the on-disk values which requires
7846                  * reading the in-memory counters. Such control dependencies
7847                  * require explicit read memory barriers.
7848                  *
7849                  * This memory barriers pairs with smp_mb__before_atomic in
7850                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7851                  * barrier implied by atomic_xchg in
7852                  * btrfs_dev_stats_read_and_reset
7853                  */
7854                 smp_rmb();
7855
7856                 ret = update_dev_stat_item(trans, device);
7857                 if (!ret)
7858                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7859         }
7860         mutex_unlock(&fs_devices->device_list_mutex);
7861
7862         return ret;
7863 }
7864
7865 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7866 {
7867         btrfs_dev_stat_inc(dev, index);
7868         btrfs_dev_stat_print_on_error(dev);
7869 }
7870
7871 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7872 {
7873         if (!dev->dev_stats_valid)
7874                 return;
7875         btrfs_err_rl_in_rcu(dev->fs_info,
7876                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7877                            rcu_str_deref(dev->name),
7878                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7879                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7880                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7881                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7882                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7883 }
7884
7885 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7886 {
7887         int i;
7888
7889         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7890                 if (btrfs_dev_stat_read(dev, i) != 0)
7891                         break;
7892         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7893                 return; /* all values == 0, suppress message */
7894
7895         btrfs_info_in_rcu(dev->fs_info,
7896                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7897                rcu_str_deref(dev->name),
7898                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7899                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7900                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7901                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7902                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7903 }
7904
7905 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7906                         struct btrfs_ioctl_get_dev_stats *stats)
7907 {
7908         BTRFS_DEV_LOOKUP_ARGS(args);
7909         struct btrfs_device *dev;
7910         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7911         int i;
7912
7913         mutex_lock(&fs_devices->device_list_mutex);
7914         args.devid = stats->devid;
7915         dev = btrfs_find_device(fs_info->fs_devices, &args);
7916         mutex_unlock(&fs_devices->device_list_mutex);
7917
7918         if (!dev) {
7919                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7920                 return -ENODEV;
7921         } else if (!dev->dev_stats_valid) {
7922                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7923                 return -ENODEV;
7924         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7925                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7926                         if (stats->nr_items > i)
7927                                 stats->values[i] =
7928                                         btrfs_dev_stat_read_and_reset(dev, i);
7929                         else
7930                                 btrfs_dev_stat_set(dev, i, 0);
7931                 }
7932                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7933                            current->comm, task_pid_nr(current));
7934         } else {
7935                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7936                         if (stats->nr_items > i)
7937                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7938         }
7939         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7940                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7941         return 0;
7942 }
7943
7944 /*
7945  * Update the size and bytes used for each device where it changed.  This is
7946  * delayed since we would otherwise get errors while writing out the
7947  * superblocks.
7948  *
7949  * Must be invoked during transaction commit.
7950  */
7951 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7952 {
7953         struct btrfs_device *curr, *next;
7954
7955         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7956
7957         if (list_empty(&trans->dev_update_list))
7958                 return;
7959
7960         /*
7961          * We don't need the device_list_mutex here.  This list is owned by the
7962          * transaction and the transaction must complete before the device is
7963          * released.
7964          */
7965         mutex_lock(&trans->fs_info->chunk_mutex);
7966         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7967                                  post_commit_list) {
7968                 list_del_init(&curr->post_commit_list);
7969                 curr->commit_total_bytes = curr->disk_total_bytes;
7970                 curr->commit_bytes_used = curr->bytes_used;
7971         }
7972         mutex_unlock(&trans->fs_info->chunk_mutex);
7973 }
7974
7975 /*
7976  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7977  */
7978 int btrfs_bg_type_to_factor(u64 flags)
7979 {
7980         const int index = btrfs_bg_flags_to_raid_index(flags);
7981
7982         return btrfs_raid_array[index].ncopies;
7983 }
7984
7985
7986
7987 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7988                                  u64 chunk_offset, u64 devid,
7989                                  u64 physical_offset, u64 physical_len)
7990 {
7991         struct btrfs_dev_lookup_args args = { .devid = devid };
7992         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7993         struct extent_map *em;
7994         struct map_lookup *map;
7995         struct btrfs_device *dev;
7996         u64 stripe_len;
7997         bool found = false;
7998         int ret = 0;
7999         int i;
8000
8001         read_lock(&em_tree->lock);
8002         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
8003         read_unlock(&em_tree->lock);
8004
8005         if (!em) {
8006                 btrfs_err(fs_info,
8007 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
8008                           physical_offset, devid);
8009                 ret = -EUCLEAN;
8010                 goto out;
8011         }
8012
8013         map = em->map_lookup;
8014         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
8015         if (physical_len != stripe_len) {
8016                 btrfs_err(fs_info,
8017 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
8018                           physical_offset, devid, em->start, physical_len,
8019                           stripe_len);
8020                 ret = -EUCLEAN;
8021                 goto out;
8022         }
8023
8024         for (i = 0; i < map->num_stripes; i++) {
8025                 if (map->stripes[i].dev->devid == devid &&
8026                     map->stripes[i].physical == physical_offset) {
8027                         found = true;
8028                         if (map->verified_stripes >= map->num_stripes) {
8029                                 btrfs_err(fs_info,
8030                                 "too many dev extents for chunk %llu found",
8031                                           em->start);
8032                                 ret = -EUCLEAN;
8033                                 goto out;
8034                         }
8035                         map->verified_stripes++;
8036                         break;
8037                 }
8038         }
8039         if (!found) {
8040                 btrfs_err(fs_info,
8041         "dev extent physical offset %llu devid %llu has no corresponding chunk",
8042                         physical_offset, devid);
8043                 ret = -EUCLEAN;
8044         }
8045
8046         /* Make sure no dev extent is beyond device boundary */
8047         dev = btrfs_find_device(fs_info->fs_devices, &args);
8048         if (!dev) {
8049                 btrfs_err(fs_info, "failed to find devid %llu", devid);
8050                 ret = -EUCLEAN;
8051                 goto out;
8052         }
8053
8054         if (physical_offset + physical_len > dev->disk_total_bytes) {
8055                 btrfs_err(fs_info,
8056 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
8057                           devid, physical_offset, physical_len,
8058                           dev->disk_total_bytes);
8059                 ret = -EUCLEAN;
8060                 goto out;
8061         }
8062
8063         if (dev->zone_info) {
8064                 u64 zone_size = dev->zone_info->zone_size;
8065
8066                 if (!IS_ALIGNED(physical_offset, zone_size) ||
8067                     !IS_ALIGNED(physical_len, zone_size)) {
8068                         btrfs_err(fs_info,
8069 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8070                                   devid, physical_offset, physical_len);
8071                         ret = -EUCLEAN;
8072                         goto out;
8073                 }
8074         }
8075
8076 out:
8077         free_extent_map(em);
8078         return ret;
8079 }
8080
8081 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8082 {
8083         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8084         struct extent_map *em;
8085         struct rb_node *node;
8086         int ret = 0;
8087
8088         read_lock(&em_tree->lock);
8089         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8090                 em = rb_entry(node, struct extent_map, rb_node);
8091                 if (em->map_lookup->num_stripes !=
8092                     em->map_lookup->verified_stripes) {
8093                         btrfs_err(fs_info,
8094                         "chunk %llu has missing dev extent, have %d expect %d",
8095                                   em->start, em->map_lookup->verified_stripes,
8096                                   em->map_lookup->num_stripes);
8097                         ret = -EUCLEAN;
8098                         goto out;
8099                 }
8100         }
8101 out:
8102         read_unlock(&em_tree->lock);
8103         return ret;
8104 }
8105
8106 /*
8107  * Ensure that all dev extents are mapped to correct chunk, otherwise
8108  * later chunk allocation/free would cause unexpected behavior.
8109  *
8110  * NOTE: This will iterate through the whole device tree, which should be of
8111  * the same size level as the chunk tree.  This slightly increases mount time.
8112  */
8113 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8114 {
8115         struct btrfs_path *path;
8116         struct btrfs_root *root = fs_info->dev_root;
8117         struct btrfs_key key;
8118         u64 prev_devid = 0;
8119         u64 prev_dev_ext_end = 0;
8120         int ret = 0;
8121
8122         /*
8123          * We don't have a dev_root because we mounted with ignorebadroots and
8124          * failed to load the root, so we want to skip the verification in this
8125          * case for sure.
8126          *
8127          * However if the dev root is fine, but the tree itself is corrupted
8128          * we'd still fail to mount.  This verification is only to make sure
8129          * writes can happen safely, so instead just bypass this check
8130          * completely in the case of IGNOREBADROOTS.
8131          */
8132         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
8133                 return 0;
8134
8135         key.objectid = 1;
8136         key.type = BTRFS_DEV_EXTENT_KEY;
8137         key.offset = 0;
8138
8139         path = btrfs_alloc_path();
8140         if (!path)
8141                 return -ENOMEM;
8142
8143         path->reada = READA_FORWARD;
8144         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8145         if (ret < 0)
8146                 goto out;
8147
8148         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8149                 ret = btrfs_next_leaf(root, path);
8150                 if (ret < 0)
8151                         goto out;
8152                 /* No dev extents at all? Not good */
8153                 if (ret > 0) {
8154                         ret = -EUCLEAN;
8155                         goto out;
8156                 }
8157         }
8158         while (1) {
8159                 struct extent_buffer *leaf = path->nodes[0];
8160                 struct btrfs_dev_extent *dext;
8161                 int slot = path->slots[0];
8162                 u64 chunk_offset;
8163                 u64 physical_offset;
8164                 u64 physical_len;
8165                 u64 devid;
8166
8167                 btrfs_item_key_to_cpu(leaf, &key, slot);
8168                 if (key.type != BTRFS_DEV_EXTENT_KEY)
8169                         break;
8170                 devid = key.objectid;
8171                 physical_offset = key.offset;
8172
8173                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8174                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8175                 physical_len = btrfs_dev_extent_length(leaf, dext);
8176
8177                 /* Check if this dev extent overlaps with the previous one */
8178                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8179                         btrfs_err(fs_info,
8180 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8181                                   devid, physical_offset, prev_dev_ext_end);
8182                         ret = -EUCLEAN;
8183                         goto out;
8184                 }
8185
8186                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8187                                             physical_offset, physical_len);
8188                 if (ret < 0)
8189                         goto out;
8190                 prev_devid = devid;
8191                 prev_dev_ext_end = physical_offset + physical_len;
8192
8193                 ret = btrfs_next_item(root, path);
8194                 if (ret < 0)
8195                         goto out;
8196                 if (ret > 0) {
8197                         ret = 0;
8198                         break;
8199                 }
8200         }
8201
8202         /* Ensure all chunks have corresponding dev extents */
8203         ret = verify_chunk_dev_extent_mapping(fs_info);
8204 out:
8205         btrfs_free_path(path);
8206         return ret;
8207 }
8208
8209 /*
8210  * Check whether the given block group or device is pinned by any inode being
8211  * used as a swapfile.
8212  */
8213 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8214 {
8215         struct btrfs_swapfile_pin *sp;
8216         struct rb_node *node;
8217
8218         spin_lock(&fs_info->swapfile_pins_lock);
8219         node = fs_info->swapfile_pins.rb_node;
8220         while (node) {
8221                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8222                 if (ptr < sp->ptr)
8223                         node = node->rb_left;
8224                 else if (ptr > sp->ptr)
8225                         node = node->rb_right;
8226                 else
8227                         break;
8228         }
8229         spin_unlock(&fs_info->swapfile_pins_lock);
8230         return node != NULL;
8231 }
8232
8233 static int relocating_repair_kthread(void *data)
8234 {
8235         struct btrfs_block_group *cache = data;
8236         struct btrfs_fs_info *fs_info = cache->fs_info;
8237         u64 target;
8238         int ret = 0;
8239
8240         target = cache->start;
8241         btrfs_put_block_group(cache);
8242
8243         sb_start_write(fs_info->sb);
8244         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8245                 btrfs_info(fs_info,
8246                            "zoned: skip relocating block group %llu to repair: EBUSY",
8247                            target);
8248                 sb_end_write(fs_info->sb);
8249                 return -EBUSY;
8250         }
8251
8252         mutex_lock(&fs_info->reclaim_bgs_lock);
8253
8254         /* Ensure block group still exists */
8255         cache = btrfs_lookup_block_group(fs_info, target);
8256         if (!cache)
8257                 goto out;
8258
8259         if (!cache->relocating_repair)
8260                 goto out;
8261
8262         ret = btrfs_may_alloc_data_chunk(fs_info, target);
8263         if (ret < 0)
8264                 goto out;
8265
8266         btrfs_info(fs_info,
8267                    "zoned: relocating block group %llu to repair IO failure",
8268                    target);
8269         ret = btrfs_relocate_chunk(fs_info, target);
8270
8271 out:
8272         if (cache)
8273                 btrfs_put_block_group(cache);
8274         mutex_unlock(&fs_info->reclaim_bgs_lock);
8275         btrfs_exclop_finish(fs_info);
8276         sb_end_write(fs_info->sb);
8277
8278         return ret;
8279 }
8280
8281 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8282 {
8283         struct btrfs_block_group *cache;
8284
8285         if (!btrfs_is_zoned(fs_info))
8286                 return false;
8287
8288         /* Do not attempt to repair in degraded state */
8289         if (btrfs_test_opt(fs_info, DEGRADED))
8290                 return true;
8291
8292         cache = btrfs_lookup_block_group(fs_info, logical);
8293         if (!cache)
8294                 return true;
8295
8296         spin_lock(&cache->lock);
8297         if (cache->relocating_repair) {
8298                 spin_unlock(&cache->lock);
8299                 btrfs_put_block_group(cache);
8300                 return true;
8301         }
8302         cache->relocating_repair = 1;
8303         spin_unlock(&cache->lock);
8304
8305         kthread_run(relocating_repair_kthread, cache,
8306                     "btrfs-relocating-repair");
8307
8308         return true;
8309 }