block: remove i_bdev
[linux-2.6-microblaze.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/mm.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "misc.h"
18 #include "ctree.h"
19 #include "extent_map.h"
20 #include "disk-io.h"
21 #include "transaction.h"
22 #include "print-tree.h"
23 #include "volumes.h"
24 #include "raid56.h"
25 #include "async-thread.h"
26 #include "check-integrity.h"
27 #include "rcu-string.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30 #include "tree-checker.h"
31 #include "space-info.h"
32 #include "block-group.h"
33 #include "discard.h"
34
35 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
36         [BTRFS_RAID_RAID10] = {
37                 .sub_stripes    = 2,
38                 .dev_stripes    = 1,
39                 .devs_max       = 0,    /* 0 == as many as possible */
40                 .devs_min       = 4,
41                 .tolerated_failures = 1,
42                 .devs_increment = 2,
43                 .ncopies        = 2,
44                 .nparity        = 0,
45                 .raid_name      = "raid10",
46                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
47                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
48         },
49         [BTRFS_RAID_RAID1] = {
50                 .sub_stripes    = 1,
51                 .dev_stripes    = 1,
52                 .devs_max       = 2,
53                 .devs_min       = 2,
54                 .tolerated_failures = 1,
55                 .devs_increment = 2,
56                 .ncopies        = 2,
57                 .nparity        = 0,
58                 .raid_name      = "raid1",
59                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
60                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
61         },
62         [BTRFS_RAID_RAID1C3] = {
63                 .sub_stripes    = 1,
64                 .dev_stripes    = 1,
65                 .devs_max       = 3,
66                 .devs_min       = 3,
67                 .tolerated_failures = 2,
68                 .devs_increment = 3,
69                 .ncopies        = 3,
70                 .nparity        = 0,
71                 .raid_name      = "raid1c3",
72                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
73                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
74         },
75         [BTRFS_RAID_RAID1C4] = {
76                 .sub_stripes    = 1,
77                 .dev_stripes    = 1,
78                 .devs_max       = 4,
79                 .devs_min       = 4,
80                 .tolerated_failures = 3,
81                 .devs_increment = 4,
82                 .ncopies        = 4,
83                 .nparity        = 0,
84                 .raid_name      = "raid1c4",
85                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
86                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
87         },
88         [BTRFS_RAID_DUP] = {
89                 .sub_stripes    = 1,
90                 .dev_stripes    = 2,
91                 .devs_max       = 1,
92                 .devs_min       = 1,
93                 .tolerated_failures = 0,
94                 .devs_increment = 1,
95                 .ncopies        = 2,
96                 .nparity        = 0,
97                 .raid_name      = "dup",
98                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
99                 .mindev_error   = 0,
100         },
101         [BTRFS_RAID_RAID0] = {
102                 .sub_stripes    = 1,
103                 .dev_stripes    = 1,
104                 .devs_max       = 0,
105                 .devs_min       = 2,
106                 .tolerated_failures = 0,
107                 .devs_increment = 1,
108                 .ncopies        = 1,
109                 .nparity        = 0,
110                 .raid_name      = "raid0",
111                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
112                 .mindev_error   = 0,
113         },
114         [BTRFS_RAID_SINGLE] = {
115                 .sub_stripes    = 1,
116                 .dev_stripes    = 1,
117                 .devs_max       = 1,
118                 .devs_min       = 1,
119                 .tolerated_failures = 0,
120                 .devs_increment = 1,
121                 .ncopies        = 1,
122                 .nparity        = 0,
123                 .raid_name      = "single",
124                 .bg_flag        = 0,
125                 .mindev_error   = 0,
126         },
127         [BTRFS_RAID_RAID5] = {
128                 .sub_stripes    = 1,
129                 .dev_stripes    = 1,
130                 .devs_max       = 0,
131                 .devs_min       = 2,
132                 .tolerated_failures = 1,
133                 .devs_increment = 1,
134                 .ncopies        = 1,
135                 .nparity        = 1,
136                 .raid_name      = "raid5",
137                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
138                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
139         },
140         [BTRFS_RAID_RAID6] = {
141                 .sub_stripes    = 1,
142                 .dev_stripes    = 1,
143                 .devs_max       = 0,
144                 .devs_min       = 3,
145                 .tolerated_failures = 2,
146                 .devs_increment = 1,
147                 .ncopies        = 1,
148                 .nparity        = 2,
149                 .raid_name      = "raid6",
150                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
151                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
152         },
153 };
154
155 const char *btrfs_bg_type_to_raid_name(u64 flags)
156 {
157         const int index = btrfs_bg_flags_to_raid_index(flags);
158
159         if (index >= BTRFS_NR_RAID_TYPES)
160                 return NULL;
161
162         return btrfs_raid_array[index].raid_name;
163 }
164
165 /*
166  * Fill @buf with textual description of @bg_flags, no more than @size_buf
167  * bytes including terminating null byte.
168  */
169 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
170 {
171         int i;
172         int ret;
173         char *bp = buf;
174         u64 flags = bg_flags;
175         u32 size_bp = size_buf;
176
177         if (!flags) {
178                 strcpy(bp, "NONE");
179                 return;
180         }
181
182 #define DESCRIBE_FLAG(flag, desc)                                               \
183         do {                                                            \
184                 if (flags & (flag)) {                                   \
185                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
186                         if (ret < 0 || ret >= size_bp)                  \
187                                 goto out_overflow;                      \
188                         size_bp -= ret;                                 \
189                         bp += ret;                                      \
190                         flags &= ~(flag);                               \
191                 }                                                       \
192         } while (0)
193
194         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
195         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
196         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
197
198         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
199         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
200                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
201                               btrfs_raid_array[i].raid_name);
202 #undef DESCRIBE_FLAG
203
204         if (flags) {
205                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
206                 size_bp -= ret;
207         }
208
209         if (size_bp < size_buf)
210                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
211
212         /*
213          * The text is trimmed, it's up to the caller to provide sufficiently
214          * large buffer
215          */
216 out_overflow:;
217 }
218
219 static int init_first_rw_device(struct btrfs_trans_handle *trans);
220 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
221 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
222 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
223 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
224                              enum btrfs_map_op op,
225                              u64 logical, u64 *length,
226                              struct btrfs_bio **bbio_ret,
227                              int mirror_num, int need_raid_map);
228
229 /*
230  * Device locking
231  * ==============
232  *
233  * There are several mutexes that protect manipulation of devices and low-level
234  * structures like chunks but not block groups, extents or files
235  *
236  * uuid_mutex (global lock)
237  * ------------------------
238  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
239  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
240  * device) or requested by the device= mount option
241  *
242  * the mutex can be very coarse and can cover long-running operations
243  *
244  * protects: updates to fs_devices counters like missing devices, rw devices,
245  * seeding, structure cloning, opening/closing devices at mount/umount time
246  *
247  * global::fs_devs - add, remove, updates to the global list
248  *
249  * does not protect: manipulation of the fs_devices::devices list in general
250  * but in mount context it could be used to exclude list modifications by eg.
251  * scan ioctl
252  *
253  * btrfs_device::name - renames (write side), read is RCU
254  *
255  * fs_devices::device_list_mutex (per-fs, with RCU)
256  * ------------------------------------------------
257  * protects updates to fs_devices::devices, ie. adding and deleting
258  *
259  * simple list traversal with read-only actions can be done with RCU protection
260  *
261  * may be used to exclude some operations from running concurrently without any
262  * modifications to the list (see write_all_supers)
263  *
264  * Is not required at mount and close times, because our device list is
265  * protected by the uuid_mutex at that point.
266  *
267  * balance_mutex
268  * -------------
269  * protects balance structures (status, state) and context accessed from
270  * several places (internally, ioctl)
271  *
272  * chunk_mutex
273  * -----------
274  * protects chunks, adding or removing during allocation, trim or when a new
275  * device is added/removed. Additionally it also protects post_commit_list of
276  * individual devices, since they can be added to the transaction's
277  * post_commit_list only with chunk_mutex held.
278  *
279  * cleaner_mutex
280  * -------------
281  * a big lock that is held by the cleaner thread and prevents running subvolume
282  * cleaning together with relocation or delayed iputs
283  *
284  *
285  * Lock nesting
286  * ============
287  *
288  * uuid_mutex
289  *   device_list_mutex
290  *     chunk_mutex
291  *   balance_mutex
292  *
293  *
294  * Exclusive operations
295  * ====================
296  *
297  * Maintains the exclusivity of the following operations that apply to the
298  * whole filesystem and cannot run in parallel.
299  *
300  * - Balance (*)
301  * - Device add
302  * - Device remove
303  * - Device replace (*)
304  * - Resize
305  *
306  * The device operations (as above) can be in one of the following states:
307  *
308  * - Running state
309  * - Paused state
310  * - Completed state
311  *
312  * Only device operations marked with (*) can go into the Paused state for the
313  * following reasons:
314  *
315  * - ioctl (only Balance can be Paused through ioctl)
316  * - filesystem remounted as read-only
317  * - filesystem unmounted and mounted as read-only
318  * - system power-cycle and filesystem mounted as read-only
319  * - filesystem or device errors leading to forced read-only
320  *
321  * The status of exclusive operation is set and cleared atomically.
322  * During the course of Paused state, fs_info::exclusive_operation remains set.
323  * A device operation in Paused or Running state can be canceled or resumed
324  * either by ioctl (Balance only) or when remounted as read-write.
325  * The exclusive status is cleared when the device operation is canceled or
326  * completed.
327  */
328
329 DEFINE_MUTEX(uuid_mutex);
330 static LIST_HEAD(fs_uuids);
331 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
332 {
333         return &fs_uuids;
334 }
335
336 /*
337  * alloc_fs_devices - allocate struct btrfs_fs_devices
338  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
339  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
340  *
341  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
342  * The returned struct is not linked onto any lists and can be destroyed with
343  * kfree() right away.
344  */
345 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
346                                                  const u8 *metadata_fsid)
347 {
348         struct btrfs_fs_devices *fs_devs;
349
350         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
351         if (!fs_devs)
352                 return ERR_PTR(-ENOMEM);
353
354         mutex_init(&fs_devs->device_list_mutex);
355
356         INIT_LIST_HEAD(&fs_devs->devices);
357         INIT_LIST_HEAD(&fs_devs->alloc_list);
358         INIT_LIST_HEAD(&fs_devs->fs_list);
359         INIT_LIST_HEAD(&fs_devs->seed_list);
360         if (fsid)
361                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
362
363         if (metadata_fsid)
364                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
365         else if (fsid)
366                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
367
368         return fs_devs;
369 }
370
371 void btrfs_free_device(struct btrfs_device *device)
372 {
373         WARN_ON(!list_empty(&device->post_commit_list));
374         rcu_string_free(device->name);
375         extent_io_tree_release(&device->alloc_state);
376         bio_put(device->flush_bio);
377         kfree(device);
378 }
379
380 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
381 {
382         struct btrfs_device *device;
383         WARN_ON(fs_devices->opened);
384         while (!list_empty(&fs_devices->devices)) {
385                 device = list_entry(fs_devices->devices.next,
386                                     struct btrfs_device, dev_list);
387                 list_del(&device->dev_list);
388                 btrfs_free_device(device);
389         }
390         kfree(fs_devices);
391 }
392
393 void __exit btrfs_cleanup_fs_uuids(void)
394 {
395         struct btrfs_fs_devices *fs_devices;
396
397         while (!list_empty(&fs_uuids)) {
398                 fs_devices = list_entry(fs_uuids.next,
399                                         struct btrfs_fs_devices, fs_list);
400                 list_del(&fs_devices->fs_list);
401                 free_fs_devices(fs_devices);
402         }
403 }
404
405 /*
406  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
407  * Returned struct is not linked onto any lists and must be destroyed using
408  * btrfs_free_device.
409  */
410 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
411 {
412         struct btrfs_device *dev;
413
414         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
415         if (!dev)
416                 return ERR_PTR(-ENOMEM);
417
418         /*
419          * Preallocate a bio that's always going to be used for flushing device
420          * barriers and matches the device lifespan
421          */
422         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
423         if (!dev->flush_bio) {
424                 kfree(dev);
425                 return ERR_PTR(-ENOMEM);
426         }
427
428         INIT_LIST_HEAD(&dev->dev_list);
429         INIT_LIST_HEAD(&dev->dev_alloc_list);
430         INIT_LIST_HEAD(&dev->post_commit_list);
431
432         atomic_set(&dev->reada_in_flight, 0);
433         atomic_set(&dev->dev_stats_ccnt, 0);
434         btrfs_device_data_ordered_init(dev, fs_info);
435         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
436         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
437         extent_io_tree_init(fs_info, &dev->alloc_state,
438                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
439
440         return dev;
441 }
442
443 static noinline struct btrfs_fs_devices *find_fsid(
444                 const u8 *fsid, const u8 *metadata_fsid)
445 {
446         struct btrfs_fs_devices *fs_devices;
447
448         ASSERT(fsid);
449
450         /* Handle non-split brain cases */
451         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
452                 if (metadata_fsid) {
453                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
454                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
455                                       BTRFS_FSID_SIZE) == 0)
456                                 return fs_devices;
457                 } else {
458                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
459                                 return fs_devices;
460                 }
461         }
462         return NULL;
463 }
464
465 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
466                                 struct btrfs_super_block *disk_super)
467 {
468
469         struct btrfs_fs_devices *fs_devices;
470
471         /*
472          * Handle scanned device having completed its fsid change but
473          * belonging to a fs_devices that was created by first scanning
474          * a device which didn't have its fsid/metadata_uuid changed
475          * at all and the CHANGING_FSID_V2 flag set.
476          */
477         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
478                 if (fs_devices->fsid_change &&
479                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
480                            BTRFS_FSID_SIZE) == 0 &&
481                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
482                            BTRFS_FSID_SIZE) == 0) {
483                         return fs_devices;
484                 }
485         }
486         /*
487          * Handle scanned device having completed its fsid change but
488          * belonging to a fs_devices that was created by a device that
489          * has an outdated pair of fsid/metadata_uuid and
490          * CHANGING_FSID_V2 flag set.
491          */
492         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
493                 if (fs_devices->fsid_change &&
494                     memcmp(fs_devices->metadata_uuid,
495                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
496                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
497                            BTRFS_FSID_SIZE) == 0) {
498                         return fs_devices;
499                 }
500         }
501
502         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
503 }
504
505
506 static int
507 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
508                       int flush, struct block_device **bdev,
509                       struct btrfs_super_block **disk_super)
510 {
511         int ret;
512
513         *bdev = blkdev_get_by_path(device_path, flags, holder);
514
515         if (IS_ERR(*bdev)) {
516                 ret = PTR_ERR(*bdev);
517                 goto error;
518         }
519
520         if (flush)
521                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
522         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
523         if (ret) {
524                 blkdev_put(*bdev, flags);
525                 goto error;
526         }
527         invalidate_bdev(*bdev);
528         *disk_super = btrfs_read_dev_super(*bdev);
529         if (IS_ERR(*disk_super)) {
530                 ret = PTR_ERR(*disk_super);
531                 blkdev_put(*bdev, flags);
532                 goto error;
533         }
534
535         return 0;
536
537 error:
538         *bdev = NULL;
539         return ret;
540 }
541
542 static bool device_path_matched(const char *path, struct btrfs_device *device)
543 {
544         int found;
545
546         rcu_read_lock();
547         found = strcmp(rcu_str_deref(device->name), path);
548         rcu_read_unlock();
549
550         return found == 0;
551 }
552
553 /*
554  *  Search and remove all stale (devices which are not mounted) devices.
555  *  When both inputs are NULL, it will search and release all stale devices.
556  *  path:       Optional. When provided will it release all unmounted devices
557  *              matching this path only.
558  *  skip_dev:   Optional. Will skip this device when searching for the stale
559  *              devices.
560  *  Return:     0 for success or if @path is NULL.
561  *              -EBUSY if @path is a mounted device.
562  *              -ENOENT if @path does not match any device in the list.
563  */
564 static int btrfs_free_stale_devices(const char *path,
565                                      struct btrfs_device *skip_device)
566 {
567         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
568         struct btrfs_device *device, *tmp_device;
569         int ret = 0;
570
571         if (path)
572                 ret = -ENOENT;
573
574         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
575
576                 mutex_lock(&fs_devices->device_list_mutex);
577                 list_for_each_entry_safe(device, tmp_device,
578                                          &fs_devices->devices, dev_list) {
579                         if (skip_device && skip_device == device)
580                                 continue;
581                         if (path && !device->name)
582                                 continue;
583                         if (path && !device_path_matched(path, device))
584                                 continue;
585                         if (fs_devices->opened) {
586                                 /* for an already deleted device return 0 */
587                                 if (path && ret != 0)
588                                         ret = -EBUSY;
589                                 break;
590                         }
591
592                         /* delete the stale device */
593                         fs_devices->num_devices--;
594                         list_del(&device->dev_list);
595                         btrfs_free_device(device);
596
597                         ret = 0;
598                 }
599                 mutex_unlock(&fs_devices->device_list_mutex);
600
601                 if (fs_devices->num_devices == 0) {
602                         btrfs_sysfs_remove_fsid(fs_devices);
603                         list_del(&fs_devices->fs_list);
604                         free_fs_devices(fs_devices);
605                 }
606         }
607
608         return ret;
609 }
610
611 /*
612  * This is only used on mount, and we are protected from competing things
613  * messing with our fs_devices by the uuid_mutex, thus we do not need the
614  * fs_devices->device_list_mutex here.
615  */
616 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
617                         struct btrfs_device *device, fmode_t flags,
618                         void *holder)
619 {
620         struct request_queue *q;
621         struct block_device *bdev;
622         struct btrfs_super_block *disk_super;
623         u64 devid;
624         int ret;
625
626         if (device->bdev)
627                 return -EINVAL;
628         if (!device->name)
629                 return -EINVAL;
630
631         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
632                                     &bdev, &disk_super);
633         if (ret)
634                 return ret;
635
636         devid = btrfs_stack_device_id(&disk_super->dev_item);
637         if (devid != device->devid)
638                 goto error_free_page;
639
640         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
641                 goto error_free_page;
642
643         device->generation = btrfs_super_generation(disk_super);
644
645         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
646                 if (btrfs_super_incompat_flags(disk_super) &
647                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
648                         pr_err(
649                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
650                         goto error_free_page;
651                 }
652
653                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
654                 fs_devices->seeding = true;
655         } else {
656                 if (bdev_read_only(bdev))
657                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
658                 else
659                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
660         }
661
662         q = bdev_get_queue(bdev);
663         if (!blk_queue_nonrot(q))
664                 fs_devices->rotating = true;
665
666         device->bdev = bdev;
667         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
668         device->mode = flags;
669
670         fs_devices->open_devices++;
671         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
672             device->devid != BTRFS_DEV_REPLACE_DEVID) {
673                 fs_devices->rw_devices++;
674                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
675         }
676         btrfs_release_disk_super(disk_super);
677
678         return 0;
679
680 error_free_page:
681         btrfs_release_disk_super(disk_super);
682         blkdev_put(bdev, flags);
683
684         return -EINVAL;
685 }
686
687 /*
688  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
689  * being created with a disk that has already completed its fsid change. Such
690  * disk can belong to an fs which has its FSID changed or to one which doesn't.
691  * Handle both cases here.
692  */
693 static struct btrfs_fs_devices *find_fsid_inprogress(
694                                         struct btrfs_super_block *disk_super)
695 {
696         struct btrfs_fs_devices *fs_devices;
697
698         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
699                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
700                            BTRFS_FSID_SIZE) != 0 &&
701                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
702                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
703                         return fs_devices;
704                 }
705         }
706
707         return find_fsid(disk_super->fsid, NULL);
708 }
709
710
711 static struct btrfs_fs_devices *find_fsid_changed(
712                                         struct btrfs_super_block *disk_super)
713 {
714         struct btrfs_fs_devices *fs_devices;
715
716         /*
717          * Handles the case where scanned device is part of an fs that had
718          * multiple successful changes of FSID but curently device didn't
719          * observe it. Meaning our fsid will be different than theirs. We need
720          * to handle two subcases :
721          *  1 - The fs still continues to have different METADATA/FSID uuids.
722          *  2 - The fs is switched back to its original FSID (METADATA/FSID
723          *  are equal).
724          */
725         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
726                 /* Changed UUIDs */
727                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
728                            BTRFS_FSID_SIZE) != 0 &&
729                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
730                            BTRFS_FSID_SIZE) == 0 &&
731                     memcmp(fs_devices->fsid, disk_super->fsid,
732                            BTRFS_FSID_SIZE) != 0)
733                         return fs_devices;
734
735                 /* Unchanged UUIDs */
736                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
737                            BTRFS_FSID_SIZE) == 0 &&
738                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
739                            BTRFS_FSID_SIZE) == 0)
740                         return fs_devices;
741         }
742
743         return NULL;
744 }
745
746 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
747                                 struct btrfs_super_block *disk_super)
748 {
749         struct btrfs_fs_devices *fs_devices;
750
751         /*
752          * Handle the case where the scanned device is part of an fs whose last
753          * metadata UUID change reverted it to the original FSID. At the same
754          * time * fs_devices was first created by another constitutent device
755          * which didn't fully observe the operation. This results in an
756          * btrfs_fs_devices created with metadata/fsid different AND
757          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
758          * fs_devices equal to the FSID of the disk.
759          */
760         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
761                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
762                            BTRFS_FSID_SIZE) != 0 &&
763                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
764                            BTRFS_FSID_SIZE) == 0 &&
765                     fs_devices->fsid_change)
766                         return fs_devices;
767         }
768
769         return NULL;
770 }
771 /*
772  * Add new device to list of registered devices
773  *
774  * Returns:
775  * device pointer which was just added or updated when successful
776  * error pointer when failed
777  */
778 static noinline struct btrfs_device *device_list_add(const char *path,
779                            struct btrfs_super_block *disk_super,
780                            bool *new_device_added)
781 {
782         struct btrfs_device *device;
783         struct btrfs_fs_devices *fs_devices = NULL;
784         struct rcu_string *name;
785         u64 found_transid = btrfs_super_generation(disk_super);
786         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
787         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
788                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
789         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
790                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
791
792         if (fsid_change_in_progress) {
793                 if (!has_metadata_uuid)
794                         fs_devices = find_fsid_inprogress(disk_super);
795                 else
796                         fs_devices = find_fsid_changed(disk_super);
797         } else if (has_metadata_uuid) {
798                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
799         } else {
800                 fs_devices = find_fsid_reverted_metadata(disk_super);
801                 if (!fs_devices)
802                         fs_devices = find_fsid(disk_super->fsid, NULL);
803         }
804
805
806         if (!fs_devices) {
807                 if (has_metadata_uuid)
808                         fs_devices = alloc_fs_devices(disk_super->fsid,
809                                                       disk_super->metadata_uuid);
810                 else
811                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
812
813                 if (IS_ERR(fs_devices))
814                         return ERR_CAST(fs_devices);
815
816                 fs_devices->fsid_change = fsid_change_in_progress;
817
818                 mutex_lock(&fs_devices->device_list_mutex);
819                 list_add(&fs_devices->fs_list, &fs_uuids);
820
821                 device = NULL;
822         } else {
823                 mutex_lock(&fs_devices->device_list_mutex);
824                 device = btrfs_find_device(fs_devices, devid,
825                                 disk_super->dev_item.uuid, NULL, false);
826
827                 /*
828                  * If this disk has been pulled into an fs devices created by
829                  * a device which had the CHANGING_FSID_V2 flag then replace the
830                  * metadata_uuid/fsid values of the fs_devices.
831                  */
832                 if (fs_devices->fsid_change &&
833                     found_transid > fs_devices->latest_generation) {
834                         memcpy(fs_devices->fsid, disk_super->fsid,
835                                         BTRFS_FSID_SIZE);
836
837                         if (has_metadata_uuid)
838                                 memcpy(fs_devices->metadata_uuid,
839                                        disk_super->metadata_uuid,
840                                        BTRFS_FSID_SIZE);
841                         else
842                                 memcpy(fs_devices->metadata_uuid,
843                                        disk_super->fsid, BTRFS_FSID_SIZE);
844
845                         fs_devices->fsid_change = false;
846                 }
847         }
848
849         if (!device) {
850                 if (fs_devices->opened) {
851                         mutex_unlock(&fs_devices->device_list_mutex);
852                         return ERR_PTR(-EBUSY);
853                 }
854
855                 device = btrfs_alloc_device(NULL, &devid,
856                                             disk_super->dev_item.uuid);
857                 if (IS_ERR(device)) {
858                         mutex_unlock(&fs_devices->device_list_mutex);
859                         /* we can safely leave the fs_devices entry around */
860                         return device;
861                 }
862
863                 name = rcu_string_strdup(path, GFP_NOFS);
864                 if (!name) {
865                         btrfs_free_device(device);
866                         mutex_unlock(&fs_devices->device_list_mutex);
867                         return ERR_PTR(-ENOMEM);
868                 }
869                 rcu_assign_pointer(device->name, name);
870
871                 list_add_rcu(&device->dev_list, &fs_devices->devices);
872                 fs_devices->num_devices++;
873
874                 device->fs_devices = fs_devices;
875                 *new_device_added = true;
876
877                 if (disk_super->label[0])
878                         pr_info(
879         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
880                                 disk_super->label, devid, found_transid, path,
881                                 current->comm, task_pid_nr(current));
882                 else
883                         pr_info(
884         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
885                                 disk_super->fsid, devid, found_transid, path,
886                                 current->comm, task_pid_nr(current));
887
888         } else if (!device->name || strcmp(device->name->str, path)) {
889                 /*
890                  * When FS is already mounted.
891                  * 1. If you are here and if the device->name is NULL that
892                  *    means this device was missing at time of FS mount.
893                  * 2. If you are here and if the device->name is different
894                  *    from 'path' that means either
895                  *      a. The same device disappeared and reappeared with
896                  *         different name. or
897                  *      b. The missing-disk-which-was-replaced, has
898                  *         reappeared now.
899                  *
900                  * We must allow 1 and 2a above. But 2b would be a spurious
901                  * and unintentional.
902                  *
903                  * Further in case of 1 and 2a above, the disk at 'path'
904                  * would have missed some transaction when it was away and
905                  * in case of 2a the stale bdev has to be updated as well.
906                  * 2b must not be allowed at all time.
907                  */
908
909                 /*
910                  * For now, we do allow update to btrfs_fs_device through the
911                  * btrfs dev scan cli after FS has been mounted.  We're still
912                  * tracking a problem where systems fail mount by subvolume id
913                  * when we reject replacement on a mounted FS.
914                  */
915                 if (!fs_devices->opened && found_transid < device->generation) {
916                         /*
917                          * That is if the FS is _not_ mounted and if you
918                          * are here, that means there is more than one
919                          * disk with same uuid and devid.We keep the one
920                          * with larger generation number or the last-in if
921                          * generation are equal.
922                          */
923                         mutex_unlock(&fs_devices->device_list_mutex);
924                         return ERR_PTR(-EEXIST);
925                 }
926
927                 /*
928                  * We are going to replace the device path for a given devid,
929                  * make sure it's the same device if the device is mounted
930                  */
931                 if (device->bdev) {
932                         int error;
933                         dev_t path_dev;
934
935                         error = lookup_bdev(path, &path_dev);
936                         if (error) {
937                                 mutex_unlock(&fs_devices->device_list_mutex);
938                                 return ERR_PTR(error);
939                         }
940
941                         if (device->bdev->bd_dev != path_dev) {
942                                 mutex_unlock(&fs_devices->device_list_mutex);
943                                 btrfs_warn_in_rcu(device->fs_info,
944         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
945                                                   path, devid, found_transid,
946                                                   current->comm,
947                                                   task_pid_nr(current));
948                                 return ERR_PTR(-EEXIST);
949                         }
950                         btrfs_info_in_rcu(device->fs_info,
951         "devid %llu device path %s changed to %s scanned by %s (%d)",
952                                           devid, rcu_str_deref(device->name),
953                                           path, current->comm,
954                                           task_pid_nr(current));
955                 }
956
957                 name = rcu_string_strdup(path, GFP_NOFS);
958                 if (!name) {
959                         mutex_unlock(&fs_devices->device_list_mutex);
960                         return ERR_PTR(-ENOMEM);
961                 }
962                 rcu_string_free(device->name);
963                 rcu_assign_pointer(device->name, name);
964                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
965                         fs_devices->missing_devices--;
966                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
967                 }
968         }
969
970         /*
971          * Unmount does not free the btrfs_device struct but would zero
972          * generation along with most of the other members. So just update
973          * it back. We need it to pick the disk with largest generation
974          * (as above).
975          */
976         if (!fs_devices->opened) {
977                 device->generation = found_transid;
978                 fs_devices->latest_generation = max_t(u64, found_transid,
979                                                 fs_devices->latest_generation);
980         }
981
982         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
983
984         mutex_unlock(&fs_devices->device_list_mutex);
985         return device;
986 }
987
988 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
989 {
990         struct btrfs_fs_devices *fs_devices;
991         struct btrfs_device *device;
992         struct btrfs_device *orig_dev;
993         int ret = 0;
994
995         fs_devices = alloc_fs_devices(orig->fsid, NULL);
996         if (IS_ERR(fs_devices))
997                 return fs_devices;
998
999         mutex_lock(&orig->device_list_mutex);
1000         fs_devices->total_devices = orig->total_devices;
1001
1002         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1003                 struct rcu_string *name;
1004
1005                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1006                                             orig_dev->uuid);
1007                 if (IS_ERR(device)) {
1008                         ret = PTR_ERR(device);
1009                         goto error;
1010                 }
1011
1012                 /*
1013                  * This is ok to do without rcu read locked because we hold the
1014                  * uuid mutex so nothing we touch in here is going to disappear.
1015                  */
1016                 if (orig_dev->name) {
1017                         name = rcu_string_strdup(orig_dev->name->str,
1018                                         GFP_KERNEL);
1019                         if (!name) {
1020                                 btrfs_free_device(device);
1021                                 ret = -ENOMEM;
1022                                 goto error;
1023                         }
1024                         rcu_assign_pointer(device->name, name);
1025                 }
1026
1027                 list_add(&device->dev_list, &fs_devices->devices);
1028                 device->fs_devices = fs_devices;
1029                 fs_devices->num_devices++;
1030         }
1031         mutex_unlock(&orig->device_list_mutex);
1032         return fs_devices;
1033 error:
1034         mutex_unlock(&orig->device_list_mutex);
1035         free_fs_devices(fs_devices);
1036         return ERR_PTR(ret);
1037 }
1038
1039 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1040                                       int step, struct btrfs_device **latest_dev)
1041 {
1042         struct btrfs_device *device, *next;
1043
1044         /* This is the initialized path, it is safe to release the devices. */
1045         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1046                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1047                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1048                                       &device->dev_state) &&
1049                             !test_bit(BTRFS_DEV_STATE_MISSING,
1050                                       &device->dev_state) &&
1051                             (!*latest_dev ||
1052                              device->generation > (*latest_dev)->generation)) {
1053                                 *latest_dev = device;
1054                         }
1055                         continue;
1056                 }
1057
1058                 /*
1059                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1060                  * in btrfs_init_dev_replace() so just continue.
1061                  */
1062                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1063                         continue;
1064
1065                 if (device->bdev) {
1066                         blkdev_put(device->bdev, device->mode);
1067                         device->bdev = NULL;
1068                         fs_devices->open_devices--;
1069                 }
1070                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1071                         list_del_init(&device->dev_alloc_list);
1072                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1073                 }
1074                 list_del_init(&device->dev_list);
1075                 fs_devices->num_devices--;
1076                 btrfs_free_device(device);
1077         }
1078
1079 }
1080
1081 /*
1082  * After we have read the system tree and know devids belonging to this
1083  * filesystem, remove the device which does not belong there.
1084  */
1085 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1086 {
1087         struct btrfs_device *latest_dev = NULL;
1088         struct btrfs_fs_devices *seed_dev;
1089
1090         mutex_lock(&uuid_mutex);
1091         __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1092
1093         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1094                 __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1095
1096         fs_devices->latest_bdev = latest_dev->bdev;
1097
1098         mutex_unlock(&uuid_mutex);
1099 }
1100
1101 static void btrfs_close_bdev(struct btrfs_device *device)
1102 {
1103         if (!device->bdev)
1104                 return;
1105
1106         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1107                 sync_blockdev(device->bdev);
1108                 invalidate_bdev(device->bdev);
1109         }
1110
1111         blkdev_put(device->bdev, device->mode);
1112 }
1113
1114 static void btrfs_close_one_device(struct btrfs_device *device)
1115 {
1116         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1117
1118         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1119             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1120                 list_del_init(&device->dev_alloc_list);
1121                 fs_devices->rw_devices--;
1122         }
1123
1124         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1125                 fs_devices->missing_devices--;
1126
1127         btrfs_close_bdev(device);
1128         if (device->bdev) {
1129                 fs_devices->open_devices--;
1130                 device->bdev = NULL;
1131         }
1132         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1133
1134         device->fs_info = NULL;
1135         atomic_set(&device->dev_stats_ccnt, 0);
1136         extent_io_tree_release(&device->alloc_state);
1137
1138         /* Verify the device is back in a pristine state  */
1139         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1140         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1141         ASSERT(list_empty(&device->dev_alloc_list));
1142         ASSERT(list_empty(&device->post_commit_list));
1143         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1144 }
1145
1146 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1147 {
1148         struct btrfs_device *device, *tmp;
1149
1150         lockdep_assert_held(&uuid_mutex);
1151
1152         if (--fs_devices->opened > 0)
1153                 return;
1154
1155         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1156                 btrfs_close_one_device(device);
1157
1158         WARN_ON(fs_devices->open_devices);
1159         WARN_ON(fs_devices->rw_devices);
1160         fs_devices->opened = 0;
1161         fs_devices->seeding = false;
1162         fs_devices->fs_info = NULL;
1163 }
1164
1165 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1166 {
1167         LIST_HEAD(list);
1168         struct btrfs_fs_devices *tmp;
1169
1170         mutex_lock(&uuid_mutex);
1171         close_fs_devices(fs_devices);
1172         if (!fs_devices->opened)
1173                 list_splice_init(&fs_devices->seed_list, &list);
1174
1175         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1176                 close_fs_devices(fs_devices);
1177                 list_del(&fs_devices->seed_list);
1178                 free_fs_devices(fs_devices);
1179         }
1180         mutex_unlock(&uuid_mutex);
1181 }
1182
1183 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1184                                 fmode_t flags, void *holder)
1185 {
1186         struct btrfs_device *device;
1187         struct btrfs_device *latest_dev = NULL;
1188         struct btrfs_device *tmp_device;
1189
1190         flags |= FMODE_EXCL;
1191
1192         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1193                                  dev_list) {
1194                 int ret;
1195
1196                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1197                 if (ret == 0 &&
1198                     (!latest_dev || device->generation > latest_dev->generation)) {
1199                         latest_dev = device;
1200                 } else if (ret == -ENODATA) {
1201                         fs_devices->num_devices--;
1202                         list_del(&device->dev_list);
1203                         btrfs_free_device(device);
1204                 }
1205         }
1206         if (fs_devices->open_devices == 0)
1207                 return -EINVAL;
1208
1209         fs_devices->opened = 1;
1210         fs_devices->latest_bdev = latest_dev->bdev;
1211         fs_devices->total_rw_bytes = 0;
1212         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1213
1214         return 0;
1215 }
1216
1217 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1218 {
1219         struct btrfs_device *dev1, *dev2;
1220
1221         dev1 = list_entry(a, struct btrfs_device, dev_list);
1222         dev2 = list_entry(b, struct btrfs_device, dev_list);
1223
1224         if (dev1->devid < dev2->devid)
1225                 return -1;
1226         else if (dev1->devid > dev2->devid)
1227                 return 1;
1228         return 0;
1229 }
1230
1231 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1232                        fmode_t flags, void *holder)
1233 {
1234         int ret;
1235
1236         lockdep_assert_held(&uuid_mutex);
1237         /*
1238          * The device_list_mutex cannot be taken here in case opening the
1239          * underlying device takes further locks like bd_mutex.
1240          *
1241          * We also don't need the lock here as this is called during mount and
1242          * exclusion is provided by uuid_mutex
1243          */
1244
1245         if (fs_devices->opened) {
1246                 fs_devices->opened++;
1247                 ret = 0;
1248         } else {
1249                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1250                 ret = open_fs_devices(fs_devices, flags, holder);
1251         }
1252
1253         return ret;
1254 }
1255
1256 void btrfs_release_disk_super(struct btrfs_super_block *super)
1257 {
1258         struct page *page = virt_to_page(super);
1259
1260         put_page(page);
1261 }
1262
1263 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1264                                                        u64 bytenr)
1265 {
1266         struct btrfs_super_block *disk_super;
1267         struct page *page;
1268         void *p;
1269         pgoff_t index;
1270
1271         /* make sure our super fits in the device */
1272         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1273                 return ERR_PTR(-EINVAL);
1274
1275         /* make sure our super fits in the page */
1276         if (sizeof(*disk_super) > PAGE_SIZE)
1277                 return ERR_PTR(-EINVAL);
1278
1279         /* make sure our super doesn't straddle pages on disk */
1280         index = bytenr >> PAGE_SHIFT;
1281         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1282                 return ERR_PTR(-EINVAL);
1283
1284         /* pull in the page with our super */
1285         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1286
1287         if (IS_ERR(page))
1288                 return ERR_CAST(page);
1289
1290         p = page_address(page);
1291
1292         /* align our pointer to the offset of the super block */
1293         disk_super = p + offset_in_page(bytenr);
1294
1295         if (btrfs_super_bytenr(disk_super) != bytenr ||
1296             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1297                 btrfs_release_disk_super(p);
1298                 return ERR_PTR(-EINVAL);
1299         }
1300
1301         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1302                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1303
1304         return disk_super;
1305 }
1306
1307 int btrfs_forget_devices(const char *path)
1308 {
1309         int ret;
1310
1311         mutex_lock(&uuid_mutex);
1312         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1313         mutex_unlock(&uuid_mutex);
1314
1315         return ret;
1316 }
1317
1318 /*
1319  * Look for a btrfs signature on a device. This may be called out of the mount path
1320  * and we are not allowed to call set_blocksize during the scan. The superblock
1321  * is read via pagecache
1322  */
1323 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1324                                            void *holder)
1325 {
1326         struct btrfs_super_block *disk_super;
1327         bool new_device_added = false;
1328         struct btrfs_device *device = NULL;
1329         struct block_device *bdev;
1330         u64 bytenr;
1331
1332         lockdep_assert_held(&uuid_mutex);
1333
1334         /*
1335          * we would like to check all the supers, but that would make
1336          * a btrfs mount succeed after a mkfs from a different FS.
1337          * So, we need to add a special mount option to scan for
1338          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1339          */
1340         bytenr = btrfs_sb_offset(0);
1341         flags |= FMODE_EXCL;
1342
1343         bdev = blkdev_get_by_path(path, flags, holder);
1344         if (IS_ERR(bdev))
1345                 return ERR_CAST(bdev);
1346
1347         disk_super = btrfs_read_disk_super(bdev, bytenr);
1348         if (IS_ERR(disk_super)) {
1349                 device = ERR_CAST(disk_super);
1350                 goto error_bdev_put;
1351         }
1352
1353         device = device_list_add(path, disk_super, &new_device_added);
1354         if (!IS_ERR(device)) {
1355                 if (new_device_added)
1356                         btrfs_free_stale_devices(path, device);
1357         }
1358
1359         btrfs_release_disk_super(disk_super);
1360
1361 error_bdev_put:
1362         blkdev_put(bdev, flags);
1363
1364         return device;
1365 }
1366
1367 /*
1368  * Try to find a chunk that intersects [start, start + len] range and when one
1369  * such is found, record the end of it in *start
1370  */
1371 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1372                                     u64 len)
1373 {
1374         u64 physical_start, physical_end;
1375
1376         lockdep_assert_held(&device->fs_info->chunk_mutex);
1377
1378         if (!find_first_extent_bit(&device->alloc_state, *start,
1379                                    &physical_start, &physical_end,
1380                                    CHUNK_ALLOCATED, NULL)) {
1381
1382                 if (in_range(physical_start, *start, len) ||
1383                     in_range(*start, physical_start,
1384                              physical_end - physical_start)) {
1385                         *start = physical_end + 1;
1386                         return true;
1387                 }
1388         }
1389         return false;
1390 }
1391
1392 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1393 {
1394         switch (device->fs_devices->chunk_alloc_policy) {
1395         case BTRFS_CHUNK_ALLOC_REGULAR:
1396                 /*
1397                  * We don't want to overwrite the superblock on the drive nor
1398                  * any area used by the boot loader (grub for example), so we
1399                  * make sure to start at an offset of at least 1MB.
1400                  */
1401                 return max_t(u64, start, SZ_1M);
1402         default:
1403                 BUG();
1404         }
1405 }
1406
1407 /**
1408  * dev_extent_hole_check - check if specified hole is suitable for allocation
1409  * @device:     the device which we have the hole
1410  * @hole_start: starting position of the hole
1411  * @hole_size:  the size of the hole
1412  * @num_bytes:  the size of the free space that we need
1413  *
1414  * This function may modify @hole_start and @hole_end to reflect the suitable
1415  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1416  */
1417 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1418                                   u64 *hole_size, u64 num_bytes)
1419 {
1420         bool changed = false;
1421         u64 hole_end = *hole_start + *hole_size;
1422
1423         /*
1424          * Check before we set max_hole_start, otherwise we could end up
1425          * sending back this offset anyway.
1426          */
1427         if (contains_pending_extent(device, hole_start, *hole_size)) {
1428                 if (hole_end >= *hole_start)
1429                         *hole_size = hole_end - *hole_start;
1430                 else
1431                         *hole_size = 0;
1432                 changed = true;
1433         }
1434
1435         switch (device->fs_devices->chunk_alloc_policy) {
1436         case BTRFS_CHUNK_ALLOC_REGULAR:
1437                 /* No extra check */
1438                 break;
1439         default:
1440                 BUG();
1441         }
1442
1443         return changed;
1444 }
1445
1446 /*
1447  * find_free_dev_extent_start - find free space in the specified device
1448  * @device:       the device which we search the free space in
1449  * @num_bytes:    the size of the free space that we need
1450  * @search_start: the position from which to begin the search
1451  * @start:        store the start of the free space.
1452  * @len:          the size of the free space. that we find, or the size
1453  *                of the max free space if we don't find suitable free space
1454  *
1455  * this uses a pretty simple search, the expectation is that it is
1456  * called very infrequently and that a given device has a small number
1457  * of extents
1458  *
1459  * @start is used to store the start of the free space if we find. But if we
1460  * don't find suitable free space, it will be used to store the start position
1461  * of the max free space.
1462  *
1463  * @len is used to store the size of the free space that we find.
1464  * But if we don't find suitable free space, it is used to store the size of
1465  * the max free space.
1466  *
1467  * NOTE: This function will search *commit* root of device tree, and does extra
1468  * check to ensure dev extents are not double allocated.
1469  * This makes the function safe to allocate dev extents but may not report
1470  * correct usable device space, as device extent freed in current transaction
1471  * is not reported as avaiable.
1472  */
1473 static int find_free_dev_extent_start(struct btrfs_device *device,
1474                                 u64 num_bytes, u64 search_start, u64 *start,
1475                                 u64 *len)
1476 {
1477         struct btrfs_fs_info *fs_info = device->fs_info;
1478         struct btrfs_root *root = fs_info->dev_root;
1479         struct btrfs_key key;
1480         struct btrfs_dev_extent *dev_extent;
1481         struct btrfs_path *path;
1482         u64 hole_size;
1483         u64 max_hole_start;
1484         u64 max_hole_size;
1485         u64 extent_end;
1486         u64 search_end = device->total_bytes;
1487         int ret;
1488         int slot;
1489         struct extent_buffer *l;
1490
1491         search_start = dev_extent_search_start(device, search_start);
1492
1493         path = btrfs_alloc_path();
1494         if (!path)
1495                 return -ENOMEM;
1496
1497         max_hole_start = search_start;
1498         max_hole_size = 0;
1499
1500 again:
1501         if (search_start >= search_end ||
1502                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1503                 ret = -ENOSPC;
1504                 goto out;
1505         }
1506
1507         path->reada = READA_FORWARD;
1508         path->search_commit_root = 1;
1509         path->skip_locking = 1;
1510
1511         key.objectid = device->devid;
1512         key.offset = search_start;
1513         key.type = BTRFS_DEV_EXTENT_KEY;
1514
1515         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1516         if (ret < 0)
1517                 goto out;
1518         if (ret > 0) {
1519                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1520                 if (ret < 0)
1521                         goto out;
1522         }
1523
1524         while (1) {
1525                 l = path->nodes[0];
1526                 slot = path->slots[0];
1527                 if (slot >= btrfs_header_nritems(l)) {
1528                         ret = btrfs_next_leaf(root, path);
1529                         if (ret == 0)
1530                                 continue;
1531                         if (ret < 0)
1532                                 goto out;
1533
1534                         break;
1535                 }
1536                 btrfs_item_key_to_cpu(l, &key, slot);
1537
1538                 if (key.objectid < device->devid)
1539                         goto next;
1540
1541                 if (key.objectid > device->devid)
1542                         break;
1543
1544                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1545                         goto next;
1546
1547                 if (key.offset > search_start) {
1548                         hole_size = key.offset - search_start;
1549                         dev_extent_hole_check(device, &search_start, &hole_size,
1550                                               num_bytes);
1551
1552                         if (hole_size > max_hole_size) {
1553                                 max_hole_start = search_start;
1554                                 max_hole_size = hole_size;
1555                         }
1556
1557                         /*
1558                          * If this free space is greater than which we need,
1559                          * it must be the max free space that we have found
1560                          * until now, so max_hole_start must point to the start
1561                          * of this free space and the length of this free space
1562                          * is stored in max_hole_size. Thus, we return
1563                          * max_hole_start and max_hole_size and go back to the
1564                          * caller.
1565                          */
1566                         if (hole_size >= num_bytes) {
1567                                 ret = 0;
1568                                 goto out;
1569                         }
1570                 }
1571
1572                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1573                 extent_end = key.offset + btrfs_dev_extent_length(l,
1574                                                                   dev_extent);
1575                 if (extent_end > search_start)
1576                         search_start = extent_end;
1577 next:
1578                 path->slots[0]++;
1579                 cond_resched();
1580         }
1581
1582         /*
1583          * At this point, search_start should be the end of
1584          * allocated dev extents, and when shrinking the device,
1585          * search_end may be smaller than search_start.
1586          */
1587         if (search_end > search_start) {
1588                 hole_size = search_end - search_start;
1589                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1590                                           num_bytes)) {
1591                         btrfs_release_path(path);
1592                         goto again;
1593                 }
1594
1595                 if (hole_size > max_hole_size) {
1596                         max_hole_start = search_start;
1597                         max_hole_size = hole_size;
1598                 }
1599         }
1600
1601         /* See above. */
1602         if (max_hole_size < num_bytes)
1603                 ret = -ENOSPC;
1604         else
1605                 ret = 0;
1606
1607 out:
1608         btrfs_free_path(path);
1609         *start = max_hole_start;
1610         if (len)
1611                 *len = max_hole_size;
1612         return ret;
1613 }
1614
1615 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1616                          u64 *start, u64 *len)
1617 {
1618         /* FIXME use last free of some kind */
1619         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1620 }
1621
1622 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1623                           struct btrfs_device *device,
1624                           u64 start, u64 *dev_extent_len)
1625 {
1626         struct btrfs_fs_info *fs_info = device->fs_info;
1627         struct btrfs_root *root = fs_info->dev_root;
1628         int ret;
1629         struct btrfs_path *path;
1630         struct btrfs_key key;
1631         struct btrfs_key found_key;
1632         struct extent_buffer *leaf = NULL;
1633         struct btrfs_dev_extent *extent = NULL;
1634
1635         path = btrfs_alloc_path();
1636         if (!path)
1637                 return -ENOMEM;
1638
1639         key.objectid = device->devid;
1640         key.offset = start;
1641         key.type = BTRFS_DEV_EXTENT_KEY;
1642 again:
1643         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1644         if (ret > 0) {
1645                 ret = btrfs_previous_item(root, path, key.objectid,
1646                                           BTRFS_DEV_EXTENT_KEY);
1647                 if (ret)
1648                         goto out;
1649                 leaf = path->nodes[0];
1650                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1651                 extent = btrfs_item_ptr(leaf, path->slots[0],
1652                                         struct btrfs_dev_extent);
1653                 BUG_ON(found_key.offset > start || found_key.offset +
1654                        btrfs_dev_extent_length(leaf, extent) < start);
1655                 key = found_key;
1656                 btrfs_release_path(path);
1657                 goto again;
1658         } else if (ret == 0) {
1659                 leaf = path->nodes[0];
1660                 extent = btrfs_item_ptr(leaf, path->slots[0],
1661                                         struct btrfs_dev_extent);
1662         } else {
1663                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1664                 goto out;
1665         }
1666
1667         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1668
1669         ret = btrfs_del_item(trans, root, path);
1670         if (ret) {
1671                 btrfs_handle_fs_error(fs_info, ret,
1672                                       "Failed to remove dev extent item");
1673         } else {
1674                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1675         }
1676 out:
1677         btrfs_free_path(path);
1678         return ret;
1679 }
1680
1681 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1682                                   struct btrfs_device *device,
1683                                   u64 chunk_offset, u64 start, u64 num_bytes)
1684 {
1685         int ret;
1686         struct btrfs_path *path;
1687         struct btrfs_fs_info *fs_info = device->fs_info;
1688         struct btrfs_root *root = fs_info->dev_root;
1689         struct btrfs_dev_extent *extent;
1690         struct extent_buffer *leaf;
1691         struct btrfs_key key;
1692
1693         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1694         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1695         path = btrfs_alloc_path();
1696         if (!path)
1697                 return -ENOMEM;
1698
1699         key.objectid = device->devid;
1700         key.offset = start;
1701         key.type = BTRFS_DEV_EXTENT_KEY;
1702         ret = btrfs_insert_empty_item(trans, root, path, &key,
1703                                       sizeof(*extent));
1704         if (ret)
1705                 goto out;
1706
1707         leaf = path->nodes[0];
1708         extent = btrfs_item_ptr(leaf, path->slots[0],
1709                                 struct btrfs_dev_extent);
1710         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1711                                         BTRFS_CHUNK_TREE_OBJECTID);
1712         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1713                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1714         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1715
1716         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1717         btrfs_mark_buffer_dirty(leaf);
1718 out:
1719         btrfs_free_path(path);
1720         return ret;
1721 }
1722
1723 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1724 {
1725         struct extent_map_tree *em_tree;
1726         struct extent_map *em;
1727         struct rb_node *n;
1728         u64 ret = 0;
1729
1730         em_tree = &fs_info->mapping_tree;
1731         read_lock(&em_tree->lock);
1732         n = rb_last(&em_tree->map.rb_root);
1733         if (n) {
1734                 em = rb_entry(n, struct extent_map, rb_node);
1735                 ret = em->start + em->len;
1736         }
1737         read_unlock(&em_tree->lock);
1738
1739         return ret;
1740 }
1741
1742 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1743                                     u64 *devid_ret)
1744 {
1745         int ret;
1746         struct btrfs_key key;
1747         struct btrfs_key found_key;
1748         struct btrfs_path *path;
1749
1750         path = btrfs_alloc_path();
1751         if (!path)
1752                 return -ENOMEM;
1753
1754         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1755         key.type = BTRFS_DEV_ITEM_KEY;
1756         key.offset = (u64)-1;
1757
1758         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1759         if (ret < 0)
1760                 goto error;
1761
1762         if (ret == 0) {
1763                 /* Corruption */
1764                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1765                 ret = -EUCLEAN;
1766                 goto error;
1767         }
1768
1769         ret = btrfs_previous_item(fs_info->chunk_root, path,
1770                                   BTRFS_DEV_ITEMS_OBJECTID,
1771                                   BTRFS_DEV_ITEM_KEY);
1772         if (ret) {
1773                 *devid_ret = 1;
1774         } else {
1775                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1776                                       path->slots[0]);
1777                 *devid_ret = found_key.offset + 1;
1778         }
1779         ret = 0;
1780 error:
1781         btrfs_free_path(path);
1782         return ret;
1783 }
1784
1785 /*
1786  * the device information is stored in the chunk root
1787  * the btrfs_device struct should be fully filled in
1788  */
1789 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1790                             struct btrfs_device *device)
1791 {
1792         int ret;
1793         struct btrfs_path *path;
1794         struct btrfs_dev_item *dev_item;
1795         struct extent_buffer *leaf;
1796         struct btrfs_key key;
1797         unsigned long ptr;
1798
1799         path = btrfs_alloc_path();
1800         if (!path)
1801                 return -ENOMEM;
1802
1803         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1804         key.type = BTRFS_DEV_ITEM_KEY;
1805         key.offset = device->devid;
1806
1807         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1808                                       &key, sizeof(*dev_item));
1809         if (ret)
1810                 goto out;
1811
1812         leaf = path->nodes[0];
1813         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1814
1815         btrfs_set_device_id(leaf, dev_item, device->devid);
1816         btrfs_set_device_generation(leaf, dev_item, 0);
1817         btrfs_set_device_type(leaf, dev_item, device->type);
1818         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1819         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1820         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1821         btrfs_set_device_total_bytes(leaf, dev_item,
1822                                      btrfs_device_get_disk_total_bytes(device));
1823         btrfs_set_device_bytes_used(leaf, dev_item,
1824                                     btrfs_device_get_bytes_used(device));
1825         btrfs_set_device_group(leaf, dev_item, 0);
1826         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1827         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1828         btrfs_set_device_start_offset(leaf, dev_item, 0);
1829
1830         ptr = btrfs_device_uuid(dev_item);
1831         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1832         ptr = btrfs_device_fsid(dev_item);
1833         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1834                             ptr, BTRFS_FSID_SIZE);
1835         btrfs_mark_buffer_dirty(leaf);
1836
1837         ret = 0;
1838 out:
1839         btrfs_free_path(path);
1840         return ret;
1841 }
1842
1843 /*
1844  * Function to update ctime/mtime for a given device path.
1845  * Mainly used for ctime/mtime based probe like libblkid.
1846  */
1847 static void update_dev_time(const char *path_name)
1848 {
1849         struct file *filp;
1850
1851         filp = filp_open(path_name, O_RDWR, 0);
1852         if (IS_ERR(filp))
1853                 return;
1854         file_update_time(filp);
1855         filp_close(filp, NULL);
1856 }
1857
1858 static int btrfs_rm_dev_item(struct btrfs_device *device)
1859 {
1860         struct btrfs_root *root = device->fs_info->chunk_root;
1861         int ret;
1862         struct btrfs_path *path;
1863         struct btrfs_key key;
1864         struct btrfs_trans_handle *trans;
1865
1866         path = btrfs_alloc_path();
1867         if (!path)
1868                 return -ENOMEM;
1869
1870         trans = btrfs_start_transaction(root, 0);
1871         if (IS_ERR(trans)) {
1872                 btrfs_free_path(path);
1873                 return PTR_ERR(trans);
1874         }
1875         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1876         key.type = BTRFS_DEV_ITEM_KEY;
1877         key.offset = device->devid;
1878
1879         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1880         if (ret) {
1881                 if (ret > 0)
1882                         ret = -ENOENT;
1883                 btrfs_abort_transaction(trans, ret);
1884                 btrfs_end_transaction(trans);
1885                 goto out;
1886         }
1887
1888         ret = btrfs_del_item(trans, root, path);
1889         if (ret) {
1890                 btrfs_abort_transaction(trans, ret);
1891                 btrfs_end_transaction(trans);
1892         }
1893
1894 out:
1895         btrfs_free_path(path);
1896         if (!ret)
1897                 ret = btrfs_commit_transaction(trans);
1898         return ret;
1899 }
1900
1901 /*
1902  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1903  * filesystem. It's up to the caller to adjust that number regarding eg. device
1904  * replace.
1905  */
1906 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1907                 u64 num_devices)
1908 {
1909         u64 all_avail;
1910         unsigned seq;
1911         int i;
1912
1913         do {
1914                 seq = read_seqbegin(&fs_info->profiles_lock);
1915
1916                 all_avail = fs_info->avail_data_alloc_bits |
1917                             fs_info->avail_system_alloc_bits |
1918                             fs_info->avail_metadata_alloc_bits;
1919         } while (read_seqretry(&fs_info->profiles_lock, seq));
1920
1921         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1922                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1923                         continue;
1924
1925                 if (num_devices < btrfs_raid_array[i].devs_min) {
1926                         int ret = btrfs_raid_array[i].mindev_error;
1927
1928                         if (ret)
1929                                 return ret;
1930                 }
1931         }
1932
1933         return 0;
1934 }
1935
1936 static struct btrfs_device * btrfs_find_next_active_device(
1937                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1938 {
1939         struct btrfs_device *next_device;
1940
1941         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1942                 if (next_device != device &&
1943                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1944                     && next_device->bdev)
1945                         return next_device;
1946         }
1947
1948         return NULL;
1949 }
1950
1951 /*
1952  * Helper function to check if the given device is part of s_bdev / latest_bdev
1953  * and replace it with the provided or the next active device, in the context
1954  * where this function called, there should be always be another device (or
1955  * this_dev) which is active.
1956  */
1957 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1958                                             struct btrfs_device *next_device)
1959 {
1960         struct btrfs_fs_info *fs_info = device->fs_info;
1961
1962         if (!next_device)
1963                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1964                                                             device);
1965         ASSERT(next_device);
1966
1967         if (fs_info->sb->s_bdev &&
1968                         (fs_info->sb->s_bdev == device->bdev))
1969                 fs_info->sb->s_bdev = next_device->bdev;
1970
1971         if (fs_info->fs_devices->latest_bdev == device->bdev)
1972                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1973 }
1974
1975 /*
1976  * Return btrfs_fs_devices::num_devices excluding the device that's being
1977  * currently replaced.
1978  */
1979 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1980 {
1981         u64 num_devices = fs_info->fs_devices->num_devices;
1982
1983         down_read(&fs_info->dev_replace.rwsem);
1984         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1985                 ASSERT(num_devices > 1);
1986                 num_devices--;
1987         }
1988         up_read(&fs_info->dev_replace.rwsem);
1989
1990         return num_devices;
1991 }
1992
1993 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
1994                                struct block_device *bdev,
1995                                const char *device_path)
1996 {
1997         struct btrfs_super_block *disk_super;
1998         int copy_num;
1999
2000         if (!bdev)
2001                 return;
2002
2003         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2004                 struct page *page;
2005                 int ret;
2006
2007                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2008                 if (IS_ERR(disk_super))
2009                         continue;
2010
2011                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2012
2013                 page = virt_to_page(disk_super);
2014                 set_page_dirty(page);
2015                 lock_page(page);
2016                 /* write_on_page() unlocks the page */
2017                 ret = write_one_page(page);
2018                 if (ret)
2019                         btrfs_warn(fs_info,
2020                                 "error clearing superblock number %d (%d)",
2021                                 copy_num, ret);
2022                 btrfs_release_disk_super(disk_super);
2023
2024         }
2025
2026         /* Notify udev that device has changed */
2027         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2028
2029         /* Update ctime/mtime for device path for libblkid */
2030         update_dev_time(device_path);
2031 }
2032
2033 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2034                     u64 devid)
2035 {
2036         struct btrfs_device *device;
2037         struct btrfs_fs_devices *cur_devices;
2038         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2039         u64 num_devices;
2040         int ret = 0;
2041
2042         mutex_lock(&uuid_mutex);
2043
2044         num_devices = btrfs_num_devices(fs_info);
2045
2046         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2047         if (ret)
2048                 goto out;
2049
2050         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2051
2052         if (IS_ERR(device)) {
2053                 if (PTR_ERR(device) == -ENOENT &&
2054                     strcmp(device_path, "missing") == 0)
2055                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2056                 else
2057                         ret = PTR_ERR(device);
2058                 goto out;
2059         }
2060
2061         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2062                 btrfs_warn_in_rcu(fs_info,
2063                   "cannot remove device %s (devid %llu) due to active swapfile",
2064                                   rcu_str_deref(device->name), device->devid);
2065                 ret = -ETXTBSY;
2066                 goto out;
2067         }
2068
2069         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2070                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2071                 goto out;
2072         }
2073
2074         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2075             fs_info->fs_devices->rw_devices == 1) {
2076                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2077                 goto out;
2078         }
2079
2080         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2081                 mutex_lock(&fs_info->chunk_mutex);
2082                 list_del_init(&device->dev_alloc_list);
2083                 device->fs_devices->rw_devices--;
2084                 mutex_unlock(&fs_info->chunk_mutex);
2085         }
2086
2087         mutex_unlock(&uuid_mutex);
2088         ret = btrfs_shrink_device(device, 0);
2089         if (!ret)
2090                 btrfs_reada_remove_dev(device);
2091         mutex_lock(&uuid_mutex);
2092         if (ret)
2093                 goto error_undo;
2094
2095         /*
2096          * TODO: the superblock still includes this device in its num_devices
2097          * counter although write_all_supers() is not locked out. This
2098          * could give a filesystem state which requires a degraded mount.
2099          */
2100         ret = btrfs_rm_dev_item(device);
2101         if (ret)
2102                 goto error_undo;
2103
2104         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2105         btrfs_scrub_cancel_dev(device);
2106
2107         /*
2108          * the device list mutex makes sure that we don't change
2109          * the device list while someone else is writing out all
2110          * the device supers. Whoever is writing all supers, should
2111          * lock the device list mutex before getting the number of
2112          * devices in the super block (super_copy). Conversely,
2113          * whoever updates the number of devices in the super block
2114          * (super_copy) should hold the device list mutex.
2115          */
2116
2117         /*
2118          * In normal cases the cur_devices == fs_devices. But in case
2119          * of deleting a seed device, the cur_devices should point to
2120          * its own fs_devices listed under the fs_devices->seed.
2121          */
2122         cur_devices = device->fs_devices;
2123         mutex_lock(&fs_devices->device_list_mutex);
2124         list_del_rcu(&device->dev_list);
2125
2126         cur_devices->num_devices--;
2127         cur_devices->total_devices--;
2128         /* Update total_devices of the parent fs_devices if it's seed */
2129         if (cur_devices != fs_devices)
2130                 fs_devices->total_devices--;
2131
2132         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2133                 cur_devices->missing_devices--;
2134
2135         btrfs_assign_next_active_device(device, NULL);
2136
2137         if (device->bdev) {
2138                 cur_devices->open_devices--;
2139                 /* remove sysfs entry */
2140                 btrfs_sysfs_remove_device(device);
2141         }
2142
2143         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2144         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2145         mutex_unlock(&fs_devices->device_list_mutex);
2146
2147         /*
2148          * at this point, the device is zero sized and detached from
2149          * the devices list.  All that's left is to zero out the old
2150          * supers and free the device.
2151          */
2152         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2153                 btrfs_scratch_superblocks(fs_info, device->bdev,
2154                                           device->name->str);
2155
2156         btrfs_close_bdev(device);
2157         synchronize_rcu();
2158         btrfs_free_device(device);
2159
2160         if (cur_devices->open_devices == 0) {
2161                 list_del_init(&cur_devices->seed_list);
2162                 close_fs_devices(cur_devices);
2163                 free_fs_devices(cur_devices);
2164         }
2165
2166 out:
2167         mutex_unlock(&uuid_mutex);
2168         return ret;
2169
2170 error_undo:
2171         btrfs_reada_undo_remove_dev(device);
2172         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2173                 mutex_lock(&fs_info->chunk_mutex);
2174                 list_add(&device->dev_alloc_list,
2175                          &fs_devices->alloc_list);
2176                 device->fs_devices->rw_devices++;
2177                 mutex_unlock(&fs_info->chunk_mutex);
2178         }
2179         goto out;
2180 }
2181
2182 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2183 {
2184         struct btrfs_fs_devices *fs_devices;
2185
2186         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2187
2188         /*
2189          * in case of fs with no seed, srcdev->fs_devices will point
2190          * to fs_devices of fs_info. However when the dev being replaced is
2191          * a seed dev it will point to the seed's local fs_devices. In short
2192          * srcdev will have its correct fs_devices in both the cases.
2193          */
2194         fs_devices = srcdev->fs_devices;
2195
2196         list_del_rcu(&srcdev->dev_list);
2197         list_del(&srcdev->dev_alloc_list);
2198         fs_devices->num_devices--;
2199         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2200                 fs_devices->missing_devices--;
2201
2202         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2203                 fs_devices->rw_devices--;
2204
2205         if (srcdev->bdev)
2206                 fs_devices->open_devices--;
2207 }
2208
2209 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2210 {
2211         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2212
2213         mutex_lock(&uuid_mutex);
2214
2215         btrfs_close_bdev(srcdev);
2216         synchronize_rcu();
2217         btrfs_free_device(srcdev);
2218
2219         /* if this is no devs we rather delete the fs_devices */
2220         if (!fs_devices->num_devices) {
2221                 /*
2222                  * On a mounted FS, num_devices can't be zero unless it's a
2223                  * seed. In case of a seed device being replaced, the replace
2224                  * target added to the sprout FS, so there will be no more
2225                  * device left under the seed FS.
2226                  */
2227                 ASSERT(fs_devices->seeding);
2228
2229                 list_del_init(&fs_devices->seed_list);
2230                 close_fs_devices(fs_devices);
2231                 free_fs_devices(fs_devices);
2232         }
2233         mutex_unlock(&uuid_mutex);
2234 }
2235
2236 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2237 {
2238         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2239
2240         mutex_lock(&fs_devices->device_list_mutex);
2241
2242         btrfs_sysfs_remove_device(tgtdev);
2243
2244         if (tgtdev->bdev)
2245                 fs_devices->open_devices--;
2246
2247         fs_devices->num_devices--;
2248
2249         btrfs_assign_next_active_device(tgtdev, NULL);
2250
2251         list_del_rcu(&tgtdev->dev_list);
2252
2253         mutex_unlock(&fs_devices->device_list_mutex);
2254
2255         /*
2256          * The update_dev_time() with in btrfs_scratch_superblocks()
2257          * may lead to a call to btrfs_show_devname() which will try
2258          * to hold device_list_mutex. And here this device
2259          * is already out of device list, so we don't have to hold
2260          * the device_list_mutex lock.
2261          */
2262         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2263                                   tgtdev->name->str);
2264
2265         btrfs_close_bdev(tgtdev);
2266         synchronize_rcu();
2267         btrfs_free_device(tgtdev);
2268 }
2269
2270 static struct btrfs_device *btrfs_find_device_by_path(
2271                 struct btrfs_fs_info *fs_info, const char *device_path)
2272 {
2273         int ret = 0;
2274         struct btrfs_super_block *disk_super;
2275         u64 devid;
2276         u8 *dev_uuid;
2277         struct block_device *bdev;
2278         struct btrfs_device *device;
2279
2280         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2281                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2282         if (ret)
2283                 return ERR_PTR(ret);
2284
2285         devid = btrfs_stack_device_id(&disk_super->dev_item);
2286         dev_uuid = disk_super->dev_item.uuid;
2287         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2288                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2289                                            disk_super->metadata_uuid, true);
2290         else
2291                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2292                                            disk_super->fsid, true);
2293
2294         btrfs_release_disk_super(disk_super);
2295         if (!device)
2296                 device = ERR_PTR(-ENOENT);
2297         blkdev_put(bdev, FMODE_READ);
2298         return device;
2299 }
2300
2301 /*
2302  * Lookup a device given by device id, or the path if the id is 0.
2303  */
2304 struct btrfs_device *btrfs_find_device_by_devspec(
2305                 struct btrfs_fs_info *fs_info, u64 devid,
2306                 const char *device_path)
2307 {
2308         struct btrfs_device *device;
2309
2310         if (devid) {
2311                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2312                                            NULL, true);
2313                 if (!device)
2314                         return ERR_PTR(-ENOENT);
2315                 return device;
2316         }
2317
2318         if (!device_path || !device_path[0])
2319                 return ERR_PTR(-EINVAL);
2320
2321         if (strcmp(device_path, "missing") == 0) {
2322                 /* Find first missing device */
2323                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2324                                     dev_list) {
2325                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2326                                      &device->dev_state) && !device->bdev)
2327                                 return device;
2328                 }
2329                 return ERR_PTR(-ENOENT);
2330         }
2331
2332         return btrfs_find_device_by_path(fs_info, device_path);
2333 }
2334
2335 /*
2336  * does all the dirty work required for changing file system's UUID.
2337  */
2338 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2339 {
2340         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2341         struct btrfs_fs_devices *old_devices;
2342         struct btrfs_fs_devices *seed_devices;
2343         struct btrfs_super_block *disk_super = fs_info->super_copy;
2344         struct btrfs_device *device;
2345         u64 super_flags;
2346
2347         lockdep_assert_held(&uuid_mutex);
2348         if (!fs_devices->seeding)
2349                 return -EINVAL;
2350
2351         /*
2352          * Private copy of the seed devices, anchored at
2353          * fs_info->fs_devices->seed_list
2354          */
2355         seed_devices = alloc_fs_devices(NULL, NULL);
2356         if (IS_ERR(seed_devices))
2357                 return PTR_ERR(seed_devices);
2358
2359         /*
2360          * It's necessary to retain a copy of the original seed fs_devices in
2361          * fs_uuids so that filesystems which have been seeded can successfully
2362          * reference the seed device from open_seed_devices. This also supports
2363          * multiple fs seed.
2364          */
2365         old_devices = clone_fs_devices(fs_devices);
2366         if (IS_ERR(old_devices)) {
2367                 kfree(seed_devices);
2368                 return PTR_ERR(old_devices);
2369         }
2370
2371         list_add(&old_devices->fs_list, &fs_uuids);
2372
2373         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2374         seed_devices->opened = 1;
2375         INIT_LIST_HEAD(&seed_devices->devices);
2376         INIT_LIST_HEAD(&seed_devices->alloc_list);
2377         mutex_init(&seed_devices->device_list_mutex);
2378
2379         mutex_lock(&fs_devices->device_list_mutex);
2380         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2381                               synchronize_rcu);
2382         list_for_each_entry(device, &seed_devices->devices, dev_list)
2383                 device->fs_devices = seed_devices;
2384
2385         fs_devices->seeding = false;
2386         fs_devices->num_devices = 0;
2387         fs_devices->open_devices = 0;
2388         fs_devices->missing_devices = 0;
2389         fs_devices->rotating = false;
2390         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2391
2392         generate_random_uuid(fs_devices->fsid);
2393         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2394         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2395         mutex_unlock(&fs_devices->device_list_mutex);
2396
2397         super_flags = btrfs_super_flags(disk_super) &
2398                       ~BTRFS_SUPER_FLAG_SEEDING;
2399         btrfs_set_super_flags(disk_super, super_flags);
2400
2401         return 0;
2402 }
2403
2404 /*
2405  * Store the expected generation for seed devices in device items.
2406  */
2407 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2408 {
2409         struct btrfs_fs_info *fs_info = trans->fs_info;
2410         struct btrfs_root *root = fs_info->chunk_root;
2411         struct btrfs_path *path;
2412         struct extent_buffer *leaf;
2413         struct btrfs_dev_item *dev_item;
2414         struct btrfs_device *device;
2415         struct btrfs_key key;
2416         u8 fs_uuid[BTRFS_FSID_SIZE];
2417         u8 dev_uuid[BTRFS_UUID_SIZE];
2418         u64 devid;
2419         int ret;
2420
2421         path = btrfs_alloc_path();
2422         if (!path)
2423                 return -ENOMEM;
2424
2425         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2426         key.offset = 0;
2427         key.type = BTRFS_DEV_ITEM_KEY;
2428
2429         while (1) {
2430                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2431                 if (ret < 0)
2432                         goto error;
2433
2434                 leaf = path->nodes[0];
2435 next_slot:
2436                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2437                         ret = btrfs_next_leaf(root, path);
2438                         if (ret > 0)
2439                                 break;
2440                         if (ret < 0)
2441                                 goto error;
2442                         leaf = path->nodes[0];
2443                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2444                         btrfs_release_path(path);
2445                         continue;
2446                 }
2447
2448                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2449                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2450                     key.type != BTRFS_DEV_ITEM_KEY)
2451                         break;
2452
2453                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2454                                           struct btrfs_dev_item);
2455                 devid = btrfs_device_id(leaf, dev_item);
2456                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2457                                    BTRFS_UUID_SIZE);
2458                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2459                                    BTRFS_FSID_SIZE);
2460                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2461                                            fs_uuid, true);
2462                 BUG_ON(!device); /* Logic error */
2463
2464                 if (device->fs_devices->seeding) {
2465                         btrfs_set_device_generation(leaf, dev_item,
2466                                                     device->generation);
2467                         btrfs_mark_buffer_dirty(leaf);
2468                 }
2469
2470                 path->slots[0]++;
2471                 goto next_slot;
2472         }
2473         ret = 0;
2474 error:
2475         btrfs_free_path(path);
2476         return ret;
2477 }
2478
2479 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2480 {
2481         struct btrfs_root *root = fs_info->dev_root;
2482         struct request_queue *q;
2483         struct btrfs_trans_handle *trans;
2484         struct btrfs_device *device;
2485         struct block_device *bdev;
2486         struct super_block *sb = fs_info->sb;
2487         struct rcu_string *name;
2488         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2489         u64 orig_super_total_bytes;
2490         u64 orig_super_num_devices;
2491         int seeding_dev = 0;
2492         int ret = 0;
2493         bool locked = false;
2494
2495         if (sb_rdonly(sb) && !fs_devices->seeding)
2496                 return -EROFS;
2497
2498         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2499                                   fs_info->bdev_holder);
2500         if (IS_ERR(bdev))
2501                 return PTR_ERR(bdev);
2502
2503         if (fs_devices->seeding) {
2504                 seeding_dev = 1;
2505                 down_write(&sb->s_umount);
2506                 mutex_lock(&uuid_mutex);
2507                 locked = true;
2508         }
2509
2510         sync_blockdev(bdev);
2511
2512         rcu_read_lock();
2513         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2514                 if (device->bdev == bdev) {
2515                         ret = -EEXIST;
2516                         rcu_read_unlock();
2517                         goto error;
2518                 }
2519         }
2520         rcu_read_unlock();
2521
2522         device = btrfs_alloc_device(fs_info, NULL, NULL);
2523         if (IS_ERR(device)) {
2524                 /* we can safely leave the fs_devices entry around */
2525                 ret = PTR_ERR(device);
2526                 goto error;
2527         }
2528
2529         name = rcu_string_strdup(device_path, GFP_KERNEL);
2530         if (!name) {
2531                 ret = -ENOMEM;
2532                 goto error_free_device;
2533         }
2534         rcu_assign_pointer(device->name, name);
2535
2536         trans = btrfs_start_transaction(root, 0);
2537         if (IS_ERR(trans)) {
2538                 ret = PTR_ERR(trans);
2539                 goto error_free_device;
2540         }
2541
2542         q = bdev_get_queue(bdev);
2543         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2544         device->generation = trans->transid;
2545         device->io_width = fs_info->sectorsize;
2546         device->io_align = fs_info->sectorsize;
2547         device->sector_size = fs_info->sectorsize;
2548         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2549                                          fs_info->sectorsize);
2550         device->disk_total_bytes = device->total_bytes;
2551         device->commit_total_bytes = device->total_bytes;
2552         device->fs_info = fs_info;
2553         device->bdev = bdev;
2554         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2555         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2556         device->mode = FMODE_EXCL;
2557         device->dev_stats_valid = 1;
2558         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2559
2560         if (seeding_dev) {
2561                 sb->s_flags &= ~SB_RDONLY;
2562                 ret = btrfs_prepare_sprout(fs_info);
2563                 if (ret) {
2564                         btrfs_abort_transaction(trans, ret);
2565                         goto error_trans;
2566                 }
2567         }
2568
2569         device->fs_devices = fs_devices;
2570
2571         mutex_lock(&fs_devices->device_list_mutex);
2572         mutex_lock(&fs_info->chunk_mutex);
2573         list_add_rcu(&device->dev_list, &fs_devices->devices);
2574         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2575         fs_devices->num_devices++;
2576         fs_devices->open_devices++;
2577         fs_devices->rw_devices++;
2578         fs_devices->total_devices++;
2579         fs_devices->total_rw_bytes += device->total_bytes;
2580
2581         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2582
2583         if (!blk_queue_nonrot(q))
2584                 fs_devices->rotating = true;
2585
2586         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2587         btrfs_set_super_total_bytes(fs_info->super_copy,
2588                 round_down(orig_super_total_bytes + device->total_bytes,
2589                            fs_info->sectorsize));
2590
2591         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2592         btrfs_set_super_num_devices(fs_info->super_copy,
2593                                     orig_super_num_devices + 1);
2594
2595         /*
2596          * we've got more storage, clear any full flags on the space
2597          * infos
2598          */
2599         btrfs_clear_space_info_full(fs_info);
2600
2601         mutex_unlock(&fs_info->chunk_mutex);
2602
2603         /* Add sysfs device entry */
2604         btrfs_sysfs_add_device(device);
2605
2606         mutex_unlock(&fs_devices->device_list_mutex);
2607
2608         if (seeding_dev) {
2609                 mutex_lock(&fs_info->chunk_mutex);
2610                 ret = init_first_rw_device(trans);
2611                 mutex_unlock(&fs_info->chunk_mutex);
2612                 if (ret) {
2613                         btrfs_abort_transaction(trans, ret);
2614                         goto error_sysfs;
2615                 }
2616         }
2617
2618         ret = btrfs_add_dev_item(trans, device);
2619         if (ret) {
2620                 btrfs_abort_transaction(trans, ret);
2621                 goto error_sysfs;
2622         }
2623
2624         if (seeding_dev) {
2625                 ret = btrfs_finish_sprout(trans);
2626                 if (ret) {
2627                         btrfs_abort_transaction(trans, ret);
2628                         goto error_sysfs;
2629                 }
2630
2631                 /*
2632                  * fs_devices now represents the newly sprouted filesystem and
2633                  * its fsid has been changed by btrfs_prepare_sprout
2634                  */
2635                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2636         }
2637
2638         ret = btrfs_commit_transaction(trans);
2639
2640         if (seeding_dev) {
2641                 mutex_unlock(&uuid_mutex);
2642                 up_write(&sb->s_umount);
2643                 locked = false;
2644
2645                 if (ret) /* transaction commit */
2646                         return ret;
2647
2648                 ret = btrfs_relocate_sys_chunks(fs_info);
2649                 if (ret < 0)
2650                         btrfs_handle_fs_error(fs_info, ret,
2651                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2652                 trans = btrfs_attach_transaction(root);
2653                 if (IS_ERR(trans)) {
2654                         if (PTR_ERR(trans) == -ENOENT)
2655                                 return 0;
2656                         ret = PTR_ERR(trans);
2657                         trans = NULL;
2658                         goto error_sysfs;
2659                 }
2660                 ret = btrfs_commit_transaction(trans);
2661         }
2662
2663         /*
2664          * Now that we have written a new super block to this device, check all
2665          * other fs_devices list if device_path alienates any other scanned
2666          * device.
2667          * We can ignore the return value as it typically returns -EINVAL and
2668          * only succeeds if the device was an alien.
2669          */
2670         btrfs_forget_devices(device_path);
2671
2672         /* Update ctime/mtime for blkid or udev */
2673         update_dev_time(device_path);
2674
2675         return ret;
2676
2677 error_sysfs:
2678         btrfs_sysfs_remove_device(device);
2679         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2680         mutex_lock(&fs_info->chunk_mutex);
2681         list_del_rcu(&device->dev_list);
2682         list_del(&device->dev_alloc_list);
2683         fs_info->fs_devices->num_devices--;
2684         fs_info->fs_devices->open_devices--;
2685         fs_info->fs_devices->rw_devices--;
2686         fs_info->fs_devices->total_devices--;
2687         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2688         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2689         btrfs_set_super_total_bytes(fs_info->super_copy,
2690                                     orig_super_total_bytes);
2691         btrfs_set_super_num_devices(fs_info->super_copy,
2692                                     orig_super_num_devices);
2693         mutex_unlock(&fs_info->chunk_mutex);
2694         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2695 error_trans:
2696         if (seeding_dev)
2697                 sb->s_flags |= SB_RDONLY;
2698         if (trans)
2699                 btrfs_end_transaction(trans);
2700 error_free_device:
2701         btrfs_free_device(device);
2702 error:
2703         blkdev_put(bdev, FMODE_EXCL);
2704         if (locked) {
2705                 mutex_unlock(&uuid_mutex);
2706                 up_write(&sb->s_umount);
2707         }
2708         return ret;
2709 }
2710
2711 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2712                                         struct btrfs_device *device)
2713 {
2714         int ret;
2715         struct btrfs_path *path;
2716         struct btrfs_root *root = device->fs_info->chunk_root;
2717         struct btrfs_dev_item *dev_item;
2718         struct extent_buffer *leaf;
2719         struct btrfs_key key;
2720
2721         path = btrfs_alloc_path();
2722         if (!path)
2723                 return -ENOMEM;
2724
2725         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2726         key.type = BTRFS_DEV_ITEM_KEY;
2727         key.offset = device->devid;
2728
2729         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2730         if (ret < 0)
2731                 goto out;
2732
2733         if (ret > 0) {
2734                 ret = -ENOENT;
2735                 goto out;
2736         }
2737
2738         leaf = path->nodes[0];
2739         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2740
2741         btrfs_set_device_id(leaf, dev_item, device->devid);
2742         btrfs_set_device_type(leaf, dev_item, device->type);
2743         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2744         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2745         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2746         btrfs_set_device_total_bytes(leaf, dev_item,
2747                                      btrfs_device_get_disk_total_bytes(device));
2748         btrfs_set_device_bytes_used(leaf, dev_item,
2749                                     btrfs_device_get_bytes_used(device));
2750         btrfs_mark_buffer_dirty(leaf);
2751
2752 out:
2753         btrfs_free_path(path);
2754         return ret;
2755 }
2756
2757 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2758                       struct btrfs_device *device, u64 new_size)
2759 {
2760         struct btrfs_fs_info *fs_info = device->fs_info;
2761         struct btrfs_super_block *super_copy = fs_info->super_copy;
2762         u64 old_total;
2763         u64 diff;
2764
2765         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2766                 return -EACCES;
2767
2768         new_size = round_down(new_size, fs_info->sectorsize);
2769
2770         mutex_lock(&fs_info->chunk_mutex);
2771         old_total = btrfs_super_total_bytes(super_copy);
2772         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2773
2774         if (new_size <= device->total_bytes ||
2775             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2776                 mutex_unlock(&fs_info->chunk_mutex);
2777                 return -EINVAL;
2778         }
2779
2780         btrfs_set_super_total_bytes(super_copy,
2781                         round_down(old_total + diff, fs_info->sectorsize));
2782         device->fs_devices->total_rw_bytes += diff;
2783
2784         btrfs_device_set_total_bytes(device, new_size);
2785         btrfs_device_set_disk_total_bytes(device, new_size);
2786         btrfs_clear_space_info_full(device->fs_info);
2787         if (list_empty(&device->post_commit_list))
2788                 list_add_tail(&device->post_commit_list,
2789                               &trans->transaction->dev_update_list);
2790         mutex_unlock(&fs_info->chunk_mutex);
2791
2792         return btrfs_update_device(trans, device);
2793 }
2794
2795 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2796 {
2797         struct btrfs_fs_info *fs_info = trans->fs_info;
2798         struct btrfs_root *root = fs_info->chunk_root;
2799         int ret;
2800         struct btrfs_path *path;
2801         struct btrfs_key key;
2802
2803         path = btrfs_alloc_path();
2804         if (!path)
2805                 return -ENOMEM;
2806
2807         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2808         key.offset = chunk_offset;
2809         key.type = BTRFS_CHUNK_ITEM_KEY;
2810
2811         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2812         if (ret < 0)
2813                 goto out;
2814         else if (ret > 0) { /* Logic error or corruption */
2815                 btrfs_handle_fs_error(fs_info, -ENOENT,
2816                                       "Failed lookup while freeing chunk.");
2817                 ret = -ENOENT;
2818                 goto out;
2819         }
2820
2821         ret = btrfs_del_item(trans, root, path);
2822         if (ret < 0)
2823                 btrfs_handle_fs_error(fs_info, ret,
2824                                       "Failed to delete chunk item.");
2825 out:
2826         btrfs_free_path(path);
2827         return ret;
2828 }
2829
2830 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2831 {
2832         struct btrfs_super_block *super_copy = fs_info->super_copy;
2833         struct btrfs_disk_key *disk_key;
2834         struct btrfs_chunk *chunk;
2835         u8 *ptr;
2836         int ret = 0;
2837         u32 num_stripes;
2838         u32 array_size;
2839         u32 len = 0;
2840         u32 cur;
2841         struct btrfs_key key;
2842
2843         mutex_lock(&fs_info->chunk_mutex);
2844         array_size = btrfs_super_sys_array_size(super_copy);
2845
2846         ptr = super_copy->sys_chunk_array;
2847         cur = 0;
2848
2849         while (cur < array_size) {
2850                 disk_key = (struct btrfs_disk_key *)ptr;
2851                 btrfs_disk_key_to_cpu(&key, disk_key);
2852
2853                 len = sizeof(*disk_key);
2854
2855                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2856                         chunk = (struct btrfs_chunk *)(ptr + len);
2857                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2858                         len += btrfs_chunk_item_size(num_stripes);
2859                 } else {
2860                         ret = -EIO;
2861                         break;
2862                 }
2863                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2864                     key.offset == chunk_offset) {
2865                         memmove(ptr, ptr + len, array_size - (cur + len));
2866                         array_size -= len;
2867                         btrfs_set_super_sys_array_size(super_copy, array_size);
2868                 } else {
2869                         ptr += len;
2870                         cur += len;
2871                 }
2872         }
2873         mutex_unlock(&fs_info->chunk_mutex);
2874         return ret;
2875 }
2876
2877 /*
2878  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2879  * @logical: Logical block offset in bytes.
2880  * @length: Length of extent in bytes.
2881  *
2882  * Return: Chunk mapping or ERR_PTR.
2883  */
2884 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2885                                        u64 logical, u64 length)
2886 {
2887         struct extent_map_tree *em_tree;
2888         struct extent_map *em;
2889
2890         em_tree = &fs_info->mapping_tree;
2891         read_lock(&em_tree->lock);
2892         em = lookup_extent_mapping(em_tree, logical, length);
2893         read_unlock(&em_tree->lock);
2894
2895         if (!em) {
2896                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2897                            logical, length);
2898                 return ERR_PTR(-EINVAL);
2899         }
2900
2901         if (em->start > logical || em->start + em->len < logical) {
2902                 btrfs_crit(fs_info,
2903                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2904                            logical, length, em->start, em->start + em->len);
2905                 free_extent_map(em);
2906                 return ERR_PTR(-EINVAL);
2907         }
2908
2909         /* callers are responsible for dropping em's ref. */
2910         return em;
2911 }
2912
2913 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2914 {
2915         struct btrfs_fs_info *fs_info = trans->fs_info;
2916         struct extent_map *em;
2917         struct map_lookup *map;
2918         u64 dev_extent_len = 0;
2919         int i, ret = 0;
2920         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2921
2922         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2923         if (IS_ERR(em)) {
2924                 /*
2925                  * This is a logic error, but we don't want to just rely on the
2926                  * user having built with ASSERT enabled, so if ASSERT doesn't
2927                  * do anything we still error out.
2928                  */
2929                 ASSERT(0);
2930                 return PTR_ERR(em);
2931         }
2932         map = em->map_lookup;
2933         mutex_lock(&fs_info->chunk_mutex);
2934         check_system_chunk(trans, map->type);
2935         mutex_unlock(&fs_info->chunk_mutex);
2936
2937         /*
2938          * Take the device list mutex to prevent races with the final phase of
2939          * a device replace operation that replaces the device object associated
2940          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2941          */
2942         mutex_lock(&fs_devices->device_list_mutex);
2943         for (i = 0; i < map->num_stripes; i++) {
2944                 struct btrfs_device *device = map->stripes[i].dev;
2945                 ret = btrfs_free_dev_extent(trans, device,
2946                                             map->stripes[i].physical,
2947                                             &dev_extent_len);
2948                 if (ret) {
2949                         mutex_unlock(&fs_devices->device_list_mutex);
2950                         btrfs_abort_transaction(trans, ret);
2951                         goto out;
2952                 }
2953
2954                 if (device->bytes_used > 0) {
2955                         mutex_lock(&fs_info->chunk_mutex);
2956                         btrfs_device_set_bytes_used(device,
2957                                         device->bytes_used - dev_extent_len);
2958                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2959                         btrfs_clear_space_info_full(fs_info);
2960                         mutex_unlock(&fs_info->chunk_mutex);
2961                 }
2962
2963                 ret = btrfs_update_device(trans, device);
2964                 if (ret) {
2965                         mutex_unlock(&fs_devices->device_list_mutex);
2966                         btrfs_abort_transaction(trans, ret);
2967                         goto out;
2968                 }
2969         }
2970         mutex_unlock(&fs_devices->device_list_mutex);
2971
2972         ret = btrfs_free_chunk(trans, chunk_offset);
2973         if (ret) {
2974                 btrfs_abort_transaction(trans, ret);
2975                 goto out;
2976         }
2977
2978         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2979
2980         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2981                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2982                 if (ret) {
2983                         btrfs_abort_transaction(trans, ret);
2984                         goto out;
2985                 }
2986         }
2987
2988         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2989         if (ret) {
2990                 btrfs_abort_transaction(trans, ret);
2991                 goto out;
2992         }
2993
2994 out:
2995         /* once for us */
2996         free_extent_map(em);
2997         return ret;
2998 }
2999
3000 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3001 {
3002         struct btrfs_root *root = fs_info->chunk_root;
3003         struct btrfs_trans_handle *trans;
3004         struct btrfs_block_group *block_group;
3005         int ret;
3006
3007         /*
3008          * Prevent races with automatic removal of unused block groups.
3009          * After we relocate and before we remove the chunk with offset
3010          * chunk_offset, automatic removal of the block group can kick in,
3011          * resulting in a failure when calling btrfs_remove_chunk() below.
3012          *
3013          * Make sure to acquire this mutex before doing a tree search (dev
3014          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3015          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3016          * we release the path used to search the chunk/dev tree and before
3017          * the current task acquires this mutex and calls us.
3018          */
3019         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3020
3021         /* step one, relocate all the extents inside this chunk */
3022         btrfs_scrub_pause(fs_info);
3023         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3024         btrfs_scrub_continue(fs_info);
3025         if (ret)
3026                 return ret;
3027
3028         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3029         if (!block_group)
3030                 return -ENOENT;
3031         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3032         btrfs_put_block_group(block_group);
3033
3034         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3035                                                      chunk_offset);
3036         if (IS_ERR(trans)) {
3037                 ret = PTR_ERR(trans);
3038                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3039                 return ret;
3040         }
3041
3042         /*
3043          * step two, delete the device extents and the
3044          * chunk tree entries
3045          */
3046         ret = btrfs_remove_chunk(trans, chunk_offset);
3047         btrfs_end_transaction(trans);
3048         return ret;
3049 }
3050
3051 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3052 {
3053         struct btrfs_root *chunk_root = fs_info->chunk_root;
3054         struct btrfs_path *path;
3055         struct extent_buffer *leaf;
3056         struct btrfs_chunk *chunk;
3057         struct btrfs_key key;
3058         struct btrfs_key found_key;
3059         u64 chunk_type;
3060         bool retried = false;
3061         int failed = 0;
3062         int ret;
3063
3064         path = btrfs_alloc_path();
3065         if (!path)
3066                 return -ENOMEM;
3067
3068 again:
3069         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3070         key.offset = (u64)-1;
3071         key.type = BTRFS_CHUNK_ITEM_KEY;
3072
3073         while (1) {
3074                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3075                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3076                 if (ret < 0) {
3077                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3078                         goto error;
3079                 }
3080                 BUG_ON(ret == 0); /* Corruption */
3081
3082                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3083                                           key.type);
3084                 if (ret)
3085                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3086                 if (ret < 0)
3087                         goto error;
3088                 if (ret > 0)
3089                         break;
3090
3091                 leaf = path->nodes[0];
3092                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3093
3094                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3095                                        struct btrfs_chunk);
3096                 chunk_type = btrfs_chunk_type(leaf, chunk);
3097                 btrfs_release_path(path);
3098
3099                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3100                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3101                         if (ret == -ENOSPC)
3102                                 failed++;
3103                         else
3104                                 BUG_ON(ret);
3105                 }
3106                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3107
3108                 if (found_key.offset == 0)
3109                         break;
3110                 key.offset = found_key.offset - 1;
3111         }
3112         ret = 0;
3113         if (failed && !retried) {
3114                 failed = 0;
3115                 retried = true;
3116                 goto again;
3117         } else if (WARN_ON(failed && retried)) {
3118                 ret = -ENOSPC;
3119         }
3120 error:
3121         btrfs_free_path(path);
3122         return ret;
3123 }
3124
3125 /*
3126  * return 1 : allocate a data chunk successfully,
3127  * return <0: errors during allocating a data chunk,
3128  * return 0 : no need to allocate a data chunk.
3129  */
3130 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3131                                       u64 chunk_offset)
3132 {
3133         struct btrfs_block_group *cache;
3134         u64 bytes_used;
3135         u64 chunk_type;
3136
3137         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3138         ASSERT(cache);
3139         chunk_type = cache->flags;
3140         btrfs_put_block_group(cache);
3141
3142         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3143                 return 0;
3144
3145         spin_lock(&fs_info->data_sinfo->lock);
3146         bytes_used = fs_info->data_sinfo->bytes_used;
3147         spin_unlock(&fs_info->data_sinfo->lock);
3148
3149         if (!bytes_used) {
3150                 struct btrfs_trans_handle *trans;
3151                 int ret;
3152
3153                 trans = btrfs_join_transaction(fs_info->tree_root);
3154                 if (IS_ERR(trans))
3155                         return PTR_ERR(trans);
3156
3157                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3158                 btrfs_end_transaction(trans);
3159                 if (ret < 0)
3160                         return ret;
3161                 return 1;
3162         }
3163
3164         return 0;
3165 }
3166
3167 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3168                                struct btrfs_balance_control *bctl)
3169 {
3170         struct btrfs_root *root = fs_info->tree_root;
3171         struct btrfs_trans_handle *trans;
3172         struct btrfs_balance_item *item;
3173         struct btrfs_disk_balance_args disk_bargs;
3174         struct btrfs_path *path;
3175         struct extent_buffer *leaf;
3176         struct btrfs_key key;
3177         int ret, err;
3178
3179         path = btrfs_alloc_path();
3180         if (!path)
3181                 return -ENOMEM;
3182
3183         trans = btrfs_start_transaction(root, 0);
3184         if (IS_ERR(trans)) {
3185                 btrfs_free_path(path);
3186                 return PTR_ERR(trans);
3187         }
3188
3189         key.objectid = BTRFS_BALANCE_OBJECTID;
3190         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3191         key.offset = 0;
3192
3193         ret = btrfs_insert_empty_item(trans, root, path, &key,
3194                                       sizeof(*item));
3195         if (ret)
3196                 goto out;
3197
3198         leaf = path->nodes[0];
3199         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3200
3201         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3202
3203         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3204         btrfs_set_balance_data(leaf, item, &disk_bargs);
3205         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3206         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3207         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3208         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3209
3210         btrfs_set_balance_flags(leaf, item, bctl->flags);
3211
3212         btrfs_mark_buffer_dirty(leaf);
3213 out:
3214         btrfs_free_path(path);
3215         err = btrfs_commit_transaction(trans);
3216         if (err && !ret)
3217                 ret = err;
3218         return ret;
3219 }
3220
3221 static int del_balance_item(struct btrfs_fs_info *fs_info)
3222 {
3223         struct btrfs_root *root = fs_info->tree_root;
3224         struct btrfs_trans_handle *trans;
3225         struct btrfs_path *path;
3226         struct btrfs_key key;
3227         int ret, err;
3228
3229         path = btrfs_alloc_path();
3230         if (!path)
3231                 return -ENOMEM;
3232
3233         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3234         if (IS_ERR(trans)) {
3235                 btrfs_free_path(path);
3236                 return PTR_ERR(trans);
3237         }
3238
3239         key.objectid = BTRFS_BALANCE_OBJECTID;
3240         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3241         key.offset = 0;
3242
3243         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3244         if (ret < 0)
3245                 goto out;
3246         if (ret > 0) {
3247                 ret = -ENOENT;
3248                 goto out;
3249         }
3250
3251         ret = btrfs_del_item(trans, root, path);
3252 out:
3253         btrfs_free_path(path);
3254         err = btrfs_commit_transaction(trans);
3255         if (err && !ret)
3256                 ret = err;
3257         return ret;
3258 }
3259
3260 /*
3261  * This is a heuristic used to reduce the number of chunks balanced on
3262  * resume after balance was interrupted.
3263  */
3264 static void update_balance_args(struct btrfs_balance_control *bctl)
3265 {
3266         /*
3267          * Turn on soft mode for chunk types that were being converted.
3268          */
3269         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3270                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3271         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3272                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3273         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3274                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3275
3276         /*
3277          * Turn on usage filter if is not already used.  The idea is
3278          * that chunks that we have already balanced should be
3279          * reasonably full.  Don't do it for chunks that are being
3280          * converted - that will keep us from relocating unconverted
3281          * (albeit full) chunks.
3282          */
3283         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3284             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3285             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3286                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3287                 bctl->data.usage = 90;
3288         }
3289         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3290             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3291             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3292                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3293                 bctl->sys.usage = 90;
3294         }
3295         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3296             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3297             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3298                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3299                 bctl->meta.usage = 90;
3300         }
3301 }
3302
3303 /*
3304  * Clear the balance status in fs_info and delete the balance item from disk.
3305  */
3306 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3307 {
3308         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3309         int ret;
3310
3311         BUG_ON(!fs_info->balance_ctl);
3312
3313         spin_lock(&fs_info->balance_lock);
3314         fs_info->balance_ctl = NULL;
3315         spin_unlock(&fs_info->balance_lock);
3316
3317         kfree(bctl);
3318         ret = del_balance_item(fs_info);
3319         if (ret)
3320                 btrfs_handle_fs_error(fs_info, ret, NULL);
3321 }
3322
3323 /*
3324  * Balance filters.  Return 1 if chunk should be filtered out
3325  * (should not be balanced).
3326  */
3327 static int chunk_profiles_filter(u64 chunk_type,
3328                                  struct btrfs_balance_args *bargs)
3329 {
3330         chunk_type = chunk_to_extended(chunk_type) &
3331                                 BTRFS_EXTENDED_PROFILE_MASK;
3332
3333         if (bargs->profiles & chunk_type)
3334                 return 0;
3335
3336         return 1;
3337 }
3338
3339 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3340                               struct btrfs_balance_args *bargs)
3341 {
3342         struct btrfs_block_group *cache;
3343         u64 chunk_used;
3344         u64 user_thresh_min;
3345         u64 user_thresh_max;
3346         int ret = 1;
3347
3348         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3349         chunk_used = cache->used;
3350
3351         if (bargs->usage_min == 0)
3352                 user_thresh_min = 0;
3353         else
3354                 user_thresh_min = div_factor_fine(cache->length,
3355                                                   bargs->usage_min);
3356
3357         if (bargs->usage_max == 0)
3358                 user_thresh_max = 1;
3359         else if (bargs->usage_max > 100)
3360                 user_thresh_max = cache->length;
3361         else
3362                 user_thresh_max = div_factor_fine(cache->length,
3363                                                   bargs->usage_max);
3364
3365         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3366                 ret = 0;
3367
3368         btrfs_put_block_group(cache);
3369         return ret;
3370 }
3371
3372 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3373                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3374 {
3375         struct btrfs_block_group *cache;
3376         u64 chunk_used, user_thresh;
3377         int ret = 1;
3378
3379         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3380         chunk_used = cache->used;
3381
3382         if (bargs->usage_min == 0)
3383                 user_thresh = 1;
3384         else if (bargs->usage > 100)
3385                 user_thresh = cache->length;
3386         else
3387                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3388
3389         if (chunk_used < user_thresh)
3390                 ret = 0;
3391
3392         btrfs_put_block_group(cache);
3393         return ret;
3394 }
3395
3396 static int chunk_devid_filter(struct extent_buffer *leaf,
3397                               struct btrfs_chunk *chunk,
3398                               struct btrfs_balance_args *bargs)
3399 {
3400         struct btrfs_stripe *stripe;
3401         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3402         int i;
3403
3404         for (i = 0; i < num_stripes; i++) {
3405                 stripe = btrfs_stripe_nr(chunk, i);
3406                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3407                         return 0;
3408         }
3409
3410         return 1;
3411 }
3412
3413 static u64 calc_data_stripes(u64 type, int num_stripes)
3414 {
3415         const int index = btrfs_bg_flags_to_raid_index(type);
3416         const int ncopies = btrfs_raid_array[index].ncopies;
3417         const int nparity = btrfs_raid_array[index].nparity;
3418
3419         if (nparity)
3420                 return num_stripes - nparity;
3421         else
3422                 return num_stripes / ncopies;
3423 }
3424
3425 /* [pstart, pend) */
3426 static int chunk_drange_filter(struct extent_buffer *leaf,
3427                                struct btrfs_chunk *chunk,
3428                                struct btrfs_balance_args *bargs)
3429 {
3430         struct btrfs_stripe *stripe;
3431         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3432         u64 stripe_offset;
3433         u64 stripe_length;
3434         u64 type;
3435         int factor;
3436         int i;
3437
3438         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3439                 return 0;
3440
3441         type = btrfs_chunk_type(leaf, chunk);
3442         factor = calc_data_stripes(type, num_stripes);
3443
3444         for (i = 0; i < num_stripes; i++) {
3445                 stripe = btrfs_stripe_nr(chunk, i);
3446                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3447                         continue;
3448
3449                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3450                 stripe_length = btrfs_chunk_length(leaf, chunk);
3451                 stripe_length = div_u64(stripe_length, factor);
3452
3453                 if (stripe_offset < bargs->pend &&
3454                     stripe_offset + stripe_length > bargs->pstart)
3455                         return 0;
3456         }
3457
3458         return 1;
3459 }
3460
3461 /* [vstart, vend) */
3462 static int chunk_vrange_filter(struct extent_buffer *leaf,
3463                                struct btrfs_chunk *chunk,
3464                                u64 chunk_offset,
3465                                struct btrfs_balance_args *bargs)
3466 {
3467         if (chunk_offset < bargs->vend &&
3468             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3469                 /* at least part of the chunk is inside this vrange */
3470                 return 0;
3471
3472         return 1;
3473 }
3474
3475 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3476                                struct btrfs_chunk *chunk,
3477                                struct btrfs_balance_args *bargs)
3478 {
3479         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3480
3481         if (bargs->stripes_min <= num_stripes
3482                         && num_stripes <= bargs->stripes_max)
3483                 return 0;
3484
3485         return 1;
3486 }
3487
3488 static int chunk_soft_convert_filter(u64 chunk_type,
3489                                      struct btrfs_balance_args *bargs)
3490 {
3491         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3492                 return 0;
3493
3494         chunk_type = chunk_to_extended(chunk_type) &
3495                                 BTRFS_EXTENDED_PROFILE_MASK;
3496
3497         if (bargs->target == chunk_type)
3498                 return 1;
3499
3500         return 0;
3501 }
3502
3503 static int should_balance_chunk(struct extent_buffer *leaf,
3504                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3505 {
3506         struct btrfs_fs_info *fs_info = leaf->fs_info;
3507         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3508         struct btrfs_balance_args *bargs = NULL;
3509         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3510
3511         /* type filter */
3512         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3513               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3514                 return 0;
3515         }
3516
3517         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3518                 bargs = &bctl->data;
3519         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3520                 bargs = &bctl->sys;
3521         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3522                 bargs = &bctl->meta;
3523
3524         /* profiles filter */
3525         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3526             chunk_profiles_filter(chunk_type, bargs)) {
3527                 return 0;
3528         }
3529
3530         /* usage filter */
3531         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3532             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3533                 return 0;
3534         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3535             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3536                 return 0;
3537         }
3538
3539         /* devid filter */
3540         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3541             chunk_devid_filter(leaf, chunk, bargs)) {
3542                 return 0;
3543         }
3544
3545         /* drange filter, makes sense only with devid filter */
3546         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3547             chunk_drange_filter(leaf, chunk, bargs)) {
3548                 return 0;
3549         }
3550
3551         /* vrange filter */
3552         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3553             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3554                 return 0;
3555         }
3556
3557         /* stripes filter */
3558         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3559             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3560                 return 0;
3561         }
3562
3563         /* soft profile changing mode */
3564         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3565             chunk_soft_convert_filter(chunk_type, bargs)) {
3566                 return 0;
3567         }
3568
3569         /*
3570          * limited by count, must be the last filter
3571          */
3572         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3573                 if (bargs->limit == 0)
3574                         return 0;
3575                 else
3576                         bargs->limit--;
3577         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3578                 /*
3579                  * Same logic as the 'limit' filter; the minimum cannot be
3580                  * determined here because we do not have the global information
3581                  * about the count of all chunks that satisfy the filters.
3582                  */
3583                 if (bargs->limit_max == 0)
3584                         return 0;
3585                 else
3586                         bargs->limit_max--;
3587         }
3588
3589         return 1;
3590 }
3591
3592 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3593 {
3594         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3595         struct btrfs_root *chunk_root = fs_info->chunk_root;
3596         u64 chunk_type;
3597         struct btrfs_chunk *chunk;
3598         struct btrfs_path *path = NULL;
3599         struct btrfs_key key;
3600         struct btrfs_key found_key;
3601         struct extent_buffer *leaf;
3602         int slot;
3603         int ret;
3604         int enospc_errors = 0;
3605         bool counting = true;
3606         /* The single value limit and min/max limits use the same bytes in the */
3607         u64 limit_data = bctl->data.limit;
3608         u64 limit_meta = bctl->meta.limit;
3609         u64 limit_sys = bctl->sys.limit;
3610         u32 count_data = 0;
3611         u32 count_meta = 0;
3612         u32 count_sys = 0;
3613         int chunk_reserved = 0;
3614
3615         path = btrfs_alloc_path();
3616         if (!path) {
3617                 ret = -ENOMEM;
3618                 goto error;
3619         }
3620
3621         /* zero out stat counters */
3622         spin_lock(&fs_info->balance_lock);
3623         memset(&bctl->stat, 0, sizeof(bctl->stat));
3624         spin_unlock(&fs_info->balance_lock);
3625 again:
3626         if (!counting) {
3627                 /*
3628                  * The single value limit and min/max limits use the same bytes
3629                  * in the
3630                  */
3631                 bctl->data.limit = limit_data;
3632                 bctl->meta.limit = limit_meta;
3633                 bctl->sys.limit = limit_sys;
3634         }
3635         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3636         key.offset = (u64)-1;
3637         key.type = BTRFS_CHUNK_ITEM_KEY;
3638
3639         while (1) {
3640                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3641                     atomic_read(&fs_info->balance_cancel_req)) {
3642                         ret = -ECANCELED;
3643                         goto error;
3644  &nbs