Merge tag 'asm-generic-cleanup-5.11' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/mm.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "misc.h"
18 #include "ctree.h"
19 #include "extent_map.h"
20 #include "disk-io.h"
21 #include "transaction.h"
22 #include "print-tree.h"
23 #include "volumes.h"
24 #include "raid56.h"
25 #include "async-thread.h"
26 #include "check-integrity.h"
27 #include "rcu-string.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30 #include "tree-checker.h"
31 #include "space-info.h"
32 #include "block-group.h"
33 #include "discard.h"
34 #include "zoned.h"
35
36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
37         [BTRFS_RAID_RAID10] = {
38                 .sub_stripes    = 2,
39                 .dev_stripes    = 1,
40                 .devs_max       = 0,    /* 0 == as many as possible */
41                 .devs_min       = 4,
42                 .tolerated_failures = 1,
43                 .devs_increment = 2,
44                 .ncopies        = 2,
45                 .nparity        = 0,
46                 .raid_name      = "raid10",
47                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
48                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
49         },
50         [BTRFS_RAID_RAID1] = {
51                 .sub_stripes    = 1,
52                 .dev_stripes    = 1,
53                 .devs_max       = 2,
54                 .devs_min       = 2,
55                 .tolerated_failures = 1,
56                 .devs_increment = 2,
57                 .ncopies        = 2,
58                 .nparity        = 0,
59                 .raid_name      = "raid1",
60                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
61                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62         },
63         [BTRFS_RAID_RAID1C3] = {
64                 .sub_stripes    = 1,
65                 .dev_stripes    = 1,
66                 .devs_max       = 3,
67                 .devs_min       = 3,
68                 .tolerated_failures = 2,
69                 .devs_increment = 3,
70                 .ncopies        = 3,
71                 .nparity        = 0,
72                 .raid_name      = "raid1c3",
73                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
74                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75         },
76         [BTRFS_RAID_RAID1C4] = {
77                 .sub_stripes    = 1,
78                 .dev_stripes    = 1,
79                 .devs_max       = 4,
80                 .devs_min       = 4,
81                 .tolerated_failures = 3,
82                 .devs_increment = 4,
83                 .ncopies        = 4,
84                 .nparity        = 0,
85                 .raid_name      = "raid1c4",
86                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
87                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
88         },
89         [BTRFS_RAID_DUP] = {
90                 .sub_stripes    = 1,
91                 .dev_stripes    = 2,
92                 .devs_max       = 1,
93                 .devs_min       = 1,
94                 .tolerated_failures = 0,
95                 .devs_increment = 1,
96                 .ncopies        = 2,
97                 .nparity        = 0,
98                 .raid_name      = "dup",
99                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
100                 .mindev_error   = 0,
101         },
102         [BTRFS_RAID_RAID0] = {
103                 .sub_stripes    = 1,
104                 .dev_stripes    = 1,
105                 .devs_max       = 0,
106                 .devs_min       = 2,
107                 .tolerated_failures = 0,
108                 .devs_increment = 1,
109                 .ncopies        = 1,
110                 .nparity        = 0,
111                 .raid_name      = "raid0",
112                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
113                 .mindev_error   = 0,
114         },
115         [BTRFS_RAID_SINGLE] = {
116                 .sub_stripes    = 1,
117                 .dev_stripes    = 1,
118                 .devs_max       = 1,
119                 .devs_min       = 1,
120                 .tolerated_failures = 0,
121                 .devs_increment = 1,
122                 .ncopies        = 1,
123                 .nparity        = 0,
124                 .raid_name      = "single",
125                 .bg_flag        = 0,
126                 .mindev_error   = 0,
127         },
128         [BTRFS_RAID_RAID5] = {
129                 .sub_stripes    = 1,
130                 .dev_stripes    = 1,
131                 .devs_max       = 0,
132                 .devs_min       = 2,
133                 .tolerated_failures = 1,
134                 .devs_increment = 1,
135                 .ncopies        = 1,
136                 .nparity        = 1,
137                 .raid_name      = "raid5",
138                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
139                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
140         },
141         [BTRFS_RAID_RAID6] = {
142                 .sub_stripes    = 1,
143                 .dev_stripes    = 1,
144                 .devs_max       = 0,
145                 .devs_min       = 3,
146                 .tolerated_failures = 2,
147                 .devs_increment = 1,
148                 .ncopies        = 1,
149                 .nparity        = 2,
150                 .raid_name      = "raid6",
151                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
152                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
153         },
154 };
155
156 const char *btrfs_bg_type_to_raid_name(u64 flags)
157 {
158         const int index = btrfs_bg_flags_to_raid_index(flags);
159
160         if (index >= BTRFS_NR_RAID_TYPES)
161                 return NULL;
162
163         return btrfs_raid_array[index].raid_name;
164 }
165
166 /*
167  * Fill @buf with textual description of @bg_flags, no more than @size_buf
168  * bytes including terminating null byte.
169  */
170 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171 {
172         int i;
173         int ret;
174         char *bp = buf;
175         u64 flags = bg_flags;
176         u32 size_bp = size_buf;
177
178         if (!flags) {
179                 strcpy(bp, "NONE");
180                 return;
181         }
182
183 #define DESCRIBE_FLAG(flag, desc)                                               \
184         do {                                                            \
185                 if (flags & (flag)) {                                   \
186                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
187                         if (ret < 0 || ret >= size_bp)                  \
188                                 goto out_overflow;                      \
189                         size_bp -= ret;                                 \
190                         bp += ret;                                      \
191                         flags &= ~(flag);                               \
192                 }                                                       \
193         } while (0)
194
195         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
199         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202                               btrfs_raid_array[i].raid_name);
203 #undef DESCRIBE_FLAG
204
205         if (flags) {
206                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
207                 size_bp -= ret;
208         }
209
210         if (size_bp < size_buf)
211                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
213         /*
214          * The text is trimmed, it's up to the caller to provide sufficiently
215          * large buffer
216          */
217 out_overflow:;
218 }
219
220 static int init_first_rw_device(struct btrfs_trans_handle *trans);
221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
225                              enum btrfs_map_op op,
226                              u64 logical, u64 *length,
227                              struct btrfs_bio **bbio_ret,
228                              int mirror_num, int need_raid_map);
229
230 /*
231  * Device locking
232  * ==============
233  *
234  * There are several mutexes that protect manipulation of devices and low-level
235  * structures like chunks but not block groups, extents or files
236  *
237  * uuid_mutex (global lock)
238  * ------------------------
239  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
240  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
241  * device) or requested by the device= mount option
242  *
243  * the mutex can be very coarse and can cover long-running operations
244  *
245  * protects: updates to fs_devices counters like missing devices, rw devices,
246  * seeding, structure cloning, opening/closing devices at mount/umount time
247  *
248  * global::fs_devs - add, remove, updates to the global list
249  *
250  * does not protect: manipulation of the fs_devices::devices list in general
251  * but in mount context it could be used to exclude list modifications by eg.
252  * scan ioctl
253  *
254  * btrfs_device::name - renames (write side), read is RCU
255  *
256  * fs_devices::device_list_mutex (per-fs, with RCU)
257  * ------------------------------------------------
258  * protects updates to fs_devices::devices, ie. adding and deleting
259  *
260  * simple list traversal with read-only actions can be done with RCU protection
261  *
262  * may be used to exclude some operations from running concurrently without any
263  * modifications to the list (see write_all_supers)
264  *
265  * Is not required at mount and close times, because our device list is
266  * protected by the uuid_mutex at that point.
267  *
268  * balance_mutex
269  * -------------
270  * protects balance structures (status, state) and context accessed from
271  * several places (internally, ioctl)
272  *
273  * chunk_mutex
274  * -----------
275  * protects chunks, adding or removing during allocation, trim or when a new
276  * device is added/removed. Additionally it also protects post_commit_list of
277  * individual devices, since they can be added to the transaction's
278  * post_commit_list only with chunk_mutex held.
279  *
280  * cleaner_mutex
281  * -------------
282  * a big lock that is held by the cleaner thread and prevents running subvolume
283  * cleaning together with relocation or delayed iputs
284  *
285  *
286  * Lock nesting
287  * ============
288  *
289  * uuid_mutex
290  *   device_list_mutex
291  *     chunk_mutex
292  *   balance_mutex
293  *
294  *
295  * Exclusive operations
296  * ====================
297  *
298  * Maintains the exclusivity of the following operations that apply to the
299  * whole filesystem and cannot run in parallel.
300  *
301  * - Balance (*)
302  * - Device add
303  * - Device remove
304  * - Device replace (*)
305  * - Resize
306  *
307  * The device operations (as above) can be in one of the following states:
308  *
309  * - Running state
310  * - Paused state
311  * - Completed state
312  *
313  * Only device operations marked with (*) can go into the Paused state for the
314  * following reasons:
315  *
316  * - ioctl (only Balance can be Paused through ioctl)
317  * - filesystem remounted as read-only
318  * - filesystem unmounted and mounted as read-only
319  * - system power-cycle and filesystem mounted as read-only
320  * - filesystem or device errors leading to forced read-only
321  *
322  * The status of exclusive operation is set and cleared atomically.
323  * During the course of Paused state, fs_info::exclusive_operation remains set.
324  * A device operation in Paused or Running state can be canceled or resumed
325  * either by ioctl (Balance only) or when remounted as read-write.
326  * The exclusive status is cleared when the device operation is canceled or
327  * completed.
328  */
329
330 DEFINE_MUTEX(uuid_mutex);
331 static LIST_HEAD(fs_uuids);
332 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
333 {
334         return &fs_uuids;
335 }
336
337 /*
338  * alloc_fs_devices - allocate struct btrfs_fs_devices
339  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
340  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
341  *
342  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
343  * The returned struct is not linked onto any lists and can be destroyed with
344  * kfree() right away.
345  */
346 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347                                                  const u8 *metadata_fsid)
348 {
349         struct btrfs_fs_devices *fs_devs;
350
351         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
352         if (!fs_devs)
353                 return ERR_PTR(-ENOMEM);
354
355         mutex_init(&fs_devs->device_list_mutex);
356
357         INIT_LIST_HEAD(&fs_devs->devices);
358         INIT_LIST_HEAD(&fs_devs->alloc_list);
359         INIT_LIST_HEAD(&fs_devs->fs_list);
360         INIT_LIST_HEAD(&fs_devs->seed_list);
361         if (fsid)
362                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
363
364         if (metadata_fsid)
365                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366         else if (fsid)
367                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
368
369         return fs_devs;
370 }
371
372 void btrfs_free_device(struct btrfs_device *device)
373 {
374         WARN_ON(!list_empty(&device->post_commit_list));
375         rcu_string_free(device->name);
376         extent_io_tree_release(&device->alloc_state);
377         bio_put(device->flush_bio);
378         btrfs_destroy_dev_zone_info(device);
379         kfree(device);
380 }
381
382 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
383 {
384         struct btrfs_device *device;
385         WARN_ON(fs_devices->opened);
386         while (!list_empty(&fs_devices->devices)) {
387                 device = list_entry(fs_devices->devices.next,
388                                     struct btrfs_device, dev_list);
389                 list_del(&device->dev_list);
390                 btrfs_free_device(device);
391         }
392         kfree(fs_devices);
393 }
394
395 void __exit btrfs_cleanup_fs_uuids(void)
396 {
397         struct btrfs_fs_devices *fs_devices;
398
399         while (!list_empty(&fs_uuids)) {
400                 fs_devices = list_entry(fs_uuids.next,
401                                         struct btrfs_fs_devices, fs_list);
402                 list_del(&fs_devices->fs_list);
403                 free_fs_devices(fs_devices);
404         }
405 }
406
407 /*
408  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
409  * Returned struct is not linked onto any lists and must be destroyed using
410  * btrfs_free_device.
411  */
412 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
413 {
414         struct btrfs_device *dev;
415
416         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
417         if (!dev)
418                 return ERR_PTR(-ENOMEM);
419
420         /*
421          * Preallocate a bio that's always going to be used for flushing device
422          * barriers and matches the device lifespan
423          */
424         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
425         if (!dev->flush_bio) {
426                 kfree(dev);
427                 return ERR_PTR(-ENOMEM);
428         }
429
430         INIT_LIST_HEAD(&dev->dev_list);
431         INIT_LIST_HEAD(&dev->dev_alloc_list);
432         INIT_LIST_HEAD(&dev->post_commit_list);
433
434         atomic_set(&dev->reada_in_flight, 0);
435         atomic_set(&dev->dev_stats_ccnt, 0);
436         btrfs_device_data_ordered_init(dev, fs_info);
437         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
438         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
439         extent_io_tree_init(fs_info, &dev->alloc_state,
440                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
441
442         return dev;
443 }
444
445 static noinline struct btrfs_fs_devices *find_fsid(
446                 const u8 *fsid, const u8 *metadata_fsid)
447 {
448         struct btrfs_fs_devices *fs_devices;
449
450         ASSERT(fsid);
451
452         /* Handle non-split brain cases */
453         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
454                 if (metadata_fsid) {
455                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
456                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
457                                       BTRFS_FSID_SIZE) == 0)
458                                 return fs_devices;
459                 } else {
460                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
461                                 return fs_devices;
462                 }
463         }
464         return NULL;
465 }
466
467 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
468                                 struct btrfs_super_block *disk_super)
469 {
470
471         struct btrfs_fs_devices *fs_devices;
472
473         /*
474          * Handle scanned device having completed its fsid change but
475          * belonging to a fs_devices that was created by first scanning
476          * a device which didn't have its fsid/metadata_uuid changed
477          * at all and the CHANGING_FSID_V2 flag set.
478          */
479         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
480                 if (fs_devices->fsid_change &&
481                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
482                            BTRFS_FSID_SIZE) == 0 &&
483                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
484                            BTRFS_FSID_SIZE) == 0) {
485                         return fs_devices;
486                 }
487         }
488         /*
489          * Handle scanned device having completed its fsid change but
490          * belonging to a fs_devices that was created by a device that
491          * has an outdated pair of fsid/metadata_uuid and
492          * CHANGING_FSID_V2 flag set.
493          */
494         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
495                 if (fs_devices->fsid_change &&
496                     memcmp(fs_devices->metadata_uuid,
497                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
498                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
499                            BTRFS_FSID_SIZE) == 0) {
500                         return fs_devices;
501                 }
502         }
503
504         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
505 }
506
507
508 static int
509 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
510                       int flush, struct block_device **bdev,
511                       struct btrfs_super_block **disk_super)
512 {
513         int ret;
514
515         *bdev = blkdev_get_by_path(device_path, flags, holder);
516
517         if (IS_ERR(*bdev)) {
518                 ret = PTR_ERR(*bdev);
519                 goto error;
520         }
521
522         if (flush)
523                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
524         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
525         if (ret) {
526                 blkdev_put(*bdev, flags);
527                 goto error;
528         }
529         invalidate_bdev(*bdev);
530         *disk_super = btrfs_read_dev_super(*bdev);
531         if (IS_ERR(*disk_super)) {
532                 ret = PTR_ERR(*disk_super);
533                 blkdev_put(*bdev, flags);
534                 goto error;
535         }
536
537         return 0;
538
539 error:
540         *bdev = NULL;
541         return ret;
542 }
543
544 static bool device_path_matched(const char *path, struct btrfs_device *device)
545 {
546         int found;
547
548         rcu_read_lock();
549         found = strcmp(rcu_str_deref(device->name), path);
550         rcu_read_unlock();
551
552         return found == 0;
553 }
554
555 /*
556  *  Search and remove all stale (devices which are not mounted) devices.
557  *  When both inputs are NULL, it will search and release all stale devices.
558  *  path:       Optional. When provided will it release all unmounted devices
559  *              matching this path only.
560  *  skip_dev:   Optional. Will skip this device when searching for the stale
561  *              devices.
562  *  Return:     0 for success or if @path is NULL.
563  *              -EBUSY if @path is a mounted device.
564  *              -ENOENT if @path does not match any device in the list.
565  */
566 static int btrfs_free_stale_devices(const char *path,
567                                      struct btrfs_device *skip_device)
568 {
569         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
570         struct btrfs_device *device, *tmp_device;
571         int ret = 0;
572
573         if (path)
574                 ret = -ENOENT;
575
576         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
577
578                 mutex_lock(&fs_devices->device_list_mutex);
579                 list_for_each_entry_safe(device, tmp_device,
580                                          &fs_devices->devices, dev_list) {
581                         if (skip_device && skip_device == device)
582                                 continue;
583                         if (path && !device->name)
584                                 continue;
585                         if (path && !device_path_matched(path, device))
586                                 continue;
587                         if (fs_devices->opened) {
588                                 /* for an already deleted device return 0 */
589                                 if (path && ret != 0)
590                                         ret = -EBUSY;
591                                 break;
592                         }
593
594                         /* delete the stale device */
595                         fs_devices->num_devices--;
596                         list_del(&device->dev_list);
597                         btrfs_free_device(device);
598
599                         ret = 0;
600                 }
601                 mutex_unlock(&fs_devices->device_list_mutex);
602
603                 if (fs_devices->num_devices == 0) {
604                         btrfs_sysfs_remove_fsid(fs_devices);
605                         list_del(&fs_devices->fs_list);
606                         free_fs_devices(fs_devices);
607                 }
608         }
609
610         return ret;
611 }
612
613 /*
614  * This is only used on mount, and we are protected from competing things
615  * messing with our fs_devices by the uuid_mutex, thus we do not need the
616  * fs_devices->device_list_mutex here.
617  */
618 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
619                         struct btrfs_device *device, fmode_t flags,
620                         void *holder)
621 {
622         struct request_queue *q;
623         struct block_device *bdev;
624         struct btrfs_super_block *disk_super;
625         u64 devid;
626         int ret;
627
628         if (device->bdev)
629                 return -EINVAL;
630         if (!device->name)
631                 return -EINVAL;
632
633         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
634                                     &bdev, &disk_super);
635         if (ret)
636                 return ret;
637
638         devid = btrfs_stack_device_id(&disk_super->dev_item);
639         if (devid != device->devid)
640                 goto error_free_page;
641
642         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
643                 goto error_free_page;
644
645         device->generation = btrfs_super_generation(disk_super);
646
647         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
648                 if (btrfs_super_incompat_flags(disk_super) &
649                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
650                         pr_err(
651                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
652                         goto error_free_page;
653                 }
654
655                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
656                 fs_devices->seeding = true;
657         } else {
658                 if (bdev_read_only(bdev))
659                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
660                 else
661                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
662         }
663
664         q = bdev_get_queue(bdev);
665         if (!blk_queue_nonrot(q))
666                 fs_devices->rotating = true;
667
668         device->bdev = bdev;
669         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
670         device->mode = flags;
671
672         ret = btrfs_get_dev_zone_info(device);
673         if (ret != 0)
674                 goto error_free_page;
675
676         fs_devices->open_devices++;
677         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
678             device->devid != BTRFS_DEV_REPLACE_DEVID) {
679                 fs_devices->rw_devices++;
680                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
681         }
682         btrfs_release_disk_super(disk_super);
683
684         return 0;
685
686 error_free_page:
687         btrfs_release_disk_super(disk_super);
688         blkdev_put(bdev, flags);
689
690         return -EINVAL;
691 }
692
693 /*
694  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
695  * being created with a disk that has already completed its fsid change. Such
696  * disk can belong to an fs which has its FSID changed or to one which doesn't.
697  * Handle both cases here.
698  */
699 static struct btrfs_fs_devices *find_fsid_inprogress(
700                                         struct btrfs_super_block *disk_super)
701 {
702         struct btrfs_fs_devices *fs_devices;
703
704         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
705                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
706                            BTRFS_FSID_SIZE) != 0 &&
707                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
708                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
709                         return fs_devices;
710                 }
711         }
712
713         return find_fsid(disk_super->fsid, NULL);
714 }
715
716
717 static struct btrfs_fs_devices *find_fsid_changed(
718                                         struct btrfs_super_block *disk_super)
719 {
720         struct btrfs_fs_devices *fs_devices;
721
722         /*
723          * Handles the case where scanned device is part of an fs that had
724          * multiple successful changes of FSID but curently device didn't
725          * observe it. Meaning our fsid will be different than theirs. We need
726          * to handle two subcases :
727          *  1 - The fs still continues to have different METADATA/FSID uuids.
728          *  2 - The fs is switched back to its original FSID (METADATA/FSID
729          *  are equal).
730          */
731         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
732                 /* Changed UUIDs */
733                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
734                            BTRFS_FSID_SIZE) != 0 &&
735                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
736                            BTRFS_FSID_SIZE) == 0 &&
737                     memcmp(fs_devices->fsid, disk_super->fsid,
738                            BTRFS_FSID_SIZE) != 0)
739                         return fs_devices;
740
741                 /* Unchanged UUIDs */
742                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
743                            BTRFS_FSID_SIZE) == 0 &&
744                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
745                            BTRFS_FSID_SIZE) == 0)
746                         return fs_devices;
747         }
748
749         return NULL;
750 }
751
752 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
753                                 struct btrfs_super_block *disk_super)
754 {
755         struct btrfs_fs_devices *fs_devices;
756
757         /*
758          * Handle the case where the scanned device is part of an fs whose last
759          * metadata UUID change reverted it to the original FSID. At the same
760          * time * fs_devices was first created by another constitutent device
761          * which didn't fully observe the operation. This results in an
762          * btrfs_fs_devices created with metadata/fsid different AND
763          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
764          * fs_devices equal to the FSID of the disk.
765          */
766         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
767                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
768                            BTRFS_FSID_SIZE) != 0 &&
769                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
770                            BTRFS_FSID_SIZE) == 0 &&
771                     fs_devices->fsid_change)
772                         return fs_devices;
773         }
774
775         return NULL;
776 }
777 /*
778  * Add new device to list of registered devices
779  *
780  * Returns:
781  * device pointer which was just added or updated when successful
782  * error pointer when failed
783  */
784 static noinline struct btrfs_device *device_list_add(const char *path,
785                            struct btrfs_super_block *disk_super,
786                            bool *new_device_added)
787 {
788         struct btrfs_device *device;
789         struct btrfs_fs_devices *fs_devices = NULL;
790         struct rcu_string *name;
791         u64 found_transid = btrfs_super_generation(disk_super);
792         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
793         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
794                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
795         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
796                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
797
798         if (fsid_change_in_progress) {
799                 if (!has_metadata_uuid)
800                         fs_devices = find_fsid_inprogress(disk_super);
801                 else
802                         fs_devices = find_fsid_changed(disk_super);
803         } else if (has_metadata_uuid) {
804                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
805         } else {
806                 fs_devices = find_fsid_reverted_metadata(disk_super);
807                 if (!fs_devices)
808                         fs_devices = find_fsid(disk_super->fsid, NULL);
809         }
810
811
812         if (!fs_devices) {
813                 if (has_metadata_uuid)
814                         fs_devices = alloc_fs_devices(disk_super->fsid,
815                                                       disk_super->metadata_uuid);
816                 else
817                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
818
819                 if (IS_ERR(fs_devices))
820                         return ERR_CAST(fs_devices);
821
822                 fs_devices->fsid_change = fsid_change_in_progress;
823
824                 mutex_lock(&fs_devices->device_list_mutex);
825                 list_add(&fs_devices->fs_list, &fs_uuids);
826
827                 device = NULL;
828         } else {
829                 mutex_lock(&fs_devices->device_list_mutex);
830                 device = btrfs_find_device(fs_devices, devid,
831                                 disk_super->dev_item.uuid, NULL);
832
833                 /*
834                  * If this disk has been pulled into an fs devices created by
835                  * a device which had the CHANGING_FSID_V2 flag then replace the
836                  * metadata_uuid/fsid values of the fs_devices.
837                  */
838                 if (fs_devices->fsid_change &&
839                     found_transid > fs_devices->latest_generation) {
840                         memcpy(fs_devices->fsid, disk_super->fsid,
841                                         BTRFS_FSID_SIZE);
842
843                         if (has_metadata_uuid)
844                                 memcpy(fs_devices->metadata_uuid,
845                                        disk_super->metadata_uuid,
846                                        BTRFS_FSID_SIZE);
847                         else
848                                 memcpy(fs_devices->metadata_uuid,
849                                        disk_super->fsid, BTRFS_FSID_SIZE);
850
851                         fs_devices->fsid_change = false;
852                 }
853         }
854
855         if (!device) {
856                 if (fs_devices->opened) {
857                         mutex_unlock(&fs_devices->device_list_mutex);
858                         return ERR_PTR(-EBUSY);
859                 }
860
861                 device = btrfs_alloc_device(NULL, &devid,
862                                             disk_super->dev_item.uuid);
863                 if (IS_ERR(device)) {
864                         mutex_unlock(&fs_devices->device_list_mutex);
865                         /* we can safely leave the fs_devices entry around */
866                         return device;
867                 }
868
869                 name = rcu_string_strdup(path, GFP_NOFS);
870                 if (!name) {
871                         btrfs_free_device(device);
872                         mutex_unlock(&fs_devices->device_list_mutex);
873                         return ERR_PTR(-ENOMEM);
874                 }
875                 rcu_assign_pointer(device->name, name);
876
877                 list_add_rcu(&device->dev_list, &fs_devices->devices);
878                 fs_devices->num_devices++;
879
880                 device->fs_devices = fs_devices;
881                 *new_device_added = true;
882
883                 if (disk_super->label[0])
884                         pr_info(
885         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
886                                 disk_super->label, devid, found_transid, path,
887                                 current->comm, task_pid_nr(current));
888                 else
889                         pr_info(
890         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
891                                 disk_super->fsid, devid, found_transid, path,
892                                 current->comm, task_pid_nr(current));
893
894         } else if (!device->name || strcmp(device->name->str, path)) {
895                 /*
896                  * When FS is already mounted.
897                  * 1. If you are here and if the device->name is NULL that
898                  *    means this device was missing at time of FS mount.
899                  * 2. If you are here and if the device->name is different
900                  *    from 'path' that means either
901                  *      a. The same device disappeared and reappeared with
902                  *         different name. or
903                  *      b. The missing-disk-which-was-replaced, has
904                  *         reappeared now.
905                  *
906                  * We must allow 1 and 2a above. But 2b would be a spurious
907                  * and unintentional.
908                  *
909                  * Further in case of 1 and 2a above, the disk at 'path'
910                  * would have missed some transaction when it was away and
911                  * in case of 2a the stale bdev has to be updated as well.
912                  * 2b must not be allowed at all time.
913                  */
914
915                 /*
916                  * For now, we do allow update to btrfs_fs_device through the
917                  * btrfs dev scan cli after FS has been mounted.  We're still
918                  * tracking a problem where systems fail mount by subvolume id
919                  * when we reject replacement on a mounted FS.
920                  */
921                 if (!fs_devices->opened && found_transid < device->generation) {
922                         /*
923                          * That is if the FS is _not_ mounted and if you
924                          * are here, that means there is more than one
925                          * disk with same uuid and devid.We keep the one
926                          * with larger generation number or the last-in if
927                          * generation are equal.
928                          */
929                         mutex_unlock(&fs_devices->device_list_mutex);
930                         return ERR_PTR(-EEXIST);
931                 }
932
933                 /*
934                  * We are going to replace the device path for a given devid,
935                  * make sure it's the same device if the device is mounted
936                  */
937                 if (device->bdev) {
938                         struct block_device *path_bdev;
939
940                         path_bdev = lookup_bdev(path);
941                         if (IS_ERR(path_bdev)) {
942                                 mutex_unlock(&fs_devices->device_list_mutex);
943                                 return ERR_CAST(path_bdev);
944                         }
945
946                         if (device->bdev != path_bdev) {
947                                 bdput(path_bdev);
948                                 mutex_unlock(&fs_devices->device_list_mutex);
949                                 /*
950                                  * device->fs_info may not be reliable here, so
951                                  * pass in a NULL instead. This avoids a
952                                  * possible use-after-free when the fs_info and
953                                  * fs_info->sb are already torn down.
954                                  */
955                                 btrfs_warn_in_rcu(NULL,
956         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
957                                                   path, devid, found_transid,
958                                                   current->comm,
959                                                   task_pid_nr(current));
960                                 return ERR_PTR(-EEXIST);
961                         }
962                         bdput(path_bdev);
963                         btrfs_info_in_rcu(device->fs_info,
964         "devid %llu device path %s changed to %s scanned by %s (%d)",
965                                           devid, rcu_str_deref(device->name),
966                                           path, current->comm,
967                                           task_pid_nr(current));
968                 }
969
970                 name = rcu_string_strdup(path, GFP_NOFS);
971                 if (!name) {
972                         mutex_unlock(&fs_devices->device_list_mutex);
973                         return ERR_PTR(-ENOMEM);
974                 }
975                 rcu_string_free(device->name);
976                 rcu_assign_pointer(device->name, name);
977                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
978                         fs_devices->missing_devices--;
979                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
980                 }
981         }
982
983         /*
984          * Unmount does not free the btrfs_device struct but would zero
985          * generation along with most of the other members. So just update
986          * it back. We need it to pick the disk with largest generation
987          * (as above).
988          */
989         if (!fs_devices->opened) {
990                 device->generation = found_transid;
991                 fs_devices->latest_generation = max_t(u64, found_transid,
992                                                 fs_devices->latest_generation);
993         }
994
995         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
996
997         mutex_unlock(&fs_devices->device_list_mutex);
998         return device;
999 }
1000
1001 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1002 {
1003         struct btrfs_fs_devices *fs_devices;
1004         struct btrfs_device *device;
1005         struct btrfs_device *orig_dev;
1006         int ret = 0;
1007
1008         fs_devices = alloc_fs_devices(orig->fsid, NULL);
1009         if (IS_ERR(fs_devices))
1010                 return fs_devices;
1011
1012         mutex_lock(&orig->device_list_mutex);
1013         fs_devices->total_devices = orig->total_devices;
1014
1015         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1016                 struct rcu_string *name;
1017
1018                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1019                                             orig_dev->uuid);
1020                 if (IS_ERR(device)) {
1021                         ret = PTR_ERR(device);
1022                         goto error;
1023                 }
1024
1025                 /*
1026                  * This is ok to do without rcu read locked because we hold the
1027                  * uuid mutex so nothing we touch in here is going to disappear.
1028                  */
1029                 if (orig_dev->name) {
1030                         name = rcu_string_strdup(orig_dev->name->str,
1031                                         GFP_KERNEL);
1032                         if (!name) {
1033                                 btrfs_free_device(device);
1034                                 ret = -ENOMEM;
1035                                 goto error;
1036                         }
1037                         rcu_assign_pointer(device->name, name);
1038                 }
1039
1040                 list_add(&device->dev_list, &fs_devices->devices);
1041                 device->fs_devices = fs_devices;
1042                 fs_devices->num_devices++;
1043         }
1044         mutex_unlock(&orig->device_list_mutex);
1045         return fs_devices;
1046 error:
1047         mutex_unlock(&orig->device_list_mutex);
1048         free_fs_devices(fs_devices);
1049         return ERR_PTR(ret);
1050 }
1051
1052 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1053                                       struct btrfs_device **latest_dev)
1054 {
1055         struct btrfs_device *device, *next;
1056
1057         /* This is the initialized path, it is safe to release the devices. */
1058         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1059                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1060                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1061                                       &device->dev_state) &&
1062                             !test_bit(BTRFS_DEV_STATE_MISSING,
1063                                       &device->dev_state) &&
1064                             (!*latest_dev ||
1065                              device->generation > (*latest_dev)->generation)) {
1066                                 *latest_dev = device;
1067                         }
1068                         continue;
1069                 }
1070
1071                 /*
1072                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1073                  * in btrfs_init_dev_replace() so just continue.
1074                  */
1075                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1076                         continue;
1077
1078                 if (device->bdev) {
1079                         blkdev_put(device->bdev, device->mode);
1080                         device->bdev = NULL;
1081                         fs_devices->open_devices--;
1082                 }
1083                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1084                         list_del_init(&device->dev_alloc_list);
1085                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1086                 }
1087                 list_del_init(&device->dev_list);
1088                 fs_devices->num_devices--;
1089                 btrfs_free_device(device);
1090         }
1091
1092 }
1093
1094 /*
1095  * After we have read the system tree and know devids belonging to this
1096  * filesystem, remove the device which does not belong there.
1097  */
1098 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1099 {
1100         struct btrfs_device *latest_dev = NULL;
1101         struct btrfs_fs_devices *seed_dev;
1102
1103         mutex_lock(&uuid_mutex);
1104         __btrfs_free_extra_devids(fs_devices, &latest_dev);
1105
1106         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1107                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1108
1109         fs_devices->latest_bdev = latest_dev->bdev;
1110
1111         mutex_unlock(&uuid_mutex);
1112 }
1113
1114 static void btrfs_close_bdev(struct btrfs_device *device)
1115 {
1116         if (!device->bdev)
1117                 return;
1118
1119         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1120                 sync_blockdev(device->bdev);
1121                 invalidate_bdev(device->bdev);
1122         }
1123
1124         blkdev_put(device->bdev, device->mode);
1125 }
1126
1127 static void btrfs_close_one_device(struct btrfs_device *device)
1128 {
1129         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1130
1131         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1132             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1133                 list_del_init(&device->dev_alloc_list);
1134                 fs_devices->rw_devices--;
1135         }
1136
1137         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1138                 fs_devices->missing_devices--;
1139
1140         btrfs_close_bdev(device);
1141         if (device->bdev) {
1142                 fs_devices->open_devices--;
1143                 device->bdev = NULL;
1144         }
1145         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1146         btrfs_destroy_dev_zone_info(device);
1147
1148         device->fs_info = NULL;
1149         atomic_set(&device->dev_stats_ccnt, 0);
1150         extent_io_tree_release(&device->alloc_state);
1151
1152         /* Verify the device is back in a pristine state  */
1153         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1154         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1155         ASSERT(list_empty(&device->dev_alloc_list));
1156         ASSERT(list_empty(&device->post_commit_list));
1157         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1158 }
1159
1160 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1161 {
1162         struct btrfs_device *device, *tmp;
1163
1164         lockdep_assert_held(&uuid_mutex);
1165
1166         if (--fs_devices->opened > 0)
1167                 return;
1168
1169         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1170                 btrfs_close_one_device(device);
1171
1172         WARN_ON(fs_devices->open_devices);
1173         WARN_ON(fs_devices->rw_devices);
1174         fs_devices->opened = 0;
1175         fs_devices->seeding = false;
1176         fs_devices->fs_info = NULL;
1177 }
1178
1179 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1180 {
1181         LIST_HEAD(list);
1182         struct btrfs_fs_devices *tmp;
1183
1184         mutex_lock(&uuid_mutex);
1185         close_fs_devices(fs_devices);
1186         if (!fs_devices->opened)
1187                 list_splice_init(&fs_devices->seed_list, &list);
1188
1189         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1190                 close_fs_devices(fs_devices);
1191                 list_del(&fs_devices->seed_list);
1192                 free_fs_devices(fs_devices);
1193         }
1194         mutex_unlock(&uuid_mutex);
1195 }
1196
1197 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1198                                 fmode_t flags, void *holder)
1199 {
1200         struct btrfs_device *device;
1201         struct btrfs_device *latest_dev = NULL;
1202         struct btrfs_device *tmp_device;
1203
1204         flags |= FMODE_EXCL;
1205
1206         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1207                                  dev_list) {
1208                 int ret;
1209
1210                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1211                 if (ret == 0 &&
1212                     (!latest_dev || device->generation > latest_dev->generation)) {
1213                         latest_dev = device;
1214                 } else if (ret == -ENODATA) {
1215                         fs_devices->num_devices--;
1216                         list_del(&device->dev_list);
1217                         btrfs_free_device(device);
1218                 }
1219         }
1220         if (fs_devices->open_devices == 0)
1221                 return -EINVAL;
1222
1223         fs_devices->opened = 1;
1224         fs_devices->latest_bdev = latest_dev->bdev;
1225         fs_devices->total_rw_bytes = 0;
1226         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1227         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1228
1229         return 0;
1230 }
1231
1232 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1233 {
1234         struct btrfs_device *dev1, *dev2;
1235
1236         dev1 = list_entry(a, struct btrfs_device, dev_list);
1237         dev2 = list_entry(b, struct btrfs_device, dev_list);
1238
1239         if (dev1->devid < dev2->devid)
1240                 return -1;
1241         else if (dev1->devid > dev2->devid)
1242                 return 1;
1243         return 0;
1244 }
1245
1246 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1247                        fmode_t flags, void *holder)
1248 {
1249         int ret;
1250
1251         lockdep_assert_held(&uuid_mutex);
1252         /*
1253          * The device_list_mutex cannot be taken here in case opening the
1254          * underlying device takes further locks like bd_mutex.
1255          *
1256          * We also don't need the lock here as this is called during mount and
1257          * exclusion is provided by uuid_mutex
1258          */
1259
1260         if (fs_devices->opened) {
1261                 fs_devices->opened++;
1262                 ret = 0;
1263         } else {
1264                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1265                 ret = open_fs_devices(fs_devices, flags, holder);
1266         }
1267
1268         return ret;
1269 }
1270
1271 void btrfs_release_disk_super(struct btrfs_super_block *super)
1272 {
1273         struct page *page = virt_to_page(super);
1274
1275         put_page(page);
1276 }
1277
1278 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1279                                                        u64 bytenr, u64 bytenr_orig)
1280 {
1281         struct btrfs_super_block *disk_super;
1282         struct page *page;
1283         void *p;
1284         pgoff_t index;
1285
1286         /* make sure our super fits in the device */
1287         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1288                 return ERR_PTR(-EINVAL);
1289
1290         /* make sure our super fits in the page */
1291         if (sizeof(*disk_super) > PAGE_SIZE)
1292                 return ERR_PTR(-EINVAL);
1293
1294         /* make sure our super doesn't straddle pages on disk */
1295         index = bytenr >> PAGE_SHIFT;
1296         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1297                 return ERR_PTR(-EINVAL);
1298
1299         /* pull in the page with our super */
1300         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1301
1302         if (IS_ERR(page))
1303                 return ERR_CAST(page);
1304
1305         p = page_address(page);
1306
1307         /* align our pointer to the offset of the super block */
1308         disk_super = p + offset_in_page(bytenr);
1309
1310         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1311             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1312                 btrfs_release_disk_super(p);
1313                 return ERR_PTR(-EINVAL);
1314         }
1315
1316         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1317                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1318
1319         return disk_super;
1320 }
1321
1322 int btrfs_forget_devices(const char *path)
1323 {
1324         int ret;
1325
1326         mutex_lock(&uuid_mutex);
1327         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1328         mutex_unlock(&uuid_mutex);
1329
1330         return ret;
1331 }
1332
1333 /*
1334  * Look for a btrfs signature on a device. This may be called out of the mount path
1335  * and we are not allowed to call set_blocksize during the scan. The superblock
1336  * is read via pagecache
1337  */
1338 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1339                                            void *holder)
1340 {
1341         struct btrfs_super_block *disk_super;
1342         bool new_device_added = false;
1343         struct btrfs_device *device = NULL;
1344         struct block_device *bdev;
1345         u64 bytenr, bytenr_orig;
1346         int ret;
1347
1348         lockdep_assert_held(&uuid_mutex);
1349
1350         /*
1351          * we would like to check all the supers, but that would make
1352          * a btrfs mount succeed after a mkfs from a different FS.
1353          * So, we need to add a special mount option to scan for
1354          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1355          */
1356         flags |= FMODE_EXCL;
1357
1358         bdev = blkdev_get_by_path(path, flags, holder);
1359         if (IS_ERR(bdev))
1360                 return ERR_CAST(bdev);
1361
1362         bytenr_orig = btrfs_sb_offset(0);
1363         ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1364         if (ret)
1365                 return ERR_PTR(ret);
1366
1367         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1368         if (IS_ERR(disk_super)) {
1369                 device = ERR_CAST(disk_super);
1370                 goto error_bdev_put;
1371         }
1372
1373         device = device_list_add(path, disk_super, &new_device_added);
1374         if (!IS_ERR(device)) {
1375                 if (new_device_added)
1376                         btrfs_free_stale_devices(path, device);
1377         }
1378
1379         btrfs_release_disk_super(disk_super);
1380
1381 error_bdev_put:
1382         blkdev_put(bdev, flags);
1383
1384         return device;
1385 }
1386
1387 /*
1388  * Try to find a chunk that intersects [start, start + len] range and when one
1389  * such is found, record the end of it in *start
1390  */
1391 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1392                                     u64 len)
1393 {
1394         u64 physical_start, physical_end;
1395
1396         lockdep_assert_held(&device->fs_info->chunk_mutex);
1397
1398         if (!find_first_extent_bit(&device->alloc_state, *start,
1399                                    &physical_start, &physical_end,
1400                                    CHUNK_ALLOCATED, NULL)) {
1401
1402                 if (in_range(physical_start, *start, len) ||
1403                     in_range(*start, physical_start,
1404                              physical_end - physical_start)) {
1405                         *start = physical_end + 1;
1406                         return true;
1407                 }
1408         }
1409         return false;
1410 }
1411
1412 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1413 {
1414         switch (device->fs_devices->chunk_alloc_policy) {
1415         case BTRFS_CHUNK_ALLOC_REGULAR:
1416                 /*
1417                  * We don't want to overwrite the superblock on the drive nor
1418                  * any area used by the boot loader (grub for example), so we
1419                  * make sure to start at an offset of at least 1MB.
1420                  */
1421                 return max_t(u64, start, SZ_1M);
1422         default:
1423                 BUG();
1424         }
1425 }
1426
1427 /**
1428  * dev_extent_hole_check - check if specified hole is suitable for allocation
1429  * @device:     the device which we have the hole
1430  * @hole_start: starting position of the hole
1431  * @hole_size:  the size of the hole
1432  * @num_bytes:  the size of the free space that we need
1433  *
1434  * This function may modify @hole_start and @hole_end to reflect the suitable
1435  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1436  */
1437 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1438                                   u64 *hole_size, u64 num_bytes)
1439 {
1440         bool changed = false;
1441         u64 hole_end = *hole_start + *hole_size;
1442
1443         /*
1444          * Check before we set max_hole_start, otherwise we could end up
1445          * sending back this offset anyway.
1446          */
1447         if (contains_pending_extent(device, hole_start, *hole_size)) {
1448                 if (hole_end >= *hole_start)
1449                         *hole_size = hole_end - *hole_start;
1450                 else
1451                         *hole_size = 0;
1452                 changed = true;
1453         }
1454
1455         switch (device->fs_devices->chunk_alloc_policy) {
1456         case BTRFS_CHUNK_ALLOC_REGULAR:
1457                 /* No extra check */
1458                 break;
1459         default:
1460                 BUG();
1461         }
1462
1463         return changed;
1464 }
1465
1466 /*
1467  * find_free_dev_extent_start - find free space in the specified device
1468  * @device:       the device which we search the free space in
1469  * @num_bytes:    the size of the free space that we need
1470  * @search_start: the position from which to begin the search
1471  * @start:        store the start of the free space.
1472  * @len:          the size of the free space. that we find, or the size
1473  *                of the max free space if we don't find suitable free space
1474  *
1475  * this uses a pretty simple search, the expectation is that it is
1476  * called very infrequently and that a given device has a small number
1477  * of extents
1478  *
1479  * @start is used to store the start of the free space if we find. But if we
1480  * don't find suitable free space, it will be used to store the start position
1481  * of the max free space.
1482  *
1483  * @len is used to store the size of the free space that we find.
1484  * But if we don't find suitable free space, it is used to store the size of
1485  * the max free space.
1486  *
1487  * NOTE: This function will search *commit* root of device tree, and does extra
1488  * check to ensure dev extents are not double allocated.
1489  * This makes the function safe to allocate dev extents but may not report
1490  * correct usable device space, as device extent freed in current transaction
1491  * is not reported as avaiable.
1492  */
1493 static int find_free_dev_extent_start(struct btrfs_device *device,
1494                                 u64 num_bytes, u64 search_start, u64 *start,
1495                                 u64 *len)
1496 {
1497         struct btrfs_fs_info *fs_info = device->fs_info;
1498         struct btrfs_root *root = fs_info->dev_root;
1499         struct btrfs_key key;
1500         struct btrfs_dev_extent *dev_extent;
1501         struct btrfs_path *path;
1502         u64 hole_size;
1503         u64 max_hole_start;
1504         u64 max_hole_size;
1505         u64 extent_end;
1506         u64 search_end = device->total_bytes;
1507         int ret;
1508         int slot;
1509         struct extent_buffer *l;
1510
1511         search_start = dev_extent_search_start(device, search_start);
1512
1513         path = btrfs_alloc_path();
1514         if (!path)
1515                 return -ENOMEM;
1516
1517         max_hole_start = search_start;
1518         max_hole_size = 0;
1519
1520 again:
1521         if (search_start >= search_end ||
1522                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1523                 ret = -ENOSPC;
1524                 goto out;
1525         }
1526
1527         path->reada = READA_FORWARD;
1528         path->search_commit_root = 1;
1529         path->skip_locking = 1;
1530
1531         key.objectid = device->devid;
1532         key.offset = search_start;
1533         key.type = BTRFS_DEV_EXTENT_KEY;
1534
1535         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1536         if (ret < 0)
1537                 goto out;
1538         if (ret > 0) {
1539                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1540                 if (ret < 0)
1541                         goto out;
1542         }
1543
1544         while (1) {
1545                 l = path->nodes[0];
1546                 slot = path->slots[0];
1547                 if (slot >= btrfs_header_nritems(l)) {
1548                         ret = btrfs_next_leaf(root, path);
1549                         if (ret == 0)
1550                                 continue;
1551                         if (ret < 0)
1552                                 goto out;
1553
1554                         break;
1555                 }
1556                 btrfs_item_key_to_cpu(l, &key, slot);
1557
1558                 if (key.objectid < device->devid)
1559                         goto next;
1560
1561                 if (key.objectid > device->devid)
1562                         break;
1563
1564                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1565                         goto next;
1566
1567                 if (key.offset > search_start) {
1568                         hole_size = key.offset - search_start;
1569                         dev_extent_hole_check(device, &search_start, &hole_size,
1570                                               num_bytes);
1571
1572                         if (hole_size > max_hole_size) {
1573                                 max_hole_start = search_start;
1574                                 max_hole_size = hole_size;
1575                         }
1576
1577                         /*
1578                          * If this free space is greater than which we need,
1579                          * it must be the max free space that we have found
1580                          * until now, so max_hole_start must point to the start
1581                          * of this free space and the length of this free space
1582                          * is stored in max_hole_size. Thus, we return
1583                          * max_hole_start and max_hole_size and go back to the
1584                          * caller.
1585                          */
1586                         if (hole_size >= num_bytes) {
1587                                 ret = 0;
1588                                 goto out;
1589                         }
1590                 }
1591
1592                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1593                 extent_end = key.offset + btrfs_dev_extent_length(l,
1594                                                                   dev_extent);
1595                 if (extent_end > search_start)
1596                         search_start = extent_end;
1597 next:
1598                 path->slots[0]++;
1599                 cond_resched();
1600         }
1601
1602         /*
1603          * At this point, search_start should be the end of
1604          * allocated dev extents, and when shrinking the device,
1605          * search_end may be smaller than search_start.
1606          */
1607         if (search_end > search_start) {
1608                 hole_size = search_end - search_start;
1609                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1610                                           num_bytes)) {
1611                         btrfs_release_path(path);
1612                         goto again;
1613                 }
1614
1615                 if (hole_size > max_hole_size) {
1616                         max_hole_start = search_start;
1617                         max_hole_size = hole_size;
1618                 }
1619         }
1620
1621         /* See above. */
1622         if (max_hole_size < num_bytes)
1623                 ret = -ENOSPC;
1624         else
1625                 ret = 0;
1626
1627 out:
1628         btrfs_free_path(path);
1629         *start = max_hole_start;
1630         if (len)
1631                 *len = max_hole_size;
1632         return ret;
1633 }
1634
1635 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1636                          u64 *start, u64 *len)
1637 {
1638         /* FIXME use last free of some kind */
1639         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1640 }
1641
1642 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1643                           struct btrfs_device *device,
1644                           u64 start, u64 *dev_extent_len)
1645 {
1646         struct btrfs_fs_info *fs_info = device->fs_info;
1647         struct btrfs_root *root = fs_info->dev_root;
1648         int ret;
1649         struct btrfs_path *path;
1650         struct btrfs_key key;
1651         struct btrfs_key found_key;
1652         struct extent_buffer *leaf = NULL;
1653         struct btrfs_dev_extent *extent = NULL;
1654
1655         path = btrfs_alloc_path();
1656         if (!path)
1657                 return -ENOMEM;
1658
1659         key.objectid = device->devid;
1660         key.offset = start;
1661         key.type = BTRFS_DEV_EXTENT_KEY;
1662 again:
1663         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1664         if (ret > 0) {
1665                 ret = btrfs_previous_item(root, path, key.objectid,
1666                                           BTRFS_DEV_EXTENT_KEY);
1667                 if (ret)
1668                         goto out;
1669                 leaf = path->nodes[0];
1670                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1671                 extent = btrfs_item_ptr(leaf, path->slots[0],
1672                                         struct btrfs_dev_extent);
1673                 BUG_ON(found_key.offset > start || found_key.offset +
1674                        btrfs_dev_extent_length(leaf, extent) < start);
1675                 key = found_key;
1676                 btrfs_release_path(path);
1677                 goto again;
1678         } else if (ret == 0) {
1679                 leaf = path->nodes[0];
1680                 extent = btrfs_item_ptr(leaf, path->slots[0],
1681                                         struct btrfs_dev_extent);
1682         } else {
1683                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1684                 goto out;
1685         }
1686
1687         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1688
1689         ret = btrfs_del_item(trans, root, path);
1690         if (ret) {
1691                 btrfs_handle_fs_error(fs_info, ret,
1692                                       "Failed to remove dev extent item");
1693         } else {
1694                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1695         }
1696 out:
1697         btrfs_free_path(path);
1698         return ret;
1699 }
1700
1701 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1702                                   struct btrfs_device *device,
1703                                   u64 chunk_offset, u64 start, u64 num_bytes)
1704 {
1705         int ret;
1706         struct btrfs_path *path;
1707         struct btrfs_fs_info *fs_info = device->fs_info;
1708         struct btrfs_root *root = fs_info->dev_root;
1709         struct btrfs_dev_extent *extent;
1710         struct extent_buffer *leaf;
1711         struct btrfs_key key;
1712
1713         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1714         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1715         path = btrfs_alloc_path();
1716         if (!path)
1717                 return -ENOMEM;
1718
1719         key.objectid = device->devid;
1720         key.offset = start;
1721         key.type = BTRFS_DEV_EXTENT_KEY;
1722         ret = btrfs_insert_empty_item(trans, root, path, &key,
1723                                       sizeof(*extent));
1724         if (ret)
1725                 goto out;
1726
1727         leaf = path->nodes[0];
1728         extent = btrfs_item_ptr(leaf, path->slots[0],
1729                                 struct btrfs_dev_extent);
1730         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1731                                         BTRFS_CHUNK_TREE_OBJECTID);
1732         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1733                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1734         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1735
1736         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1737         btrfs_mark_buffer_dirty(leaf);
1738 out:
1739         btrfs_free_path(path);
1740         return ret;
1741 }
1742
1743 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1744 {
1745         struct extent_map_tree *em_tree;
1746         struct extent_map *em;
1747         struct rb_node *n;
1748         u64 ret = 0;
1749
1750         em_tree = &fs_info->mapping_tree;
1751         read_lock(&em_tree->lock);
1752         n = rb_last(&em_tree->map.rb_root);
1753         if (n) {
1754                 em = rb_entry(n, struct extent_map, rb_node);
1755                 ret = em->start + em->len;
1756         }
1757         read_unlock(&em_tree->lock);
1758
1759         return ret;
1760 }
1761
1762 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1763                                     u64 *devid_ret)
1764 {
1765         int ret;
1766         struct btrfs_key key;
1767         struct btrfs_key found_key;
1768         struct btrfs_path *path;
1769
1770         path = btrfs_alloc_path();
1771         if (!path)
1772                 return -ENOMEM;
1773
1774         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775         key.type = BTRFS_DEV_ITEM_KEY;
1776         key.offset = (u64)-1;
1777
1778         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1779         if (ret < 0)
1780                 goto error;
1781
1782         if (ret == 0) {
1783                 /* Corruption */
1784                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1785                 ret = -EUCLEAN;
1786                 goto error;
1787         }
1788
1789         ret = btrfs_previous_item(fs_info->chunk_root, path,
1790                                   BTRFS_DEV_ITEMS_OBJECTID,
1791                                   BTRFS_DEV_ITEM_KEY);
1792         if (ret) {
1793                 *devid_ret = 1;
1794         } else {
1795                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1796                                       path->slots[0]);
1797                 *devid_ret = found_key.offset + 1;
1798         }
1799         ret = 0;
1800 error:
1801         btrfs_free_path(path);
1802         return ret;
1803 }
1804
1805 /*
1806  * the device information is stored in the chunk root
1807  * the btrfs_device struct should be fully filled in
1808  */
1809 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1810                             struct btrfs_device *device)
1811 {
1812         int ret;
1813         struct btrfs_path *path;
1814         struct btrfs_dev_item *dev_item;
1815         struct extent_buffer *leaf;
1816         struct btrfs_key key;
1817         unsigned long ptr;
1818
1819         path = btrfs_alloc_path();
1820         if (!path)
1821                 return -ENOMEM;
1822
1823         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1824         key.type = BTRFS_DEV_ITEM_KEY;
1825         key.offset = device->devid;
1826
1827         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1828                                       &key, sizeof(*dev_item));
1829         if (ret)
1830                 goto out;
1831
1832         leaf = path->nodes[0];
1833         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1834
1835         btrfs_set_device_id(leaf, dev_item, device->devid);
1836         btrfs_set_device_generation(leaf, dev_item, 0);
1837         btrfs_set_device_type(leaf, dev_item, device->type);
1838         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1839         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1840         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1841         btrfs_set_device_total_bytes(leaf, dev_item,
1842                                      btrfs_device_get_disk_total_bytes(device));
1843         btrfs_set_device_bytes_used(leaf, dev_item,
1844                                     btrfs_device_get_bytes_used(device));
1845         btrfs_set_device_group(leaf, dev_item, 0);
1846         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1847         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1848         btrfs_set_device_start_offset(leaf, dev_item, 0);
1849
1850         ptr = btrfs_device_uuid(dev_item);
1851         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1852         ptr = btrfs_device_fsid(dev_item);
1853         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1854                             ptr, BTRFS_FSID_SIZE);
1855         btrfs_mark_buffer_dirty(leaf);
1856
1857         ret = 0;
1858 out:
1859         btrfs_free_path(path);
1860         return ret;
1861 }
1862
1863 /*
1864  * Function to update ctime/mtime for a given device path.
1865  * Mainly used for ctime/mtime based probe like libblkid.
1866  */
1867 static void update_dev_time(const char *path_name)
1868 {
1869         struct file *filp;
1870
1871         filp = filp_open(path_name, O_RDWR, 0);
1872         if (IS_ERR(filp))
1873                 return;
1874         file_update_time(filp);
1875         filp_close(filp, NULL);
1876 }
1877
1878 static int btrfs_rm_dev_item(struct btrfs_device *device)
1879 {
1880         struct btrfs_root *root = device->fs_info->chunk_root;
1881         int ret;
1882         struct btrfs_path *path;
1883         struct btrfs_key key;
1884         struct btrfs_trans_handle *trans;
1885
1886         path = btrfs_alloc_path();
1887         if (!path)
1888                 return -ENOMEM;
1889
1890         trans = btrfs_start_transaction(root, 0);
1891         if (IS_ERR(trans)) {
1892                 btrfs_free_path(path);
1893                 return PTR_ERR(trans);
1894         }
1895         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1896         key.type = BTRFS_DEV_ITEM_KEY;
1897         key.offset = device->devid;
1898
1899         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1900         if (ret) {
1901                 if (ret > 0)
1902                         ret = -ENOENT;
1903                 btrfs_abort_transaction(trans, ret);
1904                 btrfs_end_transaction(trans);
1905                 goto out;
1906         }
1907
1908         ret = btrfs_del_item(trans, root, path);
1909         if (ret) {
1910                 btrfs_abort_transaction(trans, ret);
1911                 btrfs_end_transaction(trans);
1912         }
1913
1914 out:
1915         btrfs_free_path(path);
1916         if (!ret)
1917                 ret = btrfs_commit_transaction(trans);
1918         return ret;
1919 }
1920
1921 /*
1922  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1923  * filesystem. It's up to the caller to adjust that number regarding eg. device
1924  * replace.
1925  */
1926 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1927                 u64 num_devices)
1928 {
1929         u64 all_avail;
1930         unsigned seq;
1931         int i;
1932
1933         do {
1934                 seq = read_seqbegin(&fs_info->profiles_lock);
1935
1936                 all_avail = fs_info->avail_data_alloc_bits |
1937                             fs_info->avail_system_alloc_bits |
1938                             fs_info->avail_metadata_alloc_bits;
1939         } while (read_seqretry(&fs_info->profiles_lock, seq));
1940
1941         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1942                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1943                         continue;
1944
1945                 if (num_devices < btrfs_raid_array[i].devs_min) {
1946                         int ret = btrfs_raid_array[i].mindev_error;
1947
1948                         if (ret)
1949                                 return ret;
1950                 }
1951         }
1952
1953         return 0;
1954 }
1955
1956 static struct btrfs_device * btrfs_find_next_active_device(
1957                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1958 {
1959         struct btrfs_device *next_device;
1960
1961         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1962                 if (next_device != device &&
1963                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1964                     && next_device->bdev)
1965                         return next_device;
1966         }
1967
1968         return NULL;
1969 }
1970
1971 /*
1972  * Helper function to check if the given device is part of s_bdev / latest_bdev
1973  * and replace it with the provided or the next active device, in the context
1974  * where this function called, there should be always be another device (or
1975  * this_dev) which is active.
1976  */
1977 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1978                                             struct btrfs_device *next_device)
1979 {
1980         struct btrfs_fs_info *fs_info = device->fs_info;
1981
1982         if (!next_device)
1983                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1984                                                             device);
1985         ASSERT(next_device);
1986
1987         if (fs_info->sb->s_bdev &&
1988                         (fs_info->sb->s_bdev == device->bdev))
1989                 fs_info->sb->s_bdev = next_device->bdev;
1990
1991         if (fs_info->fs_devices->latest_bdev == device->bdev)
1992                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1993 }
1994
1995 /*
1996  * Return btrfs_fs_devices::num_devices excluding the device that's being
1997  * currently replaced.
1998  */
1999 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2000 {
2001         u64 num_devices = fs_info->fs_devices->num_devices;
2002
2003         down_read(&fs_info->dev_replace.rwsem);
2004         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2005                 ASSERT(num_devices > 1);
2006                 num_devices--;
2007         }
2008         up_read(&fs_info->dev_replace.rwsem);
2009
2010         return num_devices;
2011 }
2012
2013 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2014                                struct block_device *bdev,
2015                                const char *device_path)
2016 {
2017         struct btrfs_super_block *disk_super;
2018         int copy_num;
2019
2020         if (!bdev)
2021                 return;
2022
2023         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2024                 struct page *page;
2025                 int ret;
2026
2027                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2028                 if (IS_ERR(disk_super))
2029                         continue;
2030
2031                 if (bdev_is_zoned(bdev)) {
2032                         btrfs_reset_sb_log_zones(bdev, copy_num);
2033                         continue;
2034                 }
2035
2036                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2037
2038                 page = virt_to_page(disk_super);
2039                 set_page_dirty(page);
2040                 lock_page(page);
2041                 /* write_on_page() unlocks the page */
2042                 ret = write_one_page(page);
2043                 if (ret)
2044                         btrfs_warn(fs_info,
2045                                 "error clearing superblock number %d (%d)",
2046                                 copy_num, ret);
2047                 btrfs_release_disk_super(disk_super);
2048
2049         }
2050
2051         /* Notify udev that device has changed */
2052         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2053
2054         /* Update ctime/mtime for device path for libblkid */
2055         update_dev_time(device_path);
2056 }
2057
2058 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2059                     u64 devid)
2060 {
2061         struct btrfs_device *device;
2062         struct btrfs_fs_devices *cur_devices;
2063         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2064         u64 num_devices;
2065         int ret = 0;
2066
2067         mutex_lock(&uuid_mutex);
2068
2069         num_devices = btrfs_num_devices(fs_info);
2070
2071         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2072         if (ret)
2073                 goto out;
2074
2075         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2076
2077         if (IS_ERR(device)) {
2078                 if (PTR_ERR(device) == -ENOENT &&
2079                     strcmp(device_path, "missing") == 0)
2080                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2081                 else
2082                         ret = PTR_ERR(device);
2083                 goto out;
2084         }
2085
2086         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2087                 btrfs_warn_in_rcu(fs_info,
2088                   "cannot remove device %s (devid %llu) due to active swapfile",
2089                                   rcu_str_deref(device->name), device->devid);
2090                 ret = -ETXTBSY;
2091                 goto out;
2092         }
2093
2094         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2095                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2096                 goto out;
2097         }
2098
2099         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2100             fs_info->fs_devices->rw_devices == 1) {
2101                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2102                 goto out;
2103         }
2104
2105         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2106                 mutex_lock(&fs_info->chunk_mutex);
2107                 list_del_init(&device->dev_alloc_list);
2108                 device->fs_devices->rw_devices--;
2109                 mutex_unlock(&fs_info->chunk_mutex);
2110         }
2111
2112         mutex_unlock(&uuid_mutex);
2113         ret = btrfs_shrink_device(device, 0);
2114         if (!ret)
2115                 btrfs_reada_remove_dev(device);
2116         mutex_lock(&uuid_mutex);
2117         if (ret)
2118                 goto error_undo;
2119
2120         /*
2121          * TODO: the superblock still includes this device in its num_devices
2122          * counter although write_all_supers() is not locked out. This
2123          * could give a filesystem state which requires a degraded mount.
2124          */
2125         ret = btrfs_rm_dev_item(device);
2126         if (ret)
2127                 goto error_undo;
2128
2129         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2130         btrfs_scrub_cancel_dev(device);
2131
2132         /*
2133          * the device list mutex makes sure that we don't change
2134          * the device list while someone else is writing out all
2135          * the device supers. Whoever is writing all supers, should
2136          * lock the device list mutex before getting the number of
2137          * devices in the super block (super_copy). Conversely,
2138          * whoever updates the number of devices in the super block
2139          * (super_copy) should hold the device list mutex.
2140          */
2141
2142         /*
2143          * In normal cases the cur_devices == fs_devices. But in case
2144          * of deleting a seed device, the cur_devices should point to
2145          * its own fs_devices listed under the fs_devices->seed.
2146          */
2147         cur_devices = device->fs_devices;
2148         mutex_lock(&fs_devices->device_list_mutex);
2149         list_del_rcu(&device->dev_list);
2150
2151         cur_devices->num_devices--;
2152         cur_devices->total_devices--;
2153         /* Update total_devices of the parent fs_devices if it's seed */
2154         if (cur_devices != fs_devices)
2155                 fs_devices->total_devices--;
2156
2157         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2158                 cur_devices->missing_devices--;
2159
2160         btrfs_assign_next_active_device(device, NULL);
2161
2162         if (device->bdev) {
2163                 cur_devices->open_devices--;
2164                 /* remove sysfs entry */
2165                 btrfs_sysfs_remove_device(device);
2166         }
2167
2168         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2169         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2170         mutex_unlock(&fs_devices->device_list_mutex);
2171
2172         /*
2173          * at this point, the device is zero sized and detached from
2174          * the devices list.  All that's left is to zero out the old
2175          * supers and free the device.
2176          */
2177         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2178                 btrfs_scratch_superblocks(fs_info, device->bdev,
2179                                           device->name->str);
2180
2181         btrfs_close_bdev(device);
2182         synchronize_rcu();
2183         btrfs_free_device(device);
2184
2185         if (cur_devices->open_devices == 0) {
2186                 list_del_init(&cur_devices->seed_list);
2187                 close_fs_devices(cur_devices);
2188                 free_fs_devices(cur_devices);
2189         }
2190
2191 out:
2192         mutex_unlock(&uuid_mutex);
2193         return ret;
2194
2195 error_undo:
2196         btrfs_reada_undo_remove_dev(device);
2197         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2198                 mutex_lock(&fs_info->chunk_mutex);
2199                 list_add(&device->dev_alloc_list,
2200                          &fs_devices->alloc_list);
2201                 device->fs_devices->rw_devices++;
2202                 mutex_unlock(&fs_info->chunk_mutex);
2203         }
2204         goto out;
2205 }
2206
2207 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2208 {
2209         struct btrfs_fs_devices *fs_devices;
2210
2211         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2212
2213         /*
2214          * in case of fs with no seed, srcdev->fs_devices will point
2215          * to fs_devices of fs_info. However when the dev being replaced is
2216          * a seed dev it will point to the seed's local fs_devices. In short
2217          * srcdev will have its correct fs_devices in both the cases.
2218          */
2219         fs_devices = srcdev->fs_devices;
2220
2221         list_del_rcu(&srcdev->dev_list);
2222         list_del(&srcdev->dev_alloc_list);
2223         fs_devices->num_devices--;
2224         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2225                 fs_devices->missing_devices--;
2226
2227         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2228                 fs_devices->rw_devices--;
2229
2230         if (srcdev->bdev)
2231                 fs_devices->open_devices--;
2232 }
2233
2234 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2235 {
2236         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2237
2238         mutex_lock(&uuid_mutex);
2239
2240         btrfs_close_bdev(srcdev);
2241         synchronize_rcu();
2242         btrfs_free_device(srcdev);
2243
2244         /* if this is no devs we rather delete the fs_devices */
2245         if (!fs_devices->num_devices) {
2246                 /*
2247                  * On a mounted FS, num_devices can't be zero unless it's a
2248                  * seed. In case of a seed device being replaced, the replace
2249                  * target added to the sprout FS, so there will be no more
2250                  * device left under the seed FS.
2251                  */
2252                 ASSERT(fs_devices->seeding);
2253
2254                 list_del_init(&fs_devices->seed_list);
2255                 close_fs_devices(fs_devices);
2256                 free_fs_devices(fs_devices);
2257         }
2258         mutex_unlock(&uuid_mutex);
2259 }
2260
2261 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2262 {
2263         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2264
2265         mutex_lock(&fs_devices->device_list_mutex);
2266
2267         btrfs_sysfs_remove_device(tgtdev);
2268
2269         if (tgtdev->bdev)
2270                 fs_devices->open_devices--;
2271
2272         fs_devices->num_devices--;
2273
2274         btrfs_assign_next_active_device(tgtdev, NULL);
2275
2276         list_del_rcu(&tgtdev->dev_list);
2277
2278         mutex_unlock(&fs_devices->device_list_mutex);
2279
2280         /*
2281          * The update_dev_time() with in btrfs_scratch_superblocks()
2282          * may lead to a call to btrfs_show_devname() which will try
2283          * to hold device_list_mutex. And here this device
2284          * is already out of device list, so we don't have to hold
2285          * the device_list_mutex lock.
2286          */
2287         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2288                                   tgtdev->name->str);
2289
2290         btrfs_close_bdev(tgtdev);
2291         synchronize_rcu();
2292         btrfs_free_device(tgtdev);
2293 }
2294
2295 static struct btrfs_device *btrfs_find_device_by_path(
2296                 struct btrfs_fs_info *fs_info, const char *device_path)
2297 {
2298         int ret = 0;
2299         struct btrfs_super_block *disk_super;
2300         u64 devid;
2301         u8 *dev_uuid;
2302         struct block_device *bdev;
2303         struct btrfs_device *device;
2304
2305         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2306                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2307         if (ret)
2308                 return ERR_PTR(ret);
2309
2310         devid = btrfs_stack_device_id(&disk_super->dev_item);
2311         dev_uuid = disk_super->dev_item.uuid;
2312         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2313                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2314                                            disk_super->metadata_uuid);
2315         else
2316                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2317                                            disk_super->fsid);
2318
2319         btrfs_release_disk_super(disk_super);
2320         if (!device)
2321                 device = ERR_PTR(-ENOENT);
2322         blkdev_put(bdev, FMODE_READ);
2323         return device;
2324 }
2325
2326 /*
2327  * Lookup a device given by device id, or the path if the id is 0.
2328  */
2329 struct btrfs_device *btrfs_find_device_by_devspec(
2330                 struct btrfs_fs_info *fs_info, u64 devid,
2331                 const char *device_path)
2332 {
2333         struct btrfs_device *device;
2334
2335         if (devid) {
2336                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2337                                            NULL);
2338                 if (!device)
2339                         return ERR_PTR(-ENOENT);
2340                 return device;
2341         }
2342
2343         if (!device_path || !device_path[0])
2344                 return ERR_PTR(-EINVAL);
2345
2346         if (strcmp(device_path, "missing") == 0) {
2347                 /* Find first missing device */
2348                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2349                                     dev_list) {
2350                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2351                                      &device->dev_state) && !device->bdev)
2352                                 return device;
2353                 }
2354                 return ERR_PTR(-ENOENT);
2355         }
2356
2357         return btrfs_find_device_by_path(fs_info, device_path);
2358 }
2359
2360 /*
2361  * does all the dirty work required for changing file system's UUID.
2362  */
2363 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2364 {
2365         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2366         struct btrfs_fs_devices *old_devices;
2367         struct btrfs_fs_devices *seed_devices;
2368         struct btrfs_super_block *disk_super = fs_info->super_copy;
2369         struct btrfs_device *device;
2370         u64 super_flags;
2371
2372         lockdep_assert_held(&uuid_mutex);
2373         if (!fs_devices->seeding)
2374                 return -EINVAL;
2375
2376         /*
2377          * Private copy of the seed devices, anchored at
2378          * fs_info->fs_devices->seed_list
2379          */
2380         seed_devices = alloc_fs_devices(NULL, NULL);
2381         if (IS_ERR(seed_devices))
2382                 return PTR_ERR(seed_devices);
2383
2384         /*
2385          * It's necessary to retain a copy of the original seed fs_devices in
2386          * fs_uuids so that filesystems which have been seeded can successfully
2387          * reference the seed device from open_seed_devices. This also supports
2388          * multiple fs seed.
2389          */
2390         old_devices = clone_fs_devices(fs_devices);
2391         if (IS_ERR(old_devices)) {
2392                 kfree(seed_devices);
2393                 return PTR_ERR(old_devices);
2394         }
2395
2396         list_add(&old_devices->fs_list, &fs_uuids);
2397
2398         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2399         seed_devices->opened = 1;
2400         INIT_LIST_HEAD(&seed_devices->devices);
2401         INIT_LIST_HEAD(&seed_devices->alloc_list);
2402         mutex_init(&seed_devices->device_list_mutex);
2403
2404         mutex_lock(&fs_devices->device_list_mutex);
2405         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2406                               synchronize_rcu);
2407         list_for_each_entry(device, &seed_devices->devices, dev_list)
2408                 device->fs_devices = seed_devices;
2409
2410         fs_devices->seeding = false;
2411         fs_devices->num_devices = 0;
2412         fs_devices->open_devices = 0;
2413         fs_devices->missing_devices = 0;
2414         fs_devices->rotating = false;
2415         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2416
2417         generate_random_uuid(fs_devices->fsid);
2418         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2419         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2420         mutex_unlock(&fs_devices->device_list_mutex);
2421
2422         super_flags = btrfs_super_flags(disk_super) &
2423                       ~BTRFS_SUPER_FLAG_SEEDING;
2424         btrfs_set_super_flags(disk_super, super_flags);
2425
2426         return 0;
2427 }
2428
2429 /*
2430  * Store the expected generation for seed devices in device items.
2431  */
2432 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2433 {
2434         struct btrfs_fs_info *fs_info = trans->fs_info;
2435         struct btrfs_root *root = fs_info->chunk_root;
2436         struct btrfs_path *path;
2437         struct extent_buffer *leaf;
2438         struct btrfs_dev_item *dev_item;
2439         struct btrfs_device *device;
2440         struct btrfs_key key;
2441         u8 fs_uuid[BTRFS_FSID_SIZE];
2442         u8 dev_uuid[BTRFS_UUID_SIZE];
2443         u64 devid;
2444         int ret;
2445
2446         path = btrfs_alloc_path();
2447         if (!path)
2448                 return -ENOMEM;
2449
2450         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2451         key.offset = 0;
2452         key.type = BTRFS_DEV_ITEM_KEY;
2453
2454         while (1) {
2455                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2456                 if (ret < 0)
2457                         goto error;
2458
2459                 leaf = path->nodes[0];
2460 next_slot:
2461                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2462                         ret = btrfs_next_leaf(root, path);
2463                         if (ret > 0)
2464                                 break;
2465                         if (ret < 0)
2466                                 goto error;
2467                         leaf = path->nodes[0];
2468                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2469                         btrfs_release_path(path);
2470                         continue;
2471                 }
2472
2473                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2474                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2475                     key.type != BTRFS_DEV_ITEM_KEY)
2476                         break;
2477
2478                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2479                                           struct btrfs_dev_item);
2480                 devid = btrfs_device_id(leaf, dev_item);
2481                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2482                                    BTRFS_UUID_SIZE);
2483                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2484                                    BTRFS_FSID_SIZE);
2485                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2486                                            fs_uuid);
2487                 BUG_ON(!device); /* Logic error */
2488
2489                 if (device->fs_devices->seeding) {
2490                         btrfs_set_device_generation(leaf, dev_item,
2491                                                     device->generation);
2492                         btrfs_mark_buffer_dirty(leaf);
2493                 }
2494
2495                 path->slots[0]++;
2496                 goto next_slot;
2497         }
2498         ret = 0;
2499 error:
2500         btrfs_free_path(path);
2501         return ret;
2502 }
2503
2504 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2505 {
2506         struct btrfs_root *root = fs_info->dev_root;
2507         struct request_queue *q;
2508         struct btrfs_trans_handle *trans;
2509         struct btrfs_device *device;
2510         struct block_device *bdev;
2511         struct super_block *sb = fs_info->sb;
2512         struct rcu_string *name;
2513         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2514         u64 orig_super_total_bytes;
2515         u64 orig_super_num_devices;
2516         int seeding_dev = 0;
2517         int ret = 0;
2518         bool locked = false;
2519
2520         if (sb_rdonly(sb) && !fs_devices->seeding)
2521                 return -EROFS;
2522
2523         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2524                                   fs_info->bdev_holder);
2525         if (IS_ERR(bdev))
2526                 return PTR_ERR(bdev);
2527
2528         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2529                 ret = -EINVAL;
2530                 goto error;
2531         }
2532
2533         if (fs_devices->seeding) {
2534                 seeding_dev = 1;
2535                 down_write(&sb->s_umount);
2536                 mutex_lock(&uuid_mutex);
2537                 locked = true;
2538         }
2539
2540         sync_blockdev(bdev);
2541
2542         rcu_read_lock();
2543         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2544                 if (device->bdev == bdev) {
2545                         ret = -EEXIST;
2546                         rcu_read_unlock();
2547                         goto error;
2548                 }
2549         }
2550         rcu_read_unlock();
2551
2552         device = btrfs_alloc_device(fs_info, NULL, NULL);
2553         if (IS_ERR(device)) {
2554                 /* we can safely leave the fs_devices entry around */
2555                 ret = PTR_ERR(device);
2556                 goto error;
2557         }
2558
2559         name = rcu_string_strdup(device_path, GFP_KERNEL);
2560         if (!name) {
2561                 ret = -ENOMEM;
2562                 goto error_free_device;
2563         }
2564         rcu_assign_pointer(device->name, name);
2565
2566         device->fs_info = fs_info;
2567         device->bdev = bdev;
2568
2569         ret = btrfs_get_dev_zone_info(device);
2570         if (ret)
2571                 goto error_free_device;
2572
2573         trans = btrfs_start_transaction(root, 0);
2574         if (IS_ERR(trans)) {
2575                 ret = PTR_ERR(trans);
2576                 goto error_free_zone;
2577         }
2578
2579         q = bdev_get_queue(bdev);
2580         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2581         device->generation = trans->transid;
2582         device->io_width = fs_info->sectorsize;
2583         device->io_align = fs_info->sectorsize;
2584         device->sector_size = fs_info->sectorsize;
2585         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2586                                          fs_info->sectorsize);
2587         device->disk_total_bytes = device->total_bytes;
2588         device->commit_total_bytes = device->total_bytes;
2589         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2590         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2591         device->mode = FMODE_EXCL;
2592         device->dev_stats_valid = 1;
2593         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2594
2595         if (seeding_dev) {
2596                 sb->s_flags &= ~SB_RDONLY;
2597                 ret = btrfs_prepare_sprout(fs_info);
2598                 if (ret) {
2599                         btrfs_abort_transaction(trans, ret);
2600                         goto error_trans;
2601                 }
2602         }
2603
2604         device->fs_devices = fs_devices;
2605
2606         mutex_lock(&fs_devices->device_list_mutex);
2607         mutex_lock(&fs_info->chunk_mutex);
2608         list_add_rcu(&device->dev_list, &fs_devices->devices);
2609         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2610         fs_devices->num_devices++;
2611         fs_devices->open_devices++;
2612         fs_devices->rw_devices++;
2613         fs_devices->total_devices++;
2614         fs_devices->total_rw_bytes += device->total_bytes;
2615
2616         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2617
2618         if (!blk_queue_nonrot(q))
2619                 fs_devices->rotating = true;
2620
2621         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2622         btrfs_set_super_total_bytes(fs_info->super_copy,
2623                 round_down(orig_super_total_bytes + device->total_bytes,
2624                            fs_info->sectorsize));
2625
2626         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2627         btrfs_set_super_num_devices(fs_info->super_copy,
2628                                     orig_super_num_devices + 1);
2629
2630         /*
2631          * we've got more storage, clear any full flags on the space
2632          * infos
2633          */
2634         btrfs_clear_space_info_full(fs_info);
2635
2636         mutex_unlock(&fs_info->chunk_mutex);
2637
2638         /* Add sysfs device entry */
2639         btrfs_sysfs_add_device(device);
2640
2641         mutex_unlock(&fs_devices->device_list_mutex);
2642
2643         if (seeding_dev) {
2644                 mutex_lock(&fs_info->chunk_mutex);
2645                 ret = init_first_rw_device(trans);
2646                 mutex_unlock(&fs_info->chunk_mutex);
2647                 if (ret) {
2648                         btrfs_abort_transaction(trans, ret);
2649                         goto error_sysfs;
2650                 }
2651         }
2652
2653         ret = btrfs_add_dev_item(trans, device);
2654         if (ret) {
2655                 btrfs_abort_transaction(trans, ret);
2656                 goto error_sysfs;
2657         }
2658
2659         if (seeding_dev) {
2660                 ret = btrfs_finish_sprout(trans);
2661                 if (ret) {
2662                         btrfs_abort_transaction(trans, ret);
2663                         goto error_sysfs;
2664                 }
2665
2666                 /*
2667                  * fs_devices now represents the newly sprouted filesystem and
2668                  * its fsid has been changed by btrfs_prepare_sprout
2669                  */
2670                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2671         }
2672
2673         ret = btrfs_commit_transaction(trans);
2674
2675         if (seeding_dev) {
2676                 mutex_unlock(&uuid_mutex);
2677                 up_write(&sb->s_umount);
2678                 locked = false;
2679
2680                 if (ret) /* transaction commit */
2681                         return ret;
2682
2683                 ret = btrfs_relocate_sys_chunks(fs_info);
2684                 if (ret < 0)
2685                         btrfs_handle_fs_error(fs_info, ret,
2686                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2687                 trans = btrfs_attach_transaction(root);
2688                 if (IS_ERR(trans)) {
2689                         if (PTR_ERR(trans) == -ENOENT)
2690                                 return 0;
2691                         ret = PTR_ERR(trans);
2692                         trans = NULL;
2693                         goto error_sysfs;
2694                 }
2695                 ret = btrfs_commit_transaction(trans);
2696         }
2697
2698         /*
2699          * Now that we have written a new super block to this device, check all
2700          * other fs_devices list if device_path alienates any other scanned
2701          * device.
2702          * We can ignore the return value as it typically returns -EINVAL and
2703          * only succeeds if the device was an alien.
2704          */
2705         btrfs_forget_devices(device_path);
2706
2707         /* Update ctime/mtime for blkid or udev */
2708         update_dev_time(device_path);
2709
2710         return ret;
2711
2712 error_sysfs:
2713         btrfs_sysfs_remove_device(device);
2714         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2715         mutex_lock(&fs_info->chunk_mutex);
2716         list_del_rcu(&device->dev_list);
2717         list_del(&device->dev_alloc_list);
2718         fs_info->fs_devices->num_devices--;
2719         fs_info->fs_devices->open_devices--;
2720         fs_info->fs_devices->rw_devices--;
2721         fs_info->fs_devices->total_devices--;
2722         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2723         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2724         btrfs_set_super_total_bytes(fs_info->super_copy,
2725                                     orig_super_total_bytes);
2726         btrfs_set_super_num_devices(fs_info->super_copy,
2727                                     orig_super_num_devices);
2728         mutex_unlock(&fs_info->chunk_mutex);
2729         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2730 error_trans:
2731         if (seeding_dev)
2732                 sb->s_flags |= SB_RDONLY;
2733         if (trans)
2734                 btrfs_end_transaction(trans);
2735 error_free_zone:
2736         btrfs_destroy_dev_zone_info(device);
2737 error_free_device:
2738         btrfs_free_device(device);
2739 error:
2740         blkdev_put(bdev, FMODE_EXCL);
2741         if (locked) {
2742                 mutex_unlock(&uuid_mutex);
2743                 up_write(&sb->s_umount);
2744         }
2745         return ret;
2746 }
2747
2748 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2749                                         struct btrfs_device *device)
2750 {
2751         int ret;
2752         struct btrfs_path *path;
2753         struct btrfs_root *root = device->fs_info->chunk_root;
2754         struct btrfs_dev_item *dev_item;
2755         struct extent_buffer *leaf;
2756         struct btrfs_key key;
2757
2758         path = btrfs_alloc_path();
2759         if (!path)
2760                 return -ENOMEM;
2761
2762         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2763         key.type = BTRFS_DEV_ITEM_KEY;
2764         key.offset = device->devid;
2765
2766         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2767         if (ret < 0)
2768                 goto out;
2769
2770         if (ret > 0) {
2771                 ret = -ENOENT;
2772                 goto out;
2773         }
2774
2775         leaf = path->nodes[0];
2776         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2777
2778         btrfs_set_device_id(leaf, dev_item, device->devid);
2779         btrfs_set_device_type(leaf, dev_item, device->type);
2780         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2781         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2782         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2783         btrfs_set_device_total_bytes(leaf, dev_item,
2784                                      btrfs_device_get_disk_total_bytes(device));
2785         btrfs_set_device_bytes_used(leaf, dev_item,
2786                                     btrfs_device_get_bytes_used(device));
2787         btrfs_mark_buffer_dirty(leaf);
2788
2789 out:
2790         btrfs_free_path(path);
2791         return ret;
2792 }
2793
2794 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2795                       struct btrfs_device *device, u64 new_size)
2796 {
2797         struct btrfs_fs_info *fs_info = device->fs_info;
2798         struct btrfs_super_block *super_copy = fs_info->super_copy;
2799         u64 old_total;
2800         u64 diff;
2801
2802         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2803                 return -EACCES;
2804
2805         new_size = round_down(new_size, fs_info->sectorsize);
2806
2807         mutex_lock(&fs_info->chunk_mutex);
2808         old_total = btrfs_super_total_bytes(super_copy);
2809         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2810
2811         if (new_size <= device->total_bytes ||
2812             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2813                 mutex_unlock(&fs_info->chunk_mutex);
2814                 return -EINVAL;
2815         }
2816
2817         btrfs_set_super_total_bytes(super_copy,
2818                         round_down(old_total + diff, fs_info->sectorsize));
2819         device->fs_devices->total_rw_bytes += diff;
2820
2821         btrfs_device_set_total_bytes(device, new_size);
2822         btrfs_device_set_disk_total_bytes(device, new_size);
2823         btrfs_clear_space_info_full(device->fs_info);
2824         if (list_empty(&device->post_commit_list))
2825                 list_add_tail(&device->post_commit_list,
2826                               &trans->transaction->dev_update_list);
2827         mutex_unlock(&fs_info->chunk_mutex);
2828
2829         return btrfs_update_device(trans, device);
2830 }
2831
2832 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2833 {
2834         struct btrfs_fs_info *fs_info = trans->fs_info;
2835         struct btrfs_root *root = fs_info->chunk_root;
2836         int ret;
2837         struct btrfs_path *path;
2838         struct btrfs_key key;
2839
2840         path = btrfs_alloc_path();
2841         if (!path)
2842                 return -ENOMEM;
2843
2844         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2845         key.offset = chunk_offset;
2846         key.type = BTRFS_CHUNK_ITEM_KEY;
2847
2848         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2849         if (ret < 0)
2850                 goto out;
2851         else if (ret > 0) { /* Logic error or corruption */
2852                 btrfs_handle_fs_error(fs_info, -ENOENT,
2853                                       "Failed lookup while freeing chunk.");
2854                 ret = -ENOENT;
2855                 goto out;
2856         }
2857
2858         ret = btrfs_del_item(trans, root, path);
2859         if (ret < 0)
2860                 btrfs_handle_fs_error(fs_info, ret,
2861                                       "Failed to delete chunk item.");
2862 out:
2863         btrfs_free_path(path);
2864         return ret;
2865 }
2866
2867 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2868 {
2869         struct btrfs_super_block *super_copy = fs_info->super_copy;
2870         struct btrfs_disk_key *disk_key;
2871         struct btrfs_chunk *chunk;
2872         u8 *ptr;
2873         int ret = 0;
2874         u32 num_stripes;
2875         u32 array_size;
2876         u32 len = 0;
2877         u32 cur;
2878         struct btrfs_key key;
2879
2880         mutex_lock(&fs_info->chunk_mutex);
2881         array_size = btrfs_super_sys_array_size(super_copy);
2882
2883         ptr = super_copy->sys_chunk_array;
2884         cur = 0;
2885
2886         while (cur < array_size) {
2887                 disk_key = (struct btrfs_disk_key *)ptr;
2888                 btrfs_disk_key_to_cpu(&key, disk_key);
2889
2890                 len = sizeof(*disk_key);
2891
2892                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2893                         chunk = (struct btrfs_chunk *)(ptr + len);
2894                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2895                         len += btrfs_chunk_item_size(num_stripes);
2896                 } else {
2897                         ret = -EIO;
2898                         break;
2899                 }
2900                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2901                     key.offset == chunk_offset) {
2902                         memmove(ptr, ptr + len, array_size - (cur + len));
2903                         array_size -= len;
2904                         btrfs_set_super_sys_array_size(super_copy, array_size);
2905                 } else {
2906                         ptr += len;
2907                         cur += len;
2908                 }
2909         }
2910         mutex_unlock(&fs_info->chunk_mutex);
2911         return ret;
2912 }
2913
2914 /*
2915  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2916  * @logical: Logical block offset in bytes.
2917  * @length: Length of extent in bytes.
2918  *
2919  * Return: Chunk mapping or ERR_PTR.
2920  */
2921 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2922                                        u64 logical, u64 length)
2923 {
2924         struct extent_map_tree *em_tree;
2925         struct extent_map *em;
2926
2927         em_tree = &fs_info->mapping_tree;
2928         read_lock(&em_tree->lock);
2929         em = lookup_extent_mapping(em_tree, logical, length);
2930         read_unlock(&em_tree->lock);
2931
2932         if (!em) {
2933                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2934                            logical, length);
2935                 return ERR_PTR(-EINVAL);
2936         }
2937
2938         if (em->start > logical || em->start + em->len < logical) {
2939                 btrfs_crit(fs_info,
2940                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2941                            logical, length, em->start, em->start + em->len);
2942                 free_extent_map(em);
2943                 return ERR_PTR(-EINVAL);
2944         }
2945
2946         /* callers are responsible for dropping em's ref. */
2947         return em;
2948 }
2949
2950 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2951 {
2952         struct btrfs_fs_info *fs_info = trans->fs_info;
2953         struct extent_map *em;
2954         struct map_lookup *map;
2955         u64 dev_extent_len = 0;
2956         int i, ret = 0;
2957         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2958
2959         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2960         if (IS_ERR(em)) {
2961                 /*
2962                  * This is a logic error, but we don't want to just rely on the
2963                  * user having built with ASSERT enabled, so if ASSERT doesn't
2964                  * do anything we still error out.
2965                  */
2966                 ASSERT(0);
2967                 return PTR_ERR(em);
2968         }
2969         map = em->map_lookup;
2970         mutex_lock(&fs_info->chunk_mutex);
2971         check_system_chunk(trans, map->type);
2972         mutex_unlock(&fs_info->chunk_mutex);
2973
2974         /*
2975          * Take the device list mutex to prevent races with the final phase of
2976          * a device replace operation that replaces the device object associated
2977          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2978          */
2979         mutex_lock(&fs_devices->device_list_mutex);
2980         for (i = 0; i < map->num_stripes; i++) {
2981                 struct btrfs_device *device = map->stripes[i].dev;
2982                 ret = btrfs_free_dev_extent(trans, device,
2983                                             map->stripes[i].physical,
2984                                             &dev_extent_len);
2985                 if (ret) {
2986                         mutex_unlock(&fs_devices->device_list_mutex);
2987                         btrfs_abort_transaction(trans, ret);
2988                         goto out;
2989                 }
2990
2991                 if (device->bytes_used > 0) {
2992                         mutex_lock(&fs_info->chunk_mutex);
2993                         btrfs_device_set_bytes_used(device,
2994                                         device->bytes_used - dev_extent_len);
2995                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2996                         btrfs_clear_space_info_full(fs_info);
2997                         mutex_unlock(&fs_info->chunk_mutex);
2998                 }
2999
3000                 ret = btrfs_update_device(trans, device);
3001                 if (ret) {
3002                         mutex_unlock(&fs_devices->device_list_mutex);
3003                         btrfs_abort_transaction(trans, ret);
3004                         goto out;
3005                 }
3006         }
3007         mutex_unlock(&fs_devices->device_list_mutex);
3008
3009         ret = btrfs_free_chunk(trans, chunk_offset);
3010         if (ret) {
3011                 btrfs_abort_transaction(trans, ret);
3012                 goto out;
3013         }
3014
3015         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3016
3017         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3018                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3019                 if (ret) {
3020                         btrfs_abort_transaction(trans, ret);
3021                         goto out;
3022                 }
3023         }
3024
3025         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3026         if (ret) {
3027                 btrfs_abort_transaction(trans, ret);
3028                 goto out;
3029         }
3030
3031 out:
3032         /* once for us */
3033         free_extent_map(em);
3034         return ret;
3035 }
3036
3037 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3038 {
3039         struct btrfs_root *root = fs_info->chunk_root;
3040         struct btrfs_trans_handle *trans;
3041         struct btrfs_block_group *block_group;
3042         int ret;
3043
3044         /*
3045          * Prevent races with automatic removal of unused block groups.
3046          * After we relocate and before we remove the chunk with offset
3047          * chunk_offset, automatic removal of the block group can kick in,
3048          * resulting in a failure when calling btrfs_remove_chunk() below.
3049          *
3050          * Make sure to acquire this mutex before doing a tree search (dev
3051          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3052          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3053          * we release the path used to search the chunk/dev tree and before
3054          * the current task acquires this mutex and calls us.
3055          */
3056         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3057
3058         /* step one, relocate all the extents inside this chunk */
3059         btrfs_scrub_pause(fs_info);
3060         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3061         btrfs_scrub_continue(fs_info);
3062         if (ret)
3063                 return ret;
3064
3065         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3066         if (!block_group)
3067                 return -ENOENT;
3068         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3069         btrfs_put_block_group(block_group);
3070
3071         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3072                                                      chunk_offset);
3073         if (IS_ERR(trans)) {
3074                 ret = PTR_ERR(trans);
3075                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3076                 return ret;
3077         }
3078
3079         /*
3080          * step two, delete the device extents and the
3081          * chunk tree entries
3082          */
3083         ret = btrfs_remove_chunk(trans, chunk_offset);
3084         btrfs_end_transaction(trans);
3085         return ret;
3086 }
3087
3088 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3089 {
3090         struct btrfs_root *chunk_root = fs_info->chunk_root;
3091         struct btrfs_path *path;
3092         struct extent_buffer *leaf;
3093         struct btrfs_chunk *chunk;
3094         struct btrfs_key key;
3095         struct btrfs_key found_key;
3096         u64 chunk_type;
3097         bool retried = false;
3098         int failed = 0;
3099         int ret;
3100
3101         path = btrfs_alloc_path();
3102         if (!path)
3103                 return -ENOMEM;
3104
3105 again:
3106         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3107         key.offset = (u64)-1;
3108         key.type = BTRFS_CHUNK_ITEM_KEY;
3109
3110         while (1) {
3111                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3112                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3113                 if (ret < 0) {
3114                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3115                         goto error;
3116                 }
3117                 BUG_ON(ret == 0); /* Corruption */
3118
3119                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3120                                           key.type);
3121                 if (ret)
3122                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3123                 if (ret < 0)
3124                         goto error;
3125                 if (ret > 0)
3126                         break;
3127
3128                 leaf = path->nodes[0];
3129                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3130
3131                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3132                                        struct btrfs_chunk);
3133                 chunk_type = btrfs_chunk_type(leaf, chunk);
3134                 btrfs_release_path(path);
3135
3136                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3137                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3138                         if (ret == -ENOSPC)
3139                                 failed++;
3140                         else
3141                                 BUG_ON(ret);
3142                 }
3143                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3144
3145                 if (found_key.offset == 0)
3146                         break;
3147                 key.offset = found_key.offset - 1;
3148         }
3149         ret = 0;
3150         if (failed && !retried) {
3151                 failed = 0;
3152                 retried = true;
3153                 goto again;
3154         } else if (WARN_ON(failed && retried)) {
3155                 ret = -ENOSPC;
3156         }
3157 error:
3158         btrfs_free_path(path);
3159         return ret;
3160 }
3161
3162 /*
3163  * return 1 : allocate a data chunk successfully,
3164  * return <0: errors during allocating a data chunk,
3165  * return 0 : no need to allocate a data chunk.
3166  */
3167 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3168                                       u64 chunk_offset)
3169 {
3170         struct btrfs_block_group *cache;
3171         u64 bytes_used;
3172         u64 chunk_type;
3173
3174         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3175         ASSERT(cache);
3176         chunk_type = cache->flags;
3177         btrfs_put_block_group(cache);
3178
3179         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3180                 return 0;
3181
3182         spin_lock(&fs_info->data_sinfo->lock);
3183         bytes_used = fs_info->data_sinfo->bytes_used;
3184         spin_unlock(&fs_info->data_sinfo->lock);
3185
3186         if (!bytes_used) {
3187                 struct btrfs_trans_handle *trans;
3188                 int ret;
3189
3190                 trans = btrfs_join_transaction(fs_info->tree_root);
3191                 if (IS_ERR(trans))
3192                         return PTR_ERR(trans);
3193
3194                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3195                 btrfs_end_transaction(trans);
3196                 if (ret < 0)
3197                         return ret;
3198                 return 1;
3199         }
3200
3201         return 0;
3202 }
3203
3204 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3205                                struct btrfs_balance_control *bctl)
3206 {
3207         struct btrfs_root *root = fs_info->tree_root;
3208         struct btrfs_trans_handle *trans;
3209         struct btrfs_balance_item *item;
3210         struct btrfs_disk_balance_args disk_bargs;
3211         struct btrfs_path *path;
3212         struct extent_buffer *leaf;
3213         struct btrfs_key key;
3214         int ret, err;
3215
3216         path = btrfs_alloc_path();
3217         if (!path)
3218                 return -ENOMEM;
3219
3220         trans = btrfs_start_transaction(root, 0);
3221         if (IS_ERR(trans)) {
3222                 btrfs_free_path(path);
3223                 return PTR_ERR(trans);
3224         }
3225
3226         key.objectid = BTRFS_BALANCE_OBJECTID;
3227         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3228         key.offset = 0;
3229
3230         ret = btrfs_insert_empty_item(trans, root, path, &key,
3231                                       sizeof(*item));
3232         if (ret)
3233                 goto out;
3234
3235         leaf = path->nodes[0];
3236         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3237
3238         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3239
3240         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3241         btrfs_set_balance_data(leaf, item, &disk_bargs);
3242         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3243         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3244         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3245         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3246
3247         btrfs_set_balance_flags(leaf, item, bctl->flags);
3248
3249         btrfs_mark_buffer_dirty(leaf);
3250 out:
3251         btrfs_free_path(path);
3252         err = btrfs_commit_transaction(trans);
3253         if (err && !ret)
3254                 ret = err;
3255         return ret;
3256 }
3257
3258 static int del_balance_item(struct btrfs_fs_info *fs_info)
3259 {
3260         struct btrfs_root *root = fs_info->tree_root;
3261         struct btrfs_trans_handle *trans;
3262         struct btrfs_path *path;
3263         struct btrfs_key key;
3264         int ret, err;
3265
3266         path = btrfs_alloc_path();
3267         if (!path)
3268                 return -ENOMEM;
3269
3270         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3271         if (IS_ERR(trans)) {
3272                 btrfs_free_path(path);
3273                 return PTR_ERR(trans);
3274         }
3275
3276         key.objectid = BTRFS_BALANCE_OBJECTID;
3277         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3278         key.offset = 0;
3279
3280         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3281         if (ret < 0)
3282                 goto out;
3283         if (ret > 0) {
3284                 ret = -ENOENT;
3285                 goto out;
3286         }
3287
3288         ret = btrfs_del_item(trans, root, path);
3289 out:
3290         btrfs_free_path(path);
3291         err = btrfs_commit_transaction(trans);
3292         if (err && !ret)
3293                 ret = err;
3294         return ret;
3295 }
3296
3297 /*
3298  * This is a heuristic used to reduce the number of chunks balanced on
3299  * resume after balance was interrupted.
3300  */
3301 static void update_balance_args(struct btrfs_balance_control *bctl)
3302 {
3303         /*
3304          * Turn on soft mode for chunk types that were being converted.
3305          */
3306         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3307                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3308         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3309                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3310         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3311                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3312
3313         /*
3314          * Turn on usage filter if is not already used.  The idea is
3315          * that chunks that we have already balanced should be
3316          * reasonably full.  Don't do it for chunks that are being
3317          * converted - that will keep us from relocating unconverted
3318          * (albeit full) chunks.
3319          */
3320         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3321             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3322             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3323                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3324                 bctl->data.usage = 90;
3325         }
3326         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3327             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3328             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3329                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3330                 bctl->sys.usage = 90;
3331         }
3332         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3333             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3334             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3335                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3336                 bctl->meta.usage = 90;
3337         }
3338 }
3339
3340 /*
3341  * Clear the balance status in fs_info and delete the balance item from disk.
3342  */
3343 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3344 {
3345         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3346         int ret;
3347
3348         BUG_ON(!fs_info->balance_ctl);
3349
3350         spin_lock(&fs_info->balance_lock);
3351         fs_info->balance_ctl = NULL;
3352         spin_unlock(&fs_info->balance_lock);
3353
3354         kfree(bctl);
3355         ret = del_balance_item(fs_info);
3356         if (ret)
3357                 btrfs_handle_fs_error(fs_info, ret, NULL);
3358 }
3359
3360 /*
3361  * Balance filters.  Return 1 if chunk should be filtered out
3362  * (should not be balanced).
3363  */
3364 static int chunk_profiles_filter(u64 chunk_type,
3365                                  struct btrfs_balance_args *bargs)
3366 {
3367         chunk_type = chunk_to_extended(chunk_type) &
3368                                 BTRFS_EXTENDED_PROFILE_MASK;
3369
3370         if (bargs->profiles & chunk_type)
3371                 return 0;
3372
3373         return 1;
3374 }
3375
3376 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3377                               struct btrfs_balance_args *bargs)
3378 {
3379         struct btrfs_block_group *cache;
3380         u64 chunk_used;
3381         u64 user_thresh_min;
3382         u64 user_thresh_max;
3383         int ret = 1;
3384
3385         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3386         chunk_used = cache->used;
3387
3388         if (bargs->usage_min == 0)
3389                 user_thresh_min = 0;
3390         else
3391                 user_thresh_min = div_factor_fine(cache->length,
3392                                                   bargs->usage_min);
3393
3394         if (bargs->usage_max == 0)
3395                 user_thresh_max = 1;
3396         else if (bargs->usage_max > 100)
3397                 user_thresh_max = cache->length;
3398         else
3399                 user_thresh_max = div_factor_fine(cache->length,
3400                                                   bargs->usage_max);
3401
3402         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3403                 ret = 0;
3404
3405         btrfs_put_block_group(cache);
3406         return ret;
3407 }
3408
3409 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3410                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3411 {
3412         struct btrfs_block_group *cache;
3413         u64 chunk_used, user_thresh;
3414         int ret = 1;
3415
3416         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3417         chunk_used = cache->used;
3418
3419         if (bargs->usage_min == 0)
3420                 user_thresh = 1;
3421         else if (bargs->usage > 100)
3422                 user_thresh = cache->length;
3423         else
3424                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3425
3426         if (chunk_used < user_thresh)
3427                 ret = 0;
3428
3429         btrfs_put_block_group(cache);
3430         return ret;
3431 }
3432
3433 static int chunk_devid_filter(struct extent_buffer *leaf,
3434                               struct btrfs_chunk *chunk,
3435                               struct btrfs_balance_args *bargs)
3436 {
3437         struct btrfs_stripe *stripe;
3438         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3439         int i;
3440
3441         for (i = 0; i < num_stripes; i++) {
3442                 stripe = btrfs_stripe_nr(chunk, i);
3443                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3444                         return 0;
3445         }
3446
3447         return 1;
3448 }
3449
3450 static u64 calc_data_stripes(u64 type, int num_stripes)
3451 {
3452         const int index = btrfs_bg_flags_to_raid_index(type);
3453         const int ncopies = btrfs_raid_array[index].ncopies;
3454         const int nparity = btrfs_raid_array[index].nparity;
3455
3456         if (nparity)
3457                 return num_stripes - nparity;
3458         else
3459                 return num_stripes / ncopies;
3460 }
3461
3462 /* [pstart, pend) */
3463 static int chunk_drange_filter(struct extent_buffer *leaf,
3464                                struct btrfs_chunk *chunk,
3465                                struct btrfs_balance_args *bargs)
3466 {
3467         struct btrfs_stripe *stripe;
3468         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3469         u64 stripe_offset;
3470         u64 stripe_length;
3471         u64 type;
3472         int factor;
3473         int i;
3474
3475         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3476                 return 0;
3477
3478         type = btrfs_chunk_type(leaf, chunk);
3479         factor = calc_data_stripes(type, num_stripes);
3480
3481         for (i = 0; i < num_stripes; i++) {
3482                 stripe = btrfs_stripe_nr(chunk, i);
3483                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3484                         continue;
3485
3486                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3487                 stripe_length = btrfs_chunk_length(leaf, chunk);
3488                 stripe_length = div_u64(stripe_length, factor);
3489
3490                 if (stripe_offset < bargs->pend &&
3491                     stripe_offset + stripe_length > bargs->pstart)
3492                         return 0;
3493         }
3494
3495         return 1;
3496 }
3497
3498 /* [vstart, vend) */
3499 static int chunk_vrange_filter(struct extent_buffer *leaf,
3500                                struct btrfs_chunk *chunk,
3501                                u64 chunk_offset,
3502                                struct btrfs_balance_args *bargs)
3503 {
3504         if (chunk_offset < bargs->vend &&
3505             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3506                 /* at least part of the chunk is inside this vrange */
3507                 return 0;
3508
3509         return 1;
3510 }
3511
3512 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3513                                struct btrfs_chunk *chunk,
3514                                struct btrfs_balance_args *bargs)
3515 {
3516         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3517
3518         if (bargs->stripes_min <= num_stripes
3519                         && num_stripes <= bargs->stripes_max)
3520                 return 0;
3521
3522         return 1;
3523 }
3524
3525 static int chunk_soft_convert_filter(u64 chunk_type,
3526                                      struct btrfs_balance_args *bargs)
3527 {
3528         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3529                 return 0;
3530
3531         chunk_type = chunk_to_extended(chunk_type) &
3532                                 BTRFS_EXTENDED_PROFILE_MASK;
3533
3534         if (bargs->target == chunk_type)
3535                 return 1;
3536
3537         return 0;
3538 }
3539
3540 static int should_balance_chunk(struct extent_buffer *leaf,
3541                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3542 {
3543         struct btrfs_fs_info *fs_info = leaf->fs_info;
3544         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3545         struct btrfs_balance_args *bargs = NULL;
3546         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3547
3548         /* type filter */
3549         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3550               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3551                 return 0;
3552         }
3553
3554         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3555                 bargs = &bctl->data;
3556         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3557                 bargs = &bctl->sys;
3558         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3559                 bargs = &bctl->meta;
3560
3561         /* profiles filter */
3562         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3563             chunk_profiles_filter(chunk_type, bargs)) {
3564                 return 0;
3565         }
3566
3567         /* usage filter */
3568         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3569             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3570                 return 0;
3571         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3572             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3573                 return 0;
3574         }
3575
3576         /* devid filter */
3577         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3578             chunk_devid_filter(leaf, chunk, bargs)) {
3579                 return 0;
3580         }
3581
3582         /* drange filter, makes sense only with devid filter */
3583         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3584             chunk_drange_filter(leaf, chunk, bargs)) {
3585                 return 0;
3586         }
3587
3588         /* vrange filter */
3589         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3590             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3591                 return 0;
3592         }
3593
3594         /* stripes filter */
3595         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3596             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3597                 return 0;
3598         }
3599
3600         /* soft profile changing mode */
3601         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3602             chunk_soft_convert_filter(chunk_type, bargs)) {
3603                 return 0;
3604         }
3605
3606         /*
3607          * limited by count, must be the last filter
3608          */
3609         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3610                 if (bargs->limit == 0)
3611                         return 0;
3612                 else
3613                         bargs->limit--;
3614         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3615                 /*
3616                  * Same logic as the 'limit' filter; the minimum cannot be
3617                  * determined here because we do not have the global information
3618                  * about the count of all chunks that satisfy the filters.
3619                  */
3620                 if (bargs->limit_max == 0)
3621                         return 0;
3622                 else
3623                         bargs->limit_max--;
3624         }
3625
3626         return 1;
3627 }
3628
3629 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3630 {
3631         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3632         struct btrfs_root *chunk_root = fs_info->chunk_root;
3633         u64 chunk_type;
3634         struct btrfs_chunk *chunk;
3635         struct btrfs_path *path = NULL;
3636         struct btrfs_key key;
3637         struct btrfs_key found_key;
3638         struct extent_buffer *leaf;
3639         int slot;
3640         int ret;
3641         int enospc_errors = 0;
3642         bool counting = true;
3643         /* The single value limit and min/max limits use the same bytes in the */
3644         u64 limit_data = bctl->data.limit;
3645         u64 limit_meta = bctl->meta.limit;
3646         u64 limit_sys = bctl->sys.limit;
3647         u32 count_data = 0;
3648         u32 count_meta = 0;
3649         u32 count_sys = 0;
3650         int chunk_reserved = 0;
3651
3652         path = btrfs_alloc_path();
3653         if (!path) {
3654                 ret = -ENOMEM;
3655                 goto error;
3656         }
3657
3658         /* zero out stat counters */
3659         spin_lock(&fs_info->balance_lock);
3660         memset(&bctl->stat, 0, sizeof(bctl->stat));
3661         spin_unlock(&fs_info->balance_lock);
3662 again:
3663         if (!counting) {
3664                 /*
3665                  * The single value limit and min/max limits use the same bytes
3666                  * in the
3667                  */
3668                 bctl->data.limit = limit_data;
3669                 bctl->meta.limit = limit_meta;
3670                 bctl->sys.limit = limit_sys;
3671         }
3672         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3673         key.offset = (u64)-1;
3674         key.type = BTRFS_CHUNK_ITEM_KEY;
3675
3676         while (1) {
3677                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3678                     atomic_read(&fs_info->balance_cancel_req)) {
3679                         ret = -ECANCELED;
3680                         goto error;
3681                 }
3682
3683                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3684                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3685                 if (ret < 0) {
3686                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3687                         goto error;
3688                 }
3689
3690                 /*
3691                  * this shouldn't happen, it means the last relocate
3692                  * failed
3693                  */
3694                 if (ret == 0)
3695                         BUG(); /* FIXME break ? */
3696
3697                 ret = btrfs_previous_item(chunk_root, path, 0,
3698                                           BTRFS_CHUNK_ITEM_KEY);
3699                 if (ret) {
3700                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3701                         ret = 0;
3702                         break;
3703                 }
3704
3705                 leaf = path->nodes[0];
3706                 slot = path->slots[0];
3707                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3708
3709                 if (found_key.objectid != key.objectid) {
3710                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3711                         break;
3712                 }
3713
3714                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3715                 chunk_type = btrfs_chunk_type(leaf, chunk);
3716
3717                 if (!counting) {
3718                         spin_lock(&fs_info->balance_lock);
3719                         bctl->stat.considered++;
3720                         spin_unlock(&fs_info->balance_lock);
3721                 }
3722
3723                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3724
3725                 btrfs_release_path(path);
3726                 if (!ret) {
3727                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3728                         goto loop;
3729                 }
3730
3731                 if (counting) {
3732                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3733                         spin_lock(&fs_info->balance_lock);
3734                         bctl->stat.expected++;
3735                         spin_unlock(&fs_info->balance_lock);
3736
3737                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3738                                 count_data++;
3739                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3740                                 count_sys++;
3741                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3742                                 count_meta++;
3743
3744                         goto loop;
3745                 }
3746
3747                 /*
3748                  * Apply limit_min filter, no need to check if the LIMITS
3749                  * filter is used, limit_min is 0 by default
3750                  */
3751                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3752                                         count_data < bctl->data.limit_min)
3753                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3754                                         count_meta < bctl->meta.limit_min)
3755                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3756                                         count_sys < bctl->sys.limit_min)) {
3757                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3758                         goto loop;
3759                 }
3760
3761                 if (!chunk_reserved) {
3762                         /*
3763                          * We may be relocating the only data chunk we have,
3764                          * which could potentially end up with losing data's
3765                          * raid profile, so lets allocate an empty one in
3766                          * advance.
3767                          */
3768                         ret = btrfs_may_alloc_data_chunk(fs_info,
3769                                                          found_key.offset);
3770                         if (ret < 0) {
3771                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3772                                 goto error;
3773                         } else if (ret == 1) {
3774                                 chunk_reserved = 1;
3775                         }
3776                 }
3777
3778                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3779                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3780                 if (ret == -ENOSPC) {
3781                         enospc_errors++;
3782                 } else if (ret == -ETXTBSY) {
3783                         btrfs_info(fs_info,
3784            "skipping relocation of block group %llu due to active swapfile",
3785                                    found_key.offset);
3786                         ret = 0;
3787                 } else if (ret) {
3788                         goto error;
3789                 } else {
3790                         spin_lock(&fs_info->balance_lock);
3791                         bctl->stat.completed++;
3792                         spin_unlock(&fs_info->balance_lock);
3793                 }
3794 loop:
3795                 if (found_key.offset == 0)
3796                         break;
3797                 key.offset = found_key.offset - 1;
3798         }
3799
3800         if (counting) {
3801                 btrfs_release_path(path);
3802                 counting = false;
3803                 goto again;
3804         }
3805 error:
3806         btrfs_free_path(path);
3807         if (enospc_errors) {
3808                 btrfs_info(fs_info, "%d enospc errors during balance",
3809                            enospc_errors);
3810                 if (!ret)
3811                         ret = -ENOSPC;
3812         }
3813
3814         return ret;
3815 }
3816
3817 /**
3818  * alloc_profile_is_valid - see if a given profile is valid and reduced
3819  * @flags: profile to validate
3820  * @extended: if true @flags is treated as an extended profile
3821  */
3822 static int alloc_profile_is_valid(u64 flags, int extended)
3823 {
3824         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3825                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3826
3827         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3828
3829         /* 1) check that all other bits are zeroed */
3830         if (flags & ~mask)
3831                 return 0;
3832
3833         /* 2) see if profile is reduced */
3834         if (flags == 0)
3835                 return !extended; /* "0" is valid for usual profiles */
3836
3837         return has_single_bit_set(flags);
3838 }
3839
3840 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3841 {
3842         /* cancel requested || normal exit path */
3843         return atomic_read(&fs_info->balance_cancel_req) ||
3844                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3845                  atomic_read(&fs_info->balance_cancel_req) == 0);
3846 }
3847
3848 /*
3849  * Validate target profile against allowed profiles and return true if it's OK.
3850  * Otherwise print the error message and return false.
3851  */
3852 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3853                 const struct btrfs_balance_args *bargs,
3854                 u64 allowed, const char *type)
3855 {
3856         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3857                 return true;
3858
3859         /* Profile is valid and does not have bits outside of the allowed set */
3860         if (alloc_profile_is_valid(bargs->target, 1) &&
3861             (bargs->target & ~allowed) == 0)
3862                 return true;
3863
3864         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3865                         type, btrfs_bg_type_to_raid_name(bargs->target));
3866         return false;
3867 }
3868
3869 /*
3870  * Fill @buf with textual description of balance filter flags @bargs, up to
3871  * @size_buf including the terminating null. The output may be trimmed if it
3872  * does not fit into the provided buffer.
3873  */
3874 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3875                                  u32 size_buf)
3876 {
3877         int ret;
3878         u32 size_bp = size_buf;
3879         char *bp = buf;
3880         u64 flags = bargs->flags;
3881         char tmp_buf[128] = {'\0'};
3882
3883         if (!flags)
3884                 return;
3885
3886 #define CHECK_APPEND_NOARG(a)                                           \
3887         do {                                                            \
3888                 ret = snprintf(bp, size_bp, (a));                       \
3889                 if (ret < 0 || ret >= size_bp)                          \
3890                         goto out_overflow;                              \
3891                 size_bp -= ret;                                         \
3892                 bp += ret;                                              \
3893         } while (0)
3894
3895 #define CHECK_APPEND_1ARG(a, v1)                                        \
3896         do {                                                            \
3897                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3898                 if (ret < 0 || ret >= size_bp)                          \
3899                         goto out_overflow;                              \
3900                 size_bp -= ret;                                         \
3901                 bp += ret;                                              \
3902         } while (0)
3903
3904 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3905         do {                                                            \
3906                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3907                 if (ret < 0 || ret >= size_bp)                          \
3908                         goto out_overflow;                              \
3909                 size_bp -= ret;                                         \
3910                 bp += ret;                                              \
3911         } while (0)
3912
3913         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3914                 CHECK_APPEND_1ARG("convert=%s,",
3915                                   btrfs_bg_type_to_raid_name(bargs->target));
3916
3917         if (flags & BTRFS_BALANCE_ARGS_SOFT)
3918                 CHECK_APPEND_NOARG("soft,");
3919
3920         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3921                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3922                                             sizeof(tmp_buf));
3923                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3924         }
3925
3926         if (flags & BTRFS_BALANCE_ARGS_USAGE)
3927                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3928
3929         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3930                 CHECK_APPEND_2ARG("usage=%u..%u,",
3931                                   bargs->usage_min, bargs->usage_max);
3932
3933         if (flags & BTRFS_BALANCE_ARGS_DEVID)
3934                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3935
3936         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3937                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
3938                                   bargs->pstart, bargs->pend);
3939
3940         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3941                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3942                                   bargs->vstart, bargs->vend);
3943
3944         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3945                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3946
3947         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3948                 CHECK_APPEND_2ARG("limit=%u..%u,",
3949                                 bargs->limit_min, bargs->limit_max);
3950
3951         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3952                 CHECK_APPEND_2ARG("stripes=%u..%u,",
3953                                   bargs->stripes_min, bargs->stripes_max);
3954
3955 #undef CHECK_APPEND_2ARG
3956 #undef CHECK_APPEND_1ARG
3957 #undef CHECK_APPEND_NOARG
3958
3959 out_overflow:
3960
3961         if (size_bp < size_buf)
3962                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3963         else
3964                 buf[0] = '\0';
3965 }
3966
3967 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3968 {
3969         u32 size_buf = 1024;
3970         char tmp_buf[192] = {'\0'};
3971         char *buf;
3972         char *bp;
3973         u32 size_bp = size_buf;
3974         int ret;
3975         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3976
3977         buf = kzalloc(size_buf, GFP_KERNEL);
3978         if (!buf)
3979                 return;
3980
3981         bp = buf;
3982
3983 #define CHECK_APPEND_1ARG(a, v1)                                        \
3984         do {                                                            \
3985                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3986                 if (ret < 0 || ret >= size_bp)                          \
3987                         goto out_overflow;                              \
3988                 size_bp -= ret;                                         \
3989                 bp += ret;                                              \
3990         } while (0)
3991
3992         if (bctl->flags & BTRFS_BALANCE_FORCE)
3993                 CHECK_APPEND_1ARG("%s", "-f ");
3994
3995         if (bctl->flags & BTRFS_BALANCE_DATA) {
3996                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
3997                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
3998         }
3999
4000         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4001                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4002                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4003         }
4004
4005         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4006                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4007                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4008         }
4009
4010 #undef CHECK_APPEND_1ARG
4011
4012 out_overflow:
4013
4014         if (size_bp < size_buf)
4015                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4016         btrfs_info(fs_info, "balance: %s %s",
4017                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4018                    "resume" : "start", buf);
4019
4020         kfree(buf);
4021 }
4022
4023 /*
4024  * Should be called with balance mutexe held
4025  */
4026 int btrfs_balance(struct btrfs_fs_info *fs_info,
4027                   struct btrfs_balance_control *bctl,
4028                   struct btrfs_ioctl_balance_args *bargs)
4029 {
4030         u64 meta_target, data_target;
4031         u64 allowed;
4032         int mixed = 0;
4033         int ret;
4034         u64 num_devices;
4035         unsigned seq;
4036         bool reducing_redundancy;
4037         int i;
4038
4039         if (btrfs_fs_closing(fs_info) ||
4040             atomic_read(&fs_info->balance_pause_req) ||
4041             btrfs_should_cancel_balance(fs_info)) {
4042                 ret = -EINVAL;
4043                 goto out;
4044         }
4045
4046         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4047         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4048                 mixed = 1;
4049
4050         /*
4051          * In case of mixed groups both data and meta should be picked,
4052          * and identical options should be given for both of them.
4053          */
4054         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4055         if (mixed && (bctl->flags & allowed)) {
4056                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4057                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4058                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4059                         btrfs_err(fs_info,
4060           "balance: mixed groups data and metadata options must be the same");
4061                         ret = -EINVAL;
4062                         goto out;
4063                 }
4064         }
4065
4066         /*
4067          * rw_devices will not change at the moment, device add/delete/replace
4068          * are exclusive
4069          */
4070         num_devices = fs_info->fs_devices->rw_devices;
4071
4072         /*
4073          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4074          * special bit for it, to make it easier to distinguish.  Thus we need
4075          * to set it manually, or balance would refuse the profile.
4076          */
4077         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4078         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4079                 if (num_devices >= btrfs_raid_array[i].devs_min)
4080                         allowed |= btrfs_raid_array[i].bg_flag;
4081
4082         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4083             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4084             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4085                 ret = -EINVAL;
4086                 goto out;
4087         }
4088
4089         /*
4090          * Allow to reduce metadata or system integrity only if force set for
4091          * profiles with redundancy (copies, parity)
4092          */
4093         allowed = 0;
4094         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4095                 if (btrfs_raid_array[i].ncopies >= 2 ||
4096                     btrfs_raid_array[i].tolerated_failures >= 1)
4097                         allowed |= btrfs_raid_array[i].bg_flag;
4098         }
4099         do {
4100                 seq = read_seqbegin(&fs_info->profiles_lock);
4101
4102                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4103                      (fs_info->avail_system_alloc_bits & allowed) &&
4104                      !(bctl->sys.target & allowed)) ||
4105                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4106                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4107                      !(bctl->meta.target & allowed)))
4108                         reducing_redundancy = true;
4109                 else
4110                         reducing_redundancy = false;
4111
4112                 /* if we're not converting, the target field is uninitialized */
4113                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4114                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4115                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4116                         bctl->data.target : fs_info->avail_data_alloc_bits;
4117         } while (read_seqretry(&fs_info->profiles_lock, seq));
4118
4119         if (reducing_redundancy) {
4120                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4121                         btrfs_info(fs_info,
4122                            "balance: force reducing metadata redundancy");
4123                 } else {
4124                         btrfs_err(fs_info,
4125         "balance: reduces metadata redundancy, use --force if you want this");
4126                         ret = -EINVAL;
4127                         goto out;
4128                 }
4129         }
4130
4131         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4132                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4133                 btrfs_warn(fs_info,
4134         "balance: metadata profile %s has lower redundancy than data profile %s",
4135                                 btrfs_bg_type_to_raid_name(meta_target),
4136                                 btrfs_bg_type_to_raid_name(data_target));
4137         }
4138
4139         if (fs_info->send_in_progress) {
4140                 btrfs_warn_rl(fs_info,
4141 "cannot run balance while send operations are in progress (%d in progress)",
4142                               fs_info->send_in_progress);
4143                 ret = -EAGAIN;
4144                 goto out;
4145         }
4146
4147         ret = insert_balance_item(fs_info, bctl);
4148         if (ret && ret != -EEXIST)
4149                 goto out;
4150
4151         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4152                 BUG_ON(ret == -EEXIST);
4153                 BUG_ON(fs_info->balance_ctl);
4154                 spin_lock(&fs_info->balance_lock);
4155                 fs_info->balance_ctl = bctl;
4156                 spin_unlock(&fs_info->balance_lock);
4157         } else {
4158                 BUG_ON(ret != -EEXIST);
4159                 spin_lock(&fs_info->balance_lock);
4160                 update_balance_args(bctl);
4161                 spin_unlock(&fs_info->balance_lock);
4162         }
4163
4164         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4165         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4166         describe_balance_start_or_resume(fs_info);
4167         mutex_unlock(&fs_info->balance_mutex);
4168
4169         ret = __btrfs_balance(fs_info);
4170
4171         mutex_lock(&fs_info->balance_mutex);
4172         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4173                 btrfs_info(fs_info, "balance: paused");
4174         /*
4175          * Balance can be canceled by:
4176          *
4177          * - Regular cancel request
4178          *   Then ret == -ECANCELED and balance_cancel_req > 0
4179          *
4180          * - Fatal signal to "btrfs" process
4181          *   Either the signal caught by wait_reserve_ticket() and callers
4182          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4183          *   got -ECANCELED.
4184          *   Either way, in this case balance_cancel_req = 0, and
4185          *   ret == -EINTR or ret == -ECANCELED.
4186          *
4187          * So here we only check the return value to catch canceled balance.
4188          */
4189         else if (ret == -ECANCELED || ret == -EINTR)
4190                 btrfs_info(fs_info, "balance: canceled");
4191         else
4192                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4193
4194         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4195
4196         if (bargs) {
4197                 memset(bargs, 0, sizeof(*bargs));
4198                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4199         }
4200
4201         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4202             balance_need_close(fs_info)) {
4203                 reset_balance_state(fs_info);
4204                 btrfs_exclop_finish(fs_info);
4205         }
4206
4207         wake_up(&fs_info->balance_wait_q);
4208
4209         return ret;
4210 out:
4211         if (bctl->flags & BTRFS_BALANCE_RESUME)
4212                 reset_balance_state(fs_info);
4213         else
4214                 kfree(bctl);
4215         btrfs_exclop_finish(fs_info);
4216
4217         return ret;
4218 }
4219
4220 static int balance_kthread(void *data)
4221 {
4222         struct btrfs_fs_info *fs_info = data;
4223         int ret = 0;
4224
4225         mutex_lock(&fs_info->balance_mutex);
4226         if (fs_info->balance_ctl)
4227                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4228         mutex_unlock(&fs_info->balance_mutex);
4229
4230         return ret;
4231 }
4232
4233 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4234 {
4235         struct task_struct *tsk;
4236
4237         mutex_lock(&fs_info->balance_mutex);
4238         if (!fs_info->balance_ctl) {
4239                 mutex_unlock(&fs_info->balance_mutex);
4240                 return 0;
4241         }
4242         mutex_unlock(&fs_info->balance_mutex);
4243
4244         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4245                 btrfs_info(fs_info, "balance: resume skipped");
4246                 return 0;
4247         }
4248
4249         /*
4250          * A ro->rw remount sequence should continue with the paused balance
4251          * regardless of who pauses it, system or the user as of now, so set
4252          * the resume flag.
4253          */
4254         spin_lock(&fs_info->balance_lock);
4255         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4256         spin_unlock(&fs_info->balance_lock);
4257
4258         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4259         return PTR_ERR_OR_ZERO(tsk);
4260 }
4261
4262 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4263 {
4264         struct btrfs_balance_control *bctl;
4265         struct btrfs_balance_item *item;
4266         struct btrfs_disk_balance_args disk_bargs;
4267         struct btrfs_path *path;
4268         struct extent_buffer *leaf;
4269         struct btrfs_key key;
4270         int ret;
4271
4272         path = btrfs_alloc_path();
4273         if (!path)
4274                 return -ENOMEM;
4275
4276         key.objectid = BTRFS_BALANCE_OBJECTID;
4277         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4278         key.offset = 0;
4279
4280         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4281         if (ret < 0)
4282                 goto out;
4283         if (ret > 0) { /* ret = -ENOENT; */
4284                 ret = 0;
4285                 goto out;
4286         }
4287
4288         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4289         if (!bctl) {
4290                 ret = -ENOMEM;
4291                 goto out;
4292         }
4293
4294         leaf = path->nodes[0];
4295         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4296
4297         bctl->flags = btrfs_balance_flags(leaf, item);
4298         bctl->flags |= BTRFS_BALANCE_RESUME;
4299
4300         btrfs_balance_data(leaf, item, &disk_bargs);
4301         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4302         btrfs_balance_meta(leaf, item, &disk_bargs);
4303         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4304         btrfs_balance_sys(leaf, item, &disk_bargs);
4305         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4306
4307         /*
4308          * This should never happen, as the paused balance state is recovered
4309          * during mount without any chance of other exclusive ops to collide.
4310          *
4311          * This gives the exclusive op status to balance and keeps in paused
4312          * state until user intervention (cancel or umount). If the ownership
4313          * cannot be assigned, show a message but do not fail. The balance
4314          * is in a paused state and must have fs_info::balance_ctl properly
4315          * set up.
4316          */
4317         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4318                 btrfs_warn(fs_info,
4319         "balance: cannot set exclusive op status, resume manually");
4320
4321         mutex_lock(&fs_info->balance_mutex);
4322         BUG_ON(fs_info->balance_ctl);
4323         spin_lock(&fs_info->balance_lock);
4324         fs_info->balance_ctl = bctl;
4325         spin_unlock(&fs_info->balance_lock);
4326         mutex_unlock(&fs_info->balance_mutex);
4327 out:
4328         btrfs_free_path(path);
4329         return ret;
4330 }
4331
4332 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4333 {
4334         int ret = 0;
4335
4336         mutex_lock(&fs_info->balance_mutex);
4337         if (!fs_info->balance_ctl) {
4338                 mutex_unlock(&fs_info->balance_mutex);
4339                 return -ENOTCONN;
4340         }
4341
4342         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4343                 atomic_inc(&fs_info->balance_pause_req);
4344                 mutex_unlock(&fs_info->balance_mutex);
4345
4346                 wait_event(fs_info->balance_wait_q,
4347                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4348
4349                 mutex_lock(&fs_info->balance_mutex);
4350                 /* we are good with balance_ctl ripped off from under us */
4351                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4352                 atomic_dec(&fs_info->balance_pause_req);
4353         } else {
4354                 ret = -ENOTCONN;
4355         }
4356
4357         mutex_unlock(&fs_info->balance_mutex);
4358         return ret;
4359 }
4360
4361 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4362 {
4363         mutex_lock(&fs_info->balance_mutex);
4364         if (!fs_info->balance_ctl) {
4365                 mutex_unlock(&fs_info->balance_mutex);
4366                 return -ENOTCONN;
4367         }
4368
4369         /*
4370          * A paused balance with the item stored on disk can be resumed at
4371          * mount time if the mount is read-write. Otherwise it's still paused
4372          * and we must not allow cancelling as it deletes the item.
4373          */
4374         if (sb_rdonly(fs_info->sb)) {
4375                 mutex_unlock(&fs_info->balance_mutex);
4376                 return -EROFS;
4377         }
4378
4379         atomic_inc(&fs_info->balance_cancel_req);
4380         /*
4381          * if we are running just wait and return, balance item is
4382          * deleted in btrfs_balance in this case
4383          */
4384         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4385                 mutex_unlock(&fs_info->balance_mutex);
4386                 wait_event(fs_info->balance_wait_q,
4387                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4388                 mutex_lock(&fs_info->balance_mutex);
4389         } else {
4390                 mutex_unlock(&fs_info->balance_mutex);
4391                 /*
4392                  * Lock released to allow other waiters to continue, we'll
4393                  * reexamine the status again.
4394                  */
4395                 mutex_lock(&fs_info->balance_mutex);
4396
4397                 if (fs_info->balance_ctl) {
4398                         reset_balance_state(fs_info);
4399                         btrfs_exclop_finish(fs_info);
4400                         btrfs_info(fs_info, "balance: canceled");
4401                 }
4402         }
4403
4404         BUG_ON(fs_info->balance_ctl ||
4405                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4406         atomic_dec(&fs_info->balance_cancel_req);
4407         mutex_unlock(&fs_info->balance_mutex);
4408         return 0;
4409 }
4410
4411 int btrfs_uuid_scan_kthread(void *data)
4412 {
4413         struct btrfs_fs_info *fs_info = data;
4414         struct btrfs_root *root = fs_info->tree_root;
4415         struct btrfs_key key;
4416         struct btrfs_path *path = NULL;
4417         int ret = 0;
4418         struct extent_buffer *eb;
4419         int slot;
4420         struct btrfs_root_item root_item;
4421         u32 item_size;
4422         struct btrfs_trans_handle *trans = NULL;
4423         bool closing = false;
4424
4425         path = btrfs_alloc_path();
4426         if (!path) {
4427                 ret = -ENOMEM;
4428                 goto out;
4429         }
4430
4431         key.objectid = 0;
4432         key.type = BTRFS_ROOT_ITEM_KEY;
4433         key.offset = 0;
4434
4435         while (1) {
4436                 if (btrfs_fs_closing(fs_info)) {
4437                         closing = true;
4438                         break;
4439                 }
4440                 ret = btrfs_search_forward(root, &key, path,
4441                                 BTRFS_OLDEST_GENERATION);
4442                 if (ret) {
4443                         if (ret > 0)
4444                                 ret = 0;
4445                         break;
4446                 }
4447
4448                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4449                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4450                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4451                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4452                         goto skip;
4453
4454                 eb = path->nodes[0];
4455                 slot = path->slots[0];
4456                 item_size = btrfs_item_size_nr(eb, slot);
4457                 if (item_size < sizeof(root_item))
4458                         goto skip;
4459
4460                 read_extent_buffer(eb, &root_item,
4461                                    btrfs_item_ptr_offset(eb, slot),
4462                                    (int)sizeof(root_item));
4463                 if (btrfs_root_refs(&root_item) == 0)
4464                         goto skip;
4465
4466                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4467                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4468                         if (trans)
4469                                 goto update_tree;
4470
4471                         btrfs_release_path(path);
4472                         /*
4473                          * 1 - subvol uuid item
4474                          * 1 - received_subvol uuid item
4475                          */
4476                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4477                         if (IS_ERR(trans)) {
4478                                 ret = PTR_ERR(trans);
4479                                 break;
4480                         }
4481                         continue;
4482                 } else {
4483                         goto skip;
4484                 }
4485 update_tree:
4486                 btrfs_release_path(path);
4487                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4488                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4489                                                   BTRFS_UUID_KEY_SUBVOL,
4490                                                   key.objectid);
4491                         if (ret < 0) {
4492                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4493                                         ret);
4494                                 break;
4495                         }
4496                 }
4497
4498                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4499                         ret = btrfs_uuid_tree_add(trans,
4500                                                   root_item.received_uuid,
4501                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4502                                                   key.objectid);
4503                         if (ret < 0) {
4504                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4505                                         ret);
4506                                 break;
4507                         }
4508                 }
4509
4510 skip:
4511                 btrfs_release_path(path);
4512                 if (trans) {
4513                         ret = btrfs_end_transaction(trans);
4514                         trans = NULL;
4515                         if (ret)
4516                                 break;
4517                 }
4518
4519                 if (key.offset < (u64)-1) {
4520                         key.offset++;
4521                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4522                         key.offset = 0;
4523                         key.type = BTRFS_ROOT_ITEM_KEY;
4524                 } else if (key.objectid < (u64)-1) {
4525                         key.offset = 0;
4526                         key.type = BTRFS_ROOT_ITEM_KEY;
4527                         key.objectid++;
4528                 } else {
4529                         break;
4530                 }
4531                 cond_resched();
4532         }
4533
4534 out:
4535         btrfs_free_path(path);
4536         if (trans && !IS_ERR(trans))
4537                 btrfs_end_transaction(trans);
4538         if (ret)
4539                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4540         else if (!closing)
4541                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4542         up(&fs_info->uuid_tree_rescan_sem);
4543         return 0;
4544 }
4545
4546 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4547 {
4548         struct btrfs_trans_handle *trans;
4549         struct btrfs_root *tree_root = fs_info->tree_root;
4550         struct btrfs_root *uuid_root;
4551         struct task_struct *task;
4552         int ret;
4553
4554         /*
4555          * 1 - root node
4556          * 1 - root item
4557          */
4558         trans = btrfs_start_transaction(tree_root, 2);
4559         if (IS_ERR(trans))
4560                 return PTR_ERR(trans);
4561
4562         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4563         if (IS_ERR(uuid_root)) {
4564                 ret = PTR_ERR(uuid_root);
4565                 btrfs_abort_transaction(trans, ret);
4566                 btrfs_end_transaction(trans);
4567                 return ret;
4568         }
4569
4570         fs_info->uuid_root = uuid_root;
4571
4572         ret = btrfs_commit_transaction(trans);
4573         if (ret)
4574                 return ret;
4575
4576         down(&fs_info->uuid_tree_rescan_sem);
4577         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4578         if (IS_ERR(task)) {
4579                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4580                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4581                 up(&fs_info->uuid_tree_rescan_sem);
4582                 return PTR_ERR(task);
4583         }
4584
4585         return 0;
4586 }
4587
4588 /*
4589  * shrinking a device means finding all of the device extents past
4590  * the new size, and then following the back refs to the chunks.
4591  * The chunk relocation code actually frees the device extent
4592  */
4593 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4594 {
4595         struct btrfs_fs_info *fs_info = device->fs_info;
4596         struct btrfs_root *root = fs_info->dev_root;
4597         struct btrfs_trans_handle *trans;
4598         struct btrfs_dev_extent *dev_extent = NULL;
4599         struct btrfs_path *path;
4600         u64 length;
4601         u64 chunk_offset;
4602         int ret;
4603         int slot;
4604         int failed = 0;
4605         bool retried = false;
4606         struct extent_buffer *l;
4607         struct btrfs_key key;
4608         struct btrfs_super_block *super_copy = fs_info->super_copy;
4609         u64 old_total = btrfs_super_total_bytes(super_copy);
4610         u64 old_size = btrfs_device_get_total_bytes(device);
4611         u64 diff;
4612         u64 start;
4613
4614         new_size = round_down(new_size, fs_info->sectorsize);
4615         start = new_size;
4616         diff = round_down(old_size - new_size, fs_info->sectorsize);
4617
4618         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4619                 return -EINVAL;
4620
4621         path = btrfs_alloc_path();
4622         if (!path)
4623                 return -ENOMEM;
4624
4625         path->reada = READA_BACK;
4626
4627         trans = btrfs_start_transaction(root, 0);
4628         if (IS_ERR(trans)) {
4629                 btrfs_free_path(path);
4630                 return PTR_ERR(trans);
4631         }
4632
4633         mutex_lock(&fs_info->chunk_mutex);
4634
4635         btrfs_device_set_total_bytes(device, new_size);
4636         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4637                 device->fs_devices->total_rw_bytes -= diff;
4638                 atomic64_sub(diff, &fs_info->free_chunk_space);
4639         }
4640
4641         /*
4642          * Once the device's size has been set to the new size, ensure all
4643          * in-memory chunks are synced to disk so that the loop below sees them
4644          * and relocates them accordingly.
4645          */
4646         if (contains_pending_extent(device, &start, diff)) {
4647                 mutex_unlock(&fs_info->chunk_mutex);
4648                 ret = btrfs_commit_transaction(trans);
4649                 if (ret)
4650                         goto done;
4651         } else {
4652                 mutex_unlock(&fs_info->chunk_mutex);
4653                 btrfs_end_transaction(trans);
4654         }
4655
4656 again:
4657         key.objectid = device->devid;
4658         key.offset = (u64)-1;
4659         key.type = BTRFS_DEV_EXTENT_KEY;
4660
4661         do {
4662                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4663                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4664                 if (ret < 0) {
4665                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4666                         goto done;
4667                 }
4668
4669                 ret = btrfs_previous_item(root, path, 0, key.type);
4670                 if (ret)
4671                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4672                 if (ret < 0)
4673                         goto done;
4674                 if (ret) {
4675                         ret = 0;
4676                         btrfs_release_path(path);
4677                         break;
4678                 }
4679
4680                 l = path->nodes[0];
4681                 slot = path->slots[0];
4682                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4683
4684                 if (key.objectid != device->devid) {
4685                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4686                         btrfs_release_path(path);
4687                         break;
4688                 }
4689
4690                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4691                 length = btrfs_dev_extent_length(l, dev_extent);
4692
4693                 if (key.offset + length <= new_size) {
4694                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4695                         btrfs_release_path(path);
4696                         break;
4697                 }
4698
4699                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4700                 btrfs_release_path(path);
4701
4702                 /*
4703                  * We may be relocating the only data chunk we have,
4704                  * which could potentially end up with losing data's
4705                  * raid profile, so lets allocate an empty one in
4706                  * advance.
4707                  */
4708                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4709                 if (ret < 0) {
4710                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4711                         goto done;
4712                 }
4713
4714                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4715                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4716                 if (ret == -ENOSPC) {
4717                         failed++;
4718                 } else if (ret) {
4719                         if (ret == -ETXTBSY) {
4720                                 btrfs_warn(fs_info,
4721                    "could not shrink block group %llu due to active swapfile",
4722                                            chunk_offset);
4723                         }
4724                         goto done;
4725                 }
4726         } while (key.offset-- > 0);
4727
4728         if (failed && !retried) {
4729                 failed = 0;
4730                 retried = true;
4731                 goto again;
4732         } else if (failed && retried) {
4733                 ret = -ENOSPC;
4734                 goto done;
4735         }
4736
4737         /* Shrinking succeeded, else we would be at "done". */
4738         trans = btrfs_start_transaction(root, 0);
4739         if (IS_ERR(trans)) {
4740                 ret = PTR_ERR(trans);
4741                 goto done;
4742         }
4743
4744         mutex_lock(&fs_info->chunk_mutex);
4745         /* Clear all state bits beyond the shrunk device size */
4746         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4747                           CHUNK_STATE_MASK);
4748
4749         btrfs_device_set_disk_total_bytes(device, new_size);
4750         if (list_empty(&device->post_commit_list))
4751                 list_add_tail(&device->post_commit_list,
4752                               &trans->transaction->dev_update_list);
4753
4754         WARN_ON(diff > old_total);
4755         btrfs_set_super_total_bytes(super_copy,
4756                         round_down(old_total - diff, fs_info->sectorsize));
4757         mutex_unlock(&fs_info->chunk_mutex);
4758
4759         /* Now btrfs_update_device() will change the on-disk size. */
4760         ret = btrfs_update_device(trans, device);
4761         if (ret < 0) {
4762                 btrfs_abort_transaction(trans, ret);
4763                 btrfs_end_transaction(trans);
4764         } else {
4765                 ret = btrfs_commit_transaction(trans);
4766         }
4767 done:
4768         btrfs_free_path(path);
4769         if (ret) {
4770                 mutex_lock(&fs_info->chunk_mutex);
4771                 btrfs_device_set_total_bytes(device, old_size);
4772                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4773                         device->fs_devices->total_rw_bytes += diff;
4774                 atomic64_add(diff, &fs_info->free_chunk_space);
4775                 mutex_unlock(&fs_info->chunk_mutex);
4776         }
4777         return ret;
4778 }
4779
4780 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4781                            struct btrfs_key *key,
4782                            struct btrfs_chunk *chunk, int item_size)
4783 {
4784         struct btrfs_super_block *super_copy = fs_info->super_copy;
4785         struct btrfs_disk_key disk_key;
4786         u32 array_size;
4787         u8 *ptr;
4788
4789         mutex_lock(&fs_info->chunk_mutex);
4790         array_size = btrfs_super_sys_array_size(super_copy);
4791         if (array_size + item_size + sizeof(disk_key)
4792                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4793                 mutex_unlock(&fs_info->chunk_mutex);
4794                 return -EFBIG;
4795         }
4796
4797         ptr = super_copy->sys_chunk_array + array_size;
4798         btrfs_cpu_key_to_disk(&disk_key, key);
4799         memcpy(ptr, &disk_key, sizeof(disk_key));
4800         ptr += sizeof(disk_key);
4801         memcpy(ptr, chunk, item_size);
4802         item_size += sizeof(disk_key);
4803         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4804         mutex_unlock(&fs_info->chunk_mutex);
4805
4806         return 0;
4807 }
4808
4809 /*
4810  * sort the devices in descending order by max_avail, total_avail
4811  */
4812 static int btrfs_cmp_device_info(const void *a, const void *b)
4813 {
4814         const struct btrfs_device_info *di_a = a;
4815         const struct btrfs_device_info *di_b = b;
4816
4817         if (di_a->max_avail > di_b->max_avail)
4818                 return -1;
4819         if (di_a->max_avail < di_b->max_avail)
4820                 return 1;
4821         if (di_a->total_avail > di_b->total_avail)
4822                 return -1;
4823         if (di_a->total_avail < di_b->total_avail)
4824                 return 1;
4825         return 0;
4826 }
4827
4828 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4829 {
4830         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4831                 return;
4832
4833         btrfs_set_fs_incompat(info, RAID56);
4834 }
4835
4836 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4837 {
4838         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4839                 return;
4840
4841         btrfs_set_fs_incompat(info, RAID1C34);
4842 }
4843
4844 /*
4845  * Structure used internally for __btrfs_alloc_chunk() function.
4846  * Wraps needed parameters.
4847  */
4848 struct alloc_chunk_ctl {
4849         u64 start;
4850         u64 type;
4851         /* Total number of stripes to allocate */
4852         int num_stripes;
4853         /* sub_stripes info for map */
4854         int sub_stripes;
4855         /* Stripes per device */
4856         int dev_stripes;
4857         /* Maximum number of devices to use */
4858         int devs_max;
4859         /* Minimum number of devices to use */
4860         int devs_min;
4861         /* ndevs has to be a multiple of this */
4862         int devs_increment;
4863         /* Number of copies */
4864         int ncopies;
4865         /* Number of stripes worth of bytes to store parity information */
4866         int nparity;
4867         u64 max_stripe_size;
4868         u64 max_chunk_size;
4869         u64 dev_extent_min;
4870         u64 stripe_size;
4871         u64 chunk_size;
4872         int ndevs;
4873 };
4874
4875 static void init_alloc_chunk_ctl_policy_regular(
4876                                 struct btrfs_fs_devices *fs_devices,
4877                                 struct alloc_chunk_ctl *ctl)
4878 {
4879         u64 type = ctl->type;
4880
4881         if (type & BTRFS_BLOCK_GROUP_DATA) {
4882                 ctl->max_stripe_size = SZ_1G;
4883                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4884         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4885                 /* For larger filesystems, use larger metadata chunks */
4886                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4887                         ctl->max_stripe_size = SZ_1G;
4888                 else
4889                         ctl->max_stripe_size = SZ_256M;
4890                 ctl->max_chunk_size = ctl->max_stripe_size;
4891         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4892                 ctl->max_stripe_size = SZ_32M;
4893                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4894                 ctl->devs_max = min_t(int, ctl->devs_max,
4895                                       BTRFS_MAX_DEVS_SYS_CHUNK);
4896         } else {
4897                 BUG();
4898         }
4899
4900         /* We don't want a chunk larger than 10% of writable space */
4901         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4902                                   ctl->max_chunk_size);
4903         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4904 }
4905
4906 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4907                                  struct alloc_chunk_ctl *ctl)
4908 {
4909         int index = btrfs_bg_flags_to_raid_index(ctl->type);
4910
4911         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4912         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4913         ctl->devs_max = btrfs_raid_array[index].devs_max;
4914         if (!ctl->devs_max)
4915                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4916         ctl->devs_min = btrfs_raid_array[index].devs_min;
4917         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4918         ctl->ncopies = btrfs_raid_array[index].ncopies;
4919         ctl->nparity = btrfs_raid_array[index].nparity;
4920         ctl->ndevs = 0;
4921
4922         switch (fs_devices->chunk_alloc_policy) {
4923         case BTRFS_CHUNK_ALLOC_REGULAR:
4924                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4925                 break;
4926         default:
4927                 BUG();
4928         }
4929 }
4930
4931 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4932                               struct alloc_chunk_ctl *ctl,
4933                               struct btrfs_device_info *devices_info)
4934 {
4935         struct btrfs_fs_info *info = fs_devices->fs_info;
4936         struct btrfs_device *device;
4937         u64 total_avail;
4938         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4939         int ret;
4940         int ndevs = 0;
4941         u64 max_avail;
4942         u64 dev_offset;
4943
4944         /*
4945          * in the first pass through the devices list, we gather information
4946          * about the available holes on each device.
4947          */
4948         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4949                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4950                         WARN(1, KERN_ERR
4951                                "BTRFS: read-only device in alloc_list\n");
4952                         continue;
4953                 }
4954
4955                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4956                                         &device->dev_state) ||
4957                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4958                         continue;
4959
4960                 if (device->total_bytes > device->bytes_used)
4961                         total_avail = device->total_bytes - device->bytes_used;
4962                 else
4963                         total_avail = 0;
4964
4965                 /* If there is no space on this device, skip it. */
4966                 if (total_avail < ctl->dev_extent_min)
4967                         continue;
4968
4969                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
4970                                            &max_avail);
4971                 if (ret && ret != -ENOSPC)
4972                         return ret;
4973
4974                 if (ret == 0)
4975                         max_avail = dev_extent_want;
4976
4977                 if (max_avail < ctl->dev_extent_min) {
4978                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
4979                                 btrfs_debug(info,
4980                         "%s: devid %llu has no free space, have=%llu want=%llu",
4981                                             __func__, device->devid, max_avail,
4982                                             ctl->dev_extent_min);
4983                         continue;
4984                 }
4985
4986                 if (ndevs == fs_devices->rw_devices) {
4987                         WARN(1, "%s: found more than %llu devices\n",
4988                              __func__, fs_devices->rw_devices);
4989                         break;
4990                 }
4991                 devices_info[ndevs].dev_offset = dev_offset;
4992                 devices_info[ndevs].max_avail = max_avail;
4993                 devices_info[ndevs].total_avail = total_avail;
4994                 devices_info[ndevs].dev = device;
4995                 ++ndevs;
4996         }
4997         ctl->ndevs = ndevs;
4998
4999         /*
5000          * now sort the devices by hole size / available space
5001          */
5002         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5003              btrfs_cmp_device_info, NULL);
5004
5005         return 0;
5006 }
5007
5008 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5009                                       struct btrfs_device_info *devices_info)
5010 {
5011         /* Number of stripes that count for block group size */
5012         int data_stripes;
5013
5014         /*
5015          * The primary goal is to maximize the number of stripes, so use as
5016          * many devices as possible, even if the stripes are not maximum sized.
5017          *
5018          * The DUP profile stores more than one stripe per device, the
5019          * max_avail is the total size so we have to adjust.
5020          */
5021         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5022                                    ctl->dev_stripes);
5023         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5024
5025         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5026         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5027
5028         /*
5029          * Use the number of data stripes to figure out how big this chunk is
5030          * really going to be in terms of logical address space, and compare
5031          * that answer with the max chunk size. If it's higher, we try to
5032          * reduce stripe_size.
5033          */
5034         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5035                 /*
5036                  * Reduce stripe_size, round it up to a 16MB boundary again and
5037                  * then use it, unless it ends up being even bigger than the
5038                  * previous value we had already.
5039                  */
5040                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5041                                                         data_stripes), SZ_16M),
5042                                        ctl->stripe_size);
5043         }
5044
5045         /* Align to BTRFS_STRIPE_LEN */
5046         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5047         ctl->chunk_size = ctl->stripe_size * data_stripes;
5048
5049         return 0;
5050 }
5051
5052 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5053                               struct alloc_chunk_ctl *ctl,
5054                               struct btrfs_device_info *devices_info)
5055 {
5056         struct btrfs_fs_info *info = fs_devices->fs_info;
5057
5058         /*
5059          * Round down to number of usable stripes, devs_increment can be any
5060          * number so we can't use round_down() that requires power of 2, while
5061          * rounddown is safe.
5062          */
5063         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5064
5065         if (ctl->ndevs < ctl->devs_min) {
5066                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5067                         btrfs_debug(info,
5068         "%s: not enough devices with free space: have=%d minimum required=%d",
5069                                     __func__, ctl->ndevs, ctl->devs_min);
5070                 }
5071                 return -ENOSPC;
5072         }
5073
5074         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5075
5076         switch (fs_devices->chunk_alloc_policy) {
5077         case BTRFS_CHUNK_ALLOC_REGULAR:
5078                 return decide_stripe_size_regular(ctl, devices_info);
5079         default:
5080                 BUG();
5081         }
5082 }
5083
5084 static int create_chunk(struct btrfs_trans_handle *trans,
5085                         struct alloc_chunk_ctl *ctl,
5086                         struct btrfs_device_info *devices_info)
5087 {
5088         struct btrfs_fs_info *info = trans->fs_info;
5089         struct map_lookup *map = NULL;
5090         struct extent_map_tree *em_tree;
5091         struct extent_map *em;
5092         u64 start = ctl->start;
5093         u64 type = ctl->type;
5094         int ret;
5095         int i;
5096         int j;
5097
5098         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5099         if (!map)
5100                 return -ENOMEM;
5101         map->num_stripes = ctl->num_stripes;
5102
5103         for (i = 0; i < ctl->ndevs; ++i) {
5104                 for (j = 0; j < ctl->dev_stripes; ++j) {
5105                         int s = i * ctl->dev_stripes + j;
5106                         map->stripes[s].dev = devices_info[i].dev;
5107                         map->stripes[s].physical = devices_info[i].dev_offset +
5108                                                    j * ctl->stripe_size;
5109                 }
5110         }
5111         map->stripe_len = BTRFS_STRIPE_LEN;
5112         map->io_align = BTRFS_STRIPE_LEN;
5113         map->io_width = BTRFS_STRIPE_LEN;
5114         map->type = type;
5115         map->sub_stripes = ctl->sub_stripes;
5116
5117         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5118
5119         em = alloc_extent_map();
5120         if (!em) {
5121                 kfree(map);
5122                 return -ENOMEM;
5123         }
5124         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5125         em->map_lookup = map;
5126         em->start = start;
5127         em->len = ctl->chunk_size;
5128         em->block_start = 0;
5129         em->block_len = em->len;
5130         em->orig_block_len = ctl->stripe_size;
5131
5132         em_tree = &info->mapping_tree;
5133         write_lock(&em_tree->lock);
5134         ret = add_extent_mapping(em_tree, em, 0);
5135         if (ret) {
5136                 write_unlock(&em_tree->lock);
5137                 free_extent_map(em);
5138                 return ret;
5139         }
5140         write_unlock(&em_tree->lock);
5141
5142         ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5143         if (ret)
5144                 goto error_del_extent;
5145
5146         for (i = 0; i < map->num_stripes; i++) {
5147                 struct btrfs_device *dev = map->stripes[i].dev;
5148
5149                 btrfs_device_set_bytes_used(dev,
5150                                             dev->bytes_used + ctl->stripe_size);
5151                 if (list_empty(&dev->post_commit_list))
5152                         list_add_tail(&dev->post_commit_list,
5153                                       &trans->transaction->dev_update_list);
5154         }
5155
5156         atomic64_sub(ctl->stripe_size * map->num_stripes,
5157                      &info->free_chunk_space);
5158
5159         free_extent_map(em);
5160         check_raid56_incompat_flag(info, type);
5161         check_raid1c34_incompat_flag(info, type);
5162
5163         return 0;
5164
5165 error_del_extent:
5166         write_lock(&em_tree->lock);
5167         remove_extent_mapping(em_tree, em);
5168         write_unlock(&em_tree->lock);
5169
5170         /* One for our allocation */
5171         free_extent_map(em);
5172         /* One for the tree reference */
5173         free_extent_map(em);
5174
5175         return ret;
5176 }
5177
5178 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5179 {
5180         struct btrfs_fs_info *info = trans->fs_info;
5181         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5182         struct btrfs_device_info *devices_info = NULL;
5183         struct alloc_chunk_ctl ctl;
5184         int ret;
5185
5186         lockdep_assert_held(&info->chunk_mutex);
5187
5188         if (!alloc_profile_is_valid(type, 0)) {
5189                 ASSERT(0);
5190                 return -EINVAL;
5191         }
5192
5193         if (list_empty(&fs_devices->alloc_list)) {
5194                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5195                         btrfs_debug(info, "%s: no writable device", __func__);
5196                 return -ENOSPC;
5197         }
5198
5199         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5200                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5201                 ASSERT(0);
5202                 return -EINVAL;
5203         }
5204
5205         ctl.start = find_next_chunk(info);
5206         ctl.type = type;
5207         init_alloc_chunk_ctl(fs_devices, &ctl);
5208
5209         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5210                                GFP_NOFS);
5211         if (!devices_info)
5212                 return -ENOMEM;
5213
5214         ret = gather_device_info(fs_devices, &ctl, devices_info);
5215         if (ret < 0)
5216                 goto out;
5217
5218         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5219         if (ret < 0)
5220                 goto out;
5221
5222         ret = create_chunk(trans, &ctl, devices_info);
5223
5224 out:
5225         kfree(devices_info);
5226         return ret;
5227 }
5228
5229 /*
5230  * Chunk allocation falls into two parts. The first part does work
5231  * that makes the new allocated chunk usable, but does not do any operation
5232  * that modifies the chunk tree. The second part does the work that
5233  * requires modifying the chunk tree. This division is important for the
5234  * bootstrap process of adding storage to a seed btrfs.
5235  */
5236 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5237                              u64 chunk_offset, u64 chunk_size)
5238 {
5239         struct btrfs_fs_info *fs_info = trans->fs_info;
5240         struct btrfs_root *extent_root = fs_info->extent_root;
5241         struct btrfs_root *chunk_root = fs_info->chunk_root;
5242         struct btrfs_key key;
5243         struct btrfs_device *device;
5244         struct btrfs_chunk *chunk;
5245         struct btrfs_stripe *stripe;
5246         struct extent_map *em;
5247         struct map_lookup *map;
5248         size_t item_size;
5249         u64 dev_offset;
5250         u64 stripe_size;
5251         int i = 0;
5252         int ret = 0;
5253
5254         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5255         if (IS_ERR(em))
5256                 return PTR_ERR(em);
5257
5258         map = em->map_lookup;
5259         item_size = btrfs_chunk_item_size(map->num_stripes);
5260         stripe_size = em->orig_block_len;
5261
5262         chunk = kzalloc(item_size, GFP_NOFS);
5263         if (!chunk) {
5264                 ret = -ENOMEM;
5265                 goto out;
5266         }
5267
5268         /*
5269          * Take the device list mutex to prevent races with the final phase of
5270          * a device replace operation that replaces the device object associated
5271          * with the map's stripes, because the device object's id can change
5272          * at any time during that final phase of the device replace operation
5273          * (dev-replace.c:btrfs_dev_replace_finishing()).
5274          */
5275         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5276         for (i = 0; i < map->num_stripes; i++) {
5277                 device = map->stripes[i].dev;
5278                 dev_offset = map->stripes[i].physical;
5279
5280                 ret = btrfs_update_device(trans, device);
5281                 if (ret)
5282                         break;
5283                 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5284                                              dev_offset, stripe_size);
5285                 if (ret)
5286                         break;
5287         }
5288         if (ret) {
5289                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5290                 goto out;
5291         }
5292
5293         stripe = &chunk->stripe;
5294         for (i = 0; i < map->num_stripes; i++) {
5295                 device = map->stripes[i].dev;
5296                 dev_offset = map->stripes[i].physical;
5297
5298                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5299                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5300                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5301                 stripe++;
5302         }
5303         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5304
5305         btrfs_set_stack_chunk_length(chunk, chunk_size);
5306         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5307         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5308         btrfs_set_stack_chunk_type(chunk, map->type);
5309         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5310         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5311         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5312         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5313         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5314
5315         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5316         key.type = BTRFS_CHUNK_ITEM_KEY;
5317         key.offset = chunk_offset;
5318
5319         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5320         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5321                 /*
5322                  * TODO: Cleanup of inserted chunk root in case of
5323                  * failure.
5324                  */
5325                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5326         }
5327
5328 out:
5329         kfree(chunk);
5330         free_extent_map(em);
5331         return ret;
5332 }
5333
5334 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5335 {
5336         struct btrfs_fs_info *fs_info = trans->fs_info;
5337         u64 alloc_profile;
5338         int ret;
5339
5340         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5341         ret = btrfs_alloc_chunk(trans, alloc_profile);
5342         if (ret)
5343                 return ret;
5344
5345         alloc_profile = btrfs_system_alloc_profile(fs_info);
5346         ret = btrfs_alloc_chunk(trans, alloc_profile);
5347         return ret;
5348 }
5349
5350 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5351 {
5352         const int index = btrfs_bg_flags_to_raid_index(map->type);
5353
5354         return btrfs_raid_array[index].tolerated_failures;
5355 }
5356
5357 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5358 {
5359         struct extent_map *em;
5360         struct map_lookup *map;
5361         int readonly = 0;
5362         int miss_ndevs = 0;
5363         int i;
5364
5365         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5366         if (IS_ERR(em))
5367                 return 1;
5368
5369         map = em->map_lookup;
5370         for (i = 0; i < map->num_stripes; i++) {
5371                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5372                                         &map->stripes[i].dev->dev_state)) {
5373                         miss_ndevs++;
5374                         continue;
5375                 }
5376                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5377                                         &map->stripes[i].dev->dev_state)) {
5378                         readonly = 1;
5379                         goto end;
5380                 }
5381         }
5382
5383         /*
5384          * If the number of missing devices is larger than max errors,
5385          * we can not write the data into that chunk successfully, so
5386          * set it readonly.
5387          */
5388         if (miss_ndevs > btrfs_chunk_max_errors(map))
5389                 readonly = 1;
5390 end:
5391         free_extent_map(em);
5392         return readonly;
5393 }
5394
5395 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5396 {
5397         struct extent_map *em;
5398
5399         while (1) {
5400                 write_lock(&tree->lock);
5401                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5402                 if (em)
5403                         remove_extent_mapping(tree, em);
5404                 write_unlock(&tree->lock);
5405                 if (!em)
5406                         break;
5407                 /* once for us */
5408                 free_extent_map(em);
5409                 /* once for the tree */
5410                 free_extent_map(em);
5411         }
5412 }
5413
5414 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5415 {
5416         struct extent_map *em;
5417         struct map_lookup *map;
5418         int ret;
5419
5420         em = btrfs_get_chunk_map(fs_info, logical, len);
5421         if (IS_ERR(em))
5422                 /*
5423                  * We could return errors for these cases, but that could get
5424                  * ugly and we'd probably do the same thing which is just not do
5425                  * anything else and exit, so return 1 so the callers don't try
5426                  * to use other copies.
5427                  */
5428                 return 1;
5429
5430         map = em->map_lookup;
5431         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5432                 ret = map->num_stripes;
5433         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5434                 ret = map->sub_stripes;
5435         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5436                 ret = 2;
5437         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5438                 /*
5439                  * There could be two corrupted data stripes, we need
5440                  * to loop retry in order to rebuild the correct data.
5441                  *
5442                  * Fail a stripe at a time on every retry except the
5443                  * stripe under reconstruction.
5444                  */
5445                 ret = map->num_stripes;
5446         else
5447                 ret = 1;
5448         free_extent_map(em);
5449
5450         down_read(&fs_info->dev_replace.rwsem);
5451         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5452             fs_info->dev_replace.tgtdev)
5453                 ret++;
5454         up_read(&fs_info->dev_replace.rwsem);
5455
5456         return ret;
5457 }
5458
5459 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5460                                     u64 logical)
5461 {
5462         struct extent_map *em;
5463         struct map_lookup *map;
5464         unsigned long len = fs_info->sectorsize;
5465
5466         em = btrfs_get_chunk_map(fs_info, logical, len);
5467
5468         if (!WARN_ON(IS_ERR(em))) {
5469                 map = em->map_lookup;
5470                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5471                         len = map->stripe_len * nr_data_stripes(map);
5472                 free_extent_map(em);
5473         }
5474         return len;
5475 }
5476
5477 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5478 {
5479         struct extent_map *em;
5480         struct map_lookup *map;
5481         int ret = 0;
5482
5483         em = btrfs_get_chunk_map(fs_info, logical, len);
5484
5485         if(!WARN_ON(IS_ERR(em))) {
5486                 map = em->map_lookup;
5487                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5488                         ret = 1;
5489                 free_extent_map(em);
5490         }
5491         return ret;
5492 }
5493
5494 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5495                             struct map_lookup *map, int first,
5496                             int dev_replace_is_ongoing)
5497 {
5498         int i;
5499         int num_stripes;
5500         int preferred_mirror;
5501         int tolerance;
5502         struct btrfs_device *srcdev;
5503
5504         ASSERT((map->type &
5505                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5506
5507         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5508                 num_stripes = map->sub_stripes;
5509         else
5510                 num_stripes = map->num_stripes;
5511
5512         switch (fs_info->fs_devices->read_policy) {
5513         default:
5514                 /* Shouldn't happen, just warn and use pid instead of failing */
5515                 btrfs_warn_rl(fs_info,
5516                               "unknown read_policy type %u, reset to pid",
5517                               fs_info->fs_devices->read_policy);
5518                 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5519                 fallthrough;
5520         case BTRFS_READ_POLICY_PID:
5521                 preferred_mirror = first + (current->pid % num_stripes);
5522                 break;
5523         }
5524
5525         if (dev_replace_is_ongoing &&
5526             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5527              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5528                 srcdev = fs_info->dev_replace.srcdev;
5529         else
5530                 srcdev = NULL;
5531
5532         /*
5533          * try to avoid the drive that is the source drive for a
5534          * dev-replace procedure, only choose it if no other non-missing
5535          * mirror is available
5536          */
5537         for (tolerance = 0; tolerance < 2; tolerance++) {
5538                 if (map->stripes[preferred_mirror].dev->bdev &&
5539                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5540                         return preferred_mirror;
5541                 for (i = first; i < first + num_stripes; i++) {
5542                         if (map->stripes[i].dev->bdev &&
5543                             (tolerance || map->stripes[i].dev != srcdev))
5544                                 return i;
5545                 }
5546         }
5547
5548         /* we couldn't find one that doesn't fail.  Just return something
5549          * and the io error handling code will clean up eventually
5550          */
5551         return preferred_mirror;
5552 }
5553
5554 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5555 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5556 {
5557         int i;
5558         int again = 1;
5559
5560         while (again) {
5561                 again = 0;
5562                 for (i = 0; i < num_stripes - 1; i++) {
5563                         /* Swap if parity is on a smaller index */
5564                         if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5565                                 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5566                                 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5567                                 again = 1;
5568                         }
5569                 }
5570         }
5571 }
5572
5573 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5574 {
5575         struct btrfs_bio *bbio = kzalloc(
5576                  /* the size of the btrfs_bio */
5577                 sizeof(struct btrfs_bio) +
5578                 /* plus the variable array for the stripes */
5579                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5580                 /* plus the variable array for the tgt dev */
5581                 sizeof(int) * (real_stripes) +
5582                 /*
5583                  * plus the raid_map, which includes both the tgt dev
5584                  * and the stripes
5585                  */
5586                 sizeof(u64) * (total_stripes),
5587                 GFP_NOFS|__GFP_NOFAIL);
5588
5589         atomic_set(&bbio->error, 0);
5590         refcount_set(&bbio->refs, 1);
5591
5592         bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5593         bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5594
5595         return bbio;
5596 }
5597
5598 void btrfs_get_bbio(struct btrfs_bio *bbio)
5599 {
5600         WARN_ON(!refcount_read(&bbio->refs));
5601         refcount_inc(&bbio->refs);
5602 }
5603
5604 void btrfs_put_bbio(struct btrfs_bio *bbio)
5605 {
5606         if (!bbio)
5607                 return;
5608         if (refcount_dec_and_test(&bbio->refs))
5609                 kfree(bbio);
5610 }
5611
5612 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5613 /*
5614  * Please note that, discard won't be sent to target device of device
5615  * replace.
5616  */
5617 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5618                                          u64 logical, u64 *length_ret,
5619                                          struct btrfs_bio **bbio_ret)
5620 {
5621         struct extent_map *em;
5622         struct map_lookup *map;
5623         struct btrfs_bio *bbio;
5624         u64 length = *length_ret;
5625         u64 offset;
5626         u64 stripe_nr;
5627         u64 stripe_nr_end;
5628         u64 stripe_end_offset;
5629         u64 stripe_cnt;
5630         u64 stripe_len;
5631         u64 stripe_offset;
5632         u64 num_stripes;
5633         u32 stripe_index;
5634         u32 factor = 0;
5635         u32 sub_stripes = 0;
5636         u64 stripes_per_dev = 0;
5637         u32 remaining_stripes = 0;
5638         u32 last_stripe = 0;
5639         int ret = 0;
5640         int i;
5641
5642         /* discard always return a bbio */
5643         ASSERT(bbio_ret);
5644
5645         em = btrfs_get_chunk_map(fs_info, logical, length);
5646         if (IS_ERR(em))
5647                 return PTR_ERR(em);
5648
5649         map = em->map_lookup;
5650         /* we don't discard raid56 yet */
5651         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5652                 ret = -EOPNOTSUPP;
5653                 goto out;
5654         }
5655
5656         offset = logical - em->start;
5657         length = min_t(u64, em->start + em->len - logical, length);
5658         *length_ret = length;
5659
5660         stripe_len = map->stripe_len;
5661         /*
5662          * stripe_nr counts the total number of stripes we have to stride
5663          * to get to this block
5664          */
5665         stripe_nr = div64_u64(offset, stripe_len);
5666
5667         /* stripe_offset is the offset of this block in its stripe */
5668         stripe_offset = offset - stripe_nr * stripe_len;
5669
5670         stripe_nr_end = round_up(offset + length, map->stripe_len);
5671         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5672         stripe_cnt = stripe_nr_end - stripe_nr;
5673         stripe_end_offset = stripe_nr_end * map->stripe_len -
5674                             (offset + length);
5675         /*
5676          * after this, stripe_nr is the number of stripes on this
5677          * device we have to walk to find the data, and stripe_index is
5678          * the number of our device in the stripe array
5679          */
5680         num_stripes = 1;
5681         stripe_index = 0;
5682         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5683                          BTRFS_BLOCK_GROUP_RAID10)) {
5684                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5685                         sub_stripes = 1;
5686                 else
5687                         sub_stripes = map->sub_stripes;
5688
5689                 factor = map->num_stripes / sub_stripes;
5690                 num_stripes = min_t(u64, map->num_stripes,
5691                                     sub_stripes * stripe_cnt);
5692                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5693                 stripe_index *= sub_stripes;
5694                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5695                                               &remaining_stripes);
5696                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5697                 last_stripe *= sub_stripes;
5698         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5699                                 BTRFS_BLOCK_GROUP_DUP)) {
5700                 num_stripes = map->num_stripes;
5701         } else {
5702                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5703                                         &stripe_index);
5704         }
5705
5706         bbio = alloc_btrfs_bio(num_stripes, 0);
5707         if (!bbio) {
5708                 ret = -ENOMEM;
5709                 goto out;
5710         }
5711
5712         for (i = 0; i < num_stripes; i++) {
5713                 bbio->stripes[i].physical =
5714                         map->stripes[stripe_index].physical +
5715                         stripe_offset + stripe_nr * map->stripe_len;
5716                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5717
5718                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5719                                  BTRFS_BLOCK_GROUP_RAID10)) {
5720                         bbio->stripes[i].length = stripes_per_dev *
5721                                 map->stripe_len;
5722
5723                         if (i / sub_stripes < remaining_stripes)
5724                                 bbio->stripes[i].length +=
5725                                         map->stripe_len;
5726
5727                         /*
5728                          * Special for the first stripe and
5729                          * the last stripe:
5730                          *
5731                          * |-------|...|-------|
5732                          *     |----------|
5733                          *    off     end_off
5734                          */
5735                         if (i < sub_stripes)
5736                                 bbio->stripes[i].length -=
5737                                         stripe_offset;
5738
5739                         if (stripe_index >= last_stripe &&
5740                             stripe_index <= (last_stripe +
5741                                              sub_stripes - 1))
5742                                 bbio->stripes[i].length -=
5743                                         stripe_end_offset;
5744
5745                         if (i == sub_stripes - 1)
5746                                 stripe_offset = 0;
5747                 } else {
5748                         bbio->stripes[i].length = length;
5749                 }
5750
5751                 stripe_index++;
5752                 if (stripe_index == map->num_stripes) {
5753                         stripe_index = 0;
5754                         stripe_nr++;
5755                 }
5756         }
5757
5758         *bbio_ret = bbio;
5759         bbio->map_type = map->type;
5760         bbio->num_stripes = num_stripes;
5761 out:
5762         free_extent_map(em);
5763         return ret;
5764 }
5765
5766 /*
5767  * In dev-replace case, for repair case (that's the only case where the mirror
5768  * is selected explicitly when calling btrfs_map_block), blocks left of the
5769  * left cursor can also be read from the target drive.
5770  *
5771  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5772  * array of stripes.
5773  * For READ, it also needs to be supported using the same mirror number.
5774  *
5775  * If the requested block is not left of the left cursor, EIO is returned. This
5776  * can happen because btrfs_num_copies() returns one more in the dev-replace
5777  * case.
5778  */
5779 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5780                                          u64 logical, u64 length,
5781                                          u64 srcdev_devid, int *mirror_num,
5782                                          u64 *physical)
5783 {
5784         struct btrfs_bio *bbio = NULL;
5785         int num_stripes;
5786         int index_srcdev = 0;
5787         int found = 0;
5788         u64 physical_of_found = 0;
5789         int i;
5790         int ret = 0;
5791
5792         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5793                                 logical, &length, &bbio, 0, 0);
5794         if (ret) {
5795                 ASSERT(bbio == NULL);
5796                 return ret;
5797         }
5798
5799         num_stripes = bbio->num_stripes;
5800         if (*mirror_num > num_stripes) {
5801                 /*
5802                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5803                  * that means that the requested area is not left of the left
5804                  * cursor
5805                  */
5806                 btrfs_put_bbio(bbio);
5807                 return -EIO;
5808         }
5809
5810         /*
5811          * process the rest of the function using the mirror_num of the source
5812          * drive. Therefore look it up first.  At the end, patch the device
5813          * pointer to the one of the target drive.
5814          */
5815         for (i = 0; i < num_stripes; i++) {
5816                 if (bbio->stripes[i].dev->devid != srcdev_devid)
5817                         continue;
5818
5819                 /*
5820                  * In case of DUP, in order to keep it simple, only add the
5821                  * mirror with the lowest physical address
5822                  */
5823                 if (found &&
5824                     physical_of_found <= bbio->stripes[i].physical)
5825                         continue;
5826
5827                 index_srcdev = i;
5828                 found = 1;
5829                 physical_of_found = bbio->stripes[i].physical;
5830         }
5831
5832         btrfs_put_bbio(bbio);
5833
5834         ASSERT(found);
5835         if (!found)
5836                 return -EIO;
5837
5838         *mirror_num = index_srcdev + 1;
5839         *physical = physical_of_found;
5840         return ret;
5841 }
5842
5843 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5844                                       struct btrfs_bio **bbio_ret,
5845                                       struct btrfs_dev_replace *dev_replace,
5846                                       int *num_stripes_ret, int *max_errors_ret)
5847 {
5848         struct btrfs_bio *bbio = *bbio_ret;
5849         u64 srcdev_devid = dev_replace->srcdev->devid;
5850         int tgtdev_indexes = 0;
5851         int num_stripes = *num_stripes_ret;
5852         int max_errors = *max_errors_ret;
5853         int i;
5854
5855         if (op == BTRFS_MAP_WRITE) {
5856                 int index_where_to_add;
5857
5858                 /*
5859                  * duplicate the write operations while the dev replace
5860                  * procedure is running. Since the copying of the old disk to
5861                  * the new disk takes place at run time while the filesystem is
5862                  * mounted writable, the regular write operations to the old
5863                  * disk have to be duplicated to go to the new disk as well.
5864                  *
5865                  * Note that device->missing is handled by the caller, and that
5866                  * the write to the old disk is already set up in the stripes
5867                  * array.
5868                  */
5869                 index_where_to_add = num_stripes;
5870                 for (i = 0; i < num_stripes; i++) {
5871                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5872                                 /* write to new disk, too */
5873                                 struct btrfs_bio_stripe *new =
5874                                         bbio->stripes + index_where_to_add;
5875                                 struct btrfs_bio_stripe *old =
5876                                         bbio->stripes + i;
5877
5878                                 new->physical = old->physical;
5879                                 new->length = old->length;
5880                                 new->dev = dev_replace->tgtdev;
5881                                 bbio->tgtdev_map[i] = index_where_to_add;
5882                                 index_where_to_add++;
5883                                 max_errors++;
5884                                 tgtdev_indexes++;
5885                         }
5886                 }
5887                 num_stripes = index_where_to_add;
5888         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5889                 int index_srcdev = 0;
5890                 int found = 0;
5891                 u64 physical_of_found = 0;
5892
5893                 /*
5894                  * During the dev-replace procedure, the target drive can also
5895                  * be used to read data in case it is needed to repair a corrupt
5896                  * block elsewhere. This is possible if the requested area is
5897                  * left of the left cursor. In this area, the target drive is a
5898                  * full copy of the source drive.
5899                  */
5900                 for (i = 0; i < num_stripes; i++) {
5901                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5902                                 /*
5903                                  * In case of DUP, in order to keep it simple,
5904                                  * only add the mirror with the lowest physical
5905                                  * address
5906                                  */
5907                                 if (found &&
5908                                     physical_of_found <=
5909                                      bbio->stripes[i].physical)
5910                                         continue;
5911                                 index_srcdev = i;
5912                                 found = 1;
5913                                 physical_of_found = bbio->stripes[i].physical;
5914                         }
5915                 }
5916                 if (found) {
5917                         struct btrfs_bio_stripe *tgtdev_stripe =
5918                                 bbio->stripes + num_stripes;
5919
5920                         tgtdev_stripe->physical = physical_of_found;
5921                         tgtdev_stripe->length =
5922                                 bbio->stripes[index_srcdev].length;
5923                         tgtdev_stripe->dev = dev_replace->tgtdev;
5924                         bbio->tgtdev_map[index_srcdev] = num_stripes;
5925
5926                         tgtdev_indexes++;
5927                         num_stripes++;
5928                 }
5929         }
5930
5931         *num_stripes_ret = num_stripes;
5932         *max_errors_ret = max_errors;
5933         bbio->num_tgtdevs = tgtdev_indexes;
5934         *bbio_ret = bbio;
5935 }
5936
5937 static bool need_full_stripe(enum btrfs_map_op op)
5938 {
5939         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5940 }
5941
5942 /*
5943  * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5944  *                     tuple. This information is used to calculate how big a
5945  *                     particular bio can get before it straddles a stripe.
5946  *
5947  * @fs_info - the filesystem
5948  * @logical - address that we want to figure out the geometry of
5949  * @len     - the length of IO we are going to perform, starting at @logical
5950  * @op      - type of operation - write or read
5951  * @io_geom - pointer used to return values
5952  *
5953  * Returns < 0 in case a chunk for the given logical address cannot be found,
5954  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5955  */
5956 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5957                         u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5958 {
5959         struct extent_map *em;
5960         struct map_lookup *map;
5961         u64 offset;
5962         u64 stripe_offset;
5963         u64 stripe_nr;
5964         u64 stripe_len;
5965         u64 raid56_full_stripe_start = (u64)-1;
5966         int data_stripes;
5967         int ret = 0;
5968
5969         ASSERT(op != BTRFS_MAP_DISCARD);
5970
5971         em = btrfs_get_chunk_map(fs_info, logical, len);
5972         if (IS_ERR(em))
5973                 return PTR_ERR(em);
5974
5975         map = em->map_lookup;
5976         /* Offset of this logical address in the chunk */
5977         offset = logical - em->start;
5978         /* Len of a stripe in a chunk */
5979         stripe_len = map->stripe_len;
5980         /* Stripe wher this block falls in */
5981         stripe_nr = div64_u64(offset, stripe_len);
5982         /* Offset of stripe in the chunk */
5983         stripe_offset = stripe_nr * stripe_len;
5984         if (offset < stripe_offset) {
5985                 btrfs_crit(fs_info,
5986 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5987                         stripe_offset, offset, em->start, logical, stripe_len);
5988                 ret = -EINVAL;
5989                 goto out;
5990         }
5991
5992         /* stripe_offset is the offset of this block in its stripe */
5993         stripe_offset = offset - stripe_offset;
5994         data_stripes = nr_data_stripes(map);
5995
5996         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5997                 u64 max_len = stripe_len - stripe_offset;
5998
5999                 /*
6000                  * In case of raid56, we need to know the stripe aligned start
6001                  */
6002                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6003                         unsigned long full_stripe_len = stripe_len * data_stripes;
6004                         raid56_full_stripe_start = offset;
6005
6006                         /*
6007                          * Allow a write of a full stripe, but make sure we
6008                          * don't allow straddling of stripes
6009                          */
6010                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6011                                         full_stripe_len);
6012                         raid56_full_stripe_start *= full_stripe_len;
6013
6014                         /*
6015                          * For writes to RAID[56], allow a full stripeset across
6016                          * all disks. For other RAID types and for RAID[56]
6017                          * reads, just allow a single stripe (on a single disk).
6018                          */
6019                         if (op == BTRFS_MAP_WRITE) {
6020                                 max_len = stripe_len * data_stripes -
6021                                           (offset - raid56_full_stripe_start);
6022                         }
6023                 }
6024                 len = min_t(u64, em->len - offset, max_len);
6025         } else {
6026                 len = em->len - offset;
6027         }
6028
6029         io_geom->len = len;
6030         io_geom->offset = offset;
6031         io_geom->stripe_len = stripe_len;
6032         io_geom->stripe_nr = stripe_nr;
6033         io_geom->stripe_offset = stripe_offset;
6034         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6035
6036 out:
6037         /* once for us */
6038         free_extent_map(em);
6039         return ret;
6040 }
6041
6042 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6043                              enum btrfs_map_op op,
6044                              u64 logical, u64 *length,
6045                              struct btrfs_bio **bbio_ret,
6046                              int mirror_num, int need_raid_map)
6047 {
6048         struct extent_map *em;
6049         struct map_lookup *map;
6050         u64 stripe_offset;
6051         u64 stripe_nr;
6052         u64 stripe_len;
6053         u32 stripe_index;
6054         int data_stripes;
6055         int i;
6056         int ret = 0;
6057         int num_stripes;
6058         int max_errors = 0;
6059         int tgtdev_indexes = 0;
6060         struct btrfs_bio *bbio = NULL;
6061         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6062         int dev_replace_is_ongoing = 0;
6063         int num_alloc_stripes;
6064         int patch_the_first_stripe_for_dev_replace = 0;
6065         u64 physical_to_patch_in_first_stripe = 0;
6066         u64 raid56_full_stripe_start = (u64)-1;
6067         struct btrfs_io_geometry geom;
6068
6069         ASSERT(bbio_ret);
6070         ASSERT(op != BTRFS_MAP_DISCARD);
6071
6072         ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6073         if (ret < 0)
6074                 return ret;
6075
6076         em = btrfs_get_chunk_map(fs_info, logical, *length);
6077         ASSERT(!IS_ERR(em));
6078         map = em->map_lookup;
6079
6080         *length = geom.len;
6081         stripe_len = geom.stripe_len;
6082         stripe_nr = geom.stripe_nr;
6083         stripe_offset = geom.stripe_offset;
6084         raid56_full_stripe_start = geom.raid56_stripe_offset;
6085         data_stripes = nr_data_stripes(map);
6086
6087         down_read(&dev_replace->rwsem);
6088         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6089         /*
6090          * Hold the semaphore for read during the whole operation, write is
6091          * requested at commit time but must wait.
6092          */
6093         if (!dev_replace_is_ongoing)
6094                 up_read(&dev_replace->rwsem);
6095
6096         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6097             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6098                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6099                                                     dev_replace->srcdev->devid,
6100                                                     &mirror_num,
6101                                             &physical_to_patch_in_first_stripe);
6102                 if (ret)
6103                         goto out;
6104                 else
6105                         patch_the_first_stripe_for_dev_replace = 1;
6106         } else if (mirror_num > map->num_stripes) {
6107                 mirror_num = 0;
6108         }
6109
6110         num_stripes = 1;
6111         stripe_index = 0;
6112         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6113                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6114                                 &stripe_index);
6115                 if (!need_full_stripe(op))
6116                         mirror_num = 1;
6117         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6118                 if (need_full_stripe(op))
6119                         num_stripes = map->num_stripes;
6120                 else if (mirror_num)
6121                         stripe_index = mirror_num - 1;
6122                 else {
6123                         stripe_index = find_live_mirror(fs_info, map, 0,
6124                                             dev_replace_is_ongoing);
6125                         mirror_num = stripe_index + 1;
6126                 }
6127
6128         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6129                 if (need_full_stripe(op)) {
6130                         num_stripes = map->num_stripes;
6131                 } else if (mirror_num) {
6132                         stripe_index = mirror_num - 1;
6133                 } else {
6134                         mirror_num = 1;
6135                 }
6136
6137         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6138                 u32 factor = map->num_stripes / map->sub_stripes;
6139
6140                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6141                 stripe_index *= map->sub_stripes;
6142
6143                 if (need_full_stripe(op))
6144                         num_stripes = map->sub_stripes;
6145                 else if (mirror_num)
6146                         stripe_index += mirror_num - 1;
6147                 else {
6148                         int old_stripe_index = stripe_index;
6149                         stripe_index = find_live_mirror(fs_info, map,
6150                                               stripe_index,
6151                                               dev_replace_is_ongoing);
6152                         mirror_num = stripe_index - old_stripe_index + 1;
6153                 }
6154
6155         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6156                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6157                         /* push stripe_nr back to the start of the full stripe */
6158                         stripe_nr = div64_u64(raid56_full_stripe_start,
6159                                         stripe_len * data_stripes);
6160
6161                         /* RAID[56] write or recovery. Return all stripes */
6162                         num_stripes = map->num_stripes;
6163                         max_errors = nr_parity_stripes(map);
6164
6165                         *length = map->stripe_len;
6166                         stripe_index = 0;
6167                         stripe_offset = 0;
6168                 } else {
6169                         /*
6170                          * Mirror #0 or #1 means the original data block.
6171                          * Mirror #2 is RAID5 parity block.
6172                          * Mirror #3 is RAID6 Q block.
6173                          */
6174                         stripe_nr = div_u64_rem(stripe_nr,
6175                                         data_stripes, &stripe_index);
6176                         if (mirror_num > 1)
6177                                 stripe_index = data_stripes + mirror_num - 2;
6178
6179                         /* We distribute the parity blocks across stripes */
6180                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6181                                         &stripe_index);
6182                         if (!need_full_stripe(op) && mirror_num <= 1)
6183                                 mirror_num = 1;
6184                 }
6185         } else {
6186                 /*
6187                  * after this, stripe_nr is the number of stripes on this
6188                  * device we have to walk to find the data, and stripe_index is
6189                  * the number of our device in the stripe array
6190                  */
6191                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6192                                 &stripe_index);
6193                 mirror_num = stripe_index + 1;
6194         }
6195         if (stripe_index >= map->num_stripes) {
6196                 btrfs_crit(fs_info,
6197                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6198                            stripe_index, map->num_stripes);
6199                 ret = -EINVAL;
6200                 goto out;
6201         }
6202
6203         num_alloc_stripes = num_stripes;
6204         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6205                 if (op == BTRFS_MAP_WRITE)
6206                         num_alloc_stripes <<= 1;
6207                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6208                         num_alloc_stripes++;
6209                 tgtdev_indexes = num_stripes;
6210         }
6211
6212         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6213         if (!bbio) {
6214                 ret = -ENOMEM;
6215                 goto out;
6216         }
6217
6218         for (i = 0; i < num_stripes; i++) {
6219                 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6220                         stripe_offset + stripe_nr * map->stripe_len;
6221                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6222                 stripe_index++;
6223         }
6224
6225         /* build raid_map */
6226         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6227             (need_full_stripe(op) || mirror_num > 1)) {
6228                 u64 tmp;
6229                 unsigned rot;
6230
6231                 /* Work out the disk rotation on this stripe-set */
6232                 div_u64_rem(stripe_nr, num_stripes, &rot);
6233
6234                 /* Fill in the logical address of each stripe */
6235                 tmp = stripe_nr * data_stripes;
6236                 for (i = 0; i < data_stripes; i++)
6237                         bbio->raid_map[(i+rot) % num_stripes] =
6238                                 em->start + (tmp + i) * map->stripe_len;
6239
6240                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6241                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6242                         bbio->raid_map[(i+rot+1) % num_stripes] =
6243                                 RAID6_Q_STRIPE;
6244
6245                 sort_parity_stripes(bbio, num_stripes);
6246         }
6247
6248         if (need_full_stripe(op))
6249                 max_errors = btrfs_chunk_max_errors(map);
6250
6251         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6252             need_full_stripe(op)) {
6253                 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6254                                           &max_errors);
6255         }
6256
6257         *bbio_ret = bbio;
6258         bbio->map_type = map->type;
6259         bbio->num_stripes = num_stripes;
6260         bbio->max_errors = max_errors;
6261         bbio->mirror_num = mirror_num;
6262
6263         /*
6264          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6265          * mirror_num == num_stripes + 1 && dev_replace target drive is
6266          * available as a mirror
6267          */
6268         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6269                 WARN_ON(num_stripes > 1);
6270                 bbio->stripes[0].dev = dev_replace->tgtdev;
6271                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6272                 bbio->mirror_num = map->num_stripes + 1;
6273         }
6274 out:
6275         if (dev_replace_is_ongoing) {
6276                 lockdep_assert_held(&dev_replace->rwsem);
6277                 /* Unlock and let waiting writers proceed */
6278                 up_read(&dev_replace->rwsem);
6279         }
6280         free_extent_map(em);
6281         return ret;
6282 }
6283
6284 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6285                       u64 logical, u64 *length,
6286                       struct btrfs_bio **bbio_ret, int mirror_num)
6287 {
6288         if (op == BTRFS_MAP_DISCARD)
6289                 return __btrfs_map_block_for_discard(fs_info, logical,
6290                                                      length, bbio_ret);
6291
6292         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6293                                  mirror_num, 0);
6294 }
6295
6296 /* For Scrub/replace */
6297 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6298                      u64 logical, u64 *length,
6299                      struct btrfs_bio **bbio_ret)
6300 {
6301         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6302 }
6303
6304 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6305 {
6306         bio->bi_private = bbio->private;
6307         bio->bi_end_io = bbio->end_io;
6308         bio_endio(bio);
6309
6310         btrfs_put_bbio(bbio);
6311 }
6312
6313 static void btrfs_end_bio(struct bio *bio)
6314 {
6315         struct btrfs_bio *bbio = bio->bi_private;
6316         int is_orig_bio = 0;
6317
6318         if (bio->bi_status) {
6319                 atomic_inc(&bbio->error);
6320                 if (bio->bi_status == BLK_STS_IOERR ||
6321                     bio->bi_status == BLK_STS_TARGET) {
6322                         struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6323
6324                         ASSERT(dev->bdev);
6325                         if (bio_op(bio) == REQ_OP_WRITE)
6326                                 btrfs_dev_stat_inc_and_print(dev,
6327                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6328                         else if (!(bio->bi_opf & REQ_RAHEAD))
6329                                 btrfs_dev_stat_inc_and_print(dev,
6330                                                 BTRFS_DEV_STAT_READ_ERRS);
6331                         if (bio->bi_opf & REQ_PREFLUSH)
6332                                 btrfs_dev_stat_inc_and_print(dev,
6333                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6334                 }
6335         }
6336
6337         if (bio == bbio->orig_bio)
6338                 is_orig_bio = 1;
6339
6340         btrfs_bio_counter_dec(bbio->fs_info);
6341
6342         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6343                 if (!is_orig_bio) {
6344                         bio_put(bio);
6345                         bio = bbio->orig_bio;
6346                 }
6347
6348                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6349                 /* only send an error to the higher layers if it is
6350                  * beyond the tolerance of the btrfs bio
6351                  */
6352                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6353                         bio->bi_status = BLK_STS_IOERR;
6354                 } else {
6355                         /*
6356                          * this bio is actually up to date, we didn't
6357                          * go over the max number of errors
6358                          */
6359                         bio->bi_status = BLK_STS_OK;
6360                 }
6361
6362                 btrfs_end_bbio(bbio, bio);
6363         } else if (!is_orig_bio) {
6364                 bio_put(bio);
6365         }
6366 }
6367
6368 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6369                               u64 physical, struct btrfs_device *dev)
6370 {
6371         struct btrfs_fs_info *fs_info = bbio->fs_info;
6372
6373         bio->bi_private = bbio;
6374         btrfs_io_bio(bio)->device = dev;
6375         bio->bi_end_io = btrfs_end_bio;
6376         bio->bi_iter.bi_sector = physical >> 9;
6377         btrfs_debug_in_rcu(fs_info,
6378         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6379                 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6380                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6381                 dev->devid, bio->bi_iter.bi_size);
6382         bio_set_dev(bio, dev->bdev);
6383
6384         btrfs_bio_counter_inc_noblocked(fs_info);
6385
6386         btrfsic_submit_bio(bio);
6387 }
6388
6389 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6390 {
6391         atomic_inc(&bbio->error);
6392         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6393                 /* Should be the original bio. */
6394                 WARN_ON(bio != bbio->orig_bio);
6395
6396                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6397                 bio->bi_iter.bi_sector = logical >> 9;
6398                 if (atomic_read(&bbio->error) > bbio->max_errors)
6399                         bio->bi_status = BLK_STS_IOERR;
6400                 else
6401                         bio->bi_status = BLK_STS_OK;
6402                 btrfs_end_bbio(bbio, bio);
6403         }
6404 }
6405
6406 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6407                            int mirror_num)
6408 {
6409         struct btrfs_device *dev;
6410         struct bio *first_bio = bio;
6411         u64 logical = bio->bi_iter.bi_sector << 9;
6412         u64 length = 0;
6413         u64 map_length;
6414         int ret;
6415         int dev_nr;
6416         int total_devs;
6417         struct btrfs_bio *bbio = NULL;
6418
6419         length = bio->bi_iter.bi_size;
6420         map_length = length;
6421
6422         btrfs_bio_counter_inc_blocked(fs_info);
6423         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6424                                 &map_length, &bbio, mirror_num, 1);
6425         if (ret) {
6426                 btrfs_bio_counter_dec(fs_info);
6427                 return errno_to_blk_status(ret);
6428         }
6429
6430         total_devs = bbio->num_stripes;
6431         bbio->orig_bio = first_bio;
6432         bbio->private = first_bio->bi_private;
6433         bbio->end_io = first_bio->bi_end_io;
6434         bbio->fs_info = fs_info;
6435         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6436
6437         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6438             ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6439                 /* In this case, map_length has been set to the length of
6440                    a single stripe; not the whole write */
6441                 if (bio_op(bio) == REQ_OP_WRITE) {
6442                         ret = raid56_parity_write(fs_info, bio, bbio,
6443                                                   map_length);
6444                 } else {
6445                         ret = raid56_parity_recover(fs_info, bio, bbio,
6446                                                     map_length, mirror_num, 1);
6447                 }
6448
6449                 btrfs_bio_counter_dec(fs_info);
6450                 return errno_to_blk_status(ret);
6451         }
6452
6453         if (map_length < length) {
6454                 btrfs_crit(fs_info,
6455                            "mapping failed logical %llu bio len %llu len %llu",
6456                            logical, length, map_length);
6457                 BUG();
6458         }
6459
6460         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6461                 dev = bbio->stripes[dev_nr].dev;
6462                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6463                                                    &dev->dev_state) ||
6464                     (bio_op(first_bio) == REQ_OP_WRITE &&
6465                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6466                         bbio_error(bbio, first_bio, logical);
6467                         continue;
6468                 }
6469
6470                 if (dev_nr < total_devs - 1)
6471                         bio = btrfs_bio_clone(first_bio);
6472                 else
6473                         bio = first_bio;
6474
6475                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6476         }
6477         btrfs_bio_counter_dec(fs_info);
6478         return BLK_STS_OK;
6479 }
6480
6481 /*
6482  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6483  * return NULL.
6484  *
6485  * If devid and uuid are both specified, the match must be exact, otherwise
6486  * only devid is used.
6487  *
6488  * If @seed is true, traverse through the seed devices.
6489  */
6490 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6491                                        u64 devid, u8 *uuid, u8 *fsid)
6492 {
6493         struct btrfs_device *device;
6494         struct btrfs_fs_devices *seed_devs;
6495
6496         if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6497                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6498                         if (device->devid == devid &&
6499                             (!uuid || memcmp(device->uuid, uuid,
6500                                              BTRFS_UUID_SIZE) == 0))
6501                                 return device;
6502                 }
6503         }
6504
6505         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6506                 if (!fsid ||
6507                     !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6508                         list_for_each_entry(device, &seed_devs->devices,
6509                                             dev_list) {
6510                                 if (device->devid == devid &&
6511                                     (!uuid || memcmp(device->uuid, uuid,
6512                                                      BTRFS_UUID_SIZE) == 0))
6513                                         return device;
6514                         }
6515                 }
6516         }
6517
6518         return NULL;
6519 }
6520
6521 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6522                                             u64 devid, u8 *dev_uuid)
6523 {
6524         struct btrfs_device *device;
6525         unsigned int nofs_flag;
6526
6527         /*
6528          * We call this under the chunk_mutex, so we want to use NOFS for this
6529          * allocation, however we don't want to change btrfs_alloc_device() to
6530          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6531          * places.
6532          */
6533         nofs_flag = memalloc_nofs_save();
6534         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6535         memalloc_nofs_restore(nofs_flag);
6536         if (IS_ERR(device))
6537                 return device;
6538
6539         list_add(&device->dev_list, &fs_devices->devices);
6540         device->fs_devices = fs_devices;
6541         fs_devices->num_devices++;
6542
6543         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6544         fs_devices->missing_devices++;
6545
6546         return device;
6547 }
6548
6549 /**
6550  * btrfs_alloc_device - allocate struct btrfs_device
6551  * @fs_info:    used only for generating a new devid, can be NULL if
6552  *              devid is provided (i.e. @devid != NULL).
6553  * @devid:      a pointer to devid for this device.  If NULL a new devid
6554  *              is generated.
6555  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6556  *              is generated.
6557  *
6558  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6559  * on error.  Returned struct is not linked onto any lists and must be
6560  * destroyed with btrfs_free_device.
6561  */
6562 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6563                                         const u64 *devid,
6564                                         const u8 *uuid)
6565 {
6566         struct btrfs_device *dev;
6567         u64 tmp;
6568
6569         if (WARN_ON(!devid && !fs_info))
6570                 return ERR_PTR(-EINVAL);
6571
6572         dev = __alloc_device(fs_info);
6573         if (IS_ERR(dev))
6574                 return dev;
6575
6576         if (devid)
6577                 tmp = *devid;
6578         else {
6579                 int ret;
6580
6581                 ret = find_next_devid(fs_info, &tmp);
6582                 if (ret) {
6583                         btrfs_free_device(dev);
6584                         return ERR_PTR(ret);
6585                 }
6586         }
6587         dev->devid = tmp;
6588
6589         if (uuid)
6590                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6591         else
6592                 generate_random_uuid(dev->uuid);
6593
6594         return dev;
6595 }
6596
6597 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6598                                         u64 devid, u8 *uuid, bool error)
6599 {
6600         if (error)
6601                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6602                               devid, uuid);
6603         else
6604                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6605                               devid, uuid);
6606 }
6607
6608 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6609 {
6610         int index = btrfs_bg_flags_to_raid_index(type);
6611         int ncopies = btrfs_raid_array[index].ncopies;
6612         const int nparity = btrfs_raid_array[index].nparity;
6613         int data_stripes;
6614
6615         if (nparity)
6616                 data_stripes = num_stripes - nparity;
6617         else
6618                 data_stripes = num_stripes / ncopies;
6619
6620         return div_u64(chunk_len, data_stripes);
6621 }
6622
6623 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6624                           struct btrfs_chunk *chunk)
6625 {
6626         struct btrfs_fs_info *fs_info = leaf->fs_info;
6627         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6628         struct map_lookup *map;
6629         struct extent_map *em;
6630         u64 logical;
6631         u64 length;
6632         u64 devid;
6633         u8 uuid[BTRFS_UUID_SIZE];
6634         int num_stripes;
6635         int ret;
6636         int i;
6637
6638         logical = key->offset;
6639         length = btrfs_chunk_length(leaf, chunk);
6640         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6641
6642         /*
6643          * Only need to verify chunk item if we're reading from sys chunk array,
6644          * as chunk item in tree block is already verified by tree-checker.
6645          */
6646         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6647                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6648                 if (ret)
6649                         return ret;
6650         }
6651
6652         read_lock(&map_tree->lock);
6653         em = lookup_extent_mapping(map_tree, logical, 1);
6654         read_unlock(&map_tree->lock);
6655
6656         /* already mapped? */
6657         if (em && em->start <= logical && em->start + em->len > logical) {
6658                 free_extent_map(em);
6659                 return 0;
6660         } else if (em) {
6661                 free_extent_map(em);
6662         }
6663
6664         em = alloc_extent_map();
6665         if (!em)
6666                 return -ENOMEM;
6667         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6668         if (!map) {
6669                 free_extent_map(em);
6670                 return -ENOMEM;
6671         }
6672
6673         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6674         em->map_lookup = map;
6675         em->start = logical;
6676         em->len = length;
6677         em->orig_start = 0;
6678         em->block_start = 0;
6679         em->block_len = em->len;
6680
6681         map->num_stripes = num_stripes;
6682         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6683         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6684         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6685         map->type = btrfs_chunk_type(leaf, chunk);
6686         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6687         map->verified_stripes = 0;
6688         em->orig_block_len = calc_stripe_length(map->type, em->len,
6689                                                 map->num_stripes);
6690         for (i = 0; i < num_stripes; i++) {
6691                 map->stripes[i].physical =
6692                         btrfs_stripe_offset_nr(leaf, chunk, i);
6693                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6694                 read_extent_buffer(leaf, uuid, (unsigned long)
6695                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6696                                    BTRFS_UUID_SIZE);
6697                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6698                                                         devid, uuid, NULL);
6699                 if (!map->stripes[i].dev &&
6700                     !btrfs_test_opt(fs_info, DEGRADED)) {
6701                         free_extent_map(em);
6702                         btrfs_report_missing_device(fs_info, devid, uuid, true);
6703                         return -ENOENT;
6704                 }
6705                 if (!map->stripes[i].dev) {
6706                         map->stripes[i].dev =
6707                                 add_missing_dev(fs_info->fs_devices, devid,
6708                                                 uuid);
6709                         if (IS_ERR(map->stripes[i].dev)) {
6710                                 free_extent_map(em);
6711                                 btrfs_err(fs_info,
6712                                         "failed to init missing dev %llu: %ld",
6713                                         devid, PTR_ERR(map->stripes[i].dev));
6714                                 return PTR_ERR(map->stripes[i].dev);
6715                         }
6716                         btrfs_report_missing_device(fs_info, devid, uuid, false);
6717                 }
6718                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6719                                 &(map->stripes[i].dev->dev_state));
6720
6721         }
6722
6723         write_lock(&map_tree->lock);
6724         ret = add_extent_mapping(map_tree, em, 0);
6725         write_unlock(&map_tree->lock);
6726         if (ret < 0) {
6727                 btrfs_err(fs_info,
6728                           "failed to add chunk map, start=%llu len=%llu: %d",
6729                           em->start, em->len, ret);
6730         }
6731         free_extent_map(em);
6732
6733         return ret;
6734 }
6735
6736 static void fill_device_from_item(struct extent_buffer *leaf,
6737                                  struct btrfs_dev_item *dev_item,
6738                                  struct btrfs_device *device)
6739 {
6740         unsigned long ptr;
6741
6742         device->devid = btrfs_device_id(leaf, dev_item);
6743         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6744         device->total_bytes = device->disk_total_bytes;
6745         device->commit_total_bytes = device->disk_total_bytes;
6746         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6747         device->commit_bytes_used = device->bytes_used;
6748         device->type = btrfs_device_type(leaf, dev_item);
6749         device->io_align = btrfs_device_io_align(leaf, dev_item);
6750         device->io_width = btrfs_device_io_width(leaf, dev_item);
6751         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6752         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6753         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6754
6755         ptr = btrfs_device_uuid(dev_item);
6756         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6757 }
6758
6759 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6760                                                   u8 *fsid)
6761 {
6762         struct btrfs_fs_devices *fs_devices;
6763         int ret;
6764
6765         lockdep_assert_held(&uuid_mutex);
6766         ASSERT(fsid);
6767
6768         /* This will match only for multi-device seed fs */
6769         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6770                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6771                         return fs_devices;
6772
6773
6774         fs_devices = find_fsid(fsid, NULL);
6775         if (!fs_devices) {
6776                 if (!btrfs_test_opt(fs_info, DEGRADED))
6777                         return ERR_PTR(-ENOENT);
6778
6779                 fs_devices = alloc_fs_devices(fsid, NULL);
6780                 if (IS_ERR(fs_devices))
6781                         return fs_devices;
6782
6783                 fs_devices->seeding = true;
6784                 fs_devices->opened = 1;
6785                 return fs_devices;
6786         }
6787
6788         /*
6789          * Upon first call for a seed fs fsid, just create a private copy of the
6790          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6791          */
6792         fs_devices = clone_fs_devices(fs_devices);
6793         if (IS_ERR(fs_devices))
6794                 return fs_devices;
6795
6796         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6797         if (ret) {
6798                 free_fs_devices(fs_devices);
6799                 return ERR_PTR(ret);
6800         }
6801
6802         if (!fs_devices->seeding) {
6803                 close_fs_devices(fs_devices);
6804                 free_fs_devices(fs_devices);
6805                 return ERR_PTR(-EINVAL);
6806         }
6807
6808         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6809
6810         return fs_devices;
6811 }
6812
6813 static int read_one_dev(struct extent_buffer *leaf,
6814                         struct btrfs_dev_item *dev_item)
6815 {
6816         struct btrfs_fs_info *fs_info = leaf->fs_info;
6817         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6818         struct btrfs_device *device;
6819         u64 devid;
6820         int ret;
6821         u8 fs_uuid[BTRFS_FSID_SIZE];
6822         u8 dev_uuid[BTRFS_UUID_SIZE];
6823
6824         devid = btrfs_device_id(leaf, dev_item);
6825         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6826                            BTRFS_UUID_SIZE);
6827         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6828                            BTRFS_FSID_SIZE);
6829
6830         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6831                 fs_devices = open_seed_devices(fs_info, fs_uuid);
6832                 if (IS_ERR(fs_devices))
6833                         return PTR_ERR(fs_devices);
6834         }
6835
6836         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6837                                    fs_uuid);
6838         if (!device) {
6839                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6840                         btrfs_report_missing_device(fs_info, devid,
6841                                                         dev_uuid, true);
6842                         return -ENOENT;
6843                 }
6844
6845                 device = add_missing_dev(fs_devices, devid, dev_uuid);
6846                 if (IS_ERR(device)) {
6847                         btrfs_err(fs_info,
6848                                 "failed to add missing dev %llu: %ld",
6849                                 devid, PTR_ERR(device));
6850                         return PTR_ERR(device);
6851                 }
6852                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6853         } else {
6854                 if (!device->bdev) {
6855                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
6856                                 btrfs_report_missing_device(fs_info,
6857                                                 devid, dev_uuid, true);
6858                                 return -ENOENT;
6859                         }
6860                         btrfs_report_missing_device(fs_info, devid,
6861                                                         dev_uuid, false);
6862                 }
6863
6864                 if (!device->bdev &&
6865                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6866                         /*
6867                          * this happens when a device that was properly setup
6868                          * in the device info lists suddenly goes bad.
6869                          * device->bdev is NULL, and so we have to set
6870                          * device->missing to one here
6871                          */
6872                         device->fs_devices->missing_devices++;
6873                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6874                 }
6875
6876                 /* Move the device to its own fs_devices */
6877                 if (device->fs_devices != fs_devices) {
6878                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6879                                                         &device->dev_state));
6880
6881                         list_move(&device->dev_list, &fs_devices->devices);
6882                         device->fs_devices->num_devices--;
6883                         fs_devices->num_devices++;
6884
6885                         device->fs_devices->missing_devices--;
6886                         fs_devices->missing_devices++;
6887
6888                         device->fs_devices = fs_devices;
6889                 }
6890         }
6891
6892         if (device->fs_devices != fs_info->fs_devices) {
6893                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6894                 if (device->generation !=
6895                     btrfs_device_generation(leaf, dev_item))
6896                         return -EINVAL;
6897         }
6898
6899         fill_device_from_item(leaf, dev_item, device);
6900         if (device->bdev) {
6901                 u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
6902
6903                 if (device->total_bytes > max_total_bytes) {
6904                         btrfs_err(fs_info,
6905                         "device total_bytes should be at most %llu but found %llu",
6906                                   max_total_bytes, device->total_bytes);
6907                         return -EINVAL;
6908                 }
6909         }
6910         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6911         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6912            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6913                 device->fs_devices->total_rw_bytes += device->total_bytes;
6914                 atomic64_add(device->total_bytes - device->bytes_used,
6915                                 &fs_info->free_chunk_space);
6916         }
6917         ret = 0;
6918         return ret;
6919 }
6920
6921 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6922 {
6923         struct btrfs_root *root = fs_info->tree_root;
6924         struct btrfs_super_block *super_copy = fs_info->super_copy;
6925         struct extent_buffer *sb;
6926         struct btrfs_disk_key *disk_key;
6927         struct btrfs_chunk *chunk;
6928         u8 *array_ptr;
6929         unsigned long sb_array_offset;
6930         int ret = 0;
6931         u32 num_stripes;
6932         u32 array_size;
6933         u32 len = 0;
6934         u32 cur_offset;
6935         u64 type;
6936         struct btrfs_key key;
6937
6938         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6939         /*
6940          * This will create extent buffer of nodesize, superblock size is
6941          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6942          * overallocate but we can keep it as-is, only the first page is used.
6943          */
6944         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
6945                                           root->root_key.objectid, 0);
6946         if (IS_ERR(sb))
6947                 return PTR_ERR(sb);
6948         set_extent_buffer_uptodate(sb);
6949         /*
6950          * The sb extent buffer is artificial and just used to read the system array.
6951          * set_extent_buffer_uptodate() call does not properly mark all it's
6952          * pages up-to-date when the page is larger: extent does not cover the
6953          * whole page and consequently check_page_uptodate does not find all
6954          * the page's extents up-to-date (the hole beyond sb),
6955          * write_extent_buffer then triggers a WARN_ON.
6956          *
6957          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6958          * but sb spans only this function. Add an explicit SetPageUptodate call
6959          * to silence the warning eg. on PowerPC 64.
6960          */
6961         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6962                 SetPageUptodate(sb->pages[0]);
6963
6964         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6965         array_size = btrfs_super_sys_array_size(super_copy);
6966
6967         array_ptr = super_copy->sys_chunk_array;
6968         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6969         cur_offset = 0;
6970
6971         while (cur_offset < array_size) {
6972                 disk_key = (struct btrfs_disk_key *)array_ptr;
6973                 len = sizeof(*disk_key);
6974                 if (cur_offset + len > array_size)
6975                         goto out_short_read;
6976
6977                 btrfs_disk_key_to_cpu(&key, disk_key);
6978
6979                 array_ptr += len;
6980                 sb_array_offset += len;
6981                 cur_offset += len;
6982
6983                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6984                         btrfs_err(fs_info,
6985                             "unexpected item type %u in sys_array at offset %u",
6986                                   (u32)key.type, cur_offset);
6987                         ret = -EIO;
6988                         break;
6989                 }
6990
6991                 chunk = (struct btrfs_chunk *)sb_array_offset;
6992                 /*
6993                  * At least one btrfs_chunk with one stripe must be present,
6994                  * exact stripe count check comes afterwards
6995                  */
6996                 len = btrfs_chunk_item_size(1);
6997                 if (cur_offset + len > array_size)
6998                         goto out_short_read;
6999
7000                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7001                 if (!num_stripes) {
7002                         btrfs_err(fs_info,
7003                         "invalid number of stripes %u in sys_array at offset %u",
7004                                   num_stripes, cur_offset);
7005                         ret = -EIO;
7006                         break;
7007                 }
7008
7009                 type = btrfs_chunk_type(sb, chunk);
7010                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7011                         btrfs_err(fs_info,
7012                         "invalid chunk type %llu in sys_array at offset %u",
7013                                   type, cur_offset);
7014                         ret = -EIO;
7015                         break;
7016                 }
7017
7018                 len = btrfs_chunk_item_size(num_stripes);
7019                 if (cur_offset + len > array_size)
7020                         goto out_short_read;
7021
7022                 ret = read_one_chunk(&key, sb, chunk);
7023                 if (ret)
7024                         break;
7025
7026                 array_ptr += len;
7027                 sb_array_offset += len;
7028                 cur_offset += len;
7029         }
7030         clear_extent_buffer_uptodate(sb);
7031         free_extent_buffer_stale(sb);
7032         return ret;
7033
7034 out_short_read:
7035         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7036                         len, cur_offset);
7037         clear_extent_buffer_uptodate(sb);
7038         free_extent_buffer_stale(sb);
7039         return -EIO;
7040 }
7041
7042 /*
7043  * Check if all chunks in the fs are OK for read-write degraded mount
7044  *
7045  * If the @failing_dev is specified, it's accounted as missing.
7046  *
7047  * Return true if all chunks meet the minimal RW mount requirements.
7048  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7049  */
7050 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7051                                         struct btrfs_device *failing_dev)
7052 {
7053         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7054         struct extent_map *em;
7055         u64 next_start = 0;
7056         bool ret = true;
7057
7058         read_lock(&map_tree->lock);
7059         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7060         read_unlock(&map_tree->lock);
7061         /* No chunk at all? Return false anyway */
7062         if (!em) {
7063                 ret = false;
7064                 goto out;
7065         }
7066         while (em) {
7067                 struct map_lookup *map;
7068                 int missing = 0;
7069                 int max_tolerated;
7070                 int i;
7071
7072                 map = em->map_lookup;
7073                 max_tolerated =
7074                         btrfs_get_num_tolerated_disk_barrier_failures(
7075                                         map->type);
7076                 for (i = 0; i < map->num_stripes; i++) {
7077                         struct btrfs_device *dev = map->stripes[i].dev;
7078
7079                         if (!dev || !dev->bdev ||
7080                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7081                             dev->last_flush_error)
7082                                 missing++;
7083                         else if (failing_dev && failing_dev == dev)
7084                                 missing++;
7085                 }
7086                 if (missing > max_tolerated) {
7087                         if (!failing_dev)
7088                                 btrfs_warn(fs_info,
7089         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7090                                    em->start, missing, max_tolerated);
7091                         free_extent_map(em);
7092                         ret = false;
7093                         goto out;
7094                 }
7095                 next_start = extent_map_end(em);
7096                 free_extent_map(em);
7097
7098                 read_lock(&map_tree->lock);
7099                 em = lookup_extent_mapping(map_tree, next_start,
7100                                            (u64)(-1) - next_start);
7101                 read_unlock(&map_tree->lock);
7102         }
7103 out:
7104         return ret;
7105 }
7106
7107 static void readahead_tree_node_children(struct extent_buffer *node)
7108 {
7109         int i;
7110         const int nr_items = btrfs_header_nritems(node);
7111
7112         for (i = 0; i < nr_items; i++)
7113                 btrfs_readahead_node_child(node, i);
7114 }
7115
7116 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7117 {
7118         struct btrfs_root *root = fs_info->chunk_root;
7119         struct btrfs_path *path;
7120         struct extent_buffer *leaf;
7121         struct btrfs_key key;
7122         struct btrfs_key found_key;
7123         int ret;
7124         int slot;
7125         u64 total_dev = 0;
7126         u64 last_ra_node = 0;
7127
7128         path = btrfs_alloc_path();
7129         if (!path)
7130                 return -ENOMEM;
7131
7132         /*
7133          * uuid_mutex is needed only if we are mounting a sprout FS
7134          * otherwise we don't need it.
7135          */
7136         mutex_lock(&uuid_mutex);
7137
7138         /*
7139          * It is possible for mount and umount to race in such a way that
7140          * we execute this code path, but open_fs_devices failed to clear
7141          * total_rw_bytes. We certainly want it cleared before reading the
7142          * device items, so clear it here.
7143          */
7144         fs_info->fs_devices->total_rw_bytes = 0;
7145
7146         /*
7147          * Read all device items, and then all the chunk items. All
7148          * device items are found before any chunk item (their object id
7149          * is smaller than the lowest possible object id for a chunk
7150          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7151          */
7152         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7153         key.offset = 0;
7154         key.type = 0;
7155         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7156         if (ret < 0)
7157                 goto error;
7158         while (1) {
7159                 struct extent_buffer *node;
7160
7161                 leaf = path->nodes[0];
7162                 slot = path->slots[0];
7163                 if (slot >= btrfs_header_nritems(leaf)) {
7164                         ret = btrfs_next_leaf(root, path);
7165                         if (ret == 0)
7166                                 continue;
7167                         if (ret < 0)
7168                                 goto error;
7169                         break;
7170                 }
7171                 /*
7172                  * The nodes on level 1 are not locked but we don't need to do
7173                  * that during mount time as nothing else can access the tree
7174                  */
7175                 node = path->nodes[1];
7176                 if (node) {
7177                         if (last_ra_node != node->start) {
7178                                 readahead_tree_node_children(node);
7179                                 last_ra_node = node->start;
7180                         }
7181                 }
7182                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7183                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7184                         struct btrfs_dev_item *dev_item;
7185                         dev_item = btrfs_item_ptr(leaf, slot,
7186                                                   struct btrfs_dev_item);
7187                         ret = read_one_dev(leaf, dev_item);
7188                         if (ret)
7189                                 goto error;
7190                         total_dev++;
7191                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7192                         struct btrfs_chunk *chunk;
7193                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7194                         mutex_lock(&fs_info->chunk_mutex);
7195                         ret = read_one_chunk(&found_key, leaf, chunk);
7196                         mutex_unlock(&fs_info->chunk_mutex);
7197                         if (ret)
7198                                 goto error;
7199                 }
7200                 path->slots[0]++;
7201         }
7202
7203         /*
7204          * After loading chunk tree, we've got all device information,
7205          * do another round of validation checks.
7206          */
7207         if (total_dev != fs_info->fs_devices->total_devices) {
7208                 btrfs_err(fs_info,
7209            "super_num_devices %llu mismatch with num_devices %llu found here",
7210                           btrfs_super_num_devices(fs_info->super_copy),
7211                           total_dev);
7212                 ret = -EINVAL;
7213                 goto error;
7214         }
7215         if (btrfs_super_total_bytes(fs_info->super_copy) <
7216             fs_info->fs_devices->total_rw_bytes) {
7217                 btrfs_err(fs_info,
7218         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7219                           btrfs_super_total_bytes(fs_info->super_copy),
7220                           fs_info->fs_devices->total_rw_bytes);
7221                 ret = -EINVAL;
7222                 goto error;
7223         }
7224         ret = 0;
7225 error:
7226         mutex_unlock(&uuid_mutex);
7227
7228         btrfs_free_path(path);
7229         return ret;
7230 }
7231
7232 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7233 {
7234         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7235         struct btrfs_device *device;
7236
7237         fs_devices->fs_info = fs_info;
7238
7239         mutex_lock(&fs_devices->device_list_mutex);
7240         list_for_each_entry(device, &fs_devices->devices, dev_list)
7241                 device->fs_info = fs_info;
7242
7243         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7244                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7245                         device->fs_info = fs_info;
7246
7247                 seed_devs->fs_info = fs_info;
7248         }
7249         mutex_unlock(&fs_devices->device_list_mutex);
7250 }
7251
7252 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7253                                  const struct btrfs_dev_stats_item *ptr,
7254                                  int index)
7255 {
7256         u64 val;
7257
7258         read_extent_buffer(eb, &val,
7259                            offsetof(struct btrfs_dev_stats_item, values) +
7260                             ((unsigned long)ptr) + (index * sizeof(u64)),
7261                            sizeof(val));
7262         return val;
7263 }
7264
7265 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7266                                       struct btrfs_dev_stats_item *ptr,
7267                                       int index, u64 val)
7268 {
7269         write_extent_buffer(eb, &val,
7270                             offsetof(struct btrfs_dev_stats_item, values) +
7271                              ((unsigned long)ptr) + (index * sizeof(u64)),
7272                             sizeof(val));
7273 }
7274
7275 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7276                                        struct btrfs_path *path)
7277 {
7278         struct btrfs_dev_stats_item *ptr;
7279         struct extent_buffer *eb;
7280         struct btrfs_key key;
7281         int item_size;
7282         int i, ret, slot;
7283
7284         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7285         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7286         key.offset = device->devid;
7287         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7288         if (ret) {
7289                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7290                         btrfs_dev_stat_set(device, i, 0);
7291                 device->dev_stats_valid = 1;
7292                 btrfs_release_path(path);
7293                 return ret < 0 ? ret : 0;
7294         }
7295         slot = path->slots[0];
7296         eb = path->nodes[0];
7297         item_size = btrfs_item_size_nr(eb, slot);
7298
7299         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7300
7301         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7302                 if (item_size >= (1 + i) * sizeof(__le64))
7303                         btrfs_dev_stat_set(device, i,
7304                                            btrfs_dev_stats_value(eb, ptr, i));
7305                 else
7306                         btrfs_dev_stat_set(device, i, 0);
7307         }
7308
7309         device->dev_stats_valid = 1;
7310         btrfs_dev_stat_print_on_load(device);
7311         btrfs_release_path(path);
7312
7313         return 0;
7314 }
7315
7316 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7317 {
7318         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7319         struct btrfs_device *device;
7320         struct btrfs_path *path = NULL;
7321         int ret = 0;
7322
7323         path = btrfs_alloc_path();
7324         if (!path)
7325                 return -ENOMEM;
7326
7327         mutex_lock(&fs_devices->device_list_mutex);
7328         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7329                 ret = btrfs_device_init_dev_stats(device, path);
7330                 if (ret)
7331                         goto out;
7332         }
7333         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7334                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7335                         ret = btrfs_device_init_dev_stats(device, path);
7336                         if (ret)
7337                                 goto out;
7338                 }
7339         }
7340 out:
7341         mutex_unlock(&fs_devices->device_list_mutex);
7342
7343         btrfs_free_path(path);
7344         return ret;
7345 }
7346
7347 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7348                                 struct btrfs_device *device)
7349 {
7350         struct btrfs_fs_info *fs_info = trans->fs_info;
7351         struct btrfs_root *dev_root = fs_info->dev_root;
7352         struct btrfs_path *path;
7353         struct btrfs_key key;
7354         struct extent_buffer *eb;
7355         struct btrfs_dev_stats_item *ptr;
7356         int ret;
7357         int i;
7358
7359         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7360         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7361         key.offset = device->devid;
7362
7363         path = btrfs_alloc_path();
7364         if (!path)
7365                 return -ENOMEM;
7366         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7367         if (ret < 0) {
7368                 btrfs_warn_in_rcu(fs_info,
7369                         "error %d while searching for dev_stats item for device %s",
7370                               ret, rcu_str_deref(device->name));
7371                 goto out;
7372         }
7373
7374         if (ret == 0 &&
7375             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7376                 /* need to delete old one and insert a new one */
7377                 ret = btrfs_del_item(trans, dev_root, path);
7378                 if (ret != 0) {
7379                         btrfs_warn_in_rcu(fs_info,
7380                                 "delete too small dev_stats item for device %s failed %d",
7381                                       rcu_str_deref(device->name), ret);
7382                         goto out;
7383                 }
7384                 ret = 1;
7385         }
7386
7387         if (ret == 1) {
7388                 /* need to insert a new item */
7389                 btrfs_release_path(path);
7390                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7391                                               &key, sizeof(*ptr));
7392                 if (ret < 0) {
7393                         btrfs_warn_in_rcu(fs_info,
7394                                 "insert dev_stats item for device %s failed %d",
7395                                 rcu_str_deref(device->name), ret);
7396                         goto out;
7397                 }
7398         }
7399
7400         eb = path->nodes[0];
7401         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7402         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7403                 btrfs_set_dev_stats_value(eb, ptr, i,
7404                                           btrfs_dev_stat_read(device, i));
7405         btrfs_mark_buffer_dirty(eb);
7406
7407 out:
7408         btrfs_free_path(path);
7409         return ret;
7410 }
7411
7412 /*
7413  * called from commit_transaction. Writes all changed device stats to disk.
7414  */
7415 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7416 {
7417         struct btrfs_fs_info *fs_info = trans->fs_info;
7418         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7419         struct btrfs_device *device;
7420         int stats_cnt;
7421         int ret = 0;
7422
7423         mutex_lock(&fs_devices->device_list_mutex);
7424         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7425                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7426                 if (!device->dev_stats_valid || stats_cnt == 0)
7427                         continue;
7428
7429
7430                 /*
7431                  * There is a LOAD-LOAD control dependency between the value of
7432                  * dev_stats_ccnt and updating the on-disk values which requires
7433                  * reading the in-memory counters. Such control dependencies
7434                  * require explicit read memory barriers.
7435                  *
7436                  * This memory barriers pairs with smp_mb__before_atomic in
7437                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7438                  * barrier implied by atomic_xchg in
7439                  * btrfs_dev_stats_read_and_reset
7440                  */
7441                 smp_rmb();
7442
7443                 ret = update_dev_stat_item(trans, device);
7444                 if (!ret)
7445                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7446         }
7447         mutex_unlock(&fs_devices->device_list_mutex);
7448
7449         return ret;
7450 }
7451
7452 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7453 {
7454         btrfs_dev_stat_inc(dev, index);
7455         btrfs_dev_stat_print_on_error(dev);
7456 }
7457
7458 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7459 {
7460         if (!dev->dev_stats_valid)
7461                 return;
7462         btrfs_err_rl_in_rcu(dev->fs_info,
7463                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7464                            rcu_str_deref(dev->name),
7465                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7466                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7467                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7468                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7469                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7470 }
7471
7472 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7473 {
7474         int i;
7475
7476         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7477                 if (btrfs_dev_stat_read(dev, i) != 0)
7478                         break;
7479         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7480                 return; /* all values == 0, suppress message */
7481
7482         btrfs_info_in_rcu(dev->fs_info,
7483                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7484                rcu_str_deref(dev->name),
7485                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7486                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7487                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7488                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7489                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7490 }
7491
7492 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7493                         struct btrfs_ioctl_get_dev_stats *stats)
7494 {
7495         struct btrfs_device *dev;
7496         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7497         int i;
7498
7499         mutex_lock(&fs_devices->device_list_mutex);
7500         dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7501         mutex_unlock(&fs_devices->device_list_mutex);
7502
7503         if (!dev) {
7504                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7505                 return -ENODEV;
7506         } else if (!dev->dev_stats_valid) {
7507                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7508                 return -ENODEV;
7509         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7510                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7511                         if (stats->nr_items > i)
7512                                 stats->values[i] =
7513                                         btrfs_dev_stat_read_and_reset(dev, i);
7514                         else
7515                                 btrfs_dev_stat_set(dev, i, 0);
7516                 }
7517                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7518                            current->comm, task_pid_nr(current));
7519         } else {
7520                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7521                         if (stats->nr_items > i)
7522                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7523         }
7524         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7525                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7526         return 0;
7527 }
7528
7529 /*
7530  * Update the size and bytes used for each device where it changed.  This is
7531  * delayed since we would otherwise get errors while writing out the
7532  * superblocks.
7533  *
7534  * Must be invoked during transaction commit.
7535  */
7536 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7537 {
7538         struct btrfs_device *curr, *next;
7539
7540         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7541
7542         if (list_empty(&trans->dev_update_list))
7543                 return;
7544
7545         /*
7546          * We don't need the device_list_mutex here.  This list is owned by the
7547          * transaction and the transaction must complete before the device is
7548          * released.
7549          */
7550         mutex_lock(&trans->fs_info->chunk_mutex);
7551         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7552                                  post_commit_list) {
7553                 list_del_init(&curr->post_commit_list);
7554                 curr->commit_total_bytes = curr->disk_total_bytes;
7555                 curr->commit_bytes_used = curr->bytes_used;
7556         }
7557         mutex_unlock(&trans->fs_info->chunk_mutex);
7558 }
7559
7560 /*
7561  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7562  */
7563 int btrfs_bg_type_to_factor(u64 flags)
7564 {
7565         const int index = btrfs_bg_flags_to_raid_index(flags);
7566
7567         return btrfs_raid_array[index].ncopies;
7568 }
7569
7570
7571
7572 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7573                                  u64 chunk_offset, u64 devid,
7574                                  u64 physical_offset, u64 physical_len)
7575 {
7576         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7577         struct extent_map *em;
7578         struct map_lookup *map;
7579         struct btrfs_device *dev;
7580         u64 stripe_len;
7581         bool found = false;
7582         int ret = 0;
7583         int i;
7584
7585         read_lock(&em_tree->lock);
7586         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7587         read_unlock(&em_tree->lock);
7588
7589         if (!em) {
7590                 btrfs_err(fs_info,
7591 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7592                           physical_offset, devid);
7593                 ret = -EUCLEAN;
7594                 goto out;
7595         }
7596
7597         map = em->map_lookup;
7598         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7599         if (physical_len != stripe_len) {
7600                 btrfs_err(fs_info,
7601 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7602                           physical_offset, devid, em->start, physical_len,
7603                           stripe_len);
7604                 ret = -EUCLEAN;
7605                 goto out;
7606         }
7607
7608         for (i = 0; i < map->num_stripes; i++) {
7609                 if (map->stripes[i].dev->devid == devid &&
7610                     map->stripes[i].physical == physical_offset) {
7611                         found = true;
7612                         if (map->verified_stripes >= map->num_stripes) {
7613                                 btrfs_err(fs_info,
7614                                 "too many dev extents for chunk %llu found",
7615                                           em->start);
7616                                 ret = -EUCLEAN;
7617                                 goto out;
7618                         }
7619                         map->verified_stripes++;
7620                         break;
7621                 }
7622         }
7623         if (!found) {
7624                 btrfs_err(fs_info,
7625         "dev extent physical offset %llu devid %llu has no corresponding chunk",
7626                         physical_offset, devid);
7627                 ret = -EUCLEAN;
7628         }
7629
7630         /* Make sure no dev extent is beyond device bondary */
7631         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
7632         if (!dev) {
7633                 btrfs_err(fs_info, "failed to find devid %llu", devid);
7634                 ret = -EUCLEAN;
7635                 goto out;
7636         }
7637
7638         if (physical_offset + physical_len > dev->disk_total_bytes) {
7639                 btrfs_err(fs_info,
7640 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7641                           devid, physical_offset, physical_len,
7642                           dev->disk_total_bytes);
7643                 ret = -EUCLEAN;
7644                 goto out;
7645         }
7646 out:
7647         free_extent_map(em);
7648         return ret;
7649 }
7650
7651 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7652 {
7653         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7654         struct extent_map *em;
7655         struct rb_node *node;
7656         int ret = 0;
7657
7658         read_lock(&em_tree->lock);
7659         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7660                 em = rb_entry(node, struct extent_map, rb_node);
7661                 if (em->map_lookup->num_stripes !=
7662                     em->map_lookup->verified_stripes) {
7663                         btrfs_err(fs_info,
7664                         "chunk %llu has missing dev extent, have %d expect %d",
7665                                   em->start, em->map_lookup->verified_stripes,
7666                                   em->map_lookup->num_stripes);
7667                         ret = -EUCLEAN;
7668                         goto out;
7669                 }
7670         }
7671 out:
7672         read_unlock(&em_tree->lock);
7673         return ret;
7674 }
7675
7676 /*
7677  * Ensure that all dev extents are mapped to correct chunk, otherwise
7678  * later chunk allocation/free would cause unexpected behavior.
7679  *
7680  * NOTE: This will iterate through the whole device tree, which should be of
7681  * the same size level as the chunk tree.  This slightly increases mount time.
7682  */
7683 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7684 {
7685         struct btrfs_path *path;
7686         struct btrfs_root *root = fs_info->dev_root;
7687         struct btrfs_key key;
7688         u64 prev_devid = 0;
7689         u64 prev_dev_ext_end = 0;
7690         int ret = 0;
7691
7692         /*
7693          * We don't have a dev_root because we mounted with ignorebadroots and
7694          * failed to load the root, so we want to skip the verification in this
7695          * case for sure.
7696          *
7697          * However if the dev root is fine, but the tree itself is corrupted
7698          * we'd still fail to mount.  This verification is only to make sure
7699          * writes can happen safely, so instead just bypass this check
7700          * completely in the case of IGNOREBADROOTS.
7701          */
7702         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
7703                 return 0;
7704
7705         key.objectid = 1;
7706         key.type = BTRFS_DEV_EXTENT_KEY;
7707         key.offset = 0;
7708
7709         path = btrfs_alloc_path();
7710         if (!path)
7711                 return -ENOMEM;
7712
7713         path->reada = READA_FORWARD;
7714         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7715         if (ret < 0)
7716                 goto out;
7717
7718         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7719                 ret = btrfs_next_item(root, path);
7720                 if (ret < 0)
7721                         goto out;
7722                 /* No dev extents at all? Not good */
7723                 if (ret > 0) {
7724                         ret = -EUCLEAN;
7725                         goto out;
7726                 }
7727         }
7728         while (1) {
7729                 struct extent_buffer *leaf = path->nodes[0];
7730                 struct btrfs_dev_extent *dext;
7731                 int slot = path->slots[0];
7732                 u64 chunk_offset;
7733                 u64 physical_offset;
7734                 u64 physical_len;
7735                 u64 devid;
7736
7737                 btrfs_item_key_to_cpu(leaf, &key, slot);
7738                 if (key.type != BTRFS_DEV_EXTENT_KEY)
7739                         break;
7740                 devid = key.objectid;
7741                 physical_offset = key.offset;
7742
7743                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7744                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7745                 physical_len = btrfs_dev_extent_length(leaf, dext);
7746
7747                 /* Check if this dev extent overlaps with the previous one */
7748                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7749                         btrfs_err(fs_info,
7750 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7751                                   devid, physical_offset, prev_dev_ext_end);
7752                         ret = -EUCLEAN;
7753                         goto out;
7754                 }
7755
7756                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7757                                             physical_offset, physical_len);
7758                 if (ret < 0)
7759                         goto out;
7760                 prev_devid = devid;
7761                 prev_dev_ext_end = physical_offset + physical_len;
7762
7763                 ret = btrfs_next_item(root, path);
7764                 if (ret < 0)
7765                         goto out;
7766                 if (ret > 0) {
7767                         ret = 0;
7768                         break;
7769                 }
7770         }
7771
7772         /* Ensure all chunks have corresponding dev extents */
7773         ret = verify_chunk_dev_extent_mapping(fs_info);
7774 out:
7775         btrfs_free_path(path);
7776         return ret;
7777 }
7778
7779 /*
7780  * Check whether the given block group or device is pinned by any inode being
7781  * used as a swapfile.
7782  */
7783 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7784 {
7785         struct btrfs_swapfile_pin *sp;
7786         struct rb_node *node;
7787
7788         spin_lock(&fs_info->swapfile_pins_lock);
7789         node = fs_info->swapfile_pins.rb_node;
7790         while (node) {
7791                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7792                 if (ptr < sp->ptr)
7793                         node = node->rb_left;
7794                 else if (ptr > sp->ptr)
7795                         node = node->rb_right;
7796                 else
7797                         break;
7798         }
7799         spin_unlock(&fs_info->swapfile_pins_lock);
7800         return node != NULL;
7801 }