Merge tag 'pm-5.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
[linux-2.6-microblaze.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/mm.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/ratelimit.h>
12 #include <linux/kthread.h>
13 #include <linux/raid/pq.h>
14 #include <linux/semaphore.h>
15 #include <linux/uuid.h>
16 #include <linux/list_sort.h>
17 #include "misc.h"
18 #include "ctree.h"
19 #include "extent_map.h"
20 #include "disk-io.h"
21 #include "transaction.h"
22 #include "print-tree.h"
23 #include "volumes.h"
24 #include "raid56.h"
25 #include "async-thread.h"
26 #include "check-integrity.h"
27 #include "rcu-string.h"
28 #include "dev-replace.h"
29 #include "sysfs.h"
30 #include "tree-checker.h"
31 #include "space-info.h"
32 #include "block-group.h"
33 #include "discard.h"
34 #include "zoned.h"
35
36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
37         [BTRFS_RAID_RAID10] = {
38                 .sub_stripes    = 2,
39                 .dev_stripes    = 1,
40                 .devs_max       = 0,    /* 0 == as many as possible */
41                 .devs_min       = 4,
42                 .tolerated_failures = 1,
43                 .devs_increment = 2,
44                 .ncopies        = 2,
45                 .nparity        = 0,
46                 .raid_name      = "raid10",
47                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
48                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
49         },
50         [BTRFS_RAID_RAID1] = {
51                 .sub_stripes    = 1,
52                 .dev_stripes    = 1,
53                 .devs_max       = 2,
54                 .devs_min       = 2,
55                 .tolerated_failures = 1,
56                 .devs_increment = 2,
57                 .ncopies        = 2,
58                 .nparity        = 0,
59                 .raid_name      = "raid1",
60                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
61                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62         },
63         [BTRFS_RAID_RAID1C3] = {
64                 .sub_stripes    = 1,
65                 .dev_stripes    = 1,
66                 .devs_max       = 3,
67                 .devs_min       = 3,
68                 .tolerated_failures = 2,
69                 .devs_increment = 3,
70                 .ncopies        = 3,
71                 .nparity        = 0,
72                 .raid_name      = "raid1c3",
73                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
74                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75         },
76         [BTRFS_RAID_RAID1C4] = {
77                 .sub_stripes    = 1,
78                 .dev_stripes    = 1,
79                 .devs_max       = 4,
80                 .devs_min       = 4,
81                 .tolerated_failures = 3,
82                 .devs_increment = 4,
83                 .ncopies        = 4,
84                 .nparity        = 0,
85                 .raid_name      = "raid1c4",
86                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
87                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
88         },
89         [BTRFS_RAID_DUP] = {
90                 .sub_stripes    = 1,
91                 .dev_stripes    = 2,
92                 .devs_max       = 1,
93                 .devs_min       = 1,
94                 .tolerated_failures = 0,
95                 .devs_increment = 1,
96                 .ncopies        = 2,
97                 .nparity        = 0,
98                 .raid_name      = "dup",
99                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
100                 .mindev_error   = 0,
101         },
102         [BTRFS_RAID_RAID0] = {
103                 .sub_stripes    = 1,
104                 .dev_stripes    = 1,
105                 .devs_max       = 0,
106                 .devs_min       = 2,
107                 .tolerated_failures = 0,
108                 .devs_increment = 1,
109                 .ncopies        = 1,
110                 .nparity        = 0,
111                 .raid_name      = "raid0",
112                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
113                 .mindev_error   = 0,
114         },
115         [BTRFS_RAID_SINGLE] = {
116                 .sub_stripes    = 1,
117                 .dev_stripes    = 1,
118                 .devs_max       = 1,
119                 .devs_min       = 1,
120                 .tolerated_failures = 0,
121                 .devs_increment = 1,
122                 .ncopies        = 1,
123                 .nparity        = 0,
124                 .raid_name      = "single",
125                 .bg_flag        = 0,
126                 .mindev_error   = 0,
127         },
128         [BTRFS_RAID_RAID5] = {
129                 .sub_stripes    = 1,
130                 .dev_stripes    = 1,
131                 .devs_max       = 0,
132                 .devs_min       = 2,
133                 .tolerated_failures = 1,
134                 .devs_increment = 1,
135                 .ncopies        = 1,
136                 .nparity        = 1,
137                 .raid_name      = "raid5",
138                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
139                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
140         },
141         [BTRFS_RAID_RAID6] = {
142                 .sub_stripes    = 1,
143                 .dev_stripes    = 1,
144                 .devs_max       = 0,
145                 .devs_min       = 3,
146                 .tolerated_failures = 2,
147                 .devs_increment = 1,
148                 .ncopies        = 1,
149                 .nparity        = 2,
150                 .raid_name      = "raid6",
151                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
152                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
153         },
154 };
155
156 const char *btrfs_bg_type_to_raid_name(u64 flags)
157 {
158         const int index = btrfs_bg_flags_to_raid_index(flags);
159
160         if (index >= BTRFS_NR_RAID_TYPES)
161                 return NULL;
162
163         return btrfs_raid_array[index].raid_name;
164 }
165
166 /*
167  * Fill @buf with textual description of @bg_flags, no more than @size_buf
168  * bytes including terminating null byte.
169  */
170 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171 {
172         int i;
173         int ret;
174         char *bp = buf;
175         u64 flags = bg_flags;
176         u32 size_bp = size_buf;
177
178         if (!flags) {
179                 strcpy(bp, "NONE");
180                 return;
181         }
182
183 #define DESCRIBE_FLAG(flag, desc)                                               \
184         do {                                                            \
185                 if (flags & (flag)) {                                   \
186                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
187                         if (ret < 0 || ret >= size_bp)                  \
188                                 goto out_overflow;                      \
189                         size_bp -= ret;                                 \
190                         bp += ret;                                      \
191                         flags &= ~(flag);                               \
192                 }                                                       \
193         } while (0)
194
195         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
199         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202                               btrfs_raid_array[i].raid_name);
203 #undef DESCRIBE_FLAG
204
205         if (flags) {
206                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
207                 size_bp -= ret;
208         }
209
210         if (size_bp < size_buf)
211                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
213         /*
214          * The text is trimmed, it's up to the caller to provide sufficiently
215          * large buffer
216          */
217 out_overflow:;
218 }
219
220 static int init_first_rw_device(struct btrfs_trans_handle *trans);
221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
225                              enum btrfs_map_op op,
226                              u64 logical, u64 *length,
227                              struct btrfs_bio **bbio_ret,
228                              int mirror_num, int need_raid_map);
229
230 /*
231  * Device locking
232  * ==============
233  *
234  * There are several mutexes that protect manipulation of devices and low-level
235  * structures like chunks but not block groups, extents or files
236  *
237  * uuid_mutex (global lock)
238  * ------------------------
239  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
240  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
241  * device) or requested by the device= mount option
242  *
243  * the mutex can be very coarse and can cover long-running operations
244  *
245  * protects: updates to fs_devices counters like missing devices, rw devices,
246  * seeding, structure cloning, opening/closing devices at mount/umount time
247  *
248  * global::fs_devs - add, remove, updates to the global list
249  *
250  * does not protect: manipulation of the fs_devices::devices list in general
251  * but in mount context it could be used to exclude list modifications by eg.
252  * scan ioctl
253  *
254  * btrfs_device::name - renames (write side), read is RCU
255  *
256  * fs_devices::device_list_mutex (per-fs, with RCU)
257  * ------------------------------------------------
258  * protects updates to fs_devices::devices, ie. adding and deleting
259  *
260  * simple list traversal with read-only actions can be done with RCU protection
261  *
262  * may be used to exclude some operations from running concurrently without any
263  * modifications to the list (see write_all_supers)
264  *
265  * Is not required at mount and close times, because our device list is
266  * protected by the uuid_mutex at that point.
267  *
268  * balance_mutex
269  * -------------
270  * protects balance structures (status, state) and context accessed from
271  * several places (internally, ioctl)
272  *
273  * chunk_mutex
274  * -----------
275  * protects chunks, adding or removing during allocation, trim or when a new
276  * device is added/removed. Additionally it also protects post_commit_list of
277  * individual devices, since they can be added to the transaction's
278  * post_commit_list only with chunk_mutex held.
279  *
280  * cleaner_mutex
281  * -------------
282  * a big lock that is held by the cleaner thread and prevents running subvolume
283  * cleaning together with relocation or delayed iputs
284  *
285  *
286  * Lock nesting
287  * ============
288  *
289  * uuid_mutex
290  *   device_list_mutex
291  *     chunk_mutex
292  *   balance_mutex
293  *
294  *
295  * Exclusive operations
296  * ====================
297  *
298  * Maintains the exclusivity of the following operations that apply to the
299  * whole filesystem and cannot run in parallel.
300  *
301  * - Balance (*)
302  * - Device add
303  * - Device remove
304  * - Device replace (*)
305  * - Resize
306  *
307  * The device operations (as above) can be in one of the following states:
308  *
309  * - Running state
310  * - Paused state
311  * - Completed state
312  *
313  * Only device operations marked with (*) can go into the Paused state for the
314  * following reasons:
315  *
316  * - ioctl (only Balance can be Paused through ioctl)
317  * - filesystem remounted as read-only
318  * - filesystem unmounted and mounted as read-only
319  * - system power-cycle and filesystem mounted as read-only
320  * - filesystem or device errors leading to forced read-only
321  *
322  * The status of exclusive operation is set and cleared atomically.
323  * During the course of Paused state, fs_info::exclusive_operation remains set.
324  * A device operation in Paused or Running state can be canceled or resumed
325  * either by ioctl (Balance only) or when remounted as read-write.
326  * The exclusive status is cleared when the device operation is canceled or
327  * completed.
328  */
329
330 DEFINE_MUTEX(uuid_mutex);
331 static LIST_HEAD(fs_uuids);
332 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
333 {
334         return &fs_uuids;
335 }
336
337 /*
338  * alloc_fs_devices - allocate struct btrfs_fs_devices
339  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
340  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
341  *
342  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
343  * The returned struct is not linked onto any lists and can be destroyed with
344  * kfree() right away.
345  */
346 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347                                                  const u8 *metadata_fsid)
348 {
349         struct btrfs_fs_devices *fs_devs;
350
351         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
352         if (!fs_devs)
353                 return ERR_PTR(-ENOMEM);
354
355         mutex_init(&fs_devs->device_list_mutex);
356
357         INIT_LIST_HEAD(&fs_devs->devices);
358         INIT_LIST_HEAD(&fs_devs->alloc_list);
359         INIT_LIST_HEAD(&fs_devs->fs_list);
360         INIT_LIST_HEAD(&fs_devs->seed_list);
361         if (fsid)
362                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
363
364         if (metadata_fsid)
365                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366         else if (fsid)
367                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
368
369         return fs_devs;
370 }
371
372 void btrfs_free_device(struct btrfs_device *device)
373 {
374         WARN_ON(!list_empty(&device->post_commit_list));
375         rcu_string_free(device->name);
376         extent_io_tree_release(&device->alloc_state);
377         bio_put(device->flush_bio);
378         btrfs_destroy_dev_zone_info(device);
379         kfree(device);
380 }
381
382 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
383 {
384         struct btrfs_device *device;
385         WARN_ON(fs_devices->opened);
386         while (!list_empty(&fs_devices->devices)) {
387                 device = list_entry(fs_devices->devices.next,
388                                     struct btrfs_device, dev_list);
389                 list_del(&device->dev_list);
390                 btrfs_free_device(device);
391         }
392         kfree(fs_devices);
393 }
394
395 void __exit btrfs_cleanup_fs_uuids(void)
396 {
397         struct btrfs_fs_devices *fs_devices;
398
399         while (!list_empty(&fs_uuids)) {
400                 fs_devices = list_entry(fs_uuids.next,
401                                         struct btrfs_fs_devices, fs_list);
402                 list_del(&fs_devices->fs_list);
403                 free_fs_devices(fs_devices);
404         }
405 }
406
407 /*
408  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
409  * Returned struct is not linked onto any lists and must be destroyed using
410  * btrfs_free_device.
411  */
412 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
413 {
414         struct btrfs_device *dev;
415
416         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
417         if (!dev)
418                 return ERR_PTR(-ENOMEM);
419
420         /*
421          * Preallocate a bio that's always going to be used for flushing device
422          * barriers and matches the device lifespan
423          */
424         dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
425         if (!dev->flush_bio) {
426                 kfree(dev);
427                 return ERR_PTR(-ENOMEM);
428         }
429
430         INIT_LIST_HEAD(&dev->dev_list);
431         INIT_LIST_HEAD(&dev->dev_alloc_list);
432         INIT_LIST_HEAD(&dev->post_commit_list);
433
434         atomic_set(&dev->reada_in_flight, 0);
435         atomic_set(&dev->dev_stats_ccnt, 0);
436         btrfs_device_data_ordered_init(dev);
437         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
438         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
439         extent_io_tree_init(fs_info, &dev->alloc_state,
440                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
441
442         return dev;
443 }
444
445 static noinline struct btrfs_fs_devices *find_fsid(
446                 const u8 *fsid, const u8 *metadata_fsid)
447 {
448         struct btrfs_fs_devices *fs_devices;
449
450         ASSERT(fsid);
451
452         /* Handle non-split brain cases */
453         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
454                 if (metadata_fsid) {
455                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
456                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
457                                       BTRFS_FSID_SIZE) == 0)
458                                 return fs_devices;
459                 } else {
460                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
461                                 return fs_devices;
462                 }
463         }
464         return NULL;
465 }
466
467 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
468                                 struct btrfs_super_block *disk_super)
469 {
470
471         struct btrfs_fs_devices *fs_devices;
472
473         /*
474          * Handle scanned device having completed its fsid change but
475          * belonging to a fs_devices that was created by first scanning
476          * a device which didn't have its fsid/metadata_uuid changed
477          * at all and the CHANGING_FSID_V2 flag set.
478          */
479         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
480                 if (fs_devices->fsid_change &&
481                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
482                            BTRFS_FSID_SIZE) == 0 &&
483                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
484                            BTRFS_FSID_SIZE) == 0) {
485                         return fs_devices;
486                 }
487         }
488         /*
489          * Handle scanned device having completed its fsid change but
490          * belonging to a fs_devices that was created by a device that
491          * has an outdated pair of fsid/metadata_uuid and
492          * CHANGING_FSID_V2 flag set.
493          */
494         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
495                 if (fs_devices->fsid_change &&
496                     memcmp(fs_devices->metadata_uuid,
497                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
498                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
499                            BTRFS_FSID_SIZE) == 0) {
500                         return fs_devices;
501                 }
502         }
503
504         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
505 }
506
507
508 static int
509 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
510                       int flush, struct block_device **bdev,
511                       struct btrfs_super_block **disk_super)
512 {
513         int ret;
514
515         *bdev = blkdev_get_by_path(device_path, flags, holder);
516
517         if (IS_ERR(*bdev)) {
518                 ret = PTR_ERR(*bdev);
519                 goto error;
520         }
521
522         if (flush)
523                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
524         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
525         if (ret) {
526                 blkdev_put(*bdev, flags);
527                 goto error;
528         }
529         invalidate_bdev(*bdev);
530         *disk_super = btrfs_read_dev_super(*bdev);
531         if (IS_ERR(*disk_super)) {
532                 ret = PTR_ERR(*disk_super);
533                 blkdev_put(*bdev, flags);
534                 goto error;
535         }
536
537         return 0;
538
539 error:
540         *bdev = NULL;
541         return ret;
542 }
543
544 static bool device_path_matched(const char *path, struct btrfs_device *device)
545 {
546         int found;
547
548         rcu_read_lock();
549         found = strcmp(rcu_str_deref(device->name), path);
550         rcu_read_unlock();
551
552         return found == 0;
553 }
554
555 /*
556  *  Search and remove all stale (devices which are not mounted) devices.
557  *  When both inputs are NULL, it will search and release all stale devices.
558  *  path:       Optional. When provided will it release all unmounted devices
559  *              matching this path only.
560  *  skip_dev:   Optional. Will skip this device when searching for the stale
561  *              devices.
562  *  Return:     0 for success or if @path is NULL.
563  *              -EBUSY if @path is a mounted device.
564  *              -ENOENT if @path does not match any device in the list.
565  */
566 static int btrfs_free_stale_devices(const char *path,
567                                      struct btrfs_device *skip_device)
568 {
569         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
570         struct btrfs_device *device, *tmp_device;
571         int ret = 0;
572
573         if (path)
574                 ret = -ENOENT;
575
576         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
577
578                 mutex_lock(&fs_devices->device_list_mutex);
579                 list_for_each_entry_safe(device, tmp_device,
580                                          &fs_devices->devices, dev_list) {
581                         if (skip_device && skip_device == device)
582                                 continue;
583                         if (path && !device->name)
584                                 continue;
585                         if (path && !device_path_matched(path, device))
586                                 continue;
587                         if (fs_devices->opened) {
588                                 /* for an already deleted device return 0 */
589                                 if (path && ret != 0)
590                                         ret = -EBUSY;
591                                 break;
592                         }
593
594                         /* delete the stale device */
595                         fs_devices->num_devices--;
596                         list_del(&device->dev_list);
597                         btrfs_free_device(device);
598
599                         ret = 0;
600                 }
601                 mutex_unlock(&fs_devices->device_list_mutex);
602
603                 if (fs_devices->num_devices == 0) {
604                         btrfs_sysfs_remove_fsid(fs_devices);
605                         list_del(&fs_devices->fs_list);
606                         free_fs_devices(fs_devices);
607                 }
608         }
609
610         return ret;
611 }
612
613 /*
614  * This is only used on mount, and we are protected from competing things
615  * messing with our fs_devices by the uuid_mutex, thus we do not need the
616  * fs_devices->device_list_mutex here.
617  */
618 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
619                         struct btrfs_device *device, fmode_t flags,
620                         void *holder)
621 {
622         struct request_queue *q;
623         struct block_device *bdev;
624         struct btrfs_super_block *disk_super;
625         u64 devid;
626         int ret;
627
628         if (device->bdev)
629                 return -EINVAL;
630         if (!device->name)
631                 return -EINVAL;
632
633         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
634                                     &bdev, &disk_super);
635         if (ret)
636                 return ret;
637
638         devid = btrfs_stack_device_id(&disk_super->dev_item);
639         if (devid != device->devid)
640                 goto error_free_page;
641
642         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
643                 goto error_free_page;
644
645         device->generation = btrfs_super_generation(disk_super);
646
647         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
648                 if (btrfs_super_incompat_flags(disk_super) &
649                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
650                         pr_err(
651                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
652                         goto error_free_page;
653                 }
654
655                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
656                 fs_devices->seeding = true;
657         } else {
658                 if (bdev_read_only(bdev))
659                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
660                 else
661                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
662         }
663
664         q = bdev_get_queue(bdev);
665         if (!blk_queue_nonrot(q))
666                 fs_devices->rotating = true;
667
668         device->bdev = bdev;
669         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
670         device->mode = flags;
671
672         fs_devices->open_devices++;
673         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
674             device->devid != BTRFS_DEV_REPLACE_DEVID) {
675                 fs_devices->rw_devices++;
676                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
677         }
678         btrfs_release_disk_super(disk_super);
679
680         return 0;
681
682 error_free_page:
683         btrfs_release_disk_super(disk_super);
684         blkdev_put(bdev, flags);
685
686         return -EINVAL;
687 }
688
689 /*
690  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
691  * being created with a disk that has already completed its fsid change. Such
692  * disk can belong to an fs which has its FSID changed or to one which doesn't.
693  * Handle both cases here.
694  */
695 static struct btrfs_fs_devices *find_fsid_inprogress(
696                                         struct btrfs_super_block *disk_super)
697 {
698         struct btrfs_fs_devices *fs_devices;
699
700         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
701                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
702                            BTRFS_FSID_SIZE) != 0 &&
703                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
704                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
705                         return fs_devices;
706                 }
707         }
708
709         return find_fsid(disk_super->fsid, NULL);
710 }
711
712
713 static struct btrfs_fs_devices *find_fsid_changed(
714                                         struct btrfs_super_block *disk_super)
715 {
716         struct btrfs_fs_devices *fs_devices;
717
718         /*
719          * Handles the case where scanned device is part of an fs that had
720          * multiple successful changes of FSID but curently device didn't
721          * observe it. Meaning our fsid will be different than theirs. We need
722          * to handle two subcases :
723          *  1 - The fs still continues to have different METADATA/FSID uuids.
724          *  2 - The fs is switched back to its original FSID (METADATA/FSID
725          *  are equal).
726          */
727         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
728                 /* Changed UUIDs */
729                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
730                            BTRFS_FSID_SIZE) != 0 &&
731                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
732                            BTRFS_FSID_SIZE) == 0 &&
733                     memcmp(fs_devices->fsid, disk_super->fsid,
734                            BTRFS_FSID_SIZE) != 0)
735                         return fs_devices;
736
737                 /* Unchanged UUIDs */
738                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
739                            BTRFS_FSID_SIZE) == 0 &&
740                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
741                            BTRFS_FSID_SIZE) == 0)
742                         return fs_devices;
743         }
744
745         return NULL;
746 }
747
748 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
749                                 struct btrfs_super_block *disk_super)
750 {
751         struct btrfs_fs_devices *fs_devices;
752
753         /*
754          * Handle the case where the scanned device is part of an fs whose last
755          * metadata UUID change reverted it to the original FSID. At the same
756          * time * fs_devices was first created by another constitutent device
757          * which didn't fully observe the operation. This results in an
758          * btrfs_fs_devices created with metadata/fsid different AND
759          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
760          * fs_devices equal to the FSID of the disk.
761          */
762         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
763                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
764                            BTRFS_FSID_SIZE) != 0 &&
765                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
766                            BTRFS_FSID_SIZE) == 0 &&
767                     fs_devices->fsid_change)
768                         return fs_devices;
769         }
770
771         return NULL;
772 }
773 /*
774  * Add new device to list of registered devices
775  *
776  * Returns:
777  * device pointer which was just added or updated when successful
778  * error pointer when failed
779  */
780 static noinline struct btrfs_device *device_list_add(const char *path,
781                            struct btrfs_super_block *disk_super,
782                            bool *new_device_added)
783 {
784         struct btrfs_device *device;
785         struct btrfs_fs_devices *fs_devices = NULL;
786         struct rcu_string *name;
787         u64 found_transid = btrfs_super_generation(disk_super);
788         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
789         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
790                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
791         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
792                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
793
794         if (fsid_change_in_progress) {
795                 if (!has_metadata_uuid)
796                         fs_devices = find_fsid_inprogress(disk_super);
797                 else
798                         fs_devices = find_fsid_changed(disk_super);
799         } else if (has_metadata_uuid) {
800                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
801         } else {
802                 fs_devices = find_fsid_reverted_metadata(disk_super);
803                 if (!fs_devices)
804                         fs_devices = find_fsid(disk_super->fsid, NULL);
805         }
806
807
808         if (!fs_devices) {
809                 if (has_metadata_uuid)
810                         fs_devices = alloc_fs_devices(disk_super->fsid,
811                                                       disk_super->metadata_uuid);
812                 else
813                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
814
815                 if (IS_ERR(fs_devices))
816                         return ERR_CAST(fs_devices);
817
818                 fs_devices->fsid_change = fsid_change_in_progress;
819
820                 mutex_lock(&fs_devices->device_list_mutex);
821                 list_add(&fs_devices->fs_list, &fs_uuids);
822
823                 device = NULL;
824         } else {
825                 mutex_lock(&fs_devices->device_list_mutex);
826                 device = btrfs_find_device(fs_devices, devid,
827                                 disk_super->dev_item.uuid, NULL);
828
829                 /*
830                  * If this disk has been pulled into an fs devices created by
831                  * a device which had the CHANGING_FSID_V2 flag then replace the
832                  * metadata_uuid/fsid values of the fs_devices.
833                  */
834                 if (fs_devices->fsid_change &&
835                     found_transid > fs_devices->latest_generation) {
836                         memcpy(fs_devices->fsid, disk_super->fsid,
837                                         BTRFS_FSID_SIZE);
838
839                         if (has_metadata_uuid)
840                                 memcpy(fs_devices->metadata_uuid,
841                                        disk_super->metadata_uuid,
842                                        BTRFS_FSID_SIZE);
843                         else
844                                 memcpy(fs_devices->metadata_uuid,
845                                        disk_super->fsid, BTRFS_FSID_SIZE);
846
847                         fs_devices->fsid_change = false;
848                 }
849         }
850
851         if (!device) {
852                 if (fs_devices->opened) {
853                         mutex_unlock(&fs_devices->device_list_mutex);
854                         return ERR_PTR(-EBUSY);
855                 }
856
857                 device = btrfs_alloc_device(NULL, &devid,
858                                             disk_super->dev_item.uuid);
859                 if (IS_ERR(device)) {
860                         mutex_unlock(&fs_devices->device_list_mutex);
861                         /* we can safely leave the fs_devices entry around */
862                         return device;
863                 }
864
865                 name = rcu_string_strdup(path, GFP_NOFS);
866                 if (!name) {
867                         btrfs_free_device(device);
868                         mutex_unlock(&fs_devices->device_list_mutex);
869                         return ERR_PTR(-ENOMEM);
870                 }
871                 rcu_assign_pointer(device->name, name);
872
873                 list_add_rcu(&device->dev_list, &fs_devices->devices);
874                 fs_devices->num_devices++;
875
876                 device->fs_devices = fs_devices;
877                 *new_device_added = true;
878
879                 if (disk_super->label[0])
880                         pr_info(
881         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
882                                 disk_super->label, devid, found_transid, path,
883                                 current->comm, task_pid_nr(current));
884                 else
885                         pr_info(
886         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
887                                 disk_super->fsid, devid, found_transid, path,
888                                 current->comm, task_pid_nr(current));
889
890         } else if (!device->name || strcmp(device->name->str, path)) {
891                 /*
892                  * When FS is already mounted.
893                  * 1. If you are here and if the device->name is NULL that
894                  *    means this device was missing at time of FS mount.
895                  * 2. If you are here and if the device->name is different
896                  *    from 'path' that means either
897                  *      a. The same device disappeared and reappeared with
898                  *         different name. or
899                  *      b. The missing-disk-which-was-replaced, has
900                  *         reappeared now.
901                  *
902                  * We must allow 1 and 2a above. But 2b would be a spurious
903                  * and unintentional.
904                  *
905                  * Further in case of 1 and 2a above, the disk at 'path'
906                  * would have missed some transaction when it was away and
907                  * in case of 2a the stale bdev has to be updated as well.
908                  * 2b must not be allowed at all time.
909                  */
910
911                 /*
912                  * For now, we do allow update to btrfs_fs_device through the
913                  * btrfs dev scan cli after FS has been mounted.  We're still
914                  * tracking a problem where systems fail mount by subvolume id
915                  * when we reject replacement on a mounted FS.
916                  */
917                 if (!fs_devices->opened && found_transid < device->generation) {
918                         /*
919                          * That is if the FS is _not_ mounted and if you
920                          * are here, that means there is more than one
921                          * disk with same uuid and devid.We keep the one
922                          * with larger generation number or the last-in if
923                          * generation are equal.
924                          */
925                         mutex_unlock(&fs_devices->device_list_mutex);
926                         return ERR_PTR(-EEXIST);
927                 }
928
929                 /*
930                  * We are going to replace the device path for a given devid,
931                  * make sure it's the same device if the device is mounted
932                  */
933                 if (device->bdev) {
934                         int error;
935                         dev_t path_dev;
936
937                         error = lookup_bdev(path, &path_dev);
938                         if (error) {
939                                 mutex_unlock(&fs_devices->device_list_mutex);
940                                 return ERR_PTR(error);
941                         }
942
943                         if (device->bdev->bd_dev != path_dev) {
944                                 mutex_unlock(&fs_devices->device_list_mutex);
945                                 /*
946                                  * device->fs_info may not be reliable here, so
947                                  * pass in a NULL instead. This avoids a
948                                  * possible use-after-free when the fs_info and
949                                  * fs_info->sb are already torn down.
950                                  */
951                                 btrfs_warn_in_rcu(NULL,
952         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
953                                                   path, devid, found_transid,
954                                                   current->comm,
955                                                   task_pid_nr(current));
956                                 return ERR_PTR(-EEXIST);
957                         }
958                         btrfs_info_in_rcu(device->fs_info,
959         "devid %llu device path %s changed to %s scanned by %s (%d)",
960                                           devid, rcu_str_deref(device->name),
961                                           path, current->comm,
962                                           task_pid_nr(current));
963                 }
964
965                 name = rcu_string_strdup(path, GFP_NOFS);
966                 if (!name) {
967                         mutex_unlock(&fs_devices->device_list_mutex);
968                         return ERR_PTR(-ENOMEM);
969                 }
970                 rcu_string_free(device->name);
971                 rcu_assign_pointer(device->name, name);
972                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
973                         fs_devices->missing_devices--;
974                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
975                 }
976         }
977
978         /*
979          * Unmount does not free the btrfs_device struct but would zero
980          * generation along with most of the other members. So just update
981          * it back. We need it to pick the disk with largest generation
982          * (as above).
983          */
984         if (!fs_devices->opened) {
985                 device->generation = found_transid;
986                 fs_devices->latest_generation = max_t(u64, found_transid,
987                                                 fs_devices->latest_generation);
988         }
989
990         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
991
992         mutex_unlock(&fs_devices->device_list_mutex);
993         return device;
994 }
995
996 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
997 {
998         struct btrfs_fs_devices *fs_devices;
999         struct btrfs_device *device;
1000         struct btrfs_device *orig_dev;
1001         int ret = 0;
1002
1003         fs_devices = alloc_fs_devices(orig->fsid, NULL);
1004         if (IS_ERR(fs_devices))
1005                 return fs_devices;
1006
1007         mutex_lock(&orig->device_list_mutex);
1008         fs_devices->total_devices = orig->total_devices;
1009
1010         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1011                 struct rcu_string *name;
1012
1013                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1014                                             orig_dev->uuid);
1015                 if (IS_ERR(device)) {
1016                         ret = PTR_ERR(device);
1017                         goto error;
1018                 }
1019
1020                 /*
1021                  * This is ok to do without rcu read locked because we hold the
1022                  * uuid mutex so nothing we touch in here is going to disappear.
1023                  */
1024                 if (orig_dev->name) {
1025                         name = rcu_string_strdup(orig_dev->name->str,
1026                                         GFP_KERNEL);
1027                         if (!name) {
1028                                 btrfs_free_device(device);
1029                                 ret = -ENOMEM;
1030                                 goto error;
1031                         }
1032                         rcu_assign_pointer(device->name, name);
1033                 }
1034
1035                 list_add(&device->dev_list, &fs_devices->devices);
1036                 device->fs_devices = fs_devices;
1037                 fs_devices->num_devices++;
1038         }
1039         mutex_unlock(&orig->device_list_mutex);
1040         return fs_devices;
1041 error:
1042         mutex_unlock(&orig->device_list_mutex);
1043         free_fs_devices(fs_devices);
1044         return ERR_PTR(ret);
1045 }
1046
1047 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1048                                       struct btrfs_device **latest_dev)
1049 {
1050         struct btrfs_device *device, *next;
1051
1052         /* This is the initialized path, it is safe to release the devices. */
1053         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1054                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1055                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1056                                       &device->dev_state) &&
1057                             !test_bit(BTRFS_DEV_STATE_MISSING,
1058                                       &device->dev_state) &&
1059                             (!*latest_dev ||
1060                              device->generation > (*latest_dev)->generation)) {
1061                                 *latest_dev = device;
1062                         }
1063                         continue;
1064                 }
1065
1066                 /*
1067                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1068                  * in btrfs_init_dev_replace() so just continue.
1069                  */
1070                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1071                         continue;
1072
1073                 if (device->bdev) {
1074                         blkdev_put(device->bdev, device->mode);
1075                         device->bdev = NULL;
1076                         fs_devices->open_devices--;
1077                 }
1078                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1079                         list_del_init(&device->dev_alloc_list);
1080                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1081                 }
1082                 list_del_init(&device->dev_list);
1083                 fs_devices->num_devices--;
1084                 btrfs_free_device(device);
1085         }
1086
1087 }
1088
1089 /*
1090  * After we have read the system tree and know devids belonging to this
1091  * filesystem, remove the device which does not belong there.
1092  */
1093 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1094 {
1095         struct btrfs_device *latest_dev = NULL;
1096         struct btrfs_fs_devices *seed_dev;
1097
1098         mutex_lock(&uuid_mutex);
1099         __btrfs_free_extra_devids(fs_devices, &latest_dev);
1100
1101         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1102                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1103
1104         fs_devices->latest_bdev = latest_dev->bdev;
1105
1106         mutex_unlock(&uuid_mutex);
1107 }
1108
1109 static void btrfs_close_bdev(struct btrfs_device *device)
1110 {
1111         if (!device->bdev)
1112                 return;
1113
1114         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1115                 sync_blockdev(device->bdev);
1116                 invalidate_bdev(device->bdev);
1117         }
1118
1119         blkdev_put(device->bdev, device->mode);
1120 }
1121
1122 static void btrfs_close_one_device(struct btrfs_device *device)
1123 {
1124         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1125
1126         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1127             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1128                 list_del_init(&device->dev_alloc_list);
1129                 fs_devices->rw_devices--;
1130         }
1131
1132         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1133                 fs_devices->missing_devices--;
1134
1135         btrfs_close_bdev(device);
1136         if (device->bdev) {
1137                 fs_devices->open_devices--;
1138                 device->bdev = NULL;
1139         }
1140         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1141         btrfs_destroy_dev_zone_info(device);
1142
1143         device->fs_info = NULL;
1144         atomic_set(&device->dev_stats_ccnt, 0);
1145         extent_io_tree_release(&device->alloc_state);
1146
1147         /* Verify the device is back in a pristine state  */
1148         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1149         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1150         ASSERT(list_empty(&device->dev_alloc_list));
1151         ASSERT(list_empty(&device->post_commit_list));
1152         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1153 }
1154
1155 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1156 {
1157         struct btrfs_device *device, *tmp;
1158
1159         lockdep_assert_held(&uuid_mutex);
1160
1161         if (--fs_devices->opened > 0)
1162                 return;
1163
1164         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1165                 btrfs_close_one_device(device);
1166
1167         WARN_ON(fs_devices->open_devices);
1168         WARN_ON(fs_devices->rw_devices);
1169         fs_devices->opened = 0;
1170         fs_devices->seeding = false;
1171         fs_devices->fs_info = NULL;
1172 }
1173
1174 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1175 {
1176         LIST_HEAD(list);
1177         struct btrfs_fs_devices *tmp;
1178
1179         mutex_lock(&uuid_mutex);
1180         close_fs_devices(fs_devices);
1181         if (!fs_devices->opened)
1182                 list_splice_init(&fs_devices->seed_list, &list);
1183
1184         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1185                 close_fs_devices(fs_devices);
1186                 list_del(&fs_devices->seed_list);
1187                 free_fs_devices(fs_devices);
1188         }
1189         mutex_unlock(&uuid_mutex);
1190 }
1191
1192 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1193                                 fmode_t flags, void *holder)
1194 {
1195         struct btrfs_device *device;
1196         struct btrfs_device *latest_dev = NULL;
1197         struct btrfs_device *tmp_device;
1198
1199         flags |= FMODE_EXCL;
1200
1201         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1202                                  dev_list) {
1203                 int ret;
1204
1205                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1206                 if (ret == 0 &&
1207                     (!latest_dev || device->generation > latest_dev->generation)) {
1208                         latest_dev = device;
1209                 } else if (ret == -ENODATA) {
1210                         fs_devices->num_devices--;
1211                         list_del(&device->dev_list);
1212                         btrfs_free_device(device);
1213                 }
1214         }
1215         if (fs_devices->open_devices == 0)
1216                 return -EINVAL;
1217
1218         fs_devices->opened = 1;
1219         fs_devices->latest_bdev = latest_dev->bdev;
1220         fs_devices->total_rw_bytes = 0;
1221         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1222         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1223
1224         return 0;
1225 }
1226
1227 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1228 {
1229         struct btrfs_device *dev1, *dev2;
1230
1231         dev1 = list_entry(a, struct btrfs_device, dev_list);
1232         dev2 = list_entry(b, struct btrfs_device, dev_list);
1233
1234         if (dev1->devid < dev2->devid)
1235                 return -1;
1236         else if (dev1->devid > dev2->devid)
1237                 return 1;
1238         return 0;
1239 }
1240
1241 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1242                        fmode_t flags, void *holder)
1243 {
1244         int ret;
1245
1246         lockdep_assert_held(&uuid_mutex);
1247         /*
1248          * The device_list_mutex cannot be taken here in case opening the
1249          * underlying device takes further locks like bd_mutex.
1250          *
1251          * We also don't need the lock here as this is called during mount and
1252          * exclusion is provided by uuid_mutex
1253          */
1254
1255         if (fs_devices->opened) {
1256                 fs_devices->opened++;
1257                 ret = 0;
1258         } else {
1259                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1260                 ret = open_fs_devices(fs_devices, flags, holder);
1261         }
1262
1263         return ret;
1264 }
1265
1266 void btrfs_release_disk_super(struct btrfs_super_block *super)
1267 {
1268         struct page *page = virt_to_page(super);
1269
1270         put_page(page);
1271 }
1272
1273 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1274                                                        u64 bytenr, u64 bytenr_orig)
1275 {
1276         struct btrfs_super_block *disk_super;
1277         struct page *page;
1278         void *p;
1279         pgoff_t index;
1280
1281         /* make sure our super fits in the device */
1282         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1283                 return ERR_PTR(-EINVAL);
1284
1285         /* make sure our super fits in the page */
1286         if (sizeof(*disk_super) > PAGE_SIZE)
1287                 return ERR_PTR(-EINVAL);
1288
1289         /* make sure our super doesn't straddle pages on disk */
1290         index = bytenr >> PAGE_SHIFT;
1291         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1292                 return ERR_PTR(-EINVAL);
1293
1294         /* pull in the page with our super */
1295         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1296
1297         if (IS_ERR(page))
1298                 return ERR_CAST(page);
1299
1300         p = page_address(page);
1301
1302         /* align our pointer to the offset of the super block */
1303         disk_super = p + offset_in_page(bytenr);
1304
1305         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1306             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1307                 btrfs_release_disk_super(p);
1308                 return ERR_PTR(-EINVAL);
1309         }
1310
1311         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1312                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1313
1314         return disk_super;
1315 }
1316
1317 int btrfs_forget_devices(const char *path)
1318 {
1319         int ret;
1320
1321         mutex_lock(&uuid_mutex);
1322         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1323         mutex_unlock(&uuid_mutex);
1324
1325         return ret;
1326 }
1327
1328 /*
1329  * Look for a btrfs signature on a device. This may be called out of the mount path
1330  * and we are not allowed to call set_blocksize during the scan. The superblock
1331  * is read via pagecache
1332  */
1333 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1334                                            void *holder)
1335 {
1336         struct btrfs_super_block *disk_super;
1337         bool new_device_added = false;
1338         struct btrfs_device *device = NULL;
1339         struct block_device *bdev;
1340         u64 bytenr, bytenr_orig;
1341         int ret;
1342
1343         lockdep_assert_held(&uuid_mutex);
1344
1345         /*
1346          * we would like to check all the supers, but that would make
1347          * a btrfs mount succeed after a mkfs from a different FS.
1348          * So, we need to add a special mount option to scan for
1349          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1350          */
1351         flags |= FMODE_EXCL;
1352
1353         bdev = blkdev_get_by_path(path, flags, holder);
1354         if (IS_ERR(bdev))
1355                 return ERR_CAST(bdev);
1356
1357         bytenr_orig = btrfs_sb_offset(0);
1358         ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1359         if (ret)
1360                 return ERR_PTR(ret);
1361
1362         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1363         if (IS_ERR(disk_super)) {
1364                 device = ERR_CAST(disk_super);
1365                 goto error_bdev_put;
1366         }
1367
1368         device = device_list_add(path, disk_super, &new_device_added);
1369         if (!IS_ERR(device)) {
1370                 if (new_device_added)
1371                         btrfs_free_stale_devices(path, device);
1372         }
1373
1374         btrfs_release_disk_super(disk_super);
1375
1376 error_bdev_put:
1377         blkdev_put(bdev, flags);
1378
1379         return device;
1380 }
1381
1382 /*
1383  * Try to find a chunk that intersects [start, start + len] range and when one
1384  * such is found, record the end of it in *start
1385  */
1386 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1387                                     u64 len)
1388 {
1389         u64 physical_start, physical_end;
1390
1391         lockdep_assert_held(&device->fs_info->chunk_mutex);
1392
1393         if (!find_first_extent_bit(&device->alloc_state, *start,
1394                                    &physical_start, &physical_end,
1395                                    CHUNK_ALLOCATED, NULL)) {
1396
1397                 if (in_range(physical_start, *start, len) ||
1398                     in_range(*start, physical_start,
1399                              physical_end - physical_start)) {
1400                         *start = physical_end + 1;
1401                         return true;
1402                 }
1403         }
1404         return false;
1405 }
1406
1407 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1408 {
1409         switch (device->fs_devices->chunk_alloc_policy) {
1410         case BTRFS_CHUNK_ALLOC_REGULAR:
1411                 /*
1412                  * We don't want to overwrite the superblock on the drive nor
1413                  * any area used by the boot loader (grub for example), so we
1414                  * make sure to start at an offset of at least 1MB.
1415                  */
1416                 return max_t(u64, start, SZ_1M);
1417         case BTRFS_CHUNK_ALLOC_ZONED:
1418                 /*
1419                  * We don't care about the starting region like regular
1420                  * allocator, because we anyway use/reserve the first two zones
1421                  * for superblock logging.
1422                  */
1423                 return ALIGN(start, device->zone_info->zone_size);
1424         default:
1425                 BUG();
1426         }
1427 }
1428
1429 static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1430                                         u64 *hole_start, u64 *hole_size,
1431                                         u64 num_bytes)
1432 {
1433         u64 zone_size = device->zone_info->zone_size;
1434         u64 pos;
1435         int ret;
1436         bool changed = false;
1437
1438         ASSERT(IS_ALIGNED(*hole_start, zone_size));
1439
1440         while (*hole_size > 0) {
1441                 pos = btrfs_find_allocatable_zones(device, *hole_start,
1442                                                    *hole_start + *hole_size,
1443                                                    num_bytes);
1444                 if (pos != *hole_start) {
1445                         *hole_size = *hole_start + *hole_size - pos;
1446                         *hole_start = pos;
1447                         changed = true;
1448                         if (*hole_size < num_bytes)
1449                                 break;
1450                 }
1451
1452                 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1453
1454                 /* Range is ensured to be empty */
1455                 if (!ret)
1456                         return changed;
1457
1458                 /* Given hole range was invalid (outside of device) */
1459                 if (ret == -ERANGE) {
1460                         *hole_start += *hole_size;
1461                         *hole_size = 0;
1462                         return 1;
1463                 }
1464
1465                 *hole_start += zone_size;
1466                 *hole_size -= zone_size;
1467                 changed = true;
1468         }
1469
1470         return changed;
1471 }
1472
1473 /**
1474  * dev_extent_hole_check - check if specified hole is suitable for allocation
1475  * @device:     the device which we have the hole
1476  * @hole_start: starting position of the hole
1477  * @hole_size:  the size of the hole
1478  * @num_bytes:  the size of the free space that we need
1479  *
1480  * This function may modify @hole_start and @hole_size to reflect the suitable
1481  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1482  */
1483 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1484                                   u64 *hole_size, u64 num_bytes)
1485 {
1486         bool changed = false;
1487         u64 hole_end = *hole_start + *hole_size;
1488
1489         for (;;) {
1490                 /*
1491                  * Check before we set max_hole_start, otherwise we could end up
1492                  * sending back this offset anyway.
1493                  */
1494                 if (contains_pending_extent(device, hole_start, *hole_size)) {
1495                         if (hole_end >= *hole_start)
1496                                 *hole_size = hole_end - *hole_start;
1497                         else
1498                                 *hole_size = 0;
1499                         changed = true;
1500                 }
1501
1502                 switch (device->fs_devices->chunk_alloc_policy) {
1503                 case BTRFS_CHUNK_ALLOC_REGULAR:
1504                         /* No extra check */
1505                         break;
1506                 case BTRFS_CHUNK_ALLOC_ZONED:
1507                         if (dev_extent_hole_check_zoned(device, hole_start,
1508                                                         hole_size, num_bytes)) {
1509                                 changed = true;
1510                                 /*
1511                                  * The changed hole can contain pending extent.
1512                                  * Loop again to check that.
1513                                  */
1514                                 continue;
1515                         }
1516                         break;
1517                 default:
1518                         BUG();
1519                 }
1520
1521                 break;
1522         }
1523
1524         return changed;
1525 }
1526
1527 /*
1528  * find_free_dev_extent_start - find free space in the specified device
1529  * @device:       the device which we search the free space in
1530  * @num_bytes:    the size of the free space that we need
1531  * @search_start: the position from which to begin the search
1532  * @start:        store the start of the free space.
1533  * @len:          the size of the free space. that we find, or the size
1534  *                of the max free space if we don't find suitable free space
1535  *
1536  * this uses a pretty simple search, the expectation is that it is
1537  * called very infrequently and that a given device has a small number
1538  * of extents
1539  *
1540  * @start is used to store the start of the free space if we find. But if we
1541  * don't find suitable free space, it will be used to store the start position
1542  * of the max free space.
1543  *
1544  * @len is used to store the size of the free space that we find.
1545  * But if we don't find suitable free space, it is used to store the size of
1546  * the max free space.
1547  *
1548  * NOTE: This function will search *commit* root of device tree, and does extra
1549  * check to ensure dev extents are not double allocated.
1550  * This makes the function safe to allocate dev extents but may not report
1551  * correct usable device space, as device extent freed in current transaction
1552  * is not reported as avaiable.
1553  */
1554 static int find_free_dev_extent_start(struct btrfs_device *device,
1555                                 u64 num_bytes, u64 search_start, u64 *start,
1556                                 u64 *len)
1557 {
1558         struct btrfs_fs_info *fs_info = device->fs_info;
1559         struct btrfs_root *root = fs_info->dev_root;
1560         struct btrfs_key key;
1561         struct btrfs_dev_extent *dev_extent;
1562         struct btrfs_path *path;
1563         u64 hole_size;
1564         u64 max_hole_start;
1565         u64 max_hole_size;
1566         u64 extent_end;
1567         u64 search_end = device->total_bytes;
1568         int ret;
1569         int slot;
1570         struct extent_buffer *l;
1571
1572         search_start = dev_extent_search_start(device, search_start);
1573
1574         WARN_ON(device->zone_info &&
1575                 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1576
1577         path = btrfs_alloc_path();
1578         if (!path)
1579                 return -ENOMEM;
1580
1581         max_hole_start = search_start;
1582         max_hole_size = 0;
1583
1584 again:
1585         if (search_start >= search_end ||
1586                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1587                 ret = -ENOSPC;
1588                 goto out;
1589         }
1590
1591         path->reada = READA_FORWARD;
1592         path->search_commit_root = 1;
1593         path->skip_locking = 1;
1594
1595         key.objectid = device->devid;
1596         key.offset = search_start;
1597         key.type = BTRFS_DEV_EXTENT_KEY;
1598
1599         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1600         if (ret < 0)
1601                 goto out;
1602         if (ret > 0) {
1603                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1604                 if (ret < 0)
1605                         goto out;
1606         }
1607
1608         while (1) {
1609                 l = path->nodes[0];
1610                 slot = path->slots[0];
1611                 if (slot >= btrfs_header_nritems(l)) {
1612                         ret = btrfs_next_leaf(root, path);
1613                         if (ret == 0)
1614                                 continue;
1615                         if (ret < 0)
1616                                 goto out;
1617
1618                         break;
1619                 }
1620                 btrfs_item_key_to_cpu(l, &key, slot);
1621
1622                 if (key.objectid < device->devid)
1623                         goto next;
1624
1625                 if (key.objectid > device->devid)
1626                         break;
1627
1628                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1629                         goto next;
1630
1631                 if (key.offset > search_start) {
1632                         hole_size = key.offset - search_start;
1633                         dev_extent_hole_check(device, &search_start, &hole_size,
1634                                               num_bytes);
1635
1636                         if (hole_size > max_hole_size) {
1637                                 max_hole_start = search_start;
1638                                 max_hole_size = hole_size;
1639                         }
1640
1641                         /*
1642                          * If this free space is greater than which we need,
1643                          * it must be the max free space that we have found
1644                          * until now, so max_hole_start must point to the start
1645                          * of this free space and the length of this free space
1646                          * is stored in max_hole_size. Thus, we return
1647                          * max_hole_start and max_hole_size and go back to the
1648                          * caller.
1649                          */
1650                         if (hole_size >= num_bytes) {
1651                                 ret = 0;
1652                                 goto out;
1653                         }
1654                 }
1655
1656                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1657                 extent_end = key.offset + btrfs_dev_extent_length(l,
1658                                                                   dev_extent);
1659                 if (extent_end > search_start)
1660                         search_start = extent_end;
1661 next:
1662                 path->slots[0]++;
1663                 cond_resched();
1664         }
1665
1666         /*
1667          * At this point, search_start should be the end of
1668          * allocated dev extents, and when shrinking the device,
1669          * search_end may be smaller than search_start.
1670          */
1671         if (search_end > search_start) {
1672                 hole_size = search_end - search_start;
1673                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1674                                           num_bytes)) {
1675                         btrfs_release_path(path);
1676                         goto again;
1677                 }
1678
1679                 if (hole_size > max_hole_size) {
1680                         max_hole_start = search_start;
1681                         max_hole_size = hole_size;
1682                 }
1683         }
1684
1685         /* See above. */
1686         if (max_hole_size < num_bytes)
1687                 ret = -ENOSPC;
1688         else
1689                 ret = 0;
1690
1691 out:
1692         btrfs_free_path(path);
1693         *start = max_hole_start;
1694         if (len)
1695                 *len = max_hole_size;
1696         return ret;
1697 }
1698
1699 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1700                          u64 *start, u64 *len)
1701 {
1702         /* FIXME use last free of some kind */
1703         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1704 }
1705
1706 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1707                           struct btrfs_device *device,
1708                           u64 start, u64 *dev_extent_len)
1709 {
1710         struct btrfs_fs_info *fs_info = device->fs_info;
1711         struct btrfs_root *root = fs_info->dev_root;
1712         int ret;
1713         struct btrfs_path *path;
1714         struct btrfs_key key;
1715         struct btrfs_key found_key;
1716         struct extent_buffer *leaf = NULL;
1717         struct btrfs_dev_extent *extent = NULL;
1718
1719         path = btrfs_alloc_path();
1720         if (!path)
1721                 return -ENOMEM;
1722
1723         key.objectid = device->devid;
1724         key.offset = start;
1725         key.type = BTRFS_DEV_EXTENT_KEY;
1726 again:
1727         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1728         if (ret > 0) {
1729                 ret = btrfs_previous_item(root, path, key.objectid,
1730                                           BTRFS_DEV_EXTENT_KEY);
1731                 if (ret)
1732                         goto out;
1733                 leaf = path->nodes[0];
1734                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1735                 extent = btrfs_item_ptr(leaf, path->slots[0],
1736                                         struct btrfs_dev_extent);
1737                 BUG_ON(found_key.offset > start || found_key.offset +
1738                        btrfs_dev_extent_length(leaf, extent) < start);
1739                 key = found_key;
1740                 btrfs_release_path(path);
1741                 goto again;
1742         } else if (ret == 0) {
1743                 leaf = path->nodes[0];
1744                 extent = btrfs_item_ptr(leaf, path->slots[0],
1745                                         struct btrfs_dev_extent);
1746         } else {
1747                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1748                 goto out;
1749         }
1750
1751         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1752
1753         ret = btrfs_del_item(trans, root, path);
1754         if (ret) {
1755                 btrfs_handle_fs_error(fs_info, ret,
1756                                       "Failed to remove dev extent item");
1757         } else {
1758                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1759         }
1760 out:
1761         btrfs_free_path(path);
1762         return ret;
1763 }
1764
1765 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1766                                   struct btrfs_device *device,
1767                                   u64 chunk_offset, u64 start, u64 num_bytes)
1768 {
1769         int ret;
1770         struct btrfs_path *path;
1771         struct btrfs_fs_info *fs_info = device->fs_info;
1772         struct btrfs_root *root = fs_info->dev_root;
1773         struct btrfs_dev_extent *extent;
1774         struct extent_buffer *leaf;
1775         struct btrfs_key key;
1776
1777         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1778         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1779         path = btrfs_alloc_path();
1780         if (!path)
1781                 return -ENOMEM;
1782
1783         key.objectid = device->devid;
1784         key.offset = start;
1785         key.type = BTRFS_DEV_EXTENT_KEY;
1786         ret = btrfs_insert_empty_item(trans, root, path, &key,
1787                                       sizeof(*extent));
1788         if (ret)
1789                 goto out;
1790
1791         leaf = path->nodes[0];
1792         extent = btrfs_item_ptr(leaf, path->slots[0],
1793                                 struct btrfs_dev_extent);
1794         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1795                                         BTRFS_CHUNK_TREE_OBJECTID);
1796         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1797                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1798         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1799
1800         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1801         btrfs_mark_buffer_dirty(leaf);
1802 out:
1803         btrfs_free_path(path);
1804         return ret;
1805 }
1806
1807 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1808 {
1809         struct extent_map_tree *em_tree;
1810         struct extent_map *em;
1811         struct rb_node *n;
1812         u64 ret = 0;
1813
1814         em_tree = &fs_info->mapping_tree;
1815         read_lock(&em_tree->lock);
1816         n = rb_last(&em_tree->map.rb_root);
1817         if (n) {
1818                 em = rb_entry(n, struct extent_map, rb_node);
1819                 ret = em->start + em->len;
1820         }
1821         read_unlock(&em_tree->lock);
1822
1823         return ret;
1824 }
1825
1826 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1827                                     u64 *devid_ret)
1828 {
1829         int ret;
1830         struct btrfs_key key;
1831         struct btrfs_key found_key;
1832         struct btrfs_path *path;
1833
1834         path = btrfs_alloc_path();
1835         if (!path)
1836                 return -ENOMEM;
1837
1838         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1839         key.type = BTRFS_DEV_ITEM_KEY;
1840         key.offset = (u64)-1;
1841
1842         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1843         if (ret < 0)
1844                 goto error;
1845
1846         if (ret == 0) {
1847                 /* Corruption */
1848                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1849                 ret = -EUCLEAN;
1850                 goto error;
1851         }
1852
1853         ret = btrfs_previous_item(fs_info->chunk_root, path,
1854                                   BTRFS_DEV_ITEMS_OBJECTID,
1855                                   BTRFS_DEV_ITEM_KEY);
1856         if (ret) {
1857                 *devid_ret = 1;
1858         } else {
1859                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1860                                       path->slots[0]);
1861                 *devid_ret = found_key.offset + 1;
1862         }
1863         ret = 0;
1864 error:
1865         btrfs_free_path(path);
1866         return ret;
1867 }
1868
1869 /*
1870  * the device information is stored in the chunk root
1871  * the btrfs_device struct should be fully filled in
1872  */
1873 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1874                             struct btrfs_device *device)
1875 {
1876         int ret;
1877         struct btrfs_path *path;
1878         struct btrfs_dev_item *dev_item;
1879         struct extent_buffer *leaf;
1880         struct btrfs_key key;
1881         unsigned long ptr;
1882
1883         path = btrfs_alloc_path();
1884         if (!path)
1885                 return -ENOMEM;
1886
1887         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1888         key.type = BTRFS_DEV_ITEM_KEY;
1889         key.offset = device->devid;
1890
1891         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1892                                       &key, sizeof(*dev_item));
1893         if (ret)
1894                 goto out;
1895
1896         leaf = path->nodes[0];
1897         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1898
1899         btrfs_set_device_id(leaf, dev_item, device->devid);
1900         btrfs_set_device_generation(leaf, dev_item, 0);
1901         btrfs_set_device_type(leaf, dev_item, device->type);
1902         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1903         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1904         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1905         btrfs_set_device_total_bytes(leaf, dev_item,
1906                                      btrfs_device_get_disk_total_bytes(device));
1907         btrfs_set_device_bytes_used(leaf, dev_item,
1908                                     btrfs_device_get_bytes_used(device));
1909         btrfs_set_device_group(leaf, dev_item, 0);
1910         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1911         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1912         btrfs_set_device_start_offset(leaf, dev_item, 0);
1913
1914         ptr = btrfs_device_uuid(dev_item);
1915         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1916         ptr = btrfs_device_fsid(dev_item);
1917         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1918                             ptr, BTRFS_FSID_SIZE);
1919         btrfs_mark_buffer_dirty(leaf);
1920
1921         ret = 0;
1922 out:
1923         btrfs_free_path(path);
1924         return ret;
1925 }
1926
1927 /*
1928  * Function to update ctime/mtime for a given device path.
1929  * Mainly used for ctime/mtime based probe like libblkid.
1930  */
1931 static void update_dev_time(const char *path_name)
1932 {
1933         struct file *filp;
1934
1935         filp = filp_open(path_name, O_RDWR, 0);
1936         if (IS_ERR(filp))
1937                 return;
1938         file_update_time(filp);
1939         filp_close(filp, NULL);
1940 }
1941
1942 static int btrfs_rm_dev_item(struct btrfs_device *device)
1943 {
1944         struct btrfs_root *root = device->fs_info->chunk_root;
1945         int ret;
1946         struct btrfs_path *path;
1947         struct btrfs_key key;
1948         struct btrfs_trans_handle *trans;
1949
1950         path = btrfs_alloc_path();
1951         if (!path)
1952                 return -ENOMEM;
1953
1954         trans = btrfs_start_transaction(root, 0);
1955         if (IS_ERR(trans)) {
1956                 btrfs_free_path(path);
1957                 return PTR_ERR(trans);
1958         }
1959         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1960         key.type = BTRFS_DEV_ITEM_KEY;
1961         key.offset = device->devid;
1962
1963         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1964         if (ret) {
1965                 if (ret > 0)
1966                         ret = -ENOENT;
1967                 btrfs_abort_transaction(trans, ret);
1968                 btrfs_end_transaction(trans);
1969                 goto out;
1970         }
1971
1972         ret = btrfs_del_item(trans, root, path);
1973         if (ret) {
1974                 btrfs_abort_transaction(trans, ret);
1975                 btrfs_end_transaction(trans);
1976         }
1977
1978 out:
1979         btrfs_free_path(path);
1980         if (!ret)
1981                 ret = btrfs_commit_transaction(trans);
1982         return ret;
1983 }
1984
1985 /*
1986  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1987  * filesystem. It's up to the caller to adjust that number regarding eg. device
1988  * replace.
1989  */
1990 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1991                 u64 num_devices)
1992 {
1993         u64 all_avail;
1994         unsigned seq;
1995         int i;
1996
1997         do {
1998                 seq = read_seqbegin(&fs_info->profiles_lock);
1999
2000                 all_avail = fs_info->avail_data_alloc_bits |
2001                             fs_info->avail_system_alloc_bits |
2002                             fs_info->avail_metadata_alloc_bits;
2003         } while (read_seqretry(&fs_info->profiles_lock, seq));
2004
2005         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2006                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
2007                         continue;
2008
2009                 if (num_devices < btrfs_raid_array[i].devs_min) {
2010                         int ret = btrfs_raid_array[i].mindev_error;
2011
2012                         if (ret)
2013                                 return ret;
2014                 }
2015         }
2016
2017         return 0;
2018 }
2019
2020 static struct btrfs_device * btrfs_find_next_active_device(
2021                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2022 {
2023         struct btrfs_device *next_device;
2024
2025         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2026                 if (next_device != device &&
2027                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2028                     && next_device->bdev)
2029                         return next_device;
2030         }
2031
2032         return NULL;
2033 }
2034
2035 /*
2036  * Helper function to check if the given device is part of s_bdev / latest_bdev
2037  * and replace it with the provided or the next active device, in the context
2038  * where this function called, there should be always be another device (or
2039  * this_dev) which is active.
2040  */
2041 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2042                                             struct btrfs_device *next_device)
2043 {
2044         struct btrfs_fs_info *fs_info = device->fs_info;
2045
2046         if (!next_device)
2047                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2048                                                             device);
2049         ASSERT(next_device);
2050
2051         if (fs_info->sb->s_bdev &&
2052                         (fs_info->sb->s_bdev == device->bdev))
2053                 fs_info->sb->s_bdev = next_device->bdev;
2054
2055         if (fs_info->fs_devices->latest_bdev == device->bdev)
2056                 fs_info->fs_devices->latest_bdev = next_device->bdev;
2057 }
2058
2059 /*
2060  * Return btrfs_fs_devices::num_devices excluding the device that's being
2061  * currently replaced.
2062  */
2063 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2064 {
2065         u64 num_devices = fs_info->fs_devices->num_devices;
2066
2067         down_read(&fs_info->dev_replace.rwsem);
2068         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2069                 ASSERT(num_devices > 1);
2070                 num_devices--;
2071         }
2072         up_read(&fs_info->dev_replace.rwsem);
2073
2074         return num_devices;
2075 }
2076
2077 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2078                                struct block_device *bdev,
2079                                const char *device_path)
2080 {
2081         struct btrfs_super_block *disk_super;
2082         int copy_num;
2083
2084         if (!bdev)
2085                 return;
2086
2087         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2088                 struct page *page;
2089                 int ret;
2090
2091                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2092                 if (IS_ERR(disk_super))
2093                         continue;
2094
2095                 if (bdev_is_zoned(bdev)) {
2096                         btrfs_reset_sb_log_zones(bdev, copy_num);
2097                         continue;
2098                 }
2099
2100                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2101
2102                 page = virt_to_page(disk_super);
2103                 set_page_dirty(page);
2104                 lock_page(page);
2105                 /* write_on_page() unlocks the page */
2106                 ret = write_one_page(page);
2107                 if (ret)
2108                         btrfs_warn(fs_info,
2109                                 "error clearing superblock number %d (%d)",
2110                                 copy_num, ret);
2111                 btrfs_release_disk_super(disk_super);
2112
2113         }
2114
2115         /* Notify udev that device has changed */
2116         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2117
2118         /* Update ctime/mtime for device path for libblkid */
2119         update_dev_time(device_path);
2120 }
2121
2122 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2123                     u64 devid)
2124 {
2125         struct btrfs_device *device;
2126         struct btrfs_fs_devices *cur_devices;
2127         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2128         u64 num_devices;
2129         int ret = 0;
2130
2131         mutex_lock(&uuid_mutex);
2132
2133         num_devices = btrfs_num_devices(fs_info);
2134
2135         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2136         if (ret)
2137                 goto out;
2138
2139         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2140
2141         if (IS_ERR(device)) {
2142                 if (PTR_ERR(device) == -ENOENT &&
2143                     strcmp(device_path, "missing") == 0)
2144                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2145                 else
2146                         ret = PTR_ERR(device);
2147                 goto out;
2148         }
2149
2150         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2151                 btrfs_warn_in_rcu(fs_info,
2152                   "cannot remove device %s (devid %llu) due to active swapfile",
2153                                   rcu_str_deref(device->name), device->devid);
2154                 ret = -ETXTBSY;
2155                 goto out;
2156         }
2157
2158         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2159                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2160                 goto out;
2161         }
2162
2163         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2164             fs_info->fs_devices->rw_devices == 1) {
2165                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2166                 goto out;
2167         }
2168
2169         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2170                 mutex_lock(&fs_info->chunk_mutex);
2171                 list_del_init(&device->dev_alloc_list);
2172                 device->fs_devices->rw_devices--;
2173                 mutex_unlock(&fs_info->chunk_mutex);
2174         }
2175
2176         mutex_unlock(&uuid_mutex);
2177         ret = btrfs_shrink_device(device, 0);
2178         if (!ret)
2179                 btrfs_reada_remove_dev(device);
2180         mutex_lock(&uuid_mutex);
2181         if (ret)
2182                 goto error_undo;
2183
2184         /*
2185          * TODO: the superblock still includes this device in its num_devices
2186          * counter although write_all_supers() is not locked out. This
2187          * could give a filesystem state which requires a degraded mount.
2188          */
2189         ret = btrfs_rm_dev_item(device);
2190         if (ret)
2191                 goto error_undo;
2192
2193         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2194         btrfs_scrub_cancel_dev(device);
2195
2196         /*
2197          * the device list mutex makes sure that we don't change
2198          * the device list while someone else is writing out all
2199          * the device supers. Whoever is writing all supers, should
2200          * lock the device list mutex before getting the number of
2201          * devices in the super block (super_copy). Conversely,
2202          * whoever updates the number of devices in the super block
2203          * (super_copy) should hold the device list mutex.
2204          */
2205
2206         /*
2207          * In normal cases the cur_devices == fs_devices. But in case
2208          * of deleting a seed device, the cur_devices should point to
2209          * its own fs_devices listed under the fs_devices->seed.
2210          */
2211         cur_devices = device->fs_devices;
2212         mutex_lock(&fs_devices->device_list_mutex);
2213         list_del_rcu(&device->dev_list);
2214
2215         cur_devices->num_devices--;
2216         cur_devices->total_devices--;
2217         /* Update total_devices of the parent fs_devices if it's seed */
2218         if (cur_devices != fs_devices)
2219                 fs_devices->total_devices--;
2220
2221         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2222                 cur_devices->missing_devices--;
2223
2224         btrfs_assign_next_active_device(device, NULL);
2225
2226         if (device->bdev) {
2227                 cur_devices->open_devices--;
2228                 /* remove sysfs entry */
2229                 btrfs_sysfs_remove_device(device);
2230         }
2231
2232         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2233         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2234         mutex_unlock(&fs_devices->device_list_mutex);
2235
2236         /*
2237          * at this point, the device is zero sized and detached from
2238          * the devices list.  All that's left is to zero out the old
2239          * supers and free the device.
2240          */
2241         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2242                 btrfs_scratch_superblocks(fs_info, device->bdev,
2243                                           device->name->str);
2244
2245         btrfs_close_bdev(device);
2246         synchronize_rcu();
2247         btrfs_free_device(device);
2248
2249         if (cur_devices->open_devices == 0) {
2250                 list_del_init(&cur_devices->seed_list);
2251                 close_fs_devices(cur_devices);
2252                 free_fs_devices(cur_devices);
2253         }
2254
2255 out:
2256         mutex_unlock(&uuid_mutex);
2257         return ret;
2258
2259 error_undo:
2260         btrfs_reada_undo_remove_dev(device);
2261         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2262                 mutex_lock(&fs_info->chunk_mutex);
2263                 list_add(&device->dev_alloc_list,
2264                          &fs_devices->alloc_list);
2265                 device->fs_devices->rw_devices++;
2266                 mutex_unlock(&fs_info->chunk_mutex);
2267         }
2268         goto out;
2269 }
2270
2271 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2272 {
2273         struct btrfs_fs_devices *fs_devices;
2274
2275         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2276
2277         /*
2278          * in case of fs with no seed, srcdev->fs_devices will point
2279          * to fs_devices of fs_info. However when the dev being replaced is
2280          * a seed dev it will point to the seed's local fs_devices. In short
2281          * srcdev will have its correct fs_devices in both the cases.
2282          */
2283         fs_devices = srcdev->fs_devices;
2284
2285         list_del_rcu(&srcdev->dev_list);
2286         list_del(&srcdev->dev_alloc_list);
2287         fs_devices->num_devices--;
2288         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2289                 fs_devices->missing_devices--;
2290
2291         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2292                 fs_devices->rw_devices--;
2293
2294         if (srcdev->bdev)
2295                 fs_devices->open_devices--;
2296 }
2297
2298 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2299 {
2300         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2301
2302         mutex_lock(&uuid_mutex);
2303
2304         btrfs_close_bdev(srcdev);
2305         synchronize_rcu();
2306         btrfs_free_device(srcdev);
2307
2308         /* if this is no devs we rather delete the fs_devices */
2309         if (!fs_devices->num_devices) {
2310                 /*
2311                  * On a mounted FS, num_devices can't be zero unless it's a
2312                  * seed. In case of a seed device being replaced, the replace
2313                  * target added to the sprout FS, so there will be no more
2314                  * device left under the seed FS.
2315                  */
2316                 ASSERT(fs_devices->seeding);
2317
2318                 list_del_init(&fs_devices->seed_list);
2319                 close_fs_devices(fs_devices);
2320                 free_fs_devices(fs_devices);
2321         }
2322         mutex_unlock(&uuid_mutex);
2323 }
2324
2325 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2326 {
2327         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2328
2329         mutex_lock(&fs_devices->device_list_mutex);
2330
2331         btrfs_sysfs_remove_device(tgtdev);
2332
2333         if (tgtdev->bdev)
2334                 fs_devices->open_devices--;
2335
2336         fs_devices->num_devices--;
2337
2338         btrfs_assign_next_active_device(tgtdev, NULL);
2339
2340         list_del_rcu(&tgtdev->dev_list);
2341
2342         mutex_unlock(&fs_devices->device_list_mutex);
2343
2344         /*
2345          * The update_dev_time() with in btrfs_scratch_superblocks()
2346          * may lead to a call to btrfs_show_devname() which will try
2347          * to hold device_list_mutex. And here this device
2348          * is already out of device list, so we don't have to hold
2349          * the device_list_mutex lock.
2350          */
2351         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2352                                   tgtdev->name->str);
2353
2354         btrfs_close_bdev(tgtdev);
2355         synchronize_rcu();
2356         btrfs_free_device(tgtdev);
2357 }
2358
2359 static struct btrfs_device *btrfs_find_device_by_path(
2360                 struct btrfs_fs_info *fs_info, const char *device_path)
2361 {
2362         int ret = 0;
2363         struct btrfs_super_block *disk_super;
2364         u64 devid;
2365         u8 *dev_uuid;
2366         struct block_device *bdev;
2367         struct btrfs_device *device;
2368
2369         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2370                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2371         if (ret)
2372                 return ERR_PTR(ret);
2373
2374         devid = btrfs_stack_device_id(&disk_super->dev_item);
2375         dev_uuid = disk_super->dev_item.uuid;
2376         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2377                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2378                                            disk_super->metadata_uuid);
2379         else
2380                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2381                                            disk_super->fsid);
2382
2383         btrfs_release_disk_super(disk_super);
2384         if (!device)
2385                 device = ERR_PTR(-ENOENT);
2386         blkdev_put(bdev, FMODE_READ);
2387         return device;
2388 }
2389
2390 /*
2391  * Lookup a device given by device id, or the path if the id is 0.
2392  */
2393 struct btrfs_device *btrfs_find_device_by_devspec(
2394                 struct btrfs_fs_info *fs_info, u64 devid,
2395                 const char *device_path)
2396 {
2397         struct btrfs_device *device;
2398
2399         if (devid) {
2400                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2401                                            NULL);
2402                 if (!device)
2403                         return ERR_PTR(-ENOENT);
2404                 return device;
2405         }
2406
2407         if (!device_path || !device_path[0])
2408                 return ERR_PTR(-EINVAL);
2409
2410         if (strcmp(device_path, "missing") == 0) {
2411                 /* Find first missing device */
2412                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2413                                     dev_list) {
2414                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2415                                      &device->dev_state) && !device->bdev)
2416                                 return device;
2417                 }
2418                 return ERR_PTR(-ENOENT);
2419         }
2420
2421         return btrfs_find_device_by_path(fs_info, device_path);
2422 }
2423
2424 /*
2425  * does all the dirty work required for changing file system's UUID.
2426  */
2427 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2428 {
2429         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2430         struct btrfs_fs_devices *old_devices;
2431         struct btrfs_fs_devices *seed_devices;
2432         struct btrfs_super_block *disk_super = fs_info->super_copy;
2433         struct btrfs_device *device;
2434         u64 super_flags;
2435
2436         lockdep_assert_held(&uuid_mutex);
2437         if (!fs_devices->seeding)
2438                 return -EINVAL;
2439
2440         /*
2441          * Private copy of the seed devices, anchored at
2442          * fs_info->fs_devices->seed_list
2443          */
2444         seed_devices = alloc_fs_devices(NULL, NULL);
2445         if (IS_ERR(seed_devices))
2446                 return PTR_ERR(seed_devices);
2447
2448         /*
2449          * It's necessary to retain a copy of the original seed fs_devices in
2450          * fs_uuids so that filesystems which have been seeded can successfully
2451          * reference the seed device from open_seed_devices. This also supports
2452          * multiple fs seed.
2453          */
2454         old_devices = clone_fs_devices(fs_devices);
2455         if (IS_ERR(old_devices)) {
2456                 kfree(seed_devices);
2457                 return PTR_ERR(old_devices);
2458         }
2459
2460         list_add(&old_devices->fs_list, &fs_uuids);
2461
2462         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2463         seed_devices->opened = 1;
2464         INIT_LIST_HEAD(&seed_devices->devices);
2465         INIT_LIST_HEAD(&seed_devices->alloc_list);
2466         mutex_init(&seed_devices->device_list_mutex);
2467
2468         mutex_lock(&fs_devices->device_list_mutex);
2469         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2470                               synchronize_rcu);
2471         list_for_each_entry(device, &seed_devices->devices, dev_list)
2472                 device->fs_devices = seed_devices;
2473
2474         fs_devices->seeding = false;
2475         fs_devices->num_devices = 0;
2476         fs_devices->open_devices = 0;
2477         fs_devices->missing_devices = 0;
2478         fs_devices->rotating = false;
2479         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2480
2481         generate_random_uuid(fs_devices->fsid);
2482         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2483         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2484         mutex_unlock(&fs_devices->device_list_mutex);
2485
2486         super_flags = btrfs_super_flags(disk_super) &
2487                       ~BTRFS_SUPER_FLAG_SEEDING;
2488         btrfs_set_super_flags(disk_super, super_flags);
2489
2490         return 0;
2491 }
2492
2493 /*
2494  * Store the expected generation for seed devices in device items.
2495  */
2496 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2497 {
2498         struct btrfs_fs_info *fs_info = trans->fs_info;
2499         struct btrfs_root *root = fs_info->chunk_root;
2500         struct btrfs_path *path;
2501         struct extent_buffer *leaf;
2502         struct btrfs_dev_item *dev_item;
2503         struct btrfs_device *device;
2504         struct btrfs_key key;
2505         u8 fs_uuid[BTRFS_FSID_SIZE];
2506         u8 dev_uuid[BTRFS_UUID_SIZE];
2507         u64 devid;
2508         int ret;
2509
2510         path = btrfs_alloc_path();
2511         if (!path)
2512                 return -ENOMEM;
2513
2514         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2515         key.offset = 0;
2516         key.type = BTRFS_DEV_ITEM_KEY;
2517
2518         while (1) {
2519                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2520                 if (ret < 0)
2521                         goto error;
2522
2523                 leaf = path->nodes[0];
2524 next_slot:
2525                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2526                         ret = btrfs_next_leaf(root, path);
2527                         if (ret > 0)
2528                                 break;
2529                         if (ret < 0)
2530                                 goto error;
2531                         leaf = path->nodes[0];
2532                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2533                         btrfs_release_path(path);
2534                         continue;
2535                 }
2536
2537                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2538                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2539                     key.type != BTRFS_DEV_ITEM_KEY)
2540                         break;
2541
2542                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2543                                           struct btrfs_dev_item);
2544                 devid = btrfs_device_id(leaf, dev_item);
2545                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2546                                    BTRFS_UUID_SIZE);
2547                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2548                                    BTRFS_FSID_SIZE);
2549                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2550                                            fs_uuid);
2551                 BUG_ON(!device); /* Logic error */
2552
2553                 if (device->fs_devices->seeding) {
2554                         btrfs_set_device_generation(leaf, dev_item,
2555                                                     device->generation);
2556                         btrfs_mark_buffer_dirty(leaf);
2557                 }
2558
2559                 path->slots[0]++;
2560                 goto next_slot;
2561         }
2562         ret = 0;
2563 error:
2564         btrfs_free_path(path);
2565         return ret;
2566 }
2567
2568 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2569 {
2570         struct btrfs_root *root = fs_info->dev_root;
2571         struct request_queue *q;
2572         struct btrfs_trans_handle *trans;
2573         struct btrfs_device *device;
2574         struct block_device *bdev;
2575         struct super_block *sb = fs_info->sb;
2576         struct rcu_string *name;
2577         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2578         u64 orig_super_total_bytes;
2579         u64 orig_super_num_devices;
2580         int seeding_dev = 0;
2581         int ret = 0;
2582         bool locked = false;
2583
2584         if (sb_rdonly(sb) && !fs_devices->seeding)
2585                 return -EROFS;
2586
2587         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2588                                   fs_info->bdev_holder);
2589         if (IS_ERR(bdev))
2590                 return PTR_ERR(bdev);
2591
2592         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2593                 ret = -EINVAL;
2594                 goto error;
2595         }
2596
2597         if (fs_devices->seeding) {
2598                 seeding_dev = 1;
2599                 down_write(&sb->s_umount);
2600                 mutex_lock(&uuid_mutex);
2601                 locked = true;
2602         }
2603
2604         sync_blockdev(bdev);
2605
2606         rcu_read_lock();
2607         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2608                 if (device->bdev == bdev) {
2609                         ret = -EEXIST;
2610                         rcu_read_unlock();
2611                         goto error;
2612                 }
2613         }
2614         rcu_read_unlock();
2615
2616         device = btrfs_alloc_device(fs_info, NULL, NULL);
2617         if (IS_ERR(device)) {
2618                 /* we can safely leave the fs_devices entry around */
2619                 ret = PTR_ERR(device);
2620                 goto error;
2621         }
2622
2623         name = rcu_string_strdup(device_path, GFP_KERNEL);
2624         if (!name) {
2625                 ret = -ENOMEM;
2626                 goto error_free_device;
2627         }
2628         rcu_assign_pointer(device->name, name);
2629
2630         device->fs_info = fs_info;
2631         device->bdev = bdev;
2632
2633         ret = btrfs_get_dev_zone_info(device);
2634         if (ret)
2635                 goto error_free_device;
2636
2637         trans = btrfs_start_transaction(root, 0);
2638         if (IS_ERR(trans)) {
2639                 ret = PTR_ERR(trans);
2640                 goto error_free_zone;
2641         }
2642
2643         q = bdev_get_queue(bdev);
2644         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2645         device->generation = trans->transid;
2646         device->io_width = fs_info->sectorsize;
2647         device->io_align = fs_info->sectorsize;
2648         device->sector_size = fs_info->sectorsize;
2649         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2650                                          fs_info->sectorsize);
2651         device->disk_total_bytes = device->total_bytes;
2652         device->commit_total_bytes = device->total_bytes;
2653         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2654         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2655         device->mode = FMODE_EXCL;
2656         device->dev_stats_valid = 1;
2657         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2658
2659         if (seeding_dev) {
2660                 btrfs_clear_sb_rdonly(sb);
2661                 ret = btrfs_prepare_sprout(fs_info);
2662                 if (ret) {
2663                         btrfs_abort_transaction(trans, ret);
2664                         goto error_trans;
2665                 }
2666         }
2667
2668         device->fs_devices = fs_devices;
2669
2670         mutex_lock(&fs_devices->device_list_mutex);
2671         mutex_lock(&fs_info->chunk_mutex);
2672         list_add_rcu(&device->dev_list, &fs_devices->devices);
2673         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2674         fs_devices->num_devices++;
2675         fs_devices->open_devices++;
2676         fs_devices->rw_devices++;
2677         fs_devices->total_devices++;
2678         fs_devices->total_rw_bytes += device->total_bytes;
2679
2680         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2681
2682         if (!blk_queue_nonrot(q))
2683                 fs_devices->rotating = true;
2684
2685         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2686         btrfs_set_super_total_bytes(fs_info->super_copy,
2687                 round_down(orig_super_total_bytes + device->total_bytes,
2688                            fs_info->sectorsize));
2689
2690         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2691         btrfs_set_super_num_devices(fs_info->super_copy,
2692                                     orig_super_num_devices + 1);
2693
2694         /*
2695          * we've got more storage, clear any full flags on the space
2696          * infos
2697          */
2698         btrfs_clear_space_info_full(fs_info);
2699
2700         mutex_unlock(&fs_info->chunk_mutex);
2701
2702         /* Add sysfs device entry */
2703         btrfs_sysfs_add_device(device);
2704
2705         mutex_unlock(&fs_devices->device_list_mutex);
2706
2707         if (seeding_dev) {
2708                 mutex_lock(&fs_info->chunk_mutex);
2709                 ret = init_first_rw_device(trans);
2710                 mutex_unlock(&fs_info->chunk_mutex);
2711                 if (ret) {
2712                         btrfs_abort_transaction(trans, ret);
2713                         goto error_sysfs;
2714                 }
2715         }
2716
2717         ret = btrfs_add_dev_item(trans, device);
2718         if (ret) {
2719                 btrfs_abort_transaction(trans, ret);
2720                 goto error_sysfs;
2721         }
2722
2723         if (seeding_dev) {
2724                 ret = btrfs_finish_sprout(trans);
2725                 if (ret) {
2726                         btrfs_abort_transaction(trans, ret);
2727                         goto error_sysfs;
2728                 }
2729
2730                 /*
2731                  * fs_devices now represents the newly sprouted filesystem and
2732                  * its fsid has been changed by btrfs_prepare_sprout
2733                  */
2734                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2735         }
2736
2737         ret = btrfs_commit_transaction(trans);
2738
2739         if (seeding_dev) {
2740                 mutex_unlock(&uuid_mutex);
2741                 up_write(&sb->s_umount);
2742                 locked = false;
2743
2744                 if (ret) /* transaction commit */
2745                         return ret;
2746
2747                 ret = btrfs_relocate_sys_chunks(fs_info);
2748                 if (ret < 0)
2749                         btrfs_handle_fs_error(fs_info, ret,
2750                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2751                 trans = btrfs_attach_transaction(root);
2752                 if (IS_ERR(trans)) {
2753                         if (PTR_ERR(trans) == -ENOENT)
2754                                 return 0;
2755                         ret = PTR_ERR(trans);
2756                         trans = NULL;
2757                         goto error_sysfs;
2758                 }
2759                 ret = btrfs_commit_transaction(trans);
2760         }
2761
2762         /*
2763          * Now that we have written a new super block to this device, check all
2764          * other fs_devices list if device_path alienates any other scanned
2765          * device.
2766          * We can ignore the return value as it typically returns -EINVAL and
2767          * only succeeds if the device was an alien.
2768          */
2769         btrfs_forget_devices(device_path);
2770
2771         /* Update ctime/mtime for blkid or udev */
2772         update_dev_time(device_path);
2773
2774         return ret;
2775
2776 error_sysfs:
2777         btrfs_sysfs_remove_device(device);
2778         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2779         mutex_lock(&fs_info->chunk_mutex);
2780         list_del_rcu(&device->dev_list);
2781         list_del(&device->dev_alloc_list);
2782         fs_info->fs_devices->num_devices--;
2783         fs_info->fs_devices->open_devices--;
2784         fs_info->fs_devices->rw_devices--;
2785         fs_info->fs_devices->total_devices--;
2786         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2787         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2788         btrfs_set_super_total_bytes(fs_info->super_copy,
2789                                     orig_super_total_bytes);
2790         btrfs_set_super_num_devices(fs_info->super_copy,
2791                                     orig_super_num_devices);
2792         mutex_unlock(&fs_info->chunk_mutex);
2793         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2794 error_trans:
2795         if (seeding_dev)
2796                 btrfs_set_sb_rdonly(sb);
2797         if (trans)
2798                 btrfs_end_transaction(trans);
2799 error_free_zone:
2800         btrfs_destroy_dev_zone_info(device);
2801 error_free_device:
2802         btrfs_free_device(device);
2803 error:
2804         blkdev_put(bdev, FMODE_EXCL);
2805         if (locked) {
2806                 mutex_unlock(&uuid_mutex);
2807                 up_write(&sb->s_umount);
2808         }
2809         return ret;
2810 }
2811
2812 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2813                                         struct btrfs_device *device)
2814 {
2815         int ret;
2816         struct btrfs_path *path;
2817         struct btrfs_root *root = device->fs_info->chunk_root;
2818         struct btrfs_dev_item *dev_item;
2819         struct extent_buffer *leaf;
2820         struct btrfs_key key;
2821
2822         path = btrfs_alloc_path();
2823         if (!path)
2824                 return -ENOMEM;
2825
2826         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2827         key.type = BTRFS_DEV_ITEM_KEY;
2828         key.offset = device->devid;
2829
2830         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2831         if (ret < 0)
2832                 goto out;
2833
2834         if (ret > 0) {
2835                 ret = -ENOENT;
2836                 goto out;
2837         }
2838
2839         leaf = path->nodes[0];
2840         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2841
2842         btrfs_set_device_id(leaf, dev_item, device->devid);
2843         btrfs_set_device_type(leaf, dev_item, device->type);
2844         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2845         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2846         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2847         btrfs_set_device_total_bytes(leaf, dev_item,
2848                                      btrfs_device_get_disk_total_bytes(device));
2849         btrfs_set_device_bytes_used(leaf, dev_item,
2850                                     btrfs_device_get_bytes_used(device));
2851         btrfs_mark_buffer_dirty(leaf);
2852
2853 out:
2854         btrfs_free_path(path);
2855         return ret;
2856 }
2857
2858 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2859                       struct btrfs_device *device, u64 new_size)
2860 {
2861         struct btrfs_fs_info *fs_info = device->fs_info;
2862         struct btrfs_super_block *super_copy = fs_info->super_copy;
2863         u64 old_total;
2864         u64 diff;
2865
2866         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2867                 return -EACCES;
2868
2869         new_size = round_down(new_size, fs_info->sectorsize);
2870
2871         mutex_lock(&fs_info->chunk_mutex);
2872         old_total = btrfs_super_total_bytes(super_copy);
2873         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2874
2875         if (new_size <= device->total_bytes ||
2876             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2877                 mutex_unlock(&fs_info->chunk_mutex);
2878                 return -EINVAL;
2879         }
2880
2881         btrfs_set_super_total_bytes(super_copy,
2882                         round_down(old_total + diff, fs_info->sectorsize));
2883         device->fs_devices->total_rw_bytes += diff;
2884
2885         btrfs_device_set_total_bytes(device, new_size);
2886         btrfs_device_set_disk_total_bytes(device, new_size);
2887         btrfs_clear_space_info_full(device->fs_info);
2888         if (list_empty(&device->post_commit_list))
2889                 list_add_tail(&device->post_commit_list,
2890                               &trans->transaction->dev_update_list);
2891         mutex_unlock(&fs_info->chunk_mutex);
2892
2893         return btrfs_update_device(trans, device);
2894 }
2895
2896 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2897 {
2898         struct btrfs_fs_info *fs_info = trans->fs_info;
2899         struct btrfs_root *root = fs_info->chunk_root;
2900         int ret;
2901         struct btrfs_path *path;
2902         struct btrfs_key key;
2903
2904         path = btrfs_alloc_path();
2905         if (!path)
2906                 return -ENOMEM;
2907
2908         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2909         key.offset = chunk_offset;
2910         key.type = BTRFS_CHUNK_ITEM_KEY;
2911
2912         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2913         if (ret < 0)
2914                 goto out;
2915         else if (ret > 0) { /* Logic error or corruption */
2916                 btrfs_handle_fs_error(fs_info, -ENOENT,
2917                                       "Failed lookup while freeing chunk.");
2918                 ret = -ENOENT;
2919                 goto out;
2920         }
2921
2922         ret = btrfs_del_item(trans, root, path);
2923         if (ret < 0)
2924                 btrfs_handle_fs_error(fs_info, ret,
2925                                       "Failed to delete chunk item.");
2926 out:
2927         btrfs_free_path(path);
2928         return ret;
2929 }
2930
2931 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2932 {
2933         struct btrfs_super_block *super_copy = fs_info->super_copy;
2934         struct btrfs_disk_key *disk_key;
2935         struct btrfs_chunk *chunk;
2936         u8 *ptr;
2937         int ret = 0;
2938         u32 num_stripes;
2939         u32 array_size;
2940         u32 len = 0;
2941         u32 cur;
2942         struct btrfs_key key;
2943
2944         mutex_lock(&fs_info->chunk_mutex);
2945         array_size = btrfs_super_sys_array_size(super_copy);
2946
2947         ptr = super_copy->sys_chunk_array;
2948         cur = 0;
2949
2950         while (cur < array_size) {
2951                 disk_key = (struct btrfs_disk_key *)ptr;
2952                 btrfs_disk_key_to_cpu(&key, disk_key);
2953
2954                 len = sizeof(*disk_key);
2955
2956                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2957                         chunk = (struct btrfs_chunk *)(ptr + len);
2958                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2959                         len += btrfs_chunk_item_size(num_stripes);
2960                 } else {
2961                         ret = -EIO;
2962                         break;
2963                 }
2964                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2965                     key.offset == chunk_offset) {
2966                         memmove(ptr, ptr + len, array_size - (cur + len));
2967                         array_size -= len;
2968                         btrfs_set_super_sys_array_size(super_copy, array_size);
2969                 } else {
2970                         ptr += len;
2971                         cur += len;
2972                 }
2973         }
2974         mutex_unlock(&fs_info->chunk_mutex);
2975         return ret;
2976 }
2977
2978 /*
2979  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2980  * @logical: Logical block offset in bytes.
2981  * @length: Length of extent in bytes.
2982  *
2983  * Return: Chunk mapping or ERR_PTR.
2984  */
2985 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2986                                        u64 logical, u64 length)
2987 {
2988         struct extent_map_tree *em_tree;
2989         struct extent_map *em;
2990
2991         em_tree = &fs_info->mapping_tree;
2992         read_lock(&em_tree->lock);
2993         em = lookup_extent_mapping(em_tree, logical, length);
2994         read_unlock(&em_tree->lock);
2995
2996         if (!em) {
2997                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2998                            logical, length);
2999                 return ERR_PTR(-EINVAL);
3000         }
3001
3002         if (em->start > logical || em->start + em->len < logical) {
3003                 btrfs_crit(fs_info,
3004                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3005                            logical, length, em->start, em->start + em->len);
3006                 free_extent_map(em);
3007                 return ERR_PTR(-EINVAL);
3008         }
3009
3010         /* callers are responsible for dropping em's ref. */
3011         return em;
3012 }
3013
3014 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3015 {
3016         struct btrfs_fs_info *fs_info = trans->fs_info;
3017         struct extent_map *em;
3018         struct map_lookup *map;
3019         u64 dev_extent_len = 0;
3020         int i, ret = 0;
3021         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3022
3023         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3024         if (IS_ERR(em)) {
3025                 /*
3026                  * This is a logic error, but we don't want to just rely on the
3027                  * user having built with ASSERT enabled, so if ASSERT doesn't
3028                  * do anything we still error out.
3029                  */
3030                 ASSERT(0);
3031                 return PTR_ERR(em);
3032         }
3033         map = em->map_lookup;
3034         mutex_lock(&fs_info->chunk_mutex);
3035         check_system_chunk(trans, map->type);
3036         mutex_unlock(&fs_info->chunk_mutex);
3037
3038         /*
3039          * Take the device list mutex to prevent races with the final phase of
3040          * a device replace operation that replaces the device object associated
3041          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
3042          */
3043         mutex_lock(&fs_devices->device_list_mutex);
3044         for (i = 0; i < map->num_stripes; i++) {
3045                 struct btrfs_device *device = map->stripes[i].dev;
3046                 ret = btrfs_free_dev_extent(trans, device,
3047                                             map->stripes[i].physical,
3048                                             &dev_extent_len);
3049                 if (ret) {
3050                         mutex_unlock(&fs_devices->device_list_mutex);
3051                         btrfs_abort_transaction(trans, ret);
3052                         goto out;
3053                 }
3054
3055                 if (device->bytes_used > 0) {
3056                         mutex_lock(&fs_info->chunk_mutex);
3057                         btrfs_device_set_bytes_used(device,
3058                                         device->bytes_used - dev_extent_len);
3059                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3060                         btrfs_clear_space_info_full(fs_info);
3061                         mutex_unlock(&fs_info->chunk_mutex);
3062                 }
3063
3064                 ret = btrfs_update_device(trans, device);
3065                 if (ret) {
3066                         mutex_unlock(&fs_devices->device_list_mutex);
3067                         btrfs_abort_transaction(trans, ret);
3068                         goto out;
3069                 }
3070         }
3071         mutex_unlock(&fs_devices->device_list_mutex);
3072
3073         ret = btrfs_free_chunk(trans, chunk_offset);
3074         if (ret) {
3075                 btrfs_abort_transaction(trans, ret);
3076                 goto out;
3077         }
3078
3079         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3080
3081         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3082                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3083                 if (ret) {
3084                         btrfs_abort_transaction(trans, ret);
3085                         goto out;
3086                 }
3087         }
3088
3089         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3090         if (ret) {
3091                 btrfs_abort_transaction(trans, ret);
3092                 goto out;
3093         }
3094
3095 out:
3096         /* once for us */
3097         free_extent_map(em);
3098         return ret;
3099 }
3100
3101 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3102 {
3103         struct btrfs_root *root = fs_info->chunk_root;
3104         struct btrfs_trans_handle *trans;
3105         struct btrfs_block_group *block_group;
3106         int ret;
3107
3108         /*
3109          * Prevent races with automatic removal of unused block groups.
3110          * After we relocate and before we remove the chunk with offset
3111          * chunk_offset, automatic removal of the block group can kick in,
3112          * resulting in a failure when calling btrfs_remove_chunk() below.
3113          *
3114          * Make sure to acquire this mutex before doing a tree search (dev
3115          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3116          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3117          * we release the path used to search the chunk/dev tree and before
3118          * the current task acquires this mutex and calls us.
3119          */
3120         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3121
3122         /* step one, relocate all the extents inside this chunk */
3123         btrfs_scrub_pause(fs_info);
3124         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3125         btrfs_scrub_continue(fs_info);
3126         if (ret)
3127                 return ret;
3128
3129         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3130         if (!block_group)
3131                 return -ENOENT;
3132         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3133         btrfs_put_block_group(block_group);
3134
3135         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3136                                                      chunk_offset);
3137         if (IS_ERR(trans)) {
3138                 ret = PTR_ERR(trans);
3139                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3140                 return ret;
3141         }
3142
3143         /*
3144          * step two, delete the device extents and the
3145          * chunk tree entries
3146          */
3147         ret = btrfs_remove_chunk(trans, chunk_offset);
3148         btrfs_end_transaction(trans);
3149         return ret;
3150 }
3151
3152 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3153 {
3154         struct btrfs_root *chunk_root = fs_info->chunk_root;
3155         struct btrfs_path *path;
3156         struct extent_buffer *leaf;
3157         struct btrfs_chunk *chunk;
3158         struct btrfs_key key;
3159         struct btrfs_key found_key;
3160         u64 chunk_type;
3161         bool retried = false;
3162         int failed = 0;
3163         int ret;
3164
3165         path = btrfs_alloc_path();
3166         if (!path)
3167                 return -ENOMEM;
3168
3169 again:
3170         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3171         key.offset = (u64)-1;
3172         key.type = BTRFS_CHUNK_ITEM_KEY;
3173
3174         while (1) {
3175                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3176                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3177                 if (ret < 0) {
3178                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3179                         goto error;
3180                 }
3181                 BUG_ON(ret == 0); /* Corruption */
3182
3183                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3184                                           key.type);
3185                 if (ret)
3186                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3187                 if (ret < 0)
3188                         goto error;
3189                 if (ret > 0)
3190                         break;
3191
3192                 leaf = path->nodes[0];
3193                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3194
3195                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3196                                        struct btrfs_chunk);
3197                 chunk_type = btrfs_chunk_type(leaf, chunk);
3198                 btrfs_release_path(path);
3199
3200                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3201                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3202                         if (ret == -ENOSPC)
3203                                 failed++;
3204                         else
3205                                 BUG_ON(ret);
3206                 }
3207                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3208
3209                 if (found_key.offset == 0)
3210                         break;
3211                 key.offset = found_key.offset - 1;
3212         }
3213         ret = 0;
3214         if (failed && !retried) {
3215                 failed = 0;
3216                 retried = true;
3217                 goto again;
3218         } else if (WARN_ON(failed && retried)) {
3219                 ret = -ENOSPC;
3220         }
3221 error:
3222         btrfs_free_path(path);
3223         return ret;
3224 }
3225
3226 /*
3227  * return 1 : allocate a data chunk successfully,
3228  * return <0: errors during allocating a data chunk,
3229  * return 0 : no need to allocate a data chunk.
3230  */
3231 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3232                                       u64 chunk_offset)
3233 {
3234         struct btrfs_block_group *cache;
3235         u64 bytes_used;
3236         u64 chunk_type;
3237
3238         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3239         ASSERT(cache);
3240         chunk_type = cache->flags;
3241         btrfs_put_block_group(cache);
3242
3243         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3244                 return 0;
3245
3246         spin_lock(&fs_info->data_sinfo->lock);
3247         bytes_used = fs_info->data_sinfo->bytes_used;
3248         spin_unlock(&fs_info->data_sinfo->lock);
3249
3250         if (!bytes_used) {
3251                 struct btrfs_trans_handle *trans;
3252                 int ret;
3253
3254                 trans = btrfs_join_transaction(fs_info->tree_root);
3255                 if (IS_ERR(trans))
3256                         return PTR_ERR(trans);
3257
3258                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3259                 btrfs_end_transaction(trans);
3260                 if (ret < 0)
3261                         return ret;
3262                 return 1;
3263         }
3264
3265         return 0;
3266 }
3267
3268 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3269                                struct btrfs_balance_control *bctl)
3270 {
3271         struct btrfs_root *root = fs_info->tree_root;
3272         struct btrfs_trans_handle *trans;
3273         struct btrfs_balance_item *item;
3274         struct btrfs_disk_balance_args disk_bargs;
3275         struct btrfs_path *path;
3276         struct extent_buffer *leaf;
3277         struct btrfs_key key;
3278         int ret, err;
3279
3280         path = btrfs_alloc_path();
3281         if (!path)
3282                 return -ENOMEM;
3283
3284         trans = btrfs_start_transaction(root, 0);
3285         if (IS_ERR(trans)) {
3286                 btrfs_free_path(path);
3287                 return PTR_ERR(trans);
3288         }
3289
3290         key.objectid = BTRFS_BALANCE_OBJECTID;
3291         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3292         key.offset = 0;
3293
3294         ret = btrfs_insert_empty_item(trans, root, path, &key,
3295                                       sizeof(*item));
3296         if (ret)
3297                 goto out;
3298
3299         leaf = path->nodes[0];
3300         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3301
3302         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3303
3304         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3305         btrfs_set_balance_data(leaf, item, &disk_bargs);
3306         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3307         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3308         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3309         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3310
3311         btrfs_set_balance_flags(leaf, item, bctl->flags);
3312
3313         btrfs_mark_buffer_dirty(leaf);
3314 out:
3315         btrfs_free_path(path);
3316         err = btrfs_commit_transaction(trans);
3317         if (err && !ret)
3318                 ret = err;
3319         return ret;
3320 }
3321
3322 static int del_balance_item(struct btrfs_fs_info *fs_info)
3323 {
3324         struct btrfs_root *root = fs_info->tree_root;
3325         struct btrfs_trans_handle *trans;
3326         struct btrfs_path *path;
3327         struct btrfs_key key;
3328         int ret, err;
3329
3330         path = btrfs_alloc_path();
3331         if (!path)
3332                 return -ENOMEM;
3333
3334         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3335         if (IS_ERR(trans)) {
3336                 btrfs_free_path(path);
3337                 return PTR_ERR(trans);
3338         }
3339
3340         key.objectid = BTRFS_BALANCE_OBJECTID;
3341         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3342         key.offset = 0;
3343
3344         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3345         if (ret < 0)
3346                 goto out;
3347         if (ret > 0) {
3348                 ret = -ENOENT;
3349                 goto out;
3350         }
3351
3352         ret = btrfs_del_item(trans, root, path);
3353 out:
3354         btrfs_free_path(path);
3355         err = btrfs_commit_transaction(trans);
3356         if (err && !ret)
3357                 ret = err;
3358         return ret;
3359 }
3360
3361 /*
3362  * This is a heuristic used to reduce the number of chunks balanced on
3363  * resume after balance was interrupted.
3364  */
3365 static void update_balance_args(struct btrfs_balance_control *bctl)
3366 {
3367         /*
3368          * Turn on soft mode for chunk types that were being converted.
3369          */
3370         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3371                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3372         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3373                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3374         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3375                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3376
3377         /*
3378          * Turn on usage filter if is not already used.  The idea is
3379          * that chunks that we have already balanced should be
3380          * reasonably full.  Don't do it for chunks that are being
3381          * converted - that will keep us from relocating unconverted
3382          * (albeit full) chunks.
3383          */
3384         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3385             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3386             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3387                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3388                 bctl->data.usage = 90;
3389         }
3390         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3391             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3392             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3393                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3394                 bctl->sys.usage = 90;
3395         }
3396         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3397             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3398             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3399                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3400                 bctl->meta.usage = 90;
3401         }
3402 }
3403
3404 /*
3405  * Clear the balance status in fs_info and delete the balance item from disk.
3406  */
3407 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3408 {
3409         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3410         int ret;
3411
3412         BUG_ON(!fs_info->balance_ctl);
3413
3414         spin_lock(&fs_info->balance_lock);
3415         fs_info->balance_ctl = NULL;
3416         spin_unlock(&fs_info->balance_lock);
3417
3418         kfree(bctl);
3419         ret = del_balance_item(fs_info);
3420         if (ret)
3421                 btrfs_handle_fs_error(fs_info, ret, NULL);
3422 }
3423
3424 /*
3425  * Balance filters.  Return 1 if chunk should be filtered out
3426  * (should not be balanced).
3427  */
3428 static int chunk_profiles_filter(u64 chunk_type,
3429                                  struct btrfs_balance_args *bargs)
3430 {
3431         chunk_type = chunk_to_extended(chunk_type) &
3432                                 BTRFS_EXTENDED_PROFILE_MASK;
3433
3434         if (bargs->profiles & chunk_type)
3435                 return 0;
3436
3437         return 1;
3438 }
3439
3440 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3441                               struct btrfs_balance_args *bargs)
3442 {
3443         struct btrfs_block_group *cache;
3444         u64 chunk_used;
3445         u64 user_thresh_min;
3446         u64 user_thresh_max;
3447         int ret = 1;
3448
3449         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3450         chunk_used = cache->used;
3451
3452         if (bargs->usage_min == 0)
3453                 user_thresh_min = 0;
3454         else
3455                 user_thresh_min = div_factor_fine(cache->length,
3456                                                   bargs->usage_min);
3457
3458         if (bargs->usage_max == 0)
3459                 user_thresh_max = 1;
3460         else if (bargs->usage_max > 100)
3461                 user_thresh_max = cache->length;
3462         else
3463                 user_thresh_max = div_factor_fine(cache->length,
3464                                                   bargs->usage_max);
3465
3466         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3467                 ret = 0;
3468
3469         btrfs_put_block_group(cache);
3470         return ret;
3471 }
3472
3473 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3474                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3475 {
3476         struct btrfs_block_group *cache;
3477         u64 chunk_used, user_thresh;
3478         int ret = 1;
3479
3480         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3481         chunk_used = cache->used;
3482
3483         if (bargs->usage_min == 0)
3484                 user_thresh = 1;
3485         else if (bargs->usage > 100)
3486                 user_thresh = cache->length;
3487         else
3488                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3489
3490         if (chunk_used < user_thresh)
3491                 ret = 0;
3492
3493         btrfs_put_block_group(cache);
3494         return ret;
3495 }
3496
3497 static int chunk_devid_filter(struct extent_buffer *leaf,
3498                               struct btrfs_chunk *chunk,
3499                               struct btrfs_balance_args *bargs)
3500 {
3501         struct btrfs_stripe *stripe;
3502         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3503         int i;
3504
3505         for (i = 0; i < num_stripes; i++) {
3506                 stripe = btrfs_stripe_nr(chunk, i);
3507                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3508                         return 0;
3509         }
3510
3511         return 1;
3512 }
3513
3514 static u64 calc_data_stripes(u64 type, int num_stripes)
3515 {
3516         const int index = btrfs_bg_flags_to_raid_index(type);
3517         const int ncopies = btrfs_raid_array[index].ncopies;
3518         const int nparity = btrfs_raid_array[index].nparity;
3519
3520         if (nparity)
3521                 return num_stripes - nparity;
3522         else
3523                 return num_stripes / ncopies;
3524 }
3525
3526 /* [pstart, pend) */
3527 static int chunk_drange_filter(struct extent_buffer *leaf,
3528                                struct btrfs_chunk *chunk,
3529                                struct btrfs_balance_args *bargs)
3530 {
3531         struct btrfs_stripe *stripe;
3532         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3533         u64 stripe_offset;
3534         u64 stripe_length;
3535         u64 type;
3536         int factor;
3537         int i;
3538
3539         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3540                 return 0;
3541
3542         type = btrfs_chunk_type(leaf, chunk);
3543         factor = calc_data_stripes(type, num_stripes);
3544
3545         for (i = 0; i < num_stripes; i++) {
3546                 stripe = btrfs_stripe_nr(chunk, i);
3547                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3548                         continue;
3549
3550                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3551                 stripe_length = btrfs_chunk_length(leaf, chunk);
3552                 stripe_length = div_u64(stripe_length, factor);
3553
3554                 if (stripe_offset < bargs->pend &&
3555                     stripe_offset + stripe_length > bargs->pstart)
3556                         return 0;
3557         }
3558
3559         return 1;
3560 }
3561
3562 /* [vstart, vend) */
3563 static int chunk_vrange_filter(struct extent_buffer *leaf,
3564                                struct btrfs_chunk *chunk,
3565                                u64 chunk_offset,
3566                                struct btrfs_balance_args *bargs)
3567 {
3568         if (chunk_offset < bargs->vend &&
3569             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3570                 /* at least part of the chunk is inside this vrange */
3571                 return 0;
3572
3573         return 1;
3574 }
3575
3576 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3577                                struct btrfs_chunk *chunk,
3578                                struct btrfs_balance_args *bargs)
3579 {
3580         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3581
3582         if (bargs->stripes_min <= num_stripes
3583                         && num_stripes <= bargs->stripes_max)
3584                 return 0;
3585
3586         return 1;
3587 }
3588
3589 static int chunk_soft_convert_filter(u64 chunk_type,
3590                                      struct btrfs_balance_args *bargs)
3591 {
3592         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3593                 return 0;
3594
3595         chunk_type = chunk_to_extended(chunk_type) &
3596                                 BTRFS_EXTENDED_PROFILE_MASK;
3597
3598         if (bargs->target == chunk_type)
3599                 return 1;
3600
3601         return 0;
3602 }
3603
3604 static int should_balance_chunk(struct extent_buffer *leaf,
3605                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3606 {
3607         struct btrfs_fs_info *fs_info = leaf->fs_info;
3608         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3609         struct btrfs_balance_args *bargs = NULL;
3610         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3611
3612         /* type filter */
3613         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3614               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3615                 return 0;
3616         }
3617
3618         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3619                 bargs = &bctl->data;
3620         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3621                 bargs = &bctl->sys;
3622         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3623                 bargs = &bctl->meta;
3624
3625         /* profiles filter */
3626         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3627             chunk_profiles_filter(chunk_type, bargs)) {
3628                 return 0;
3629         }
3630
3631         /* usage filter */
3632         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3633             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3634                 return 0;
3635         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3636             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3637                 return 0;
3638         }
3639
3640         /* devid filter */
3641         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3642             chunk_devid_filter(leaf, chunk, bargs)) {
3643                 return 0;
3644         }
3645
3646         /* drange filter, makes sense only with devid filter */
3647         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3648             chunk_drange_filter(leaf, chunk, bargs)) {
3649                 return 0;
3650         }
3651
3652         /* vrange filter */
3653         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3654             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3655                 return 0;
3656         }
3657
3658         /* stripes filter */
3659         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3660             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3661                 return 0;
3662         }
3663
3664         /* soft profile changing mode */
3665         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3666             chunk_soft_convert_filter(chunk_type, bargs)) {
3667                 return 0;
3668         }
3669
3670         /*
3671          * limited by count, must be the last filter
3672          */
3673         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3674                 if (bargs->limit == 0)
3675                         return 0;
3676                 else
3677                         bargs->limit--;
3678         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3679                 /*
3680                  * Same logic as the 'limit' filter; the minimum cannot be
3681                  * determined here because we do not have the global information
3682                  * about the count of all chunks that satisfy the filters.
3683                  */
3684                 if (bargs->limit_max == 0)
3685                         return 0;
3686                 else
3687                         bargs->limit_max--;
3688         }
3689
3690         return 1;
3691 }
3692
3693 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3694 {
3695         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3696         struct btrfs_root *chunk_root = fs_info->chunk_root;
3697         u64 chunk_type;
3698         struct btrfs_chunk *chunk;
3699         struct btrfs_path *path = NULL;
3700         struct btrfs_key key;
3701         struct btrfs_key found_key;
3702         struct extent_buffer *leaf;
3703         int slot;
3704         int ret;
3705         int enospc_errors = 0;
3706         bool counting = true;
3707         /* The single value limit and min/max limits use the same bytes in the */
3708         u64 limit_data = bctl->data.limit;
3709         u64 limit_meta = bctl->meta.limit;
3710         u64 limit_sys = bctl->sys.limit;
3711         u32 count_data = 0;
3712         u32 count_meta = 0;
3713         u32 count_sys = 0;
3714         int chunk_reserved = 0;
3715
3716         path = btrfs_alloc_path();
3717         if (!path) {
3718                 ret = -ENOMEM;
3719                 goto error;
3720         }
3721
3722         /* zero out stat counters */
3723         spin_lock(&fs_info->balance_lock);
3724         memset(&bctl->stat, 0, sizeof(bctl->stat));
3725         spin_unlock(&fs_info->balance_lock);
3726 again:
3727         if (!counting) {
3728                 /*
3729                  * The single value limit and min/max limits use the same bytes
3730                  * in the
3731                  */
3732                 bctl->data.limit = limit_data;
3733                 bctl->meta.limit = limit_meta;
3734                 bctl->sys.limit = limit_sys;
3735         }
3736         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3737         key.offset = (u64)-1;
3738         key.type = BTRFS_CHUNK_ITEM_KEY;
3739
3740         while (1) {
3741                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3742                     atomic_read(&fs_info->balance_cancel_req)) {
3743                         ret = -ECANCELED;
3744                         goto error;
3745                 }
3746
3747                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3748                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3749                 if (ret < 0) {
3750                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3751                         goto error;
3752                 }
3753
3754                 /*
3755                  * this shouldn't happen, it means the last relocate
3756                  * failed
3757                  */
3758                 if (ret == 0)
3759                         BUG(); /* FIXME break ? */
3760
3761                 ret = btrfs_previous_item(chunk_root, path, 0,
3762                                           BTRFS_CHUNK_ITEM_KEY);
3763                 if (ret) {
3764                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3765                         ret = 0;
3766                         break;
3767                 }
3768
3769                 leaf = path->nodes[0];
3770                 slot = path->slots[0];
3771                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3772
3773                 if (found_key.objectid != key.objectid) {
3774                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3775                         break;
3776                 }
3777
3778                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3779                 chunk_type = btrfs_chunk_type(leaf, chunk);
3780
3781                 if (!counting) {
3782                         spin_lock(&fs_info->balance_lock);
3783                         bctl->stat.considered++;
3784                         spin_unlock(&fs_info->balance_lock);
3785                 }
3786
3787                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3788
3789                 btrfs_release_path(path);
3790                 if (!ret) {
3791                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3792                         goto loop;
3793                 }
3794
3795                 if (counting) {
3796                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3797                         spin_lock(&fs_info->balance_lock);
3798                         bctl->stat.expected++;
3799                         spin_unlock(&fs_info->balance_lock);
3800
3801                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3802                                 count_data++;
3803                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3804                                 count_sys++;
3805                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3806                                 count_meta++;
3807
3808                         goto loop;
3809                 }
3810
3811                 /*
3812                  * Apply limit_min filter, no need to check if the LIMITS
3813                  * filter is used, limit_min is 0 by default
3814                  */
3815                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3816                                         count_data < bctl->data.limit_min)
3817                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3818                                         count_meta < bctl->meta.limit_min)
3819                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3820                                         count_sys < bctl->sys.limit_min)) {
3821                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3822                         goto loop;
3823                 }
3824
3825                 if (!chunk_reserved) {
3826                         /*
3827                          * We may be relocating the only data chunk we have,
3828                          * which could potentially end up with losing data's
3829                          * raid profile, so lets allocate an empty one in
3830                          * advance.
3831                          */
3832                         ret = btrfs_may_alloc_data_chunk(fs_info,
3833                                                          found_key.offset);
3834                         if (ret < 0) {
3835                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3836                                 goto error;
3837                         } else if (ret == 1) {
3838                                 chunk_reserved = 1;
3839                         }
3840                 }
3841
3842                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3843                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3844                 if (ret == -ENOSPC) {
3845                         enospc_errors++;
3846                 } else if (ret == -ETXTBSY) {
3847                         btrfs_info(fs_info,
3848            "skipping relocation of block group %llu due to active swapfile",
3849                                    found_key.offset);
3850                         ret = 0;
3851                 } else if (ret) {
3852                         goto error;
3853                 } else {
3854                         spin_lock(&fs_info->balance_lock);
3855                         bctl->stat.completed++;
3856                         spin_unlock(&fs_info->balance_lock);
3857                 }
3858 loop:
3859                 if (found_key.offset == 0)
3860                         break;
3861                 key.offset = found_key.offset - 1;
3862         }
3863
3864         if (counting) {
3865                 btrfs_release_path(path);
3866                 counting = false;
3867                 goto again;
3868         }
3869 error:
3870         btrfs_free_path(path);
3871         if (enospc_errors) {
3872                 btrfs_info(fs_info, "%d enospc errors during balance",
3873                            enospc_errors);
3874                 if (!ret)
3875                         ret = -ENOSPC;
3876         }
3877
3878         return ret;
3879 }
3880
3881 /**
3882  * alloc_profile_is_valid - see if a given profile is valid and reduced
3883  * @flags: profile to validate
3884  * @extended: if true @flags is treated as an extended profile
3885  */
3886 static int alloc_profile_is_valid(u64 flags, int extended)
3887 {
3888         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3889                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3890
3891         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3892
3893         /* 1) check that all other bits are zeroed */
3894         if (flags & ~mask)
3895                 return 0;
3896
3897         /* 2) see if profile is reduced */
3898         if (flags == 0)
3899                 return !extended; /* "0" is valid for usual profiles */
3900
3901         return has_single_bit_set(flags);
3902 }
3903
3904 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3905 {
3906         /* cancel requested || normal exit path */
3907         return atomic_read(&fs_info->balance_cancel_req) ||
3908                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3909                  atomic_read(&fs_info->balance_cancel_req) == 0);
3910 }
3911
3912 /*
3913  * Validate target profile against allowed profiles and return true if it's OK.
3914  * Otherwise print the error message and return false.
3915  */
3916 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3917                 const struct btrfs_balance_args *bargs,
3918                 u64 allowed, const char *type)
3919 {
3920         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3921                 return true;
3922
3923         /* Profile is valid and does not have bits outside of the allowed set */
3924         if (alloc_profile_is_valid(bargs->target, 1) &&
3925             (bargs->target & ~allowed) == 0)
3926                 return true;
3927
3928         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3929                         type, btrfs_bg_type_to_raid_name(bargs->target));
3930         return false;
3931 }
3932
3933 /*
3934  * Fill @buf with textual description of balance filter flags @bargs, up to
3935  * @size_buf including the terminating null. The output may be trimmed if it
3936  * does not fit into the provided buffer.
3937  */
3938 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3939                                  u32 size_buf)
3940 {
3941         int ret;
3942         u32 size_bp = size_buf;
3943         char *bp = buf;
3944         u64 flags = bargs->flags;
3945         char tmp_buf[128] = {'\0'};
3946
3947         if (!flags)
3948                 return;
3949
3950 #define CHECK_APPEND_NOARG(a)                                           \
3951         do {                                                            \
3952                 ret = snprintf(bp, size_bp, (a));                       \
3953                 if (ret < 0 || ret >= size_bp)                          \
3954                         goto out_overflow;                              \
3955                 size_bp -= ret;                                         \
3956                 bp += ret;                                              \
3957         } while (0)
3958
3959 #define CHECK_APPEND_1ARG(a, v1)                                        \
3960         do {                                                            \
3961                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3962                 if (ret < 0 || ret >= size_bp)                          \
3963                         goto out_overflow;                              \
3964                 size_bp -= ret;                                         \
3965                 bp += ret;                                              \
3966         } while (0)
3967
3968 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3969         do {                                                            \
3970                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3971                 if (ret < 0 || ret >= size_bp)                          \
3972                         goto out_overflow;                              \
3973                 size_bp -= ret;                                         \
3974                 bp += ret;                                              \
3975         } while (0)
3976
3977         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3978                 CHECK_APPEND_1ARG("convert=%s,",
3979                                   btrfs_bg_type_to_raid_name(bargs->target));
3980
3981         if (flags & BTRFS_BALANCE_ARGS_SOFT)
3982                 CHECK_APPEND_NOARG("soft,");
3983
3984         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3985                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3986                                             sizeof(tmp_buf));
3987                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3988         }
3989
3990         if (flags & BTRFS_BALANCE_ARGS_USAGE)
3991                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3992
3993         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3994                 CHECK_APPEND_2ARG("usage=%u..%u,",
3995                                   bargs->usage_min, bargs->usage_max);
3996
3997         if (flags & BTRFS_BALANCE_ARGS_DEVID)
3998                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3999
4000         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4001                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4002                                   bargs->pstart, bargs->pend);
4003
4004         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4005                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4006                                   bargs->vstart, bargs->vend);
4007
4008         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4009                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4010
4011         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4012                 CHECK_APPEND_2ARG("limit=%u..%u,",
4013                                 bargs->limit_min, bargs->limit_max);
4014
4015         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4016                 CHECK_APPEND_2ARG("stripes=%u..%u,",
4017                                   bargs->stripes_min, bargs->stripes_max);
4018
4019 #undef CHECK_APPEND_2ARG
4020 #undef CHECK_APPEND_1ARG
4021 #undef CHECK_APPEND_NOARG
4022
4023 out_overflow:
4024
4025         if (size_bp < size_buf)
4026                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4027         else
4028                 buf[0] = '\0';
4029 }
4030
4031 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4032 {
4033         u32 size_buf = 1024;
4034         char tmp_buf[192] = {'\0'};
4035         char *buf;
4036         char *bp;
4037         u32 size_bp = size_buf;
4038         int ret;
4039         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4040
4041         buf = kzalloc(size_buf, GFP_KERNEL);
4042         if (!buf)
4043                 return;
4044
4045         bp = buf;
4046
4047 #define CHECK_APPEND_1ARG(a, v1)                                        \
4048         do {                                                            \
4049                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4050                 if (ret < 0 || ret >= size_bp)                          \
4051                         goto out_overflow;                              \
4052                 size_bp -= ret;                                         \
4053                 bp += ret;                                              \
4054         } while (0)
4055
4056         if (bctl->flags & BTRFS_BALANCE_FORCE)
4057                 CHECK_APPEND_1ARG("%s", "-f ");
4058
4059         if (bctl->flags & BTRFS_BALANCE_DATA) {
4060                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4061                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4062         }
4063
4064         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4065                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4066                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4067         }
4068
4069         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4070                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4071                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4072         }
4073
4074 #undef CHECK_APPEND_1ARG
4075
4076 out_overflow:
4077
4078         if (size_bp < size_buf)
4079                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4080         btrfs_info(fs_info, "balance: %s %s",
4081                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4082                    "resume" : "start", buf);
4083
4084         kfree(buf);
4085 }
4086
4087 /*
4088  * Should be called with balance mutexe held
4089  */
4090 int btrfs_balance(struct btrfs_fs_info *fs_info,
4091                   struct btrfs_balance_control *bctl,
4092                   struct btrfs_ioctl_balance_args *bargs)
4093 {
4094         u64 meta_target, data_target;
4095         u64 allowed;
4096         int mixed = 0;
4097         int ret;
4098         u64 num_devices;
4099         unsigned seq;
4100         bool reducing_redundancy;
4101         int i;
4102
4103         if (btrfs_fs_closing(fs_info) ||
4104             atomic_read(&fs_info->balance_pause_req) ||
4105             btrfs_should_cancel_balance(fs_info)) {
4106                 ret = -EINVAL;
4107                 goto out;
4108         }
4109
4110         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4111         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4112                 mixed = 1;
4113
4114         /*
4115          * In case of mixed groups both data and meta should be picked,
4116          * and identical options should be given for both of them.
4117          */
4118         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4119         if (mixed && (bctl->flags & allowed)) {
4120                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4121                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4122                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4123                         btrfs_err(fs_info,
4124           "balance: mixed groups data and metadata options must be the same");
4125                         ret = -EINVAL;
4126                         goto out;
4127                 }
4128         }
4129
4130         /*
4131          * rw_devices will not change at the moment, device add/delete/replace
4132          * are exclusive
4133          */
4134         num_devices = fs_info->fs_devices->rw_devices;
4135
4136         /*
4137          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4138          * special bit for it, to make it easier to distinguish.  Thus we need
4139          * to set it manually, or balance would refuse the profile.
4140          */
4141         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4142         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4143                 if (num_devices >= btrfs_raid_array[i].devs_min)
4144                         allowed |= btrfs_raid_array[i].bg_flag;
4145
4146         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4147             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4148             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4149                 ret = -EINVAL;
4150                 goto out;
4151         }
4152
4153         /*
4154          * Allow to reduce metadata or system integrity only if force set for
4155          * profiles with redundancy (copies, parity)
4156          */
4157         allowed = 0;
4158         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4159                 if (btrfs_raid_array[i].ncopies >= 2 ||
4160                     btrfs_raid_array[i].tolerated_failures >= 1)
4161                         allowed |= btrfs_raid_array[i].bg_flag;
4162         }
4163         do {
4164                 seq = read_seqbegin(&fs_info->profiles_lock);
4165
4166                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4167                      (fs_info->avail_system_alloc_bits & allowed) &&
4168                      !(bctl->sys.target & allowed)) ||
4169                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4170                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4171                      !(bctl->meta.target & allowed)))
4172                         reducing_redundancy = true;
4173                 else
4174                         reducing_redundancy = false;
4175
4176                 /* if we're not converting, the target field is uninitialized */
4177                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4178                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4179                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4180                         bctl->data.target : fs_info->avail_data_alloc_bits;
4181         } while (read_seqretry(&fs_info->profiles_lock, seq));
4182
4183         if (reducing_redundancy) {
4184                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4185                         btrfs_info(fs_info,
4186                            "balance: force reducing metadata redundancy");
4187                 } else {
4188                         btrfs_err(fs_info,
4189         "balance: reduces metadata redundancy, use --force if you want this");
4190                         ret = -EINVAL;
4191                         goto out;
4192                 }
4193         }
4194
4195         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4196                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4197                 btrfs_warn(fs_info,
4198         "balance: metadata profile %s has lower redundancy than data profile %s",
4199                                 btrfs_bg_type_to_raid_name(meta_target),
4200                                 btrfs_bg_type_to_raid_name(data_target));
4201         }
4202
4203         if (fs_info->send_in_progress) {
4204                 btrfs_warn_rl(fs_info,
4205 "cannot run balance while send operations are in progress (%d in progress)",
4206                               fs_info->send_in_progress);
4207                 ret = -EAGAIN;
4208                 goto out;
4209         }
4210
4211         ret = insert_balance_item(fs_info, bctl);
4212         if (ret && ret != -EEXIST)
4213                 goto out;
4214
4215         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4216                 BUG_ON(ret == -EEXIST);
4217                 BUG_ON(fs_info->balance_ctl);
4218                 spin_lock(&fs_info->balance_lock);
4219                 fs_info->balance_ctl = bctl;
4220                 spin_unlock(&fs_info->balance_lock);
4221         } else {
4222                 BUG_ON(ret != -EEXIST);
4223                 spin_lock(&fs_info->balance_lock);
4224                 update_balance_args(bctl);
4225                 spin_unlock(&fs_info->balance_lock);
4226         }
4227
4228         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4229         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4230         describe_balance_start_or_resume(fs_info);
4231         mutex_unlock(&fs_info->balance_mutex);
4232
4233         ret = __btrfs_balance(fs_info);
4234
4235         mutex_lock(&fs_info->balance_mutex);
4236         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4237                 btrfs_info(fs_info, "balance: paused");
4238         /*
4239          * Balance can be canceled by:
4240          *
4241          * - Regular cancel request
4242          *   Then ret == -ECANCELED and balance_cancel_req > 0
4243          *
4244          * - Fatal signal to "btrfs" process
4245          *   Either the signal caught by wait_reserve_ticket() and callers
4246          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4247          *   got -ECANCELED.
4248          *   Either way, in this case balance_cancel_req = 0, and
4249          *   ret == -EINTR or ret == -ECANCELED.
4250          *
4251          * So here we only check the return value to catch canceled balance.
4252          */
4253         else if (ret == -ECANCELED || ret == -EINTR)
4254                 btrfs_info(fs_info, "balance: canceled");
4255         else
4256                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4257
4258         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4259
4260         if (bargs) {
4261                 memset(bargs, 0, sizeof(*bargs));
4262                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4263         }
4264
4265         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4266             balance_need_close(fs_info)) {
4267                 reset_balance_state(fs_info);
4268                 btrfs_exclop_finish(fs_info);
4269         }
4270
4271         wake_up(&fs_info->balance_wait_q);
4272
4273         return ret;
4274 out:
4275         if (bctl->flags & BTRFS_BALANCE_RESUME)
4276                 reset_balance_state(fs_info);
4277         else
4278                 kfree(bctl);
4279         btrfs_exclop_finish(fs_info);
4280
4281         return ret;
4282 }
4283
4284 static int balance_kthread(void *data)
4285 {
4286         struct btrfs_fs_info *fs_info = data;
4287         int ret = 0;
4288
4289         mutex_lock(&fs_info->balance_mutex);
4290         if (fs_info->balance_ctl)
4291                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4292         mutex_unlock(&fs_info->balance_mutex);
4293
4294         return ret;
4295 }
4296
4297 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4298 {
4299         struct task_struct *tsk;
4300
4301         mutex_lock(&fs_info->balance_mutex);
4302         if (!fs_info->balance_ctl) {
4303                 mutex_unlock(&fs_info->balance_mutex);
4304                 return 0;
4305         }
4306         mutex_unlock(&fs_info->balance_mutex);
4307
4308         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4309                 btrfs_info(fs_info, "balance: resume skipped");
4310                 return 0;
4311         }
4312
4313         /*
4314          * A ro->rw remount sequence should continue with the paused balance
4315          * regardless of who pauses it, system or the user as of now, so set
4316          * the resume flag.
4317          */
4318         spin_lock(&fs_info->balance_lock);
4319         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4320         spin_unlock(&fs_info->balance_lock);
4321
4322         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4323         return PTR_ERR_OR_ZERO(tsk);
4324 }
4325
4326 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4327 {
4328         struct btrfs_balance_control *bctl;
4329         struct btrfs_balance_item *item;
4330         struct btrfs_disk_balance_args disk_bargs;
4331         struct btrfs_path *path;
4332         struct extent_buffer *leaf;
4333         struct btrfs_key key;
4334         int ret;
4335
4336         path = btrfs_alloc_path();
4337         if (!path)
4338                 return -ENOMEM;
4339
4340         key.objectid = BTRFS_BALANCE_OBJECTID;
4341         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4342         key.offset = 0;
4343
4344         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4345         if (ret < 0)
4346                 goto out;
4347         if (ret > 0) { /* ret = -ENOENT; */
4348                 ret = 0;
4349                 goto out;
4350         }
4351
4352         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4353         if (!bctl) {
4354                 ret = -ENOMEM;
4355                 goto out;
4356         }
4357
4358         leaf = path->nodes[0];
4359         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4360
4361         bctl->flags = btrfs_balance_flags(leaf, item);
4362         bctl->flags |= BTRFS_BALANCE_RESUME;
4363
4364         btrfs_balance_data(leaf, item, &disk_bargs);
4365         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4366         btrfs_balance_meta(leaf, item, &disk_bargs);
4367         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4368         btrfs_balance_sys(leaf, item, &disk_bargs);
4369         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4370
4371         /*
4372          * This should never happen, as the paused balance state is recovered
4373          * during mount without any chance of other exclusive ops to collide.
4374          *
4375          * This gives the exclusive op status to balance and keeps in paused
4376          * state until user intervention (cancel or umount). If the ownership
4377          * cannot be assigned, show a message but do not fail. The balance
4378          * is in a paused state and must have fs_info::balance_ctl properly
4379          * set up.
4380          */
4381         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4382                 btrfs_warn(fs_info,
4383         "balance: cannot set exclusive op status, resume manually");
4384
4385         btrfs_release_path(path);
4386
4387         mutex_lock(&fs_info->balance_mutex);
4388         BUG_ON(fs_info->balance_ctl);
4389         spin_lock(&fs_info->balance_lock);
4390         fs_info->balance_ctl = bctl;
4391         spin_unlock(&fs_info->balance_lock);
4392         mutex_unlock(&fs_info->balance_mutex);
4393 out:
4394         btrfs_free_path(path);
4395         return ret;
4396 }
4397
4398 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4399 {
4400         int ret = 0;
4401
4402         mutex_lock(&fs_info->balance_mutex);
4403         if (!fs_info->balance_ctl) {
4404                 mutex_unlock(&fs_info->balance_mutex);
4405                 return -ENOTCONN;
4406         }
4407
4408         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4409                 atomic_inc(&fs_info->balance_pause_req);
4410                 mutex_unlock(&fs_info->balance_mutex);
4411
4412                 wait_event(fs_info->balance_wait_q,
4413                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4414
4415                 mutex_lock(&fs_info->balance_mutex);
4416                 /* we are good with balance_ctl ripped off from under us */
4417                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4418                 atomic_dec(&fs_info->balance_pause_req);
4419         } else {
4420                 ret = -ENOTCONN;
4421         }
4422
4423         mutex_unlock(&fs_info->balance_mutex);
4424         return ret;
4425 }
4426
4427 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4428 {
4429         mutex_lock(&fs_info->balance_mutex);
4430         if (!fs_info->balance_ctl) {
4431                 mutex_unlock(&fs_info->balance_mutex);
4432                 return -ENOTCONN;
4433         }
4434
4435         /*
4436          * A paused balance with the item stored on disk can be resumed at
4437          * mount time if the mount is read-write. Otherwise it's still paused
4438          * and we must not allow cancelling as it deletes the item.
4439          */
4440         if (sb_rdonly(fs_info->sb)) {
4441                 mutex_unlock(&fs_info->balance_mutex);
4442                 return -EROFS;
4443         }
4444
4445         atomic_inc(&fs_info->balance_cancel_req);
4446         /*
4447          * if we are running just wait and return, balance item is
4448          * deleted in btrfs_balance in this case
4449          */
4450         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4451                 mutex_unlock(&fs_info->balance_mutex);
4452                 wait_event(fs_info->balance_wait_q,
4453                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4454                 mutex_lock(&fs_info->balance_mutex);
4455         } else {
4456                 mutex_unlock(&fs_info->balance_mutex);
4457                 /*
4458                  * Lock released to allow other waiters to continue, we'll
4459                  * reexamine the status again.
4460                  */
4461                 mutex_lock(&fs_info->balance_mutex);
4462
4463                 if (fs_info->balance_ctl) {
4464                         reset_balance_state(fs_info);
4465                         btrfs_exclop_finish(fs_info);
4466                         btrfs_info(fs_info, "balance: canceled");
4467                 }
4468         }
4469
4470         BUG_ON(fs_info->balance_ctl ||
4471                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4472         atomic_dec(&fs_info->balance_cancel_req);
4473         mutex_unlock(&fs_info->balance_mutex);
4474         return 0;
4475 }
4476
4477 int btrfs_uuid_scan_kthread(void *data)
4478 {
4479         struct btrfs_fs_info *fs_info = data;
4480         struct btrfs_root *root = fs_info->tree_root;
4481         struct btrfs_key key;
4482         struct btrfs_path *path = NULL;
4483         int ret = 0;
4484         struct extent_buffer *eb;
4485         int slot;
4486         struct btrfs_root_item root_item;
4487         u32 item_size;
4488         struct btrfs_trans_handle *trans = NULL;
4489         bool closing = false;
4490
4491         path = btrfs_alloc_path();
4492         if (!path) {
4493                 ret = -ENOMEM;
4494                 goto out;
4495         }
4496
4497         key.objectid = 0;
4498         key.type = BTRFS_ROOT_ITEM_KEY;
4499         key.offset = 0;
4500
4501         while (1) {
4502                 if (btrfs_fs_closing(fs_info)) {
4503                         closing = true;
4504                         break;
4505                 }
4506                 ret = btrfs_search_forward(root, &key, path,
4507                                 BTRFS_OLDEST_GENERATION);
4508                 if (ret) {
4509                         if (ret > 0)
4510                                 ret = 0;
4511                         break;
4512                 }
4513
4514                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4515                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4516                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4517                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4518                         goto skip;
4519
4520                 eb = path->nodes[0];
4521                 slot = path->slots[0];
4522                 item_size = btrfs_item_size_nr(eb, slot);
4523                 if (item_size < sizeof(root_item))
4524                         goto skip;
4525
4526                 read_extent_buffer(eb, &root_item,
4527                                    btrfs_item_ptr_offset(eb, slot),
4528                                    (int)sizeof(root_item));
4529                 if (btrfs_root_refs(&root_item) == 0)
4530                         goto skip;
4531
4532                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4533                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4534                         if (trans)
4535                                 goto update_tree;
4536
4537                         btrfs_release_path(path);
4538                         /*
4539                          * 1 - subvol uuid item
4540                          * 1 - received_subvol uuid item
4541                          */
4542                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4543                         if (IS_ERR(trans)) {
4544                                 ret = PTR_ERR(trans);
4545                                 break;
4546                         }
4547                         continue;
4548                 } else {
4549                         goto skip;
4550                 }
4551 update_tree:
4552                 btrfs_release_path(path);
4553                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4554                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4555                                                   BTRFS_UUID_KEY_SUBVOL,
4556                                                   key.objectid);
4557                         if (ret < 0) {
4558                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4559                                         ret);
4560                                 break;
4561                         }
4562                 }
4563
4564                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4565                         ret = btrfs_uuid_tree_add(trans,
4566                                                   root_item.received_uuid,
4567                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4568                                                   key.objectid);
4569                         if (ret < 0) {
4570                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4571                                         ret);
4572                                 break;
4573                         }
4574                 }
4575
4576 skip:
4577                 btrfs_release_path(path);
4578                 if (trans) {
4579                         ret = btrfs_end_transaction(trans);
4580                         trans = NULL;
4581                         if (ret)
4582                                 break;
4583                 }
4584
4585                 if (key.offset < (u64)-1) {
4586                         key.offset++;
4587                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4588                         key.offset = 0;
4589                         key.type = BTRFS_ROOT_ITEM_KEY;
4590                 } else if (key.objectid < (u64)-1) {
4591                         key.offset = 0;
4592                         key.type = BTRFS_ROOT_ITEM_KEY;
4593                         key.objectid++;
4594                 } else {
4595                         break;
4596                 }
4597                 cond_resched();
4598         }
4599
4600 out:
4601         btrfs_free_path(path);
4602         if (trans && !IS_ERR(trans))
4603                 btrfs_end_transaction(trans);
4604         if (ret)
4605                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4606         else if (!closing)
4607                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4608         up(&fs_info->uuid_tree_rescan_sem);
4609         return 0;
4610 }
4611
4612 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4613 {
4614         struct btrfs_trans_handle *trans;
4615         struct btrfs_root *tree_root = fs_info->tree_root;
4616         struct btrfs_root *uuid_root;
4617         struct task_struct *task;
4618         int ret;
4619
4620         /*
4621          * 1 - root node
4622          * 1 - root item
4623          */
4624         trans = btrfs_start_transaction(tree_root, 2);
4625         if (IS_ERR(trans))
4626                 return PTR_ERR(trans);
4627
4628         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4629         if (IS_ERR(uuid_root)) {
4630                 ret = PTR_ERR(uuid_root);
4631                 btrfs_abort_transaction(trans, ret);
4632                 btrfs_end_transaction(trans);
4633                 return ret;
4634         }
4635
4636         fs_info->uuid_root = uuid_root;
4637
4638         ret = btrfs_commit_transaction(trans);
4639         if (ret)
4640                 return ret;
4641
4642         down(&fs_info->uuid_tree_rescan_sem);
4643         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4644         if (IS_ERR(task)) {
4645                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4646                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4647                 up(&fs_info->uuid_tree_rescan_sem);
4648                 return PTR_ERR(task);
4649         }
4650
4651         return 0;
4652 }
4653
4654 /*
4655  * shrinking a device means finding all of the device extents past
4656  * the new size, and then following the back refs to the chunks.
4657  * The chunk relocation code actually frees the device extent
4658  */
4659 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4660 {
4661         struct btrfs_fs_info *fs_info = device->fs_info;
4662         struct btrfs_root *root = fs_info->dev_root;
4663         struct btrfs_trans_handle *trans;
4664         struct btrfs_dev_extent *dev_extent = NULL;
4665         struct btrfs_path *path;
4666         u64 length;
4667         u64 chunk_offset;
4668         int ret;
4669         int slot;
4670         int failed = 0;
4671         bool retried = false;
4672         struct extent_buffer *l;
4673         struct btrfs_key key;
4674         struct btrfs_super_block *super_copy = fs_info->super_copy;
4675         u64 old_total = btrfs_super_total_bytes(super_copy);
4676         u64 old_size = btrfs_device_get_total_bytes(device);
4677         u64 diff;
4678         u64 start;
4679
4680         new_size = round_down(new_size, fs_info->sectorsize);
4681         start = new_size;
4682         diff = round_down(old_size - new_size, fs_info->sectorsize);
4683
4684         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4685                 return -EINVAL;
4686
4687         path = btrfs_alloc_path();
4688         if (!path)
4689                 return -ENOMEM;
4690
4691         path->reada = READA_BACK;
4692
4693         trans = btrfs_start_transaction(root, 0);
4694         if (IS_ERR(trans)) {
4695                 btrfs_free_path(path);
4696                 return PTR_ERR(trans);
4697         }
4698
4699         mutex_lock(&fs_info->chunk_mutex);
4700
4701         btrfs_device_set_total_bytes(device, new_size);
4702         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4703                 device->fs_devices->total_rw_bytes -= diff;
4704                 atomic64_sub(diff, &fs_info->free_chunk_space);
4705         }
4706
4707         /*
4708          * Once the device's size has been set to the new size, ensure all
4709          * in-memory chunks are synced to disk so that the loop below sees them
4710          * and relocates them accordingly.
4711          */
4712         if (contains_pending_extent(device, &start, diff)) {
4713                 mutex_unlock(&fs_info->chunk_mutex);
4714                 ret = btrfs_commit_transaction(trans);
4715                 if (ret)
4716                         goto done;
4717         } else {
4718                 mutex_unlock(&fs_info->chunk_mutex);
4719                 btrfs_end_transaction(trans);
4720         }
4721
4722 again:
4723         key.objectid = device->devid;
4724         key.offset = (u64)-1;
4725         key.type = BTRFS_DEV_EXTENT_KEY;
4726
4727         do {
4728                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4729                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4730                 if (ret < 0) {
4731                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4732                         goto done;
4733                 }
4734
4735                 ret = btrfs_previous_item(root, path, 0, key.type);
4736                 if (ret) {
4737                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4738                         if (ret < 0)
4739                                 goto done;
4740                         ret = 0;
4741                         btrfs_release_path(path);
4742                         break;
4743                 }
4744
4745                 l = path->nodes[0];
4746                 slot = path->slots[0];
4747                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4748
4749                 if (key.objectid != device->devid) {
4750                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4751                         btrfs_release_path(path);
4752                         break;
4753                 }
4754
4755                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4756                 length = btrfs_dev_extent_length(l, dev_extent);
4757
4758                 if (key.offset + length <= new_size) {
4759                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4760                         btrfs_release_path(path);
4761                         break;
4762                 }
4763
4764                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4765                 btrfs_release_path(path);
4766
4767                 /*
4768                  * We may be relocating the only data chunk we have,
4769                  * which could potentially end up with losing data's
4770                  * raid profile, so lets allocate an empty one in
4771                  * advance.
4772                  */
4773                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4774                 if (ret < 0) {
4775                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4776                         goto done;
4777                 }
4778
4779                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4780                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4781                 if (ret == -ENOSPC) {
4782                         failed++;
4783                 } else if (ret) {
4784                         if (ret == -ETXTBSY) {
4785                                 btrfs_warn(fs_info,
4786                    "could not shrink block group %llu due to active swapfile",
4787                                            chunk_offset);
4788                         }
4789                         goto done;
4790                 }
4791         } while (key.offset-- > 0);
4792
4793         if (failed && !retried) {
4794                 failed = 0;
4795                 retried = true;
4796                 goto again;
4797         } else if (failed && retried) {
4798                 ret = -ENOSPC;
4799                 goto done;
4800         }
4801
4802         /* Shrinking succeeded, else we would be at "done". */
4803         trans = btrfs_start_transaction(root, 0);
4804         if (IS_ERR(trans)) {
4805                 ret = PTR_ERR(trans);
4806                 goto done;
4807         }
4808
4809         mutex_lock(&fs_info->chunk_mutex);
4810         /* Clear all state bits beyond the shrunk device size */
4811         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4812                           CHUNK_STATE_MASK);
4813
4814         btrfs_device_set_disk_total_bytes(device, new_size);
4815         if (list_empty(&device->post_commit_list))
4816                 list_add_tail(&device->post_commit_list,
4817                               &trans->transaction->dev_update_list);
4818
4819         WARN_ON(diff > old_total);
4820         btrfs_set_super_total_bytes(super_copy,
4821                         round_down(old_total - diff, fs_info->sectorsize));
4822         mutex_unlock(&fs_info->chunk_mutex);
4823
4824         /* Now btrfs_update_device() will change the on-disk size. */
4825         ret = btrfs_update_device(trans, device);
4826         if (ret < 0) {
4827                 btrfs_abort_transaction(trans, ret);
4828                 btrfs_end_transaction(trans);
4829         } else {
4830                 ret = btrfs_commit_transaction(trans);
4831         }
4832 done:
4833         btrfs_free_path(path);
4834         if (ret) {
4835                 mutex_lock(&fs_info->chunk_mutex);
4836                 btrfs_device_set_total_bytes(device, old_size);
4837                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4838                         device->fs_devices->total_rw_bytes += diff;
4839                 atomic64_add(diff, &fs_info->free_chunk_space);
4840                 mutex_unlock(&fs_info->chunk_mutex);
4841         }
4842         return ret;
4843 }
4844
4845 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4846                            struct btrfs_key *key,
4847                            struct btrfs_chunk *chunk, int item_size)
4848 {
4849         struct btrfs_super_block *super_copy = fs_info->super_copy;
4850         struct btrfs_disk_key disk_key;
4851         u32 array_size;
4852         u8 *ptr;
4853
4854         mutex_lock(&fs_info->chunk_mutex);
4855         array_size = btrfs_super_sys_array_size(super_copy);
4856         if (array_size + item_size + sizeof(disk_key)
4857                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4858                 mutex_unlock(&fs_info->chunk_mutex);
4859                 return -EFBIG;
4860         }
4861
4862         ptr = super_copy->sys_chunk_array + array_size;
4863         btrfs_cpu_key_to_disk(&disk_key, key);
4864         memcpy(ptr, &disk_key, sizeof(disk_key));
4865         ptr += sizeof(disk_key);
4866         memcpy(ptr, chunk, item_size);
4867         item_size += sizeof(disk_key);
4868         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4869         mutex_unlock(&fs_info->chunk_mutex);
4870
4871         return 0;
4872 }
4873
4874 /*
4875  * sort the devices in descending order by max_avail, total_avail
4876  */
4877 static int btrfs_cmp_device_info(const void *a, const void *b)
4878 {
4879         const struct btrfs_device_info *di_a = a;
4880         const struct btrfs_device_info *di_b = b;
4881
4882         if (di_a->max_avail > di_b->max_avail)
4883                 return -1;
4884         if (di_a->max_avail < di_b->max_avail)
4885                 return 1;
4886         if (di_a->total_avail > di_b->total_avail)
4887                 return -1;
4888         if (di_a->total_avail < di_b->total_avail)
4889                 return 1;
4890         return 0;
4891 }
4892
4893 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4894 {
4895         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4896                 return;
4897
4898         btrfs_set_fs_incompat(info, RAID56);
4899 }
4900
4901 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4902 {
4903         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4904                 return;
4905
4906         btrfs_set_fs_incompat(info, RAID1C34);
4907 }
4908
4909 /*
4910  * Structure used internally for __btrfs_alloc_chunk() function.
4911  * Wraps needed parameters.
4912  */
4913 struct alloc_chunk_ctl {
4914         u64 start;
4915         u64 type;
4916         /* Total number of stripes to allocate */
4917         int num_stripes;
4918         /* sub_stripes info for map */
4919         int sub_stripes;
4920         /* Stripes per device */
4921         int dev_stripes;
4922         /* Maximum number of devices to use */
4923         int devs_max;
4924         /* Minimum number of devices to use */
4925         int devs_min;
4926         /* ndevs has to be a multiple of this */
4927         int devs_increment;
4928         /* Number of copies */
4929         int ncopies;
4930         /* Number of stripes worth of bytes to store parity information */
4931         int nparity;
4932         u64 max_stripe_size;
4933         u64 max_chunk_size;
4934         u64 dev_extent_min;
4935         u64 stripe_size;
4936         u64 chunk_size;
4937         int ndevs;
4938 };
4939
4940 static void init_alloc_chunk_ctl_policy_regular(
4941                                 struct btrfs_fs_devices *fs_devices,
4942                                 struct alloc_chunk_ctl *ctl)
4943 {
4944         u64 type = ctl->type;
4945
4946         if (type & BTRFS_BLOCK_GROUP_DATA) {
4947                 ctl->max_stripe_size = SZ_1G;
4948                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4949         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4950                 /* For larger filesystems, use larger metadata chunks */
4951                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4952                         ctl->max_stripe_size = SZ_1G;
4953                 else
4954                         ctl->max_stripe_size = SZ_256M;
4955                 ctl->max_chunk_size = ctl->max_stripe_size;
4956         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4957                 ctl->max_stripe_size = SZ_32M;
4958                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4959                 ctl->devs_max = min_t(int, ctl->devs_max,
4960                                       BTRFS_MAX_DEVS_SYS_CHUNK);
4961         } else {
4962                 BUG();
4963         }
4964
4965         /* We don't want a chunk larger than 10% of writable space */
4966         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4967                                   ctl->max_chunk_size);
4968         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4969 }
4970
4971 static void init_alloc_chunk_ctl_policy_zoned(
4972                                       struct btrfs_fs_devices *fs_devices,
4973                                       struct alloc_chunk_ctl *ctl)
4974 {
4975         u64 zone_size = fs_devices->fs_info->zone_size;
4976         u64 limit;
4977         int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
4978         int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
4979         u64 min_chunk_size = min_data_stripes * zone_size;
4980         u64 type = ctl->type;
4981
4982         ctl->max_stripe_size = zone_size;
4983         if (type & BTRFS_BLOCK_GROUP_DATA) {
4984                 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
4985                                                  zone_size);
4986         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4987                 ctl->max_chunk_size = ctl->max_stripe_size;
4988         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4989                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4990                 ctl->devs_max = min_t(int, ctl->devs_max,
4991                                       BTRFS_MAX_DEVS_SYS_CHUNK);
4992         }
4993
4994         /* We don't want a chunk larger than 10% of writable space */
4995         limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
4996                                zone_size),
4997                     min_chunk_size);
4998         ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
4999         ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5000 }
5001
5002 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5003                                  struct alloc_chunk_ctl *ctl)
5004 {
5005         int index = btrfs_bg_flags_to_raid_index(ctl->type);
5006
5007         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5008         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5009         ctl->devs_max = btrfs_raid_array[index].devs_max;
5010         if (!ctl->devs_max)
5011                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5012         ctl->devs_min = btrfs_raid_array[index].devs_min;
5013         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5014         ctl->ncopies = btrfs_raid_array[index].ncopies;
5015         ctl->nparity = btrfs_raid_array[index].nparity;
5016         ctl->ndevs = 0;
5017
5018         switch (fs_devices->chunk_alloc_policy) {
5019         case BTRFS_CHUNK_ALLOC_REGULAR:
5020                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5021                 break;
5022         case BTRFS_CHUNK_ALLOC_ZONED:
5023                 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5024                 break;
5025         default:
5026                 BUG();
5027         }
5028 }
5029
5030 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5031                               struct alloc_chunk_ctl *ctl,
5032                               struct btrfs_device_info *devices_info)
5033 {
5034         struct btrfs_fs_info *info = fs_devices->fs_info;
5035         struct btrfs_device *device;
5036         u64 total_avail;
5037         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5038         int ret;
5039         int ndevs = 0;
5040         u64 max_avail;
5041         u64 dev_offset;
5042
5043         /*
5044          * in the first pass through the devices list, we gather information
5045          * about the available holes on each device.
5046          */
5047         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5048                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5049                         WARN(1, KERN_ERR
5050                                "BTRFS: read-only device in alloc_list\n");
5051                         continue;
5052                 }
5053
5054                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5055                                         &device->dev_state) ||
5056                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5057                         continue;
5058
5059                 if (device->total_bytes > device->bytes_used)
5060                         total_avail = device->total_bytes - device->bytes_used;
5061                 else
5062                         total_avail = 0;
5063
5064                 /* If there is no space on this device, skip it. */
5065                 if (total_avail < ctl->dev_extent_min)
5066                         continue;
5067
5068                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5069                                            &max_avail);
5070                 if (ret && ret != -ENOSPC)
5071                         return ret;
5072
5073                 if (ret == 0)
5074                         max_avail = dev_extent_want;
5075
5076                 if (max_avail < ctl->dev_extent_min) {
5077                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
5078                                 btrfs_debug(info,
5079                         "%s: devid %llu has no free space, have=%llu want=%llu",
5080                                             __func__, device->devid, max_avail,
5081                                             ctl->dev_extent_min);
5082                         continue;
5083                 }
5084
5085                 if (ndevs == fs_devices->rw_devices) {
5086                         WARN(1, "%s: found more than %llu devices\n",
5087                              __func__, fs_devices->rw_devices);
5088                         break;
5089                 }
5090                 devices_info[ndevs].dev_offset = dev_offset;
5091                 devices_info[ndevs].max_avail = max_avail;
5092                 devices_info[ndevs].total_avail = total_avail;
5093                 devices_info[ndevs].dev = device;
5094                 ++ndevs;
5095         }
5096         ctl->ndevs = ndevs;
5097
5098         /*
5099          * now sort the devices by hole size / available space
5100          */
5101         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5102              btrfs_cmp_device_info, NULL);
5103
5104         return 0;
5105 }
5106
5107 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5108                                       struct btrfs_device_info *devices_info)
5109 {
5110         /* Number of stripes that count for block group size */
5111         int data_stripes;
5112
5113         /*
5114          * The primary goal is to maximize the number of stripes, so use as
5115          * many devices as possible, even if the stripes are not maximum sized.
5116          *
5117          * The DUP profile stores more than one stripe per device, the
5118          * max_avail is the total size so we have to adjust.
5119          */
5120         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5121                                    ctl->dev_stripes);
5122         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5123
5124         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5125         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5126
5127         /*
5128          * Use the number of data stripes to figure out how big this chunk is
5129          * really going to be in terms of logical address space, and compare
5130          * that answer with the max chunk size. If it's higher, we try to
5131          * reduce stripe_size.
5132          */
5133         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5134                 /*
5135                  * Reduce stripe_size, round it up to a 16MB boundary again and
5136                  * then use it, unless it ends up being even bigger than the
5137                  * previous value we had already.
5138                  */
5139                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5140                                                         data_stripes), SZ_16M),
5141                                        ctl->stripe_size);
5142         }
5143
5144         /* Align to BTRFS_STRIPE_LEN */
5145         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5146         ctl->chunk_size = ctl->stripe_size * data_stripes;
5147
5148         return 0;
5149 }
5150
5151 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5152                                     struct btrfs_device_info *devices_info)
5153 {
5154         u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5155         /* Number of stripes that count for block group size */
5156         int data_stripes;
5157
5158         /*
5159          * It should hold because:
5160          *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
5161          */
5162         ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5163
5164         ctl->stripe_size = zone_size;
5165         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5166         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5167
5168         /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5169         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5170                 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5171                                              ctl->stripe_size) + ctl->nparity,
5172                                      ctl->dev_stripes);
5173                 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5174                 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5175                 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5176         }
5177
5178         ctl->chunk_size = ctl->stripe_size * data_stripes;
5179
5180         return 0;
5181 }
5182
5183 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5184                               struct alloc_chunk_ctl *ctl,
5185                               struct btrfs_device_info *devices_info)
5186 {
5187         struct btrfs_fs_info *info = fs_devices->fs_info;
5188
5189         /*
5190          * Round down to number of usable stripes, devs_increment can be any
5191          * number so we can't use round_down() that requires power of 2, while
5192          * rounddown is safe.
5193          */
5194         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5195
5196         if (ctl->ndevs < ctl->devs_min) {
5197                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5198                         btrfs_debug(info,
5199         "%s: not enough devices with free space: have=%d minimum required=%d",
5200                                     __func__, ctl->ndevs, ctl->devs_min);
5201                 }
5202                 return -ENOSPC;
5203         }
5204
5205         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5206
5207         switch (fs_devices->chunk_alloc_policy) {
5208         case BTRFS_CHUNK_ALLOC_REGULAR:
5209                 return decide_stripe_size_regular(ctl, devices_info);
5210         case BTRFS_CHUNK_ALLOC_ZONED:
5211                 return decide_stripe_size_zoned(ctl, devices_info);
5212         default:
5213                 BUG();
5214         }
5215 }
5216
5217 static int create_chunk(struct btrfs_trans_handle *trans,
5218                         struct alloc_chunk_ctl *ctl,
5219                         struct btrfs_device_info *devices_info)
5220 {
5221         struct btrfs_fs_info *info = trans->fs_info;
5222         struct map_lookup *map = NULL;
5223         struct extent_map_tree *em_tree;
5224         struct extent_map *em;
5225         u64 start = ctl->start;
5226         u64 type = ctl->type;
5227         int ret;
5228         int i;
5229         int j;
5230
5231         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5232         if (!map)
5233                 return -ENOMEM;
5234         map->num_stripes = ctl->num_stripes;
5235
5236         for (i = 0; i < ctl->ndevs; ++i) {
5237                 for (j = 0; j < ctl->dev_stripes; ++j) {
5238                         int s = i * ctl->dev_stripes + j;
5239                         map->stripes[s].dev = devices_info[i].dev;
5240                         map->stripes[s].physical = devices_info[i].dev_offset +
5241                                                    j * ctl->stripe_size;
5242                 }
5243         }
5244         map->stripe_len = BTRFS_STRIPE_LEN;
5245         map->io_align = BTRFS_STRIPE_LEN;
5246         map->io_width = BTRFS_STRIPE_LEN;
5247         map->type = type;
5248         map->sub_stripes = ctl->sub_stripes;
5249
5250         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5251
5252         em = alloc_extent_map();
5253         if (!em) {
5254                 kfree(map);
5255                 return -ENOMEM;
5256         }
5257         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5258         em->map_lookup = map;
5259         em->start = start;
5260         em->len = ctl->chunk_size;
5261         em->block_start = 0;
5262         em->block_len = em->len;
5263         em->orig_block_len = ctl->stripe_size;
5264
5265         em_tree = &info->mapping_tree;
5266         write_lock(&em_tree->lock);
5267         ret = add_extent_mapping(em_tree, em, 0);
5268         if (ret) {
5269                 write_unlock(&em_tree->lock);
5270                 free_extent_map(em);
5271                 return ret;
5272         }
5273         write_unlock(&em_tree->lock);
5274
5275         ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5276         if (ret)
5277                 goto error_del_extent;
5278
5279         for (i = 0; i < map->num_stripes; i++) {
5280                 struct btrfs_device *dev = map->stripes[i].dev;
5281
5282                 btrfs_device_set_bytes_used(dev,
5283                                             dev->bytes_used + ctl->stripe_size);
5284                 if (list_empty(&dev->post_commit_list))
5285                         list_add_tail(&dev->post_commit_list,
5286                                       &trans->transaction->dev_update_list);
5287         }
5288
5289         atomic64_sub(ctl->stripe_size * map->num_stripes,
5290                      &info->free_chunk_space);
5291
5292         free_extent_map(em);
5293         check_raid56_incompat_flag(info, type);
5294         check_raid1c34_incompat_flag(info, type);
5295
5296         return 0;
5297
5298 error_del_extent:
5299         write_lock(&em_tree->lock);
5300         remove_extent_mapping(em_tree, em);
5301         write_unlock(&em_tree->lock);
5302
5303         /* One for our allocation */
5304         free_extent_map(em);
5305         /* One for the tree reference */
5306         free_extent_map(em);
5307
5308         return ret;
5309 }
5310
5311 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5312 {
5313         struct btrfs_fs_info *info = trans->fs_info;
5314         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5315         struct btrfs_device_info *devices_info = NULL;
5316         struct alloc_chunk_ctl ctl;
5317         int ret;
5318
5319         lockdep_assert_held(&info->chunk_mutex);
5320
5321         if (!alloc_profile_is_valid(type, 0)) {
5322                 ASSERT(0);
5323                 return -EINVAL;
5324         }
5325
5326         if (list_empty(&fs_devices->alloc_list)) {
5327                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5328                         btrfs_debug(info, "%s: no writable device", __func__);
5329                 return -ENOSPC;
5330         }
5331
5332         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5333                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5334                 ASSERT(0);
5335                 return -EINVAL;
5336         }
5337
5338         ctl.start = find_next_chunk(info);
5339         ctl.type = type;
5340         init_alloc_chunk_ctl(fs_devices, &ctl);
5341
5342         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5343                                GFP_NOFS);
5344         if (!devices_info)
5345                 return -ENOMEM;
5346
5347         ret = gather_device_info(fs_devices, &ctl, devices_info);
5348         if (ret < 0)
5349                 goto out;
5350
5351         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5352         if (ret < 0)
5353                 goto out;
5354
5355         ret = create_chunk(trans, &ctl, devices_info);
5356
5357 out:
5358         kfree(devices_info);
5359         return ret;
5360 }
5361
5362 /*
5363  * Chunk allocation falls into two parts. The first part does work
5364  * that makes the new allocated chunk usable, but does not do any operation
5365  * that modifies the chunk tree. The second part does the work that
5366  * requires modifying the chunk tree. This division is important for the
5367  * bootstrap process of adding storage to a seed btrfs.
5368  */
5369 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5370                              u64 chunk_offset, u64 chunk_size)
5371 {
5372         struct btrfs_fs_info *fs_info = trans->fs_info;
5373         struct btrfs_root *extent_root = fs_info->extent_root;
5374         struct btrfs_root *chunk_root = fs_info->chunk_root;
5375         struct btrfs_key key;
5376         struct btrfs_device *device;
5377         struct btrfs_chunk *chunk;
5378         struct btrfs_stripe *stripe;
5379         struct extent_map *em;
5380         struct map_lookup *map;
5381         size_t item_size;
5382         u64 dev_offset;
5383         u64 stripe_size;
5384         int i = 0;
5385         int ret = 0;
5386
5387         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5388         if (IS_ERR(em))
5389                 return PTR_ERR(em);
5390
5391         map = em->map_lookup;
5392         item_size = btrfs_chunk_item_size(map->num_stripes);
5393         stripe_size = em->orig_block_len;
5394
5395         chunk = kzalloc(item_size, GFP_NOFS);
5396         if (!chunk) {
5397                 ret = -ENOMEM;
5398                 goto out;
5399         }
5400
5401         /*
5402          * Take the device list mutex to prevent races with the final phase of
5403          * a device replace operation that replaces the device object associated
5404          * with the map's stripes, because the device object's id can change
5405          * at any time during that final phase of the device replace operation
5406          * (dev-replace.c:btrfs_dev_replace_finishing()).
5407          */
5408         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5409         for (i = 0; i < map->num_stripes; i++) {
5410                 device = map->stripes[i].dev;
5411                 dev_offset = map->stripes[i].physical;
5412
5413                 ret = btrfs_update_device(trans, device);
5414                 if (ret)
5415                         break;
5416                 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5417                                              dev_offset, stripe_size);
5418                 if (ret)
5419                         break;
5420         }
5421         if (ret) {
5422                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5423                 goto out;
5424         }
5425
5426         stripe = &chunk->stripe;
5427         for (i = 0; i < map->num_stripes; i++) {
5428                 device = map->stripes[i].dev;
5429                 dev_offset = map->stripes[i].physical;
5430
5431                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5432                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5433                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5434                 stripe++;
5435         }
5436         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5437
5438         btrfs_set_stack_chunk_length(chunk, chunk_size);
5439         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5440         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5441         btrfs_set_stack_chunk_type(chunk, map->type);
5442         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5443         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5444         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5445         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5446         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5447
5448         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5449         key.type = BTRFS_CHUNK_ITEM_KEY;
5450         key.offset = chunk_offset;
5451
5452         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5453         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5454                 /*
5455                  * TODO: Cleanup of inserted chunk root in case of
5456                  * failure.
5457                  */
5458                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5459         }
5460
5461 out:
5462         kfree(chunk);
5463         free_extent_map(em);
5464         return ret;
5465 }
5466
5467 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5468 {
5469         struct btrfs_fs_info *fs_info = trans->fs_info;
5470         u64 alloc_profile;
5471         int ret;
5472
5473         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5474         ret = btrfs_alloc_chunk(trans, alloc_profile);
5475         if (ret)
5476                 return ret;
5477
5478         alloc_profile = btrfs_system_alloc_profile(fs_info);
5479         ret = btrfs_alloc_chunk(trans, alloc_profile);
5480         return ret;
5481 }
5482
5483 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5484 {
5485         const int index = btrfs_bg_flags_to_raid_index(map->type);
5486
5487         return btrfs_raid_array[index].tolerated_failures;
5488 }
5489
5490 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5491 {
5492         struct extent_map *em;
5493         struct map_lookup *map;
5494         int readonly = 0;
5495         int miss_ndevs = 0;
5496         int i;
5497
5498         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5499         if (IS_ERR(em))
5500                 return 1;
5501
5502         map = em->map_lookup;
5503         for (i = 0; i < map->num_stripes; i++) {
5504                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5505                                         &map->stripes[i].dev->dev_state)) {
5506                         miss_ndevs++;
5507                         continue;
5508                 }
5509                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5510                                         &map->stripes[i].dev->dev_state)) {
5511                         readonly = 1;
5512                         goto end;
5513                 }
5514         }
5515
5516         /*
5517          * If the number of missing devices is larger than max errors,
5518          * we can not write the data into that chunk successfully, so
5519          * set it readonly.
5520          */
5521         if (miss_ndevs > btrfs_chunk_max_errors(map))
5522                 readonly = 1;
5523 end:
5524         free_extent_map(em);
5525         return readonly;
5526 }
5527
5528 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5529 {
5530         struct extent_map *em;
5531
5532         while (1) {
5533                 write_lock(&tree->lock);
5534                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5535                 if (em)
5536                         remove_extent_mapping(tree, em);
5537                 write_unlock(&tree->lock);
5538                 if (!em)
5539                         break;
5540                 /* once for us */
5541                 free_extent_map(em);
5542                 /* once for the tree */
5543                 free_extent_map(em);
5544         }
5545 }
5546
5547 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5548 {
5549         struct extent_map *em;
5550         struct map_lookup *map;
5551         int ret;
5552
5553         em = btrfs_get_chunk_map(fs_info, logical, len);
5554         if (IS_ERR(em))
5555                 /*
5556                  * We could return errors for these cases, but that could get
5557                  * ugly and we'd probably do the same thing which is just not do
5558                  * anything else and exit, so return 1 so the callers don't try
5559                  * to use other copies.
5560                  */
5561                 return 1;
5562
5563         map = em->map_lookup;
5564         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5565                 ret = map->num_stripes;
5566         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5567                 ret = map->sub_stripes;
5568         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5569                 ret = 2;
5570         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5571                 /*
5572                  * There could be two corrupted data stripes, we need
5573                  * to loop retry in order to rebuild the correct data.
5574                  *
5575                  * Fail a stripe at a time on every retry except the
5576                  * stripe under reconstruction.
5577                  */
5578                 ret = map->num_stripes;
5579         else
5580                 ret = 1;
5581         free_extent_map(em);
5582
5583         down_read(&fs_info->dev_replace.rwsem);
5584         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5585             fs_info->dev_replace.tgtdev)
5586                 ret++;
5587         up_read(&fs_info->dev_replace.rwsem);
5588
5589         return ret;
5590 }
5591
5592 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5593                                     u64 logical)
5594 {
5595         struct extent_map *em;
5596         struct map_lookup *map;
5597         unsigned long len = fs_info->sectorsize;
5598
5599         em = btrfs_get_chunk_map(fs_info, logical, len);
5600
5601         if (!WARN_ON(IS_ERR(em))) {
5602                 map = em->map_lookup;
5603                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5604                         len = map->stripe_len * nr_data_stripes(map);
5605                 free_extent_map(em);
5606         }
5607         return len;
5608 }
5609
5610 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5611 {
5612         struct extent_map *em;
5613         struct map_lookup *map;
5614         int ret = 0;
5615
5616         em = btrfs_get_chunk_map(fs_info, logical, len);
5617
5618         if(!WARN_ON(IS_ERR(em))) {
5619                 map = em->map_lookup;
5620                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5621                         ret = 1;
5622                 free_extent_map(em);
5623         }
5624         return ret;
5625 }
5626
5627 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5628                             struct map_lookup *map, int first,
5629                             int dev_replace_is_ongoing)
5630 {
5631         int i;
5632         int num_stripes;
5633         int preferred_mirror;
5634         int tolerance;
5635         struct btrfs_device *srcdev;
5636
5637         ASSERT((map->type &
5638                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5639
5640         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5641                 num_stripes = map->sub_stripes;
5642         else
5643                 num_stripes = map->num_stripes;
5644
5645         switch (fs_info->fs_devices->read_policy) {
5646         default:
5647                 /* Shouldn't happen, just warn and use pid instead of failing */
5648                 btrfs_warn_rl(fs_info,
5649                               "unknown read_policy type %u, reset to pid",
5650                               fs_info->fs_devices->read_policy);
5651                 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5652                 fallthrough;
5653         case BTRFS_READ_POLICY_PID:
5654                 preferred_mirror = first + (current->pid % num_stripes);
5655                 break;
5656         }
5657
5658         if (dev_replace_is_ongoing &&
5659             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5660              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5661                 srcdev = fs_info->dev_replace.srcdev;
5662         else
5663                 srcdev = NULL;
5664
5665         /*
5666          * try to avoid the drive that is the source drive for a
5667          * dev-replace procedure, only choose it if no other non-missing
5668          * mirror is available
5669          */
5670         for (tolerance = 0; tolerance < 2; tolerance++) {
5671                 if (map->stripes[preferred_mirror].dev->bdev &&
5672                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5673                         return preferred_mirror;
5674                 for (i = first; i < first + num_stripes; i++) {
5675                         if (map->stripes[i].dev->bdev &&
5676                             (tolerance || map->stripes[i].dev != srcdev))
5677                                 return i;
5678                 }
5679         }
5680
5681         /* we couldn't find one that doesn't fail.  Just return something
5682          * and the io error handling code will clean up eventually
5683          */
5684         return preferred_mirror;
5685 }
5686
5687 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5688 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5689 {
5690         int i;
5691         int again = 1;
5692
5693         while (again) {
5694                 again = 0;
5695                 for (i = 0; i < num_stripes - 1; i++) {
5696                         /* Swap if parity is on a smaller index */
5697                         if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5698                                 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5699                                 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5700                                 again = 1;
5701                         }
5702                 }
5703         }
5704 }
5705
5706 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5707 {
5708         struct btrfs_bio *bbio = kzalloc(
5709                  /* the size of the btrfs_bio */
5710                 sizeof(struct btrfs_bio) +
5711                 /* plus the variable array for the stripes */
5712                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5713                 /* plus the variable array for the tgt dev */
5714                 sizeof(int) * (real_stripes) +
5715                 /*
5716                  * plus the raid_map, which includes both the tgt dev
5717                  * and the stripes
5718                  */
5719                 sizeof(u64) * (total_stripes),
5720                 GFP_NOFS|__GFP_NOFAIL);
5721
5722         atomic_set(&bbio->error, 0);
5723         refcount_set(&bbio->refs, 1);
5724
5725         bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5726         bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5727
5728         return bbio;
5729 }
5730
5731 void btrfs_get_bbio(struct btrfs_bio *bbio)
5732 {
5733         WARN_ON(!refcount_read(&bbio->refs));
5734         refcount_inc(&bbio->refs);
5735 }
5736
5737 void btrfs_put_bbio(struct btrfs_bio *bbio)
5738 {
5739         if (!bbio)
5740                 return;
5741         if (refcount_dec_and_test(&bbio->refs))
5742                 kfree(bbio);
5743 }
5744
5745 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5746 /*
5747  * Please note that, discard won't be sent to target device of device
5748  * replace.
5749  */
5750 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5751                                          u64 logical, u64 *length_ret,
5752                                          struct btrfs_bio **bbio_ret)
5753 {
5754         struct extent_map *em;
5755         struct map_lookup *map;
5756         struct btrfs_bio *bbio;
5757         u64 length = *length_ret;
5758         u64 offset;
5759         u64 stripe_nr;
5760         u64 stripe_nr_end;
5761         u64 stripe_end_offset;
5762         u64 stripe_cnt;
5763         u64 stripe_len;
5764         u64 stripe_offset;
5765         u64 num_stripes;
5766         u32 stripe_index;
5767         u32 factor = 0;
5768         u32 sub_stripes = 0;
5769         u64 stripes_per_dev = 0;
5770         u32 remaining_stripes = 0;
5771         u32 last_stripe = 0;
5772         int ret = 0;
5773         int i;
5774
5775         /* discard always return a bbio */
5776         ASSERT(bbio_ret);
5777
5778         em = btrfs_get_chunk_map(fs_info, logical, length);
5779         if (IS_ERR(em))
5780                 return PTR_ERR(em);
5781
5782         map = em->map_lookup;
5783         /* we don't discard raid56 yet */
5784         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5785                 ret = -EOPNOTSUPP;
5786                 goto out;
5787         }
5788
5789         offset = logical - em->start;
5790         length = min_t(u64, em->start + em->len - logical, length);
5791         *length_ret = length;
5792
5793         stripe_len = map->stripe_len;
5794         /*
5795          * stripe_nr counts the total number of stripes we have to stride
5796          * to get to this block
5797          */
5798         stripe_nr = div64_u64(offset, stripe_len);
5799
5800         /* stripe_offset is the offset of this block in its stripe */
5801         stripe_offset = offset - stripe_nr * stripe_len;
5802
5803         stripe_nr_end = round_up(offset + length, map->stripe_len);
5804         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5805         stripe_cnt = stripe_nr_end - stripe_nr;
5806         stripe_end_offset = stripe_nr_end * map->stripe_len -
5807                             (offset + length);
5808         /*
5809          * after this, stripe_nr is the number of stripes on this
5810          * device we have to walk to find the data, and stripe_index is
5811          * the number of our device in the stripe array
5812          */
5813         num_stripes = 1;
5814         stripe_index = 0;
5815         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5816                          BTRFS_BLOCK_GROUP_RAID10)) {
5817                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5818                         sub_stripes = 1;
5819                 else
5820                         sub_stripes = map->sub_stripes;
5821
5822                 factor = map->num_stripes / sub_stripes;
5823                 num_stripes = min_t(u64, map->num_stripes,
5824                                     sub_stripes * stripe_cnt);
5825                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5826                 stripe_index *= sub_stripes;
5827                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5828                                               &remaining_stripes);
5829                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5830                 last_stripe *= sub_stripes;
5831         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5832                                 BTRFS_BLOCK_GROUP_DUP)) {
5833                 num_stripes = map->num_stripes;
5834         } else {
5835                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5836                                         &stripe_index);
5837         }
5838
5839         bbio = alloc_btrfs_bio(num_stripes, 0);
5840         if (!bbio) {
5841                 ret = -ENOMEM;
5842                 goto out;
5843         }
5844
5845         for (i = 0; i < num_stripes; i++) {
5846                 bbio->stripes[i].physical =
5847                         map->stripes[stripe_index].physical +
5848                         stripe_offset + stripe_nr * map->stripe_len;
5849                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5850
5851                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5852                                  BTRFS_BLOCK_GROUP_RAID10)) {
5853                         bbio->stripes[i].length = stripes_per_dev *
5854                                 map->stripe_len;
5855
5856                         if (i / sub_stripes < remaining_stripes)
5857                                 bbio->stripes[i].length +=
5858                                         map->stripe_len;
5859
5860                         /*
5861                          * Special for the first stripe and
5862                          * the last stripe:
5863                          *
5864                          * |-------|...|-------|
5865                          *     |----------|
5866                          *    off     end_off
5867                          */
5868                         if (i < sub_stripes)
5869                                 bbio->stripes[i].length -=
5870                                         stripe_offset;
5871
5872                         if (stripe_index >= last_stripe &&
5873                             stripe_index <= (last_stripe +
5874                                              sub_stripes - 1))
5875                                 bbio->stripes[i].length -=
5876                                         stripe_end_offset;
5877
5878                         if (i == sub_stripes - 1)
5879                                 stripe_offset = 0;
5880                 } else {
5881                         bbio->stripes[i].length = length;
5882                 }
5883
5884                 stripe_index++;
5885                 if (stripe_index == map->num_stripes) {
5886                         stripe_index = 0;
5887                         stripe_nr++;
5888                 }
5889         }
5890
5891         *bbio_ret = bbio;
5892         bbio->map_type = map->type;
5893         bbio->num_stripes = num_stripes;
5894 out:
5895         free_extent_map(em);
5896         return ret;
5897 }
5898
5899 /*
5900  * In dev-replace case, for repair case (that's the only case where the mirror
5901  * is selected explicitly when calling btrfs_map_block), blocks left of the
5902  * left cursor can also be read from the target drive.
5903  *
5904  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5905  * array of stripes.
5906  * For READ, it also needs to be supported using the same mirror number.
5907  *
5908  * If the requested block is not left of the left cursor, EIO is returned. This
5909  * can happen because btrfs_num_copies() returns one more in the dev-replace
5910  * case.
5911  */
5912 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5913                                          u64 logical, u64 length,
5914                                          u64 srcdev_devid, int *mirror_num,
5915                                          u64 *physical)
5916 {
5917         struct btrfs_bio *bbio = NULL;
5918         int num_stripes;
5919         int index_srcdev = 0;
5920         int found = 0;
5921         u64 physical_of_found = 0;
5922         int i;
5923         int ret = 0;
5924
5925         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5926                                 logical, &length, &bbio, 0, 0);
5927         if (ret) {
5928                 ASSERT(bbio == NULL);
5929                 return ret;
5930         }
5931
5932         num_stripes = bbio->num_stripes;
5933         if (*mirror_num > num_stripes) {
5934                 /*
5935                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5936                  * that means that the requested area is not left of the left
5937                  * cursor
5938                  */
5939                 btrfs_put_bbio(bbio);
5940                 return -EIO;
5941         }
5942
5943         /*
5944          * process the rest of the function using the mirror_num of the source
5945          * drive. Therefore look it up first.  At the end, patch the device
5946          * pointer to the one of the target drive.
5947          */
5948         for (i = 0; i < num_stripes; i++) {
5949                 if (bbio->stripes[i].dev->devid != srcdev_devid)
5950                         continue;
5951
5952                 /*
5953                  * In case of DUP, in order to keep it simple, only add the
5954                  * mirror with the lowest physical address
5955                  */
5956                 if (found &&
5957                     physical_of_found <= bbio->stripes[i].physical)
5958                         continue;
5959
5960                 index_srcdev = i;
5961                 found = 1;
5962                 physical_of_found = bbio->stripes[i].physical;
5963         }
5964
5965         btrfs_put_bbio(bbio);
5966
5967         ASSERT(found);
5968         if (!found)
5969                 return -EIO;
5970
5971         *mirror_num = index_srcdev + 1;
5972         *physical = physical_of_found;
5973         return ret;
5974 }
5975
5976 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
5977 {
5978         struct btrfs_block_group *cache;
5979         bool ret;
5980
5981         /* Non zoned filesystem does not use "to_copy" flag */
5982         if (!btrfs_is_zoned(fs_info))
5983                 return false;
5984
5985         cache = btrfs_lookup_block_group(fs_info, logical);
5986
5987         spin_lock(&cache->lock);
5988         ret = cache->to_copy;
5989         spin_unlock(&cache->lock);
5990
5991         btrfs_put_block_group(cache);
5992         return ret;
5993 }
5994
5995 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5996                                       struct btrfs_bio **bbio_ret,
5997                                       struct btrfs_dev_replace *dev_replace,
5998                                       u64 logical,
5999                                       int *num_stripes_ret, int *max_errors_ret)
6000 {
6001         struct btrfs_bio *bbio = *bbio_ret;
6002         u64 srcdev_devid = dev_replace->srcdev->devid;
6003         int tgtdev_indexes = 0;
6004         int num_stripes = *num_stripes_ret;
6005         int max_errors = *max_errors_ret;
6006         int i;
6007
6008         if (op == BTRFS_MAP_WRITE) {
6009                 int index_where_to_add;
6010
6011                 /*
6012                  * A block group which have "to_copy" set will eventually
6013                  * copied by dev-replace process. We can avoid cloning IO here.
6014                  */
6015                 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6016                         return;
6017
6018                 /*
6019                  * duplicate the write operations while the dev replace
6020                  * procedure is running. Since the copying of the old disk to
6021                  * the new disk takes place at run time while the filesystem is
6022                  * mounted writable, the regular write operations to the old
6023                  * disk have to be duplicated to go to the new disk as well.
6024                  *
6025                  * Note that device->missing is handled by the caller, and that
6026                  * the write to the old disk is already set up in the stripes
6027                  * array.
6028                  */
6029                 index_where_to_add = num_stripes;
6030                 for (i = 0; i < num_stripes; i++) {
6031                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
6032                                 /* write to new disk, too */
6033                                 struct btrfs_bio_stripe *new =
6034                                         bbio->stripes + index_where_to_add;
6035                                 struct btrfs_bio_stripe *old =
6036                                         bbio->stripes + i;
6037
6038                                 new->physical = old->physical;
6039                                 new->length = old->length;
6040                                 new->dev = dev_replace->tgtdev;
6041                                 bbio->tgtdev_map[i] = index_where_to_add;
6042                                 index_where_to_add++;
6043                                 max_errors++;
6044                                 tgtdev_indexes++;
6045                         }
6046                 }
6047                 num_stripes = index_where_to_add;
6048         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6049                 int index_srcdev = 0;
6050                 int found = 0;
6051                 u64 physical_of_found = 0;
6052
6053                 /*
6054                  * During the dev-replace procedure, the target drive can also
6055                  * be used to read data in case it is needed to repair a corrupt
6056                  * block elsewhere. This is possible if the requested area is
6057                  * left of the left cursor. In this area, the target drive is a
6058                  * full copy of the source drive.
6059                  */
6060                 for (i = 0; i < num_stripes; i++) {
6061                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
6062                                 /*
6063                                  * In case of DUP, in order to keep it simple,
6064                                  * only add the mirror with the lowest physical
6065                                  * address
6066                                  */
6067                                 if (found &&
6068                                     physical_of_found <=
6069                                      bbio->stripes[i].physical)
6070                                         continue;
6071                                 index_srcdev = i;
6072                                 found = 1;
6073                                 physical_of_found = bbio->stripes[i].physical;
6074                         }
6075                 }
6076                 if (found) {
6077                         struct btrfs_bio_stripe *tgtdev_stripe =
6078                                 bbio->stripes + num_stripes;
6079
6080                         tgtdev_stripe->physical = physical_of_found;
6081                         tgtdev_stripe->length =
6082                                 bbio->stripes[index_srcdev].length;
6083                         tgtdev_stripe->dev = dev_replace->tgtdev;
6084                         bbio->tgtdev_map[index_srcdev] = num_stripes;
6085
6086                         tgtdev_indexes++;
6087                         num_stripes++;
6088                 }
6089         }
6090
6091         *num_stripes_ret = num_stripes;
6092         *max_errors_ret = max_errors;
6093         bbio->num_tgtdevs = tgtdev_indexes;
6094         *bbio_ret = bbio;
6095 }
6096
6097 static bool need_full_stripe(enum btrfs_map_op op)
6098 {
6099         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6100 }
6101
6102 /*
6103  * Calculate the geometry of a particular (address, len) tuple. This
6104  * information is used to calculate how big a particular bio can get before it
6105  * straddles a stripe.
6106  *
6107  * @fs_info: the filesystem
6108  * @em:      mapping containing the logical extent
6109  * @op:      type of operation - write or read
6110  * @logical: address that we want to figure out the geometry of
6111  * @len:     the length of IO we are going to perform, starting at @logical
6112  * @io_geom: pointer used to return values
6113  *
6114  * Returns < 0 in case a chunk for the given logical address cannot be found,
6115  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6116  */
6117 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6118                           enum btrfs_map_op op, u64 logical, u64 len,
6119                           struct btrfs_io_geometry *io_geom)
6120 {
6121         struct map_lookup *map;
6122         u64 offset;
6123         u64 stripe_offset;
6124         u64 stripe_nr;
6125         u64 stripe_len;
6126         u64 raid56_full_stripe_start = (u64)-1;
6127         int data_stripes;
6128
6129         ASSERT(op != BTRFS_MAP_DISCARD);
6130
6131         map = em->map_lookup;
6132         /* Offset of this logical address in the chunk */
6133         offset = logical - em->start;
6134         /* Len of a stripe in a chunk */
6135         stripe_len = map->stripe_len;
6136         /* Stripe wher this block falls in */
6137         stripe_nr = div64_u64(offset, stripe_len);
6138         /* Offset of stripe in the chunk */
6139         stripe_offset = stripe_nr * stripe_len;
6140         if (offset < stripe_offset) {
6141                 btrfs_crit(fs_info,
6142 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6143                         stripe_offset, offset, em->start, logical, stripe_len);
6144                 return -EINVAL;
6145         }
6146
6147         /* stripe_offset is the offset of this block in its stripe */
6148         stripe_offset = offset - stripe_offset;
6149         data_stripes = nr_data_stripes(map);
6150
6151         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6152                 u64 max_len = stripe_len - stripe_offset;
6153
6154                 /*
6155                  * In case of raid56, we need to know the stripe aligned start
6156                  */
6157                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6158                         unsigned long full_stripe_len = stripe_len * data_stripes;
6159                         raid56_full_stripe_start = offset;
6160
6161                         /*
6162                          * Allow a write of a full stripe, but make sure we
6163                          * don't allow straddling of stripes
6164                          */
6165                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6166                                         full_stripe_len);
6167                         raid56_full_stripe_start *= full_stripe_len;
6168
6169                         /*
6170                          * For writes to RAID[56], allow a full stripeset across
6171                          * all disks. For other RAID types and for RAID[56]
6172                          * reads, just allow a single stripe (on a single disk).
6173                          */
6174                         if (op == BTRFS_MAP_WRITE) {
6175                                 max_len = stripe_len * data_stripes -
6176                                           (offset - raid56_full_stripe_start);
6177                         }
6178                 }
6179                 len = min_t(u64, em->len - offset, max_len);
6180         } else {
6181                 len = em->len - offset;
6182         }
6183
6184         io_geom->len = len;
6185         io_geom->offset = offset;
6186         io_geom->stripe_len = stripe_len;
6187         io_geom->stripe_nr = stripe_nr;
6188         io_geom->stripe_offset = stripe_offset;
6189         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6190
6191         return 0;
6192 }
6193
6194 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6195                              enum btrfs_map_op op,
6196                              u64 logical, u64 *length,
6197                              struct btrfs_bio **bbio_ret,
6198                              int mirror_num, int need_raid_map)
6199 {
6200         struct extent_map *em;
6201         struct map_lookup *map;
6202         u64 stripe_offset;
6203         u64 stripe_nr;
6204         u64 stripe_len;
6205         u32 stripe_index;
6206         int data_stripes;
6207         int i;
6208         int ret = 0;
6209         int num_stripes;
6210         int max_errors = 0;
6211         int tgtdev_indexes = 0;
6212         struct btrfs_bio *bbio = NULL;
6213         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6214         int dev_replace_is_ongoing = 0;
6215         int num_alloc_stripes;
6216         int patch_the_first_stripe_for_dev_replace = 0;
6217         u64 physical_to_patch_in_first_stripe = 0;
6218         u64 raid56_full_stripe_start = (u64)-1;
6219         struct btrfs_io_geometry geom;
6220
6221         ASSERT(bbio_ret);
6222         ASSERT(op != BTRFS_MAP_DISCARD);
6223
6224         em = btrfs_get_chunk_map(fs_info, logical, *length);
6225         ASSERT(!IS_ERR(em));
6226
6227         ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
6228         if (ret < 0)
6229                 return ret;
6230
6231         map = em->map_lookup;
6232
6233         *length = geom.len;
6234         stripe_len = geom.stripe_len;
6235         stripe_nr = geom.stripe_nr;
6236         stripe_offset = geom.stripe_offset;
6237         raid56_full_stripe_start = geom.raid56_stripe_offset;
6238         data_stripes = nr_data_stripes(map);
6239
6240         down_read(&dev_replace->rwsem);
6241         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6242         /*
6243          * Hold the semaphore for read during the whole operation, write is
6244          * requested at commit time but must wait.
6245          */
6246         if (!dev_replace_is_ongoing)
6247                 up_read(&dev_replace->rwsem);
6248
6249         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6250             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6251                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6252                                                     dev_replace->srcdev->devid,
6253                                                     &mirror_num,
6254                                             &physical_to_patch_in_first_stripe);
6255                 if (ret)
6256                         goto out;
6257                 else
6258                         patch_the_first_stripe_for_dev_replace = 1;
6259         } else if (mirror_num > map->num_stripes) {
6260                 mirror_num = 0;
6261         }
6262
6263         num_stripes = 1;
6264         stripe_index = 0;
6265         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6266                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6267                                 &stripe_index);
6268                 if (!need_full_stripe(op))
6269                         mirror_num = 1;
6270         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6271                 if (need_full_stripe(op))
6272                         num_stripes = map->num_stripes;
6273                 else if (mirror_num)
6274                         stripe_index = mirror_num - 1;
6275                 else {
6276                         stripe_index = find_live_mirror(fs_info, map, 0,
6277                                             dev_replace_is_ongoing);
6278                         mirror_num = stripe_index + 1;
6279                 }
6280
6281         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6282                 if (need_full_stripe(op)) {
6283                         num_stripes = map->num_stripes;
6284                 } else if (mirror_num) {
6285                         stripe_index = mirror_num - 1;
6286                 } else {
6287                         mirror_num = 1;
6288                 }
6289
6290         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6291                 u32 factor = map->num_stripes / map->sub_stripes;
6292
6293                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6294                 stripe_index *= map->sub_stripes;
6295
6296                 if (need_full_stripe(op))
6297                         num_stripes = map->sub_stripes;
6298                 else if (mirror_num)
6299                         stripe_index += mirror_num - 1;
6300                 else {
6301                         int old_stripe_index = stripe_index;
6302                         stripe_index = find_live_mirror(fs_info, map,
6303                                               stripe_index,
6304                                               dev_replace_is_ongoing);
6305                         mirror_num = stripe_index - old_stripe_index + 1;
6306                 }
6307
6308         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6309                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6310                         /* push stripe_nr back to the start of the full stripe */
6311                         stripe_nr = div64_u64(raid56_full_stripe_start,
6312                                         stripe_len * data_stripes);
6313
6314                         /* RAID[56] write or recovery. Return all stripes */
6315                         num_stripes = map->num_stripes;
6316                         max_errors = nr_parity_stripes(map);
6317
6318                         *length = map->stripe_len;
6319                         stripe_index = 0;
6320                         stripe_offset = 0;
6321                 } else {
6322                         /*
6323                          * Mirror #0 or #1 means the original data block.
6324                          * Mirror #2 is RAID5 parity block.
6325                          * Mirror #3 is RAID6 Q block.
6326                          */
6327                         stripe_nr = div_u64_rem(stripe_nr,
6328                                         data_stripes, &stripe_index);
6329                         if (mirror_num > 1)
6330                                 stripe_index = data_stripes + mirror_num - 2;
6331
6332                         /* We distribute the parity blocks across stripes */
6333                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6334                                         &stripe_index);
6335                         if (!need_full_stripe(op) && mirror_num <= 1)
6336                                 mirror_num = 1;
6337                 }
6338         } else {
6339                 /*
6340                  * after this, stripe_nr is the number of stripes on this
6341                  * device we have to walk to find the data, and stripe_index is
6342                  * the number of our device in the stripe array
6343                  */
6344                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6345                                 &stripe_index);
6346                 mirror_num = stripe_index + 1;
6347         }
6348         if (stripe_index >= map->num_stripes) {
6349                 btrfs_crit(fs_info,
6350                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6351                            stripe_index, map->num_stripes);
6352                 ret = -EINVAL;
6353                 goto out;
6354         }
6355
6356         num_alloc_stripes = num_stripes;
6357         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6358                 if (op == BTRFS_MAP_WRITE)
6359                         num_alloc_stripes <<= 1;
6360                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6361                         num_alloc_stripes++;
6362                 tgtdev_indexes = num_stripes;
6363         }
6364
6365         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6366         if (!bbio) {
6367                 ret = -ENOMEM;
6368                 goto out;
6369         }
6370
6371         for (i = 0; i < num_stripes; i++) {
6372                 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6373                         stripe_offset + stripe_nr * map->stripe_len;
6374                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6375                 stripe_index++;
6376         }
6377
6378         /* build raid_map */
6379         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6380             (need_full_stripe(op) || mirror_num > 1)) {
6381                 u64 tmp;
6382                 unsigned rot;
6383
6384                 /* Work out the disk rotation on this stripe-set */
6385                 div_u64_rem(stripe_nr, num_stripes, &rot);
6386
6387                 /* Fill in the logical address of each stripe */
6388                 tmp = stripe_nr * data_stripes;
6389                 for (i = 0; i < data_stripes; i++)
6390                         bbio->raid_map[(i+rot) % num_stripes] =
6391                                 em->start + (tmp + i) * map->stripe_len;
6392
6393                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6394                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6395                         bbio->raid_map[(i+rot+1) % num_stripes] =
6396                                 RAID6_Q_STRIPE;
6397
6398                 sort_parity_stripes(bbio, num_stripes);
6399         }
6400
6401         if (need_full_stripe(op))
6402                 max_errors = btrfs_chunk_max_errors(map);
6403
6404         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6405             need_full_stripe(op)) {
6406                 handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
6407                                           &num_stripes, &max_errors);
6408         }
6409
6410         *bbio_ret = bbio;
6411         bbio->map_type = map->type;
6412         bbio->num_stripes = num_stripes;
6413         bbio->max_errors = max_errors;
6414         bbio->mirror_num = mirror_num;
6415
6416         /*
6417          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6418          * mirror_num == num_stripes + 1 && dev_replace target drive is
6419          * available as a mirror
6420          */
6421         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6422                 WARN_ON(num_stripes > 1);
6423                 bbio->stripes[0].dev = dev_replace->tgtdev;
6424                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6425                 bbio->mirror_num = map->num_stripes + 1;
6426         }
6427 out:
6428         if (dev_replace_is_ongoing) {
6429                 lockdep_assert_held(&dev_replace->rwsem);
6430                 /* Unlock and let waiting writers proceed */
6431                 up_read(&dev_replace->rwsem);
6432         }
6433         free_extent_map(em);
6434         return ret;
6435 }
6436
6437 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6438                       u64 logical, u64 *length,
6439                       struct btrfs_bio **bbio_ret, int mirror_num)
6440 {
6441         if (op == BTRFS_MAP_DISCARD)
6442                 return __btrfs_map_block_for_discard(fs_info, logical,
6443                                                      length, bbio_ret);
6444
6445         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6446                                  mirror_num, 0);
6447 }
6448
6449 /* For Scrub/replace */
6450 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6451                      u64 logical, u64 *length,
6452                      struct btrfs_bio **bbio_ret)
6453 {
6454         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6455 }
6456
6457 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6458 {
6459         bio->bi_private = bbio->private;
6460         bio->bi_end_io = bbio->end_io;
6461         bio_endio(bio);
6462
6463         btrfs_put_bbio(bbio);
6464 }
6465
6466 static void btrfs_end_bio(struct bio *bio)
6467 {
6468         struct btrfs_bio *bbio = bio->bi_private;
6469         int is_orig_bio = 0;
6470
6471         if (bio->bi_status) {
6472                 atomic_inc(&bbio->error);
6473                 if (bio->bi_status == BLK_STS_IOERR ||
6474                     bio->bi_status == BLK_STS_TARGET) {
6475                         struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6476
6477                         ASSERT(dev->bdev);
6478                         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6479                                 btrfs_dev_stat_inc_and_print(dev,
6480                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6481                         else if (!(bio->bi_opf & REQ_RAHEAD))
6482                                 btrfs_dev_stat_inc_and_print(dev,
6483                                                 BTRFS_DEV_STAT_READ_ERRS);
6484                         if (bio->bi_opf & REQ_PREFLUSH)
6485                                 btrfs_dev_stat_inc_and_print(dev,
6486                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6487                 }
6488         }
6489
6490         if (bio == bbio->orig_bio)
6491                 is_orig_bio = 1;
6492
6493         btrfs_bio_counter_dec(bbio->fs_info);
6494
6495         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6496                 if (!is_orig_bio) {
6497                         bio_put(bio);
6498                         bio = bbio->orig_bio;
6499                 }
6500
6501                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6502                 /* only send an error to the higher layers if it is
6503                  * beyond the tolerance of the btrfs bio
6504                  */
6505                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6506                         bio->bi_status = BLK_STS_IOERR;
6507                 } else {
6508                         /*
6509                          * this bio is actually up to date, we didn't
6510                          * go over the max number of errors
6511                          */
6512                         bio->bi_status = BLK_STS_OK;
6513                 }
6514
6515                 btrfs_end_bbio(bbio, bio);
6516         } else if (!is_orig_bio) {
6517                 bio_put(bio);
6518         }
6519 }
6520
6521 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6522                               u64 physical, struct btrfs_device *dev)
6523 {
6524         struct btrfs_fs_info *fs_info = bbio->fs_info;
6525
6526         bio->bi_private = bbio;
6527         btrfs_io_bio(bio)->device = dev;
6528         bio->bi_end_io = btrfs_end_bio;
6529         bio->bi_iter.bi_sector = physical >> 9;
6530         /*
6531          * For zone append writing, bi_sector must point the beginning of the
6532          * zone
6533          */
6534         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6535                 if (btrfs_dev_is_sequential(dev, physical)) {
6536                         u64 zone_start = round_down(physical, fs_info->zone_size);
6537
6538                         bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6539                 } else {
6540                         bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6541                         bio->bi_opf |= REQ_OP_WRITE;
6542                 }
6543         }
6544         btrfs_debug_in_rcu(fs_info,
6545         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6546                 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6547                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6548                 dev->devid, bio->bi_iter.bi_size);
6549         bio_set_dev(bio, dev->bdev);
6550
6551         btrfs_bio_counter_inc_noblocked(fs_info);
6552
6553         btrfsic_submit_bio(bio);
6554 }
6555
6556 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6557 {
6558         atomic_inc(&bbio->error);
6559         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6560                 /* Should be the original bio. */
6561                 WARN_ON(bio != bbio->orig_bio);
6562
6563                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6564                 bio->bi_iter.bi_sector = logical >> 9;
6565                 if (atomic_read(&bbio->error) > bbio->max_errors)
6566                         bio->bi_status = BLK_STS_IOERR;
6567                 else
6568                         bio->bi_status = BLK_STS_OK;
6569                 btrfs_end_bbio(bbio, bio);
6570         }
6571 }
6572
6573 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6574                            int mirror_num)
6575 {
6576         struct btrfs_device *dev;
6577         struct bio *first_bio = bio;
6578         u64 logical = bio->bi_iter.bi_sector << 9;
6579         u64 length = 0;
6580         u64 map_length;
6581         int ret;
6582         int dev_nr;
6583         int total_devs;
6584         struct btrfs_bio *bbio = NULL;
6585
6586         length = bio->bi_iter.bi_size;
6587         map_length = length;
6588
6589         btrfs_bio_counter_inc_blocked(fs_info);
6590         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6591                                 &map_length, &bbio, mirror_num, 1);
6592         if (ret) {
6593                 btrfs_bio_counter_dec(fs_info);
6594                 return errno_to_blk_status(ret);
6595         }
6596
6597         total_devs = bbio->num_stripes;
6598         bbio->orig_bio = first_bio;
6599         bbio->private = first_bio->bi_private;
6600         bbio->end_io = first_bio->bi_end_io;
6601         bbio->fs_info = fs_info;
6602         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6603
6604         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6605             ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
6606                 /* In this case, map_length has been set to the length of
6607                    a single stripe; not the whole write */
6608                 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6609                         ret = raid56_parity_write(fs_info, bio, bbio,
6610                                                   map_length);
6611                 } else {
6612                         ret = raid56_parity_recover(fs_info, bio, bbio,
6613                                                     map_length, mirror_num, 1);
6614                 }
6615
6616                 btrfs_bio_counter_dec(fs_info);
6617                 return errno_to_blk_status(ret);
6618         }
6619
6620         if (map_length < length) {
6621                 btrfs_crit(fs_info,
6622                            "mapping failed logical %llu bio len %llu len %llu",
6623                            logical, length, map_length);
6624                 BUG();
6625         }
6626
6627         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6628                 dev = bbio->stripes[dev_nr].dev;
6629                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6630                                                    &dev->dev_state) ||
6631                     (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6632                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6633                         bbio_error(bbio, first_bio, logical);
6634                         continue;
6635                 }
6636
6637                 if (dev_nr < total_devs - 1)
6638                         bio = btrfs_bio_clone(first_bio);
6639                 else
6640                         bio = first_bio;
6641
6642                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6643         }
6644         btrfs_bio_counter_dec(fs_info);
6645         return BLK_STS_OK;
6646 }
6647
6648 /*
6649  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6650  * return NULL.
6651  *
6652  * If devid and uuid are both specified, the match must be exact, otherwise
6653  * only devid is used.
6654  *
6655  * If @seed is true, traverse through the seed devices.
6656  */
6657 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6658                                        u64 devid, u8 *uuid, u8 *fsid)
6659 {
6660         struct btrfs_device *device;
6661         struct btrfs_fs_devices *seed_devs;
6662
6663         if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6664                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6665                         if (device->devid == devid &&
6666                             (!uuid || memcmp(device->uuid, uuid,
6667                                              BTRFS_UUID_SIZE) == 0))
6668                                 return device;
6669                 }
6670         }
6671
6672         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6673                 if (!fsid ||
6674                     !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6675                         list_for_each_entry(device, &seed_devs->devices,
6676                                             dev_list) {
6677                                 if (device->devid == devid &&
6678                                     (!uuid || memcmp(device->uuid, uuid,
6679                                                      BTRFS_UUID_SIZE) == 0))
6680                                         return device;
6681                         }
6682                 }
6683         }
6684
6685         return NULL;
6686 }
6687
6688 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6689                                             u64 devid, u8 *dev_uuid)
6690 {
6691         struct btrfs_device *device;
6692         unsigned int nofs_flag;
6693
6694         /*
6695          * We call this under the chunk_mutex, so we want to use NOFS for this
6696          * allocation, however we don't want to change btrfs_alloc_device() to
6697          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6698          * places.
6699          */
6700         nofs_flag = memalloc_nofs_save();
6701         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6702         memalloc_nofs_restore(nofs_flag);
6703         if (IS_ERR(device))
6704                 return device;
6705
6706         list_add(&device->dev_list, &fs_devices->devices);
6707         device->fs_devices = fs_devices;
6708         fs_devices->num_devices++;
6709
6710         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6711         fs_devices->missing_devices++;
6712
6713         return device;
6714 }
6715
6716 /**
6717  * btrfs_alloc_device - allocate struct btrfs_device
6718  * @fs_info:    used only for generating a new devid, can be NULL if
6719  *              devid is provided (i.e. @devid != NULL).
6720  * @devid:      a pointer to devid for this device.  If NULL a new devid
6721  *              is generated.
6722  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6723  *              is generated.
6724  *
6725  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6726  * on error.  Returned struct is not linked onto any lists and must be
6727  * destroyed with btrfs_free_device.
6728  */
6729 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6730                                         const u64 *devid,
6731                                         const u8 *uuid)
6732 {
6733         struct btrfs_device *dev;
6734         u64 tmp;
6735
6736         if (WARN_ON(!devid && !fs_info))
6737                 return ERR_PTR(-EINVAL);
6738
6739         dev = __alloc_device(fs_info);
6740         if (IS_ERR(dev))
6741                 return dev;
6742
6743         if (devid)
6744                 tmp = *devid;
6745         else {
6746                 int ret;
6747
6748                 ret = find_next_devid(fs_info, &tmp);
6749                 if (ret) {
6750                         btrfs_free_device(dev);
6751                         return ERR_PTR(ret);
6752                 }
6753         }
6754         dev->devid = tmp;
6755
6756         if (uuid)
6757                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6758         else
6759                 generate_random_uuid(dev->uuid);
6760
6761         return dev;
6762 }
6763
6764 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6765                                         u64 devid, u8 *uuid, bool error)
6766 {
6767         if (error)
6768                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6769                               devid, uuid);
6770         else
6771                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6772                               devid, uuid);
6773 }
6774
6775 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6776 {
6777         int index = btrfs_bg_flags_to_raid_index(type);
6778         int ncopies = btrfs_raid_array[index].ncopies;
6779         const int nparity = btrfs_raid_array[index].nparity;
6780         int data_stripes;
6781
6782         if (nparity)
6783                 data_stripes = num_stripes - nparity;
6784         else
6785                 data_stripes = num_stripes / ncopies;
6786
6787         return div_u64(chunk_len, data_stripes);
6788 }
6789
6790 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6791                           struct btrfs_chunk *chunk)
6792 {
6793         struct btrfs_fs_info *fs_info = leaf->fs_info;
6794         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6795         struct map_lookup *map;
6796         struct extent_map *em;
6797         u64 logical;
6798         u64 length;
6799         u64 devid;
6800         u8 uuid[BTRFS_UUID_SIZE];
6801         int num_stripes;
6802         int ret;
6803         int i;
6804
6805         logical = key->offset;
6806         length = btrfs_chunk_length(leaf, chunk);
6807         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6808
6809         /*
6810          * Only need to verify chunk item if we're reading from sys chunk array,
6811          * as chunk item in tree block is already verified by tree-checker.
6812          */
6813         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6814                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6815                 if (ret)
6816                         return ret;
6817         }
6818
6819         read_lock(&map_tree->lock);
6820         em = lookup_extent_mapping(map_tree, logical, 1);
6821         read_unlock(&map_tree->lock);
6822
6823         /* already mapped? */
6824         if (em && em->start <= logical && em->start + em->len > logical) {
6825                 free_extent_map(em);
6826                 return 0;
6827         } else if (em) {
6828                 free_extent_map(em);
6829         }
6830
6831         em = alloc_extent_map();
6832         if (!em)
6833                 return -ENOMEM;
6834         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6835         if (!map) {
6836                 free_extent_map(em);
6837                 return -ENOMEM;
6838         }
6839
6840         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6841         em->map_lookup = map;
6842         em->start = logical;
6843         em->len = length;
6844         em->orig_start = 0;
6845         em->block_start = 0;
6846         em->block_len = em->len;
6847
6848         map->num_stripes = num_stripes;
6849         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6850         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6851         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6852         map->type = btrfs_chunk_type(leaf, chunk);
6853         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6854         map->verified_stripes = 0;
6855         em->orig_block_len = calc_stripe_length(map->type, em->len,
6856                                                 map->num_stripes);
6857         for (i = 0; i < num_stripes; i++) {
6858                 map->stripes[i].physical =
6859                         btrfs_stripe_offset_nr(leaf, chunk, i);
6860                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6861                 read_extent_buffer(leaf, uuid, (unsigned long)
6862                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6863                                    BTRFS_UUID_SIZE);
6864                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6865                                                         devid, uuid, NULL);
6866                 if (!map->stripes[i].dev &&
6867                     !btrfs_test_opt(fs_info, DEGRADED)) {
6868                         free_extent_map(em);
6869                         btrfs_report_missing_device(fs_info, devid, uuid, true);
6870                         return -ENOENT;
6871                 }
6872                 if (!map->stripes[i].dev) {
6873                         map->stripes[i].dev =
6874                                 add_missing_dev(fs_info->fs_devices, devid,
6875                                                 uuid);
6876                         if (IS_ERR(map->stripes[i].dev)) {
6877                                 free_extent_map(em);
6878                                 btrfs_err(fs_info,
6879                                         "failed to init missing dev %llu: %ld",
6880                                         devid, PTR_ERR(map->stripes[i].dev));
6881                                 return PTR_ERR(map->stripes[i].dev);
6882                         }
6883                         btrfs_report_missing_device(fs_info, devid, uuid, false);
6884                 }
6885                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6886                                 &(map->stripes[i].dev->dev_state));
6887
6888         }
6889
6890         write_lock(&map_tree->lock);
6891         ret = add_extent_mapping(map_tree, em, 0);
6892         write_unlock(&map_tree->lock);
6893         if (ret < 0) {
6894                 btrfs_err(fs_info,
6895                           "failed to add chunk map, start=%llu len=%llu: %d",
6896                           em->start, em->len, ret);
6897         }
6898         free_extent_map(em);
6899
6900         return ret;
6901 }
6902
6903 static void fill_device_from_item(struct extent_buffer *leaf,
6904                                  struct btrfs_dev_item *dev_item,
6905                                  struct btrfs_device *device)
6906 {
6907         unsigned long ptr;
6908
6909         device->devid = btrfs_device_id(leaf, dev_item);
6910         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6911         device->total_bytes = device->disk_total_bytes;
6912         device->commit_total_bytes = device->disk_total_bytes;
6913         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6914         device->commit_bytes_used = device->bytes_used;
6915         device->type = btrfs_device_type(leaf, dev_item);
6916         device->io_align = btrfs_device_io_align(leaf, dev_item);
6917         device->io_width = btrfs_device_io_width(leaf, dev_item);
6918         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6919         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6920         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6921
6922         ptr = btrfs_device_uuid(dev_item);
6923         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6924 }
6925
6926 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6927                                                   u8 *fsid)
6928 {
6929         struct btrfs_fs_devices *fs_devices;
6930         int ret;
6931
6932         lockdep_assert_held(&uuid_mutex);
6933         ASSERT(fsid);
6934
6935         /* This will match only for multi-device seed fs */
6936         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6937                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6938                         return fs_devices;
6939
6940
6941         fs_devices = find_fsid(fsid, NULL);
6942         if (!fs_devices) {
6943                 if (!btrfs_test_opt(fs_info, DEGRADED))
6944                         return ERR_PTR(-ENOENT);
6945
6946                 fs_devices = alloc_fs_devices(fsid, NULL);
6947                 if (IS_ERR(fs_devices))
6948                         return fs_devices;
6949
6950                 fs_devices->seeding = true;
6951                 fs_devices->opened = 1;
6952                 return fs_devices;
6953         }
6954
6955         /*
6956          * Upon first call for a seed fs fsid, just create a private copy of the
6957          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6958          */
6959         fs_devices = clone_fs_devices(fs_devices);
6960         if (IS_ERR(fs_devices))
6961                 return fs_devices;
6962
6963         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6964         if (ret) {
6965                 free_fs_devices(fs_devices);
6966                 return ERR_PTR(ret);
6967         }
6968
6969         if (!fs_devices->seeding) {
6970                 close_fs_devices(fs_devices);
6971                 free_fs_devices(fs_devices);
6972                 return ERR_PTR(-EINVAL);
6973         }
6974
6975         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6976
6977         return fs_devices;
6978 }
6979
6980 static int read_one_dev(struct extent_buffer *leaf,
6981                         struct btrfs_dev_item *dev_item)
6982 {
6983         struct btrfs_fs_info *fs_info = leaf->fs_info;
6984         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6985         struct btrfs_device *device;
6986         u64 devid;
6987         int ret;
6988         u8 fs_uuid[BTRFS_FSID_SIZE];
6989         u8 dev_uuid[BTRFS_UUID_SIZE];
6990
6991         devid = btrfs_device_id(leaf, dev_item);
6992         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6993                            BTRFS_UUID_SIZE);
6994         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6995                            BTRFS_FSID_SIZE);
6996
6997         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6998                 fs_devices = open_seed_devices(fs_info, fs_uuid);
6999                 if (IS_ERR(fs_devices))
7000                         return PTR_ERR(fs_devices);
7001         }
7002
7003         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7004                                    fs_uuid);
7005         if (!device) {
7006                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7007                         btrfs_report_missing_device(fs_info, devid,
7008                                                         dev_uuid, true);
7009                         return -ENOENT;
7010                 }
7011
7012                 device = add_missing_dev(fs_devices, devid, dev_uuid);
7013                 if (IS_ERR(device)) {
7014                         btrfs_err(fs_info,
7015                                 "failed to add missing dev %llu: %ld",
7016                                 devid, PTR_ERR(device));
7017                         return PTR_ERR(device);
7018                 }
7019                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7020         } else {
7021                 if (!device->bdev) {
7022                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
7023                                 btrfs_report_missing_device(fs_info,
7024                                                 devid, dev_uuid, true);
7025                                 return -ENOENT;
7026                         }
7027                         btrfs_report_missing_device(fs_info, devid,
7028                                                         dev_uuid, false);
7029                 }
7030
7031                 if (!device->bdev &&
7032                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7033                         /*
7034                          * this happens when a device that was properly setup
7035                          * in the device info lists suddenly goes bad.
7036                          * device->bdev is NULL, and so we have to set
7037                          * device->missing to one here
7038                          */
7039                         device->fs_devices->missing_devices++;
7040                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7041                 }
7042
7043                 /* Move the device to its own fs_devices */
7044                 if (device->fs_devices != fs_devices) {
7045                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7046                                                         &device->dev_state));
7047
7048                         list_move(&device->dev_list, &fs_devices->devices);
7049                         device->fs_devices->num_devices--;
7050                         fs_devices->num_devices++;
7051
7052                         device->fs_devices->missing_devices--;
7053                         fs_devices->missing_devices++;
7054
7055                         device->fs_devices = fs_devices;
7056                 }
7057         }
7058
7059         if (device->fs_devices != fs_info->fs_devices) {
7060                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7061                 if (device->generation !=
7062                     btrfs_device_generation(leaf, dev_item))
7063                         return -EINVAL;
7064         }
7065
7066         fill_device_from_item(leaf, dev_item, device);
7067         if (device->bdev) {
7068                 u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
7069
7070                 if (device->total_bytes > max_total_bytes) {
7071                         btrfs_err(fs_info,
7072                         "device total_bytes should be at most %llu but found %llu",
7073                                   max_total_bytes, device->total_bytes);
7074                         return -EINVAL;
7075                 }
7076         }
7077         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7078         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7079            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7080                 device->fs_devices->total_rw_bytes += device->total_bytes;
7081                 atomic64_add(device->total_bytes - device->bytes_used,
7082                                 &fs_info->free_chunk_space);
7083         }
7084         ret = 0;
7085         return ret;
7086 }
7087
7088 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7089 {
7090         struct btrfs_root *root = fs_info->tree_root;
7091         struct btrfs_super_block *super_copy = fs_info->super_copy;
7092         struct extent_buffer *sb;
7093         struct btrfs_disk_key *disk_key;
7094         struct btrfs_chunk *chunk;
7095         u8 *array_ptr;
7096         unsigned long sb_array_offset;
7097         int ret = 0;
7098         u32 num_stripes;
7099         u32 array_size;
7100         u32 len = 0;
7101         u32 cur_offset;
7102         u64 type;
7103         struct btrfs_key key;
7104
7105         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7106         /*
7107          * This will create extent buffer of nodesize, superblock size is
7108          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
7109          * overallocate but we can keep it as-is, only the first page is used.
7110          */
7111         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
7112                                           root->root_key.objectid, 0);
7113         if (IS_ERR(sb))
7114                 return PTR_ERR(sb);
7115         set_extent_buffer_uptodate(sb);
7116         /*
7117          * The sb extent buffer is artificial and just used to read the system array.
7118          * set_extent_buffer_uptodate() call does not properly mark all it's
7119          * pages up-to-date when the page is larger: extent does not cover the
7120          * whole page and consequently check_page_uptodate does not find all
7121          * the page's extents up-to-date (the hole beyond sb),
7122          * write_extent_buffer then triggers a WARN_ON.
7123          *
7124          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
7125          * but sb spans only this function. Add an explicit SetPageUptodate call
7126          * to silence the warning eg. on PowerPC 64.
7127          */
7128         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7129                 SetPageUptodate(sb->pages[0]);
7130
7131         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7132         array_size = btrfs_super_sys_array_size(super_copy);
7133
7134         array_ptr = super_copy->sys_chunk_array;
7135         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7136         cur_offset = 0;
7137
7138         while (cur_offset < array_size) {
7139                 disk_key = (struct btrfs_disk_key *)array_ptr;
7140                 len = sizeof(*disk_key);
7141                 if (cur_offset + len > array_size)
7142                         goto out_short_read;
7143
7144                 btrfs_disk_key_to_cpu(&key, disk_key);
7145
7146                 array_ptr += len;
7147                 sb_array_offset += len;
7148                 cur_offset += len;
7149
7150                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7151                         btrfs_err(fs_info,
7152                             "unexpected item type %u in sys_array at offset %u",
7153                                   (u32)key.type, cur_offset);
7154                         ret = -EIO;
7155                         break;
7156                 }
7157
7158                 chunk = (struct btrfs_chunk *)sb_array_offset;
7159                 /*
7160                  * At least one btrfs_chunk with one stripe must be present,
7161                  * exact stripe count check comes afterwards
7162                  */
7163                 len = btrfs_chunk_item_size(1);
7164                 if (cur_offset + len > array_size)
7165                         goto out_short_read;
7166
7167                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7168                 if (!num_stripes) {
7169                         btrfs_err(fs_info,
7170                         "invalid number of stripes %u in sys_array at offset %u",
7171                                   num_stripes, cur_offset);
7172                         ret = -EIO;
7173                         break;
7174                 }
7175
7176                 type = btrfs_chunk_type(sb, chunk);
7177                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7178                         btrfs_err(fs_info,
7179                         "invalid chunk type %llu in sys_array at offset %u",
7180                                   type, cur_offset);
7181                         ret = -EIO;
7182                         break;
7183                 }
7184
7185                 len = btrfs_chunk_item_size(num_stripes);
7186                 if (cur_offset + len > array_size)
7187                         goto out_short_read;
7188
7189                 ret = read_one_chunk(&key, sb, chunk);
7190                 if (ret)
7191                         break;
7192
7193                 array_ptr += len;
7194                 sb_array_offset += len;
7195                 cur_offset += len;
7196         }
7197         clear_extent_buffer_uptodate(sb);
7198         free_extent_buffer_stale(sb);
7199         return ret;
7200
7201 out_short_read:
7202         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7203                         len, cur_offset);
7204         clear_extent_buffer_uptodate(sb);
7205         free_extent_buffer_stale(sb);
7206         return -EIO;
7207 }
7208
7209 /*
7210  * Check if all chunks in the fs are OK for read-write degraded mount
7211  *
7212  * If the @failing_dev is specified, it's accounted as missing.
7213  *
7214  * Return true if all chunks meet the minimal RW mount requirements.
7215  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7216  */
7217 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7218                                         struct btrfs_device *failing_dev)
7219 {
7220         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7221         struct extent_map *em;
7222         u64 next_start = 0;
7223         bool ret = true;
7224
7225         read_lock(&map_tree->lock);
7226         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7227         read_unlock(&map_tree->lock);
7228         /* No chunk at all? Return false anyway */
7229         if (!em) {
7230                 ret = false;
7231                 goto out;
7232         }
7233         while (em) {
7234                 struct map_lookup *map;
7235                 int missing = 0;
7236                 int max_tolerated;
7237                 int i;
7238
7239                 map = em->map_lookup;
7240                 max_tolerated =
7241                         btrfs_get_num_tolerated_disk_barrier_failures(
7242                                         map->type);
7243                 for (i = 0; i < map->num_stripes; i++) {
7244                         struct btrfs_device *dev = map->stripes[i].dev;
7245
7246                         if (!dev || !dev->bdev ||
7247                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7248                             dev->last_flush_error)
7249                                 missing++;
7250                         else if (failing_dev && failing_dev == dev)
7251                                 missing++;
7252                 }
7253                 if (missing > max_tolerated) {
7254                         if (!failing_dev)
7255                                 btrfs_warn(fs_info,
7256         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7257                                    em->start, missing, max_tolerated);
7258                         free_extent_map(em);
7259                         ret = false;
7260                         goto out;
7261                 }
7262                 next_start = extent_map_end(em);
7263                 free_extent_map(em);
7264
7265                 read_lock(&map_tree->lock);
7266                 em = lookup_extent_mapping(map_tree, next_start,
7267                                            (u64)(-1) - next_start);
7268                 read_unlock(&map_tree->lock);
7269         }
7270 out:
7271         return ret;
7272 }
7273
7274 static void readahead_tree_node_children(struct extent_buffer *node)
7275 {
7276         int i;
7277         const int nr_items = btrfs_header_nritems(node);
7278
7279         for (i = 0; i < nr_items; i++)
7280                 btrfs_readahead_node_child(node, i);
7281 }
7282
7283 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7284 {
7285         struct btrfs_root *root = fs_info->chunk_root;
7286         struct btrfs_path *path;
7287         struct extent_buffer *leaf;
7288         struct btrfs_key key;
7289         struct btrfs_key found_key;
7290         int ret;
7291         int slot;
7292         u64 total_dev = 0;
7293         u64 last_ra_node = 0;
7294
7295         path = btrfs_alloc_path();
7296         if (!path)
7297                 return -ENOMEM;
7298
7299         /*
7300          * uuid_mutex is needed only if we are mounting a sprout FS
7301          * otherwise we don't need it.
7302          */
7303         mutex_lock(&uuid_mutex);
7304
7305         /*
7306          * It is possible for mount and umount to race in such a way that
7307          * we execute this code path, but open_fs_devices failed to clear
7308          * total_rw_bytes. We certainly want it cleared before reading the
7309          * device items, so clear it here.
7310          */
7311         fs_info->fs_devices->total_rw_bytes = 0;
7312
7313         /*
7314          * Read all device items, and then all the chunk items. All
7315          * device items are found before any chunk item (their object id
7316          * is smaller than the lowest possible object id for a chunk
7317          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7318          */
7319         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7320         key.offset = 0;
7321         key.type = 0;
7322         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7323         if (ret < 0)
7324                 goto error;
7325         while (1) {
7326                 struct extent_buffer *node;
7327
7328                 leaf = path->nodes[0];
7329                 slot = path->slots[0];
7330                 if (slot >= btrfs_header_nritems(leaf)) {
7331                         ret = btrfs_next_leaf(root, path);
7332                         if (ret == 0)
7333                                 continue;
7334                         if (ret < 0)
7335                                 goto error;
7336                         break;
7337                 }
7338                 /*
7339                  * The nodes on level 1 are not locked but we don't need to do
7340                  * that during mount time as nothing else can access the tree
7341                  */
7342                 node = path->nodes[1];
7343                 if (node) {
7344                         if (last_ra_node != node->start) {
7345                                 readahead_tree_node_children(node);
7346                                 last_ra_node = node->start;
7347                         }
7348                 }
7349                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7350                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7351                         struct btrfs_dev_item *dev_item;
7352                         dev_item = btrfs_item_ptr(leaf, slot,
7353                                                   struct btrfs_dev_item);
7354                         ret = read_one_dev(leaf, dev_item);
7355                         if (ret)
7356                                 goto error;
7357                         total_dev++;
7358                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7359                         struct btrfs_chunk *chunk;
7360                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7361                         mutex_lock(&fs_info->chunk_mutex);
7362                         ret = read_one_chunk(&found_key, leaf, chunk);
7363                         mutex_unlock(&fs_info->chunk_mutex);
7364                         if (ret)
7365                                 goto error;
7366                 }
7367                 path->slots[0]++;
7368         }
7369
7370         /*
7371          * After loading chunk tree, we've got all device information,
7372          * do another round of validation checks.
7373          */
7374         if (total_dev != fs_info->fs_devices->total_devices) {
7375                 btrfs_err(fs_info,
7376            "super_num_devices %llu mismatch with num_devices %llu found here",
7377                           btrfs_super_num_devices(fs_info->super_copy),
7378                           total_dev);
7379                 ret = -EINVAL;
7380                 goto error;
7381         }
7382         if (btrfs_super_total_bytes(fs_info->super_copy) <
7383             fs_info->fs_devices->total_rw_bytes) {
7384                 btrfs_err(fs_info,
7385         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7386                           btrfs_super_total_bytes(fs_info->super_copy),
7387                           fs_info->fs_devices->total_rw_bytes);
7388                 ret = -EINVAL;
7389                 goto error;
7390         }
7391         ret = 0;
7392 error:
7393         mutex_unlock(&uuid_mutex);
7394
7395         btrfs_free_path(path);
7396         return ret;
7397 }
7398
7399 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7400 {
7401         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7402         struct btrfs_device *device;
7403
7404         fs_devices->fs_info = fs_info;
7405
7406         mutex_lock(&fs_devices->device_list_mutex);
7407         list_for_each_entry(device, &fs_devices->devices, dev_list)
7408                 device->fs_info = fs_info;
7409
7410         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7411                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7412                         device->fs_info = fs_info;
7413
7414                 seed_devs->fs_info = fs_info;
7415         }
7416         mutex_unlock(&fs_devices->device_list_mutex);
7417 }
7418
7419 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7420                                  const struct btrfs_dev_stats_item *ptr,
7421                                  int index)
7422 {
7423         u64 val;
7424
7425         read_extent_buffer(eb, &val,
7426                            offsetof(struct btrfs_dev_stats_item, values) +
7427                             ((unsigned long)ptr) + (index * sizeof(u64)),
7428                            sizeof(val));
7429         return val;
7430 }
7431
7432 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7433                                       struct btrfs_dev_stats_item *ptr,
7434                                       int index, u64 val)
7435 {
7436         write_extent_buffer(eb, &val,
7437                             offsetof(struct btrfs_dev_stats_item, values) +
7438                              ((unsigned long)ptr) + (index * sizeof(u64)),
7439                             sizeof(val));
7440 }
7441
7442 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7443                                        struct btrfs_path *path)
7444 {
7445         struct btrfs_dev_stats_item *ptr;
7446         struct extent_buffer *eb;
7447         struct btrfs_key key;
7448         int item_size;
7449         int i, ret, slot;
7450
7451         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7452         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7453         key.offset = device->devid;
7454         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7455         if (ret) {
7456                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7457                         btrfs_dev_stat_set(device, i, 0);
7458                 device->dev_stats_valid = 1;
7459                 btrfs_release_path(path);
7460                 return ret < 0 ? ret : 0;
7461         }
7462         slot = path->slots[0];
7463         eb = path->nodes[0];
7464         item_size = btrfs_item_size_nr(eb, slot);
7465
7466         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7467
7468         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7469                 if (item_size >= (1 + i) * sizeof(__le64))
7470                         btrfs_dev_stat_set(device, i,
7471                                            btrfs_dev_stats_value(eb, ptr, i));
7472                 else
7473                         btrfs_dev_stat_set(device, i, 0);
7474         }
7475
7476         device->dev_stats_valid = 1;
7477         btrfs_dev_stat_print_on_load(device);
7478         btrfs_release_path(path);
7479
7480         return 0;
7481 }
7482
7483 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7484 {
7485         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7486         struct btrfs_device *device;
7487         struct btrfs_path *path = NULL;
7488         int ret = 0;
7489
7490         path = btrfs_alloc_path();
7491         if (!path)
7492                 return -ENOMEM;
7493
7494         mutex_lock(&fs_devices->device_list_mutex);
7495         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7496                 ret = btrfs_device_init_dev_stats(device, path);
7497                 if (ret)
7498                         goto out;
7499         }
7500         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7501                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7502                         ret = btrfs_device_init_dev_stats(device, path);
7503                         if (ret)
7504                                 goto out;
7505                 }
7506         }
7507 out:
7508         mutex_unlock(&fs_devices->device_list_mutex);
7509
7510         btrfs_free_path(path);
7511         return ret;
7512 }
7513
7514 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7515                                 struct btrfs_device *device)
7516 {
7517         struct btrfs_fs_info *fs_info = trans->fs_info;
7518         struct btrfs_root *dev_root = fs_info->dev_root;
7519         struct btrfs_path *path;
7520         struct btrfs_key key;
7521         struct extent_buffer *eb;
7522         struct btrfs_dev_stats_item *ptr;
7523         int ret;
7524         int i;
7525
7526         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7527         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7528         key.offset = device->devid;
7529
7530         path = btrfs_alloc_path();
7531         if (!path)
7532                 return -ENOMEM;
7533         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7534         if (ret < 0) {
7535                 btrfs_warn_in_rcu(fs_info,
7536                         "error %d while searching for dev_stats item for device %s",
7537                               ret, rcu_str_deref(device->name));
7538                 goto out;
7539         }
7540
7541         if (ret == 0 &&
7542             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7543                 /* need to delete old one and insert a new one */
7544                 ret = btrfs_del_item(trans, dev_root, path);
7545                 if (ret != 0) {
7546                         btrfs_warn_in_rcu(fs_info,
7547                                 "delete too small dev_stats item for device %s failed %d",
7548                                       rcu_str_deref(device->name), ret);
7549                         goto out;
7550                 }
7551                 ret = 1;
7552         }
7553
7554         if (ret == 1) {
7555                 /* need to insert a new item */
7556                 btrfs_release_path(path);
7557                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7558                                               &key, sizeof(*ptr));
7559                 if (ret < 0) {
7560                         btrfs_warn_in_rcu(fs_info,
7561                                 "insert dev_stats item for device %s failed %d",
7562                                 rcu_str_deref(device->name), ret);
7563                         goto out;
7564                 }
7565         }
7566
7567         eb = path->nodes[0];
7568         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7569         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7570                 btrfs_set_dev_stats_value(eb, ptr, i,
7571                                           btrfs_dev_stat_read(device, i));
7572         btrfs_mark_buffer_dirty(eb);
7573
7574 out:
7575         btrfs_free_path(path);
7576         return ret;
7577 }
7578
7579 /*
7580  * called from commit_transaction. Writes all changed device stats to disk.
7581  */
7582 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7583 {
7584         struct btrfs_fs_info *fs_info = trans->fs_info;
7585         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7586         struct btrfs_device *device;
7587         int stats_cnt;
7588         int ret = 0;
7589
7590         mutex_lock(&fs_devices->device_list_mutex);
7591         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7592                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7593                 if (!device->dev_stats_valid || stats_cnt == 0)
7594                         continue;
7595
7596
7597                 /*
7598                  * There is a LOAD-LOAD control dependency between the value of
7599                  * dev_stats_ccnt and updating the on-disk values which requires
7600                  * reading the in-memory counters. Such control dependencies
7601                  * require explicit read memory barriers.
7602                  *
7603                  * This memory barriers pairs with smp_mb__before_atomic in
7604                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7605                  * barrier implied by atomic_xchg in
7606                  * btrfs_dev_stats_read_and_reset
7607                  */
7608                 smp_rmb();
7609
7610                 ret = update_dev_stat_item(trans, device);
7611                 if (!ret)
7612                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7613         }
7614         mutex_unlock(&fs_devices->device_list_mutex);
7615
7616         return ret;
7617 }
7618
7619 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7620 {
7621         btrfs_dev_stat_inc(dev, index);
7622         btrfs_dev_stat_print_on_error(dev);
7623 }
7624
7625 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7626 {
7627         if (!dev->dev_stats_valid)
7628                 return;
7629         btrfs_err_rl_in_rcu(dev->fs_info,
7630                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7631                            rcu_str_deref(dev->name),
7632                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7633                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7634                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7635                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7636                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7637 }
7638
7639 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7640 {
7641         int i;
7642
7643         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7644                 if (btrfs_dev_stat_read(dev, i) != 0)
7645                         break;
7646         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7647                 return; /* all values == 0, suppress message */
7648
7649         btrfs_info_in_rcu(dev->fs_info,
7650                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7651                rcu_str_deref(dev->name),
7652                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7653                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7654                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7655                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7656                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7657 }
7658
7659 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7660                         struct btrfs_ioctl_get_dev_stats *stats)
7661 {
7662         struct btrfs_device *dev;
7663         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7664         int i;
7665
7666         mutex_lock(&fs_devices->device_list_mutex);
7667         dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7668         mutex_unlock(&fs_devices->device_list_mutex);
7669
7670         if (!dev) {
7671                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7672                 return -ENODEV;
7673         } else if (!dev->dev_stats_valid) {
7674                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7675                 return -ENODEV;
7676         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7677                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7678                         if (stats->nr_items > i)
7679                                 stats->values[i] =
7680                                         btrfs_dev_stat_read_and_reset(dev, i);
7681                         else
7682                                 btrfs_dev_stat_set(dev, i, 0);
7683                 }
7684                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7685                            current->comm, task_pid_nr(current));
7686         } else {
7687                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7688                         if (stats->nr_items > i)
7689                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7690         }
7691         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7692                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7693         return 0;
7694 }
7695
7696 /*
7697  * Update the size and bytes used for each device where it changed.  This is
7698  * delayed since we would otherwise get errors while writing out the
7699  * superblocks.
7700  *
7701  * Must be invoked during transaction commit.
7702  */
7703 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7704 {
7705         struct btrfs_device *curr, *next;
7706
7707         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7708
7709         if (list_empty(&trans->dev_update_list))
7710                 return;
7711
7712         /*
7713          * We don't need the device_list_mutex here.  This list is owned by the
7714          * transaction and the transaction must complete before the device is
7715          * released.
7716          */
7717         mutex_lock(&trans->fs_info->chunk_mutex);
7718         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7719                                  post_commit_list) {
7720                 list_del_init(&curr->post_commit_list);
7721                 curr->commit_total_bytes = curr->disk_total_bytes;
7722                 curr->commit_bytes_used = curr->bytes_used;
7723         }
7724         mutex_unlock(&trans->fs_info->chunk_mutex);
7725 }
7726
7727 /*
7728  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7729  */
7730 int btrfs_bg_type_to_factor(u64 flags)
7731 {
7732         const int index = btrfs_bg_flags_to_raid_index(flags);
7733
7734         return btrfs_raid_array[index].ncopies;
7735 }
7736
7737
7738
7739 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7740                                  u64 chunk_offset, u64 devid,
7741                                  u64 physical_offset, u64 physical_len)
7742 {
7743         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7744         struct extent_map *em;
7745         struct map_lookup *map;
7746         struct btrfs_device *dev;
7747         u64 stripe_len;
7748         bool found = false;
7749         int ret = 0;
7750         int i;
7751
7752         read_lock(&em_tree->lock);
7753         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7754         read_unlock(&em_tree->lock);
7755
7756         if (!em) {
7757                 btrfs_err(fs_info,
7758 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7759                           physical_offset, devid);
7760                 ret = -EUCLEAN;
7761                 goto out;
7762         }
7763
7764         map = em->map_lookup;
7765         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7766         if (physical_len != stripe_len) {
7767                 btrfs_err(fs_info,
7768 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7769                           physical_offset, devid, em->start, physical_len,
7770                           stripe_len);
7771                 ret = -EUCLEAN;
7772                 goto out;
7773         }
7774
7775         for (i = 0; i < map->num_stripes; i++) {
7776                 if (map->stripes[i].dev->devid == devid &&
7777                     map->stripes[i].physical == physical_offset) {
7778                         found = true;
7779                         if (map->verified_stripes >= map->num_stripes) {
7780                                 btrfs_err(fs_info,
7781                                 "too many dev extents for chunk %llu found",
7782                                           em->start);
7783                                 ret = -EUCLEAN;
7784                                 goto out;
7785                         }
7786                         map->verified_stripes++;
7787                         break;
7788                 }
7789         }
7790         if (!found) {
7791                 btrfs_err(fs_info,
7792         "dev extent physical offset %llu devid %llu has no corresponding chunk",
7793                         physical_offset, devid);
7794                 ret = -EUCLEAN;
7795         }
7796
7797         /* Make sure no dev extent is beyond device bondary */
7798         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
7799         if (!dev) {
7800                 btrfs_err(fs_info, "failed to find devid %llu", devid);
7801                 ret = -EUCLEAN;
7802                 goto out;
7803         }
7804
7805         if (physical_offset + physical_len > dev->disk_total_bytes) {
7806                 btrfs_err(fs_info,
7807 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7808                           devid, physical_offset, physical_len,
7809                           dev->disk_total_bytes);
7810                 ret = -EUCLEAN;
7811                 goto out;
7812         }
7813
7814         if (dev->zone_info) {
7815                 u64 zone_size = dev->zone_info->zone_size;
7816
7817                 if (!IS_ALIGNED(physical_offset, zone_size) ||
7818                     !IS_ALIGNED(physical_len, zone_size)) {
7819                         btrfs_err(fs_info,
7820 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
7821                                   devid, physical_offset, physical_len);
7822                         ret = -EUCLEAN;
7823                         goto out;
7824                 }
7825         }
7826
7827 out:
7828         free_extent_map(em);
7829         return ret;
7830 }
7831
7832 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7833 {
7834         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7835         struct extent_map *em;
7836         struct rb_node *node;
7837         int ret = 0;
7838
7839         read_lock(&em_tree->lock);
7840         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7841                 em = rb_entry(node, struct extent_map, rb_node);
7842                 if (em->map_lookup->num_stripes !=
7843                     em->map_lookup->verified_stripes) {
7844                         btrfs_err(fs_info,
7845                         "chunk %llu has missing dev extent, have %d expect %d",
7846                                   em->start, em->map_lookup->verified_stripes,
7847                                   em->map_lookup->num_stripes);
7848                         ret = -EUCLEAN;
7849                         goto out;
7850                 }
7851         }
7852 out:
7853         read_unlock(&em_tree->lock);
7854         return ret;
7855 }
7856
7857 /*
7858  * Ensure that all dev extents are mapped to correct chunk, otherwise
7859  * later chunk allocation/free would cause unexpected behavior.
7860  *
7861  * NOTE: This will iterate through the whole device tree, which should be of
7862  * the same size level as the chunk tree.  This slightly increases mount time.
7863  */
7864 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7865 {
7866         struct btrfs_path *path;
7867         struct btrfs_root *root = fs_info->dev_root;
7868         struct btrfs_key key;
7869         u64 prev_devid = 0;
7870         u64 prev_dev_ext_end = 0;
7871         int ret = 0;
7872
7873         /*
7874          * We don't have a dev_root because we mounted with ignorebadroots and
7875          * failed to load the root, so we want to skip the verification in this
7876          * case for sure.
7877          *
7878          * However if the dev root is fine, but the tree itself is corrupted
7879          * we'd still fail to mount.  This verification is only to make sure
7880          * writes can happen safely, so instead just bypass this check
7881          * completely in the case of IGNOREBADROOTS.
7882          */
7883         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
7884                 return 0;
7885
7886         key.objectid = 1;
7887         key.type = BTRFS_DEV_EXTENT_KEY;
7888         key.offset = 0;
7889
7890         path = btrfs_alloc_path();
7891         if (!path)
7892                 return -ENOMEM;
7893
7894         path->reada = READA_FORWARD;
7895         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7896         if (ret < 0)
7897                 goto out;
7898
7899         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7900                 ret = btrfs_next_item(root, path);
7901                 if (ret < 0)
7902                         goto out;
7903                 /* No dev extents at all? Not good */
7904                 if (ret > 0) {
7905                         ret = -EUCLEAN;
7906                         goto out;
7907                 }
7908         }
7909         while (1) {
7910                 struct extent_buffer *leaf = path->nodes[0];
7911                 struct btrfs_dev_extent *dext;
7912                 int slot = path->slots[0];
7913                 u64 chunk_offset;
7914                 u64 physical_offset;
7915                 u64 physical_len;
7916                 u64 devid;
7917
7918                 btrfs_item_key_to_cpu(leaf, &key, slot);
7919                 if (key.type != BTRFS_DEV_EXTENT_KEY)
7920                         break;
7921                 devid = key.objectid;
7922                 physical_offset = key.offset;
7923
7924                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7925                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7926                 physical_len = btrfs_dev_extent_length(leaf, dext);
7927
7928                 /* Check if this dev extent overlaps with the previous one */
7929                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7930                         btrfs_err(fs_info,
7931 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7932                                   devid, physical_offset, prev_dev_ext_end);
7933                         ret = -EUCLEAN;
7934                         goto out;
7935                 }
7936
7937                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7938                                             physical_offset, physical_len);
7939                 if (ret < 0)
7940                         goto out;
7941                 prev_devid = devid;
7942                 prev_dev_ext_end = physical_offset + physical_len;
7943
7944                 ret = btrfs_next_item(root, path);
7945                 if (ret < 0)
7946                         goto out;
7947                 if (ret > 0) {
7948                         ret = 0;
7949                         break;
7950                 }
7951         }
7952
7953         /* Ensure all chunks have corresponding dev extents */
7954         ret = verify_chunk_dev_extent_mapping(fs_info);
7955 out:
7956         btrfs_free_path(path);
7957         return ret;
7958 }
7959
7960 /*
7961  * Check whether the given block group or device is pinned by any inode being
7962  * used as a swapfile.
7963  */
7964 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7965 {
7966         struct btrfs_swapfile_pin *sp;
7967         struct rb_node *node;
7968
7969         spin_lock(&fs_info->swapfile_pins_lock);
7970         node = fs_info->swapfile_pins.rb_node;
7971         while (node) {
7972                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7973                 if (ptr < sp->ptr)
7974                         node = node->rb_left;
7975                 else if (ptr > sp->ptr)
7976                         node = node->rb_right;
7977                 else
7978                         break;
7979         }
7980         spin_unlock(&fs_info->swapfile_pins_lock);
7981         return node != NULL;
7982 }
7983
7984 static int relocating_repair_kthread(void *data)
7985 {
7986         struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
7987         struct btrfs_fs_info *fs_info = cache->fs_info;
7988         u64 target;
7989         int ret = 0;
7990
7991         target = cache->start;
7992         btrfs_put_block_group(cache);
7993
7994         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
7995                 btrfs_info(fs_info,
7996                            "zoned: skip relocating block group %llu to repair: EBUSY",
7997                            target);
7998                 return -EBUSY;
7999         }
8000
8001         mutex_lock(&fs_info->delete_unused_bgs_mutex);
8002
8003         /* Ensure block group still exists */
8004         cache = btrfs_lookup_block_group(fs_info, target);
8005         if (!cache)
8006                 goto out;
8007
8008         if (!cache->relocating_repair)
8009                 goto out;
8010
8011         ret = btrfs_may_alloc_data_chunk(fs_info, target);
8012         if (ret < 0)
8013                 goto out;
8014
8015         btrfs_info(fs_info,
8016                    "zoned: relocating block group %llu to repair IO failure",
8017                    target);
8018         ret = btrfs_relocate_chunk(fs_info, target);
8019
8020 out:
8021         if (cache)
8022                 btrfs_put_block_group(cache);
8023         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
8024         btrfs_exclop_finish(fs_info);
8025
8026         return ret;
8027 }
8028
8029 int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8030 {
8031         struct btrfs_block_group *cache;
8032
8033         /* Do not attempt to repair in degraded state */
8034         if (btrfs_test_opt(fs_info, DEGRADED))
8035                 return 0;
8036
8037         cache = btrfs_lookup_block_group(fs_info, logical);
8038         if (!cache)
8039                 return 0;
8040
8041         spin_lock(&cache->lock);
8042         if (cache->relocating_repair) {
8043                 spin_unlock(&cache->lock);
8044                 btrfs_put_block_group(cache);
8045                 return 0;
8046         }
8047         cache->relocating_repair = 1;
8048         spin_unlock(&cache->lock);
8049
8050         kthread_run(relocating_repair_kthread, cache,
8051                     "btrfs-relocating-repair");
8052
8053         return 0;
8054 }