fs/btrfs/zoned.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/bitops.h>
   4 #include <linux/slab.h>
   5 #include <linux/blkdev.h>
   6 #include <linux/sched/mm.h>
   7 #include "ctree.h"
   8 #include "volumes.h"
   9 #include "zoned.h"
  10 #include "rcu-string.h"
  11 #include "disk-io.h"
  12 #include "block-group.h"
  13 #include "transaction.h"
  14 #include "dev-replace.h"
  15 #include "space-info.h"
  16
  17 /* Maximum number of zones to report per blkdev_report_zones() call */
  18 #define BTRFS_REPORT_NR_ZONES   4096
  19 /* Invalid allocation pointer value for missing devices */
  20 #define WP_MISSING_DEV ((u64)-1)
  21 /* Pseudo write pointer value for conventional zone */
  22 #define WP_CONVENTIONAL ((u64)-2)
  23
  24 /*
  25  * Location of the first zone of superblock logging zone pairs.
  26  *
  27  * - primary superblock:    0B (zone 0)
  28  * - first copy:          512G (zone starting at that offset)
  29  * - second copy:           4T (zone starting at that offset)
  30  */
  31 #define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
  32 #define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
  33 #define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
  34
  35 #define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
  36 #define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
  37
  38 /* Number of superblock log zones */
  39 #define BTRFS_NR_SB_LOG_ZONES 2
  40
  41 /*
  42  * Maximum supported zone size. Currently, SMR disks have a zone size of
  43  * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
  44  * expect the zone size to become larger than 8GiB in the near future.
  45  */
  46 #define BTRFS_MAX_ZONE_SIZE             SZ_8G
  47
  48 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
  49 {
  50         struct blk_zone *zones = data;
  51
  52         memcpy(&zones[idx], zone, sizeof(*zone));
  53
  54         return 0;
  55 }
  56
  57 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
  58                             u64 *wp_ret)
  59 {
  60         bool empty[BTRFS_NR_SB_LOG_ZONES];
  61         bool full[BTRFS_NR_SB_LOG_ZONES];
  62         sector_t sector;
  63
  64         ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
  65                zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
  66
  67         empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
  68         empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
  69         full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
  70         full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
  71
  72         /*
  73          * Possible states of log buffer zones
  74          *
  75          *           Empty[0]  In use[0]  Full[0]
  76          * Empty[1]         *          x        0
  77          * In use[1]        0          x        0
  78          * Full[1]          1          1        C
  79          *
  80          * Log position:
  81          *   *: Special case, no superblock is written
  82          *   0: Use write pointer of zones[0]
  83          *   1: Use write pointer of zones[1]
  84          *   C: Compare super blcoks from zones[0] and zones[1], use the latest
  85          *      one determined by generation
  86          *   x: Invalid state
  87          */
  88
  89         if (empty[0] && empty[1]) {
  90                 /* Special case to distinguish no superblock to read */
  91                 *wp_ret = zones[0].start << SECTOR_SHIFT;
  92                 return -ENOENT;
  93         } else if (full[0] && full[1]) {
  94                 /* Compare two super blocks */
  95                 struct address_space *mapping = bdev->bd_inode->i_mapping;
  96                 struct page *page[BTRFS_NR_SB_LOG_ZONES];
  97                 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
  98                 int i;
  99
 100                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 101                         u64 bytenr;
 102
 103                         bytenr = ((zones[i].start + zones[i].len)
 104                                    << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
 105
 106                         page[i] = read_cache_page_gfp(mapping,
 107                                         bytenr >> PAGE_SHIFT, GFP_NOFS);
 108                         if (IS_ERR(page[i])) {
 109                                 if (i == 1)
 110                                         btrfs_release_disk_super(super[0]);
 111                                 return PTR_ERR(page[i]);
 112                         }
 113                         super[i] = page_address(page[i]);
 114                 }
 115
 116                 if (super[0]->generation > super[1]->generation)
 117                         sector = zones[1].start;
 118                 else
 119                         sector = zones[0].start;
 120
 121                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
 122                         btrfs_release_disk_super(super[i]);
 123         } else if (!full[0] && (empty[1] || full[1])) {
 124                 sector = zones[0].wp;
 125         } else if (full[0]) {
 126                 sector = zones[1].wp;
 127         } else {
 128                 return -EUCLEAN;
 129         }
 130         *wp_ret = sector << SECTOR_SHIFT;
 131         return 0;
 132 }
 133
 134 /*
 135  * Get the first zone number of the superblock mirror
 136  */
 137 static inline u32 sb_zone_number(int shift, int mirror)
 138 {
 139         u64 zone;
 140
 141         ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 142         switch (mirror) {
 143         case 0: zone = 0; break;
 144         case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
 145         case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
 146         }
 147
 148         ASSERT(zone <= U32_MAX);
 149
 150         return (u32)zone;
 151 }
 152
 153 /*
 154  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 155  * device into static sized chunks and fake a conventional zone on each of
 156  * them.
 157  */
 158 static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 159                                 struct blk_zone *zones, unsigned int nr_zones)
 160 {
 161         const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
 162         sector_t bdev_size = bdev_nr_sectors(device->bdev);
 163         unsigned int i;
 164
 165         pos >>= SECTOR_SHIFT;
 166         for (i = 0; i < nr_zones; i++) {
 167                 zones[i].start = i * zone_sectors + pos;
 168                 zones[i].len = zone_sectors;
 169                 zones[i].capacity = zone_sectors;
 170                 zones[i].wp = zones[i].start + zone_sectors;
 171                 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
 172                 zones[i].cond = BLK_ZONE_COND_NOT_WP;
 173
 174                 if (zones[i].wp >= bdev_size) {
 175                         i++;
 176                         break;
 177                 }
 178         }
 179
 180         return i;
 181 }
 182
 183 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 184                                struct blk_zone *zones, unsigned int *nr_zones)
 185 {
 186         int ret;
 187
 188         if (!*nr_zones)
 189                 return 0;
 190
 191         if (!bdev_is_zoned(device->bdev)) {
 192                 ret = emulate_report_zones(device, pos, zones, *nr_zones);
 193                 *nr_zones = ret;
 194                 return 0;
 195         }
 196
 197         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 198                                   copy_zone_info_cb, zones);
 199         if (ret < 0) {
 200                 btrfs_err_in_rcu(device->fs_info,
 201                                  "zoned: failed to read zone %llu on %s (devid %llu)",
 202                                  pos, rcu_str_deref(device->name),
 203                                  device->devid);
 204                 return ret;
 205         }
 206         *nr_zones = ret;
 207         if (!ret)
 208                 return -EIO;
 209
 210         return 0;
 211 }
 212
 213 /* The emulated zone size is determined from the size of device extent */
 214 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 215 {
 216         struct btrfs_path *path;
 217         struct btrfs_root *root = fs_info->dev_root;
 218         struct btrfs_key key;
 219         struct extent_buffer *leaf;
 220         struct btrfs_dev_extent *dext;
 221         int ret = 0;
 222
 223         key.objectid = 1;
 224         key.type = BTRFS_DEV_EXTENT_KEY;
 225         key.offset = 0;
 226
 227         path = btrfs_alloc_path();
 228         if (!path)
 229                 return -ENOMEM;
 230
 231         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 232         if (ret < 0)
 233                 goto out;
 234
 235         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 236                 ret = btrfs_next_item(root, path);
 237                 if (ret < 0)
 238                         goto out;
 239                 /* No dev extents at all? Not good */
 240                 if (ret > 0) {
 241                         ret = -EUCLEAN;
 242                         goto out;
 243                 }
 244         }
 245
 246         leaf = path->nodes[0];
 247         dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
 248         fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
 249         ret = 0;
 250
 251 out:
 252         btrfs_free_path(path);
 253
 254         return ret;
 255 }
 256
 257 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 258 {
 259         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 260         struct btrfs_device *device;
 261         int ret = 0;
 262
 263         /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
 264         if (!btrfs_fs_incompat(fs_info, ZONED))
 265                 return 0;
 266
 267         mutex_lock(&fs_devices->device_list_mutex);
 268         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 269                 /* We can skip reading of zone info for missing devices */
 270                 if (!device->bdev)
 271                         continue;
 272
 273                 ret = btrfs_get_dev_zone_info(device);
 274                 if (ret)
 275                         break;
 276         }
 277         mutex_unlock(&fs_devices->device_list_mutex);
 278
 279         return ret;
 280 }
 281
 282 int btrfs_get_dev_zone_info(struct btrfs_device *device)
 283 {
 284         struct btrfs_fs_info *fs_info = device->fs_info;
 285         struct btrfs_zoned_device_info *zone_info = NULL;
 286         struct block_device *bdev = device->bdev;
 287         struct request_queue *queue = bdev_get_queue(bdev);
 288         sector_t nr_sectors;
 289         sector_t sector = 0;
 290         struct blk_zone *zones = NULL;
 291         unsigned int i, nreported = 0, nr_zones;
 292         sector_t zone_sectors;
 293         char *model, *emulated;
 294         int ret;
 295
 296         /*
 297          * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
 298          * yet be set.
 299          */
 300         if (!btrfs_fs_incompat(fs_info, ZONED))
 301                 return 0;
 302
 303         if (device->zone_info)
 304                 return 0;
 305
 306         zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
 307         if (!zone_info)
 308                 return -ENOMEM;
 309
 310         if (!bdev_is_zoned(bdev)) {
 311                 if (!fs_info->zone_size) {
 312                         ret = calculate_emulated_zone_size(fs_info);
 313                         if (ret)
 314                                 goto out;
 315                 }
 316
 317                 ASSERT(fs_info->zone_size);
 318                 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
 319         } else {
 320                 zone_sectors = bdev_zone_sectors(bdev);
 321         }
 322
 323         /* Check if it's power of 2 (see is_power_of_2) */
 324         ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 325         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 326
 327         /* We reject devices with a zone size larger than 8GB */
 328         if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
 329                 btrfs_err_in_rcu(fs_info,
 330                 "zoned: %s: zone size %llu larger than supported maximum %llu",
 331                                  rcu_str_deref(device->name),
 332                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
 333                 ret = -EINVAL;
 334                 goto out;
 335         }
 336
 337         nr_sectors = bdev_nr_sectors(bdev);
 338         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 339         zone_info->max_zone_append_size =
 340                 (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
 341         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 342         if (!IS_ALIGNED(nr_sectors, zone_sectors))
 343                 zone_info->nr_zones++;
 344
 345         if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
 346                 btrfs_err(fs_info, "zoned: device %pg does not support zone append",
 347                           bdev);
 348                 ret = -EINVAL;
 349                 goto out;
 350         }
 351
 352         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 353         if (!zone_info->seq_zones) {
 354                 ret = -ENOMEM;
 355                 goto out;
 356         }
 357
 358         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 359         if (!zone_info->empty_zones) {
 360                 ret = -ENOMEM;
 361                 goto out;
 362         }
 363
 364         zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 365         if (!zones) {
 366                 ret = -ENOMEM;
 367                 goto out;
 368         }
 369
 370         /* Get zones type */
 371         while (sector < nr_sectors) {
 372                 nr_zones = BTRFS_REPORT_NR_ZONES;
 373                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
 374                                           &nr_zones);
 375                 if (ret)
 376                         goto out;
 377
 378                 for (i = 0; i < nr_zones; i++) {
 379                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 380                                 __set_bit(nreported, zone_info->seq_zones);
 381                         if (zones[i].cond == BLK_ZONE_COND_EMPTY)
 382                                 __set_bit(nreported, zone_info->empty_zones);
 383                         nreported++;
 384                 }
 385                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 386         }
 387
 388         if (nreported != zone_info->nr_zones) {
 389                 btrfs_err_in_rcu(device->fs_info,
 390                                  "inconsistent number of zones on %s (%u/%u)",
 391                                  rcu_str_deref(device->name), nreported,
 392                                  zone_info->nr_zones);
 393                 ret = -EIO;
 394                 goto out;
 395         }
 396
 397         /* Validate superblock log */
 398         nr_zones = BTRFS_NR_SB_LOG_ZONES;
 399         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 400                 u32 sb_zone;
 401                 u64 sb_wp;
 402                 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
 403
 404                 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
 405                 if (sb_zone + 1 >= zone_info->nr_zones)
 406                         continue;
 407
 408                 sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
 409                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
 410                                           &zone_info->sb_zones[sb_pos],
 411                                           &nr_zones);
 412                 if (ret)
 413                         goto out;
 414
 415                 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
 416                         btrfs_err_in_rcu(device->fs_info,
 417         "zoned: failed to read super block log zone info at devid %llu zone %u",
 418                                          device->devid, sb_zone);
 419                         ret = -EUCLEAN;
 420                         goto out;
 421                 }
 422
 423                 /*
 424                  * If zones[0] is conventional, always use the beggining of the
 425                  * zone to record superblock. No need to validate in that case.
 426                  */
 427                 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
 428                     BLK_ZONE_TYPE_CONVENTIONAL)
 429                         continue;
 430
 431                 ret = sb_write_pointer(device->bdev,
 432                                        &zone_info->sb_zones[sb_pos], &sb_wp);
 433                 if (ret != -ENOENT && ret) {
 434                         btrfs_err_in_rcu(device->fs_info,
 435                         "zoned: super block log zone corrupted devid %llu zone %u",
 436                                          device->devid, sb_zone);
 437                         ret = -EUCLEAN;
 438                         goto out;
 439                 }
 440         }
 441
 442
 443         kfree(zones);
 444
 445         device->zone_info = zone_info;
 446
 447         switch (bdev_zoned_model(bdev)) {
 448         case BLK_ZONED_HM:
 449                 model = "host-managed zoned";
 450                 emulated = "";
 451                 break;
 452         case BLK_ZONED_HA:
 453                 model = "host-aware zoned";
 454                 emulated = "";
 455                 break;
 456         case BLK_ZONED_NONE:
 457                 model = "regular";
 458                 emulated = "emulated ";
 459                 break;
 460         default:
 461                 /* Just in case */
 462                 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
 463                                  bdev_zoned_model(bdev),
 464                                  rcu_str_deref(device->name));
 465                 ret = -EOPNOTSUPP;
 466                 goto out_free_zone_info;
 467         }
 468
 469         btrfs_info_in_rcu(fs_info,
 470                 "%s block device %s, %u %szones of %llu bytes",
 471                 model, rcu_str_deref(device->name), zone_info->nr_zones,
 472                 emulated, zone_info->zone_size);
 473
 474         return 0;
 475
 476 out:
 477         kfree(zones);
 478 out_free_zone_info:
 479         bitmap_free(zone_info->empty_zones);
 480         bitmap_free(zone_info->seq_zones);
 481         kfree(zone_info);
 482         device->zone_info = NULL;
 483
 484         return ret;
 485 }
 486
 487 void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 488 {
 489         struct btrfs_zoned_device_info *zone_info = device->zone_info;
 490
 491         if (!zone_info)
 492                 return;
 493
 494         bitmap_free(zone_info->seq_zones);
 495         bitmap_free(zone_info->empty_zones);
 496         kfree(zone_info);
 497         device->zone_info = NULL;
 498 }
 499
 500 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 501                        struct blk_zone *zone)
 502 {
 503         unsigned int nr_zones = 1;
 504         int ret;
 505
 506         ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
 507         if (ret != 0 || !nr_zones)
 508                 return ret ? ret : -EIO;
 509
 510         return 0;
 511 }
 512
 513 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 514 {
 515         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 516         struct btrfs_device *device;
 517         u64 zoned_devices = 0;
 518         u64 nr_devices = 0;
 519         u64 zone_size = 0;
 520         u64 max_zone_append_size = 0;
 521         const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
 522         int ret = 0;
 523
 524         /* Count zoned devices */
 525         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 526                 enum blk_zoned_model model;
 527
 528                 if (!device->bdev)
 529                         continue;
 530
 531                 model = bdev_zoned_model(device->bdev);
 532                 /*
 533                  * A Host-Managed zoned device must be used as a zoned device.
 534                  * A Host-Aware zoned device and a non-zoned devices can be
 535                  * treated as a zoned device, if ZONED flag is enabled in the
 536                  * superblock.
 537                  */
 538                 if (model == BLK_ZONED_HM ||
 539                     (model == BLK_ZONED_HA && incompat_zoned) ||
 540                     (model == BLK_ZONED_NONE && incompat_zoned)) {
 541                         struct btrfs_zoned_device_info *zone_info =
 542                                 device->zone_info;
 543
 544                         zone_info = device->zone_info;
 545                         zoned_devices++;
 546                         if (!zone_size) {
 547                                 zone_size = zone_info->zone_size;
 548                         } else if (zone_info->zone_size != zone_size) {
 549                                 btrfs_err(fs_info,
 550                 "zoned: unequal block device zone sizes: have %llu found %llu",
 551                                           device->zone_info->zone_size,
 552                                           zone_size);
 553                                 ret = -EINVAL;
 554                                 goto out;
 555                         }
 556                         if (!max_zone_append_size ||
 557                             (zone_info->max_zone_append_size &&
 558                              zone_info->max_zone_append_size < max_zone_append_size))
 559                                 max_zone_append_size =
 560                                         zone_info->max_zone_append_size;
 561                 }
 562                 nr_devices++;
 563         }
 564
 565         if (!zoned_devices && !incompat_zoned)
 566                 goto out;
 567
 568         if (!zoned_devices && incompat_zoned) {
 569                 /* No zoned block device found on ZONED filesystem */
 570                 btrfs_err(fs_info,
 571                           "zoned: no zoned devices found on a zoned filesystem");
 572                 ret = -EINVAL;
 573                 goto out;
 574         }
 575
 576         if (zoned_devices && !incompat_zoned) {
 577                 btrfs_err(fs_info,
 578                           "zoned: mode not enabled but zoned device found");
 579                 ret = -EINVAL;
 580                 goto out;
 581         }
 582
 583         if (zoned_devices != nr_devices) {
 584                 btrfs_err(fs_info,
 585                           "zoned: cannot mix zoned and regular devices");
 586                 ret = -EINVAL;
 587                 goto out;
 588         }
 589
 590         /*
 591          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 592          * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
 593          * check the alignment here.
 594          */
 595         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
 596                 btrfs_err(fs_info,
 597                           "zoned: zone size %llu not aligned to stripe %u",
 598                           zone_size, BTRFS_STRIPE_LEN);
 599                 ret = -EINVAL;
 600                 goto out;
 601         }
 602
 603         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 604                 btrfs_err(fs_info, "zoned: mixed block groups not supported");
 605                 ret = -EINVAL;
 606                 goto out;
 607         }
 608
 609         fs_info->zone_size = zone_size;
 610         fs_info->max_zone_append_size = max_zone_append_size;
 611         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 612
 613         /*
 614          * Check mount options here, because we might change fs_info->zoned
 615          * from fs_info->zone_size.
 616          */
 617         ret = btrfs_check_mountopts_zoned(fs_info);
 618         if (ret)
 619                 goto out;
 620
 621         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
 622 out:
 623         return ret;
 624 }
 625
 626 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 627 {
 628         if (!btrfs_is_zoned(info))
 629                 return 0;
 630
 631         /*
 632          * Space cache writing is not COWed. Disable that to avoid write errors
 633          * in sequential zones.
 634          */
 635         if (btrfs_test_opt(info, SPACE_CACHE)) {
 636                 btrfs_err(info, "zoned: space cache v1 is not supported");
 637                 return -EINVAL;
 638         }
 639
 640         if (btrfs_test_opt(info, NODATACOW)) {
 641                 btrfs_err(info, "zoned: NODATACOW not supported");
 642                 return -EINVAL;
 643         }
 644
 645         return 0;
 646 }
 647
 648 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 649                            int rw, u64 *bytenr_ret)
 650 {
 651         u64 wp;
 652         int ret;
 653
 654         if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
 655                 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
 656                 return 0;
 657         }
 658
 659         ret = sb_write_pointer(bdev, zones, &wp);
 660         if (ret != -ENOENT && ret < 0)
 661                 return ret;
 662
 663         if (rw == WRITE) {
 664                 struct blk_zone *reset = NULL;
 665
 666                 if (wp == zones[0].start << SECTOR_SHIFT)
 667                         reset = &zones[0];
 668                 else if (wp == zones[1].start << SECTOR_SHIFT)
 669                         reset = &zones[1];
 670
 671                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
 672                         ASSERT(reset->cond == BLK_ZONE_COND_FULL);
 673
 674                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 675                                                reset->start, reset->len,
 676                                                GFP_NOFS);
 677                         if (ret)
 678                                 return ret;
 679
 680                         reset->cond = BLK_ZONE_COND_EMPTY;
 681                         reset->wp = reset->start;
 682                 }
 683         } else if (ret != -ENOENT) {
 684                 /* For READ, we want the precious one */
 685                 if (wp == zones[0].start << SECTOR_SHIFT)
 686                         wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
 687                 wp -= BTRFS_SUPER_INFO_SIZE;
 688         }
 689
 690         *bytenr_ret = wp;
 691         return 0;
 692
 693 }
 694
 695 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 696                                u64 *bytenr_ret)
 697 {
 698         struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
 699         sector_t zone_sectors;
 700         u32 sb_zone;
 701         int ret;
 702         u8 zone_sectors_shift;
 703         sector_t nr_sectors;
 704         u32 nr_zones;
 705
 706         if (!bdev_is_zoned(bdev)) {
 707                 *bytenr_ret = btrfs_sb_offset(mirror);
 708                 return 0;
 709         }
 710
 711         ASSERT(rw == READ || rw == WRITE);
 712
 713         zone_sectors = bdev_zone_sectors(bdev);
 714         if (!is_power_of_2(zone_sectors))
 715                 return -EINVAL;
 716         zone_sectors_shift = ilog2(zone_sectors);
 717         nr_sectors = bdev_nr_sectors(bdev);
 718         nr_zones = nr_sectors >> zone_sectors_shift;
 719
 720         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 721         if (sb_zone + 1 >= nr_zones)
 722                 return -ENOENT;
 723
 724         ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
 725                                   BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
 726                                   zones);
 727         if (ret < 0)
 728                 return ret;
 729         if (ret != BTRFS_NR_SB_LOG_ZONES)
 730                 return -EIO;
 731
 732         return sb_log_location(bdev, zones, rw, bytenr_ret);
 733 }
 734
 735 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 736                           u64 *bytenr_ret)
 737 {
 738         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 739         u32 zone_num;
 740
 741         /*
 742          * For a zoned filesystem on a non-zoned block device, use the same
 743          * super block locations as regular filesystem. Doing so, the super
 744          * block can always be retrieved and the zoned flag of the volume
 745          * detected from the super block information.
 746          */
 747         if (!bdev_is_zoned(device->bdev)) {
 748                 *bytenr_ret = btrfs_sb_offset(mirror);
 749                 return 0;
 750         }
 751
 752         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 753         if (zone_num + 1 >= zinfo->nr_zones)
 754                 return -ENOENT;
 755
 756         return sb_log_location(device->bdev,
 757                                &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
 758                                rw, bytenr_ret);
 759 }
 760
 761 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
 762                                   int mirror)
 763 {
 764         u32 zone_num;
 765
 766         if (!zinfo)
 767                 return false;
 768
 769         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 770         if (zone_num + 1 >= zinfo->nr_zones)
 771                 return false;
 772
 773         if (!test_bit(zone_num, zinfo->seq_zones))
 774                 return false;
 775
 776         return true;
 777 }
 778
 779 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 780 {
 781         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 782         struct blk_zone *zone;
 783
 784         if (!is_sb_log_zone(zinfo, mirror))
 785                 return;
 786
 787         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
 788         if (zone->cond != BLK_ZONE_COND_FULL) {
 789                 if (zone->cond == BLK_ZONE_COND_EMPTY)
 790                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
 791
 792                 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 793
 794                 if (zone->wp == zone->start + zone->len)
 795                         zone->cond = BLK_ZONE_COND_FULL;
 796
 797                 return;
 798         }
 799
 800         zone++;
 801         ASSERT(zone->cond != BLK_ZONE_COND_FULL);
 802         if (zone->cond == BLK_ZONE_COND_EMPTY)
 803                 zone->cond = BLK_ZONE_COND_IMP_OPEN;
 804
 805         zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 806
 807         if (zone->wp == zone->start + zone->len)
 808                 zone->cond = BLK_ZONE_COND_FULL;
 809 }
 810
 811 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 812 {
 813         sector_t zone_sectors;
 814         sector_t nr_sectors;
 815         u8 zone_sectors_shift;
 816         u32 sb_zone;
 817         u32 nr_zones;
 818
 819         zone_sectors = bdev_zone_sectors(bdev);
 820         zone_sectors_shift = ilog2(zone_sectors);
 821         nr_sectors = bdev_nr_sectors(bdev);
 822         nr_zones = nr_sectors >> zone_sectors_shift;
 823
 824         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 825         if (sb_zone + 1 >= nr_zones)
 826                 return -ENOENT;
 827
 828         return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 829                                 sb_zone << zone_sectors_shift,
 830                                 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 831 }
 832
 833 /**
 834  * btrfs_find_allocatable_zones - find allocatable zones within a given region
 835  *
 836  * @device:     the device to allocate a region on
 837  * @hole_start: the position of the hole to allocate the region
 838  * @num_bytes:  size of wanted region
 839  * @hole_end:   the end of the hole
 840  * @return:     position of allocatable zones
 841  *
 842  * Allocatable region should not contain any superblock locations.
 843  */
 844 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
 845                                  u64 hole_end, u64 num_bytes)
 846 {
 847         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 848         const u8 shift = zinfo->zone_size_shift;
 849         u64 nzones = num_bytes >> shift;
 850         u64 pos = hole_start;
 851         u64 begin, end;
 852         bool have_sb;
 853         int i;
 854
 855         ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
 856         ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
 857
 858         while (pos < hole_end) {
 859                 begin = pos >> shift;
 860                 end = begin + nzones;
 861
 862                 if (end > zinfo->nr_zones)
 863                         return hole_end;
 864
 865                 /* Check if zones in the region are all empty */
 866                 if (btrfs_dev_is_sequential(device, pos) &&
 867                     find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
 868                         pos += zinfo->zone_size;
 869                         continue;
 870                 }
 871
 872                 have_sb = false;
 873                 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 874                         u32 sb_zone;
 875                         u64 sb_pos;
 876
 877                         sb_zone = sb_zone_number(shift, i);
 878                         if (!(end <= sb_zone ||
 879                               sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
 880                                 have_sb = true;
 881                                 pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
 882                                 break;
 883                         }
 884
 885                         /* We also need to exclude regular superblock positions */
 886                         sb_pos = btrfs_sb_offset(i);
 887                         if (!(pos + num_bytes <= sb_pos ||
 888                               sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
 889                                 have_sb = true;
 890                                 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
 891                                             zinfo->zone_size);
 892                                 break;
 893                         }
 894                 }
 895                 if (!have_sb)
 896                         break;
 897         }
 898
 899         return pos;
 900 }
 901
 902 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
 903                             u64 length, u64 *bytes)
 904 {
 905         int ret;
 906
 907         *bytes = 0;
 908         ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
 909                                physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
 910                                GFP_NOFS);
 911         if (ret)
 912                 return ret;
 913
 914         *bytes = length;
 915         while (length) {
 916                 btrfs_dev_set_zone_empty(device, physical);
 917                 physical += device->zone_info->zone_size;
 918                 length -= device->zone_info->zone_size;
 919         }
 920
 921         return 0;
 922 }
 923
 924 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
 925 {
 926         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 927         const u8 shift = zinfo->zone_size_shift;
 928         unsigned long begin = start >> shift;
 929         unsigned long end = (start + size) >> shift;
 930         u64 pos;
 931         int ret;
 932
 933         ASSERT(IS_ALIGNED(start, zinfo->zone_size));
 934         ASSERT(IS_ALIGNED(size, zinfo->zone_size));
 935
 936         if (end > zinfo->nr_zones)
 937                 return -ERANGE;
 938
 939         /* All the zones are conventional */
 940         if (find_next_bit(zinfo->seq_zones, begin, end) == end)
 941                 return 0;
 942
 943         /* All the zones are sequential and empty */
 944         if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
 945             find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
 946                 return 0;
 947
 948         for (pos = start; pos < start + size; pos += zinfo->zone_size) {
 949                 u64 reset_bytes;
 950
 951                 if (!btrfs_dev_is_sequential(device, pos) ||
 952                     btrfs_dev_is_empty_zone(device, pos))
 953                         continue;
 954
 955                 /* Free regions should be empty */
 956                 btrfs_warn_in_rcu(
 957                         device->fs_info,
 958                 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
 959                         rcu_str_deref(device->name), device->devid, pos >> shift);
 960                 WARN_ON_ONCE(1);
 961
 962                 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
 963                                               &reset_bytes);
 964                 if (ret)
 965                         return ret;
 966         }
 967
 968         return 0;
 969 }
 970
 971 /*
 972  * Calculate an allocation pointer from the extent allocation information
 973  * for a block group consist of conventional zones. It is pointed to the
 974  * end of the highest addressed extent in the block group as an allocation
 975  * offset.
 976  */
 977 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 978                                    u64 *offset_ret)
 979 {
 980         struct btrfs_fs_info *fs_info = cache->fs_info;
 981         struct btrfs_root *root = fs_info->extent_root;
 982         struct btrfs_path *path;
 983         struct btrfs_key key;
 984         struct btrfs_key found_key;
 985         int ret;
 986         u64 length;
 987
 988         path = btrfs_alloc_path();
 989         if (!path)
 990                 return -ENOMEM;
 991
 992         key.objectid = cache->start + cache->length;
 993         key.type = 0;
 994         key.offset = 0;
 995
 996         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 997         /* We should not find the exact match */
 998         if (!ret)
 999                 ret = -EUCLEAN;
1000         if (ret < 0)
1001                 goto out;
1002
1003         ret = btrfs_previous_extent_item(root, path, cache->start);
1004         if (ret) {
1005                 if (ret == 1) {
1006                         ret = 0;
1007                         *offset_ret = 0;
1008                 }
1009                 goto out;
1010         }
1011
1012         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1013
1014         if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1015                 length = found_key.offset;
1016         else
1017                 length = fs_info->nodesize;
1018
1019         if (!(found_key.objectid >= cache->start &&
1020                found_key.objectid + length <= cache->start + cache->length)) {
1021                 ret = -EUCLEAN;
1022                 goto out;
1023         }
1024         *offset_ret = found_key.objectid + length - cache->start;
1025         ret = 0;
1026
1027 out:
1028         btrfs_free_path(path);
1029         return ret;
1030 }
1031
1032 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1033 {
1034         struct btrfs_fs_info *fs_info = cache->fs_info;
1035         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1036         struct extent_map *em;
1037         struct map_lookup *map;
1038         struct btrfs_device *device;
1039         u64 logical = cache->start;
1040         u64 length = cache->length;
1041         u64 physical = 0;
1042         int ret;
1043         int i;
1044         unsigned int nofs_flag;
1045         u64 *alloc_offsets = NULL;
1046         u64 last_alloc = 0;
1047         u32 num_sequential = 0, num_conventional = 0;
1048
1049         if (!btrfs_is_zoned(fs_info))
1050                 return 0;
1051
1052         /* Sanity check */
1053         if (!IS_ALIGNED(length, fs_info->zone_size)) {
1054                 btrfs_err(fs_info,
1055                 "zoned: block group %llu len %llu unaligned to zone size %llu",
1056                           logical, length, fs_info->zone_size);
1057                 return -EIO;
1058         }
1059
1060         /* Get the chunk mapping */
1061         read_lock(&em_tree->lock);
1062         em = lookup_extent_mapping(em_tree, logical, length);
1063         read_unlock(&em_tree->lock);
1064
1065         if (!em)
1066                 return -EINVAL;
1067
1068         map = em->map_lookup;
1069
1070         alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1071         if (!alloc_offsets) {
1072                 free_extent_map(em);
1073                 return -ENOMEM;
1074         }
1075
1076         for (i = 0; i < map->num_stripes; i++) {
1077                 bool is_sequential;
1078                 struct blk_zone zone;
1079                 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1080                 int dev_replace_is_ongoing = 0;
1081
1082                 device = map->stripes[i].dev;
1083                 physical = map->stripes[i].physical;
1084
1085                 if (device->bdev == NULL) {
1086                         alloc_offsets[i] = WP_MISSING_DEV;
1087                         continue;
1088                 }
1089
1090                 is_sequential = btrfs_dev_is_sequential(device, physical);
1091                 if (is_sequential)
1092                         num_sequential++;
1093                 else
1094                         num_conventional++;
1095
1096                 if (!is_sequential) {
1097                         alloc_offsets[i] = WP_CONVENTIONAL;
1098                         continue;
1099                 }
1100
1101                 /*
1102                  * This zone will be used for allocation, so mark this zone
1103                  * non-empty.
1104                  */
1105                 btrfs_dev_clear_zone_empty(device, physical);
1106
1107                 down_read(&dev_replace->rwsem);
1108                 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1109                 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1110                         btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
1111                 up_read(&dev_replace->rwsem);
1112
1113                 /*
1114                  * The group is mapped to a sequential zone. Get the zone write
1115                  * pointer to determine the allocation offset within the zone.
1116                  */
1117                 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
1118                 nofs_flag = memalloc_nofs_save();
1119                 ret = btrfs_get_dev_zone(device, physical, &zone);
1120                 memalloc_nofs_restore(nofs_flag);
1121                 if (ret == -EIO || ret == -EOPNOTSUPP) {
1122                         ret = 0;
1123                         alloc_offsets[i] = WP_MISSING_DEV;
1124                         continue;
1125                 } else if (ret) {
1126                         goto out;
1127                 }
1128
1129                 switch (zone.cond) {
1130                 case BLK_ZONE_COND_OFFLINE:
1131                 case BLK_ZONE_COND_READONLY:
1132                         btrfs_err(fs_info,
1133                 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1134                                   physical >> device->zone_info->zone_size_shift,
1135                                   rcu_str_deref(device->name), device->devid);
1136                         alloc_offsets[i] = WP_MISSING_DEV;
1137                         break;
1138                 case BLK_ZONE_COND_EMPTY:
1139                         alloc_offsets[i] = 0;
1140                         break;
1141                 case BLK_ZONE_COND_FULL:
1142                         alloc_offsets[i] = fs_info->zone_size;
1143                         break;
1144                 default:
1145                         /* Partially used zone */
1146                         alloc_offsets[i] =
1147                                         ((zone.wp - zone.start) << SECTOR_SHIFT);
1148                         break;
1149                 }
1150         }
1151
1152         if (num_sequential > 0)
1153                 cache->seq_zone = true;
1154
1155         if (num_conventional > 0) {
1156                 /*
1157                  * Avoid calling calculate_alloc_pointer() for new BG. It
1158                  * is no use for new BG. It must be always 0.
1159                  *
1160                  * Also, we have a lock chain of extent buffer lock ->
1161                  * chunk mutex.  For new BG, this function is called from
1162                  * btrfs_make_block_group() which is already taking the
1163                  * chunk mutex. Thus, we cannot call
1164                  * calculate_alloc_pointer() which takes extent buffer
1165                  * locks to avoid deadlock.
1166                  */
1167                 if (new) {
1168                         cache->alloc_offset = 0;
1169                         goto out;
1170                 }
1171                 ret = calculate_alloc_pointer(cache, &last_alloc);
1172                 if (ret || map->num_stripes == num_conventional) {
1173                         if (!ret)
1174                                 cache->alloc_offset = last_alloc;
1175                         else
1176                                 btrfs_err(fs_info,
1177                         "zoned: failed to determine allocation offset of bg %llu",
1178                                           cache->start);
1179                         goto out;
1180                 }
1181         }
1182
1183         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1184         case 0: /* single */
1185                 cache->alloc_offset = alloc_offsets[0];
1186                 break;
1187         case BTRFS_BLOCK_GROUP_DUP:
1188         case BTRFS_BLOCK_GROUP_RAID1:
1189         case BTRFS_BLOCK_GROUP_RAID0:
1190         case BTRFS_BLOCK_GROUP_RAID10:
1191         case BTRFS_BLOCK_GROUP_RAID5:
1192         case BTRFS_BLOCK_GROUP_RAID6:
1193                 /* non-single profiles are not supported yet */
1194         default:
1195                 btrfs_err(fs_info, "zoned: profile %s not yet supported",
1196                           btrfs_bg_type_to_raid_name(map->type));
1197                 ret = -EINVAL;
1198                 goto out;
1199         }
1200
1201 out:
1202         /* An extent is allocated after the write pointer */
1203         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1204                 btrfs_err(fs_info,
1205                           "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1206                           logical, last_alloc, cache->alloc_offset);
1207                 ret = -EIO;
1208         }
1209
1210         if (!ret)
1211                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
1212
1213         kfree(alloc_offsets);
1214         free_extent_map(em);
1215
1216         return ret;
1217 }
1218
1219 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1220 {
1221         u64 unusable, free;
1222
1223         if (!btrfs_is_zoned(cache->fs_info))
1224                 return;
1225
1226         WARN_ON(cache->bytes_super != 0);
1227         unusable = cache->alloc_offset - cache->used;
1228         free = cache->length - cache->alloc_offset;
1229
1230         /* We only need ->free_space in ALLOC_SEQ block groups */
1231         cache->last_byte_to_unpin = (u64)-1;
1232         cache->cached = BTRFS_CACHE_FINISHED;
1233         cache->free_space_ctl->free_space = free;
1234         cache->zone_unusable = unusable;
1235
1236         /* Should not have any excluded extents. Just in case, though */
1237         btrfs_free_excluded_extents(cache);
1238 }
1239
1240 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1241                             struct extent_buffer *eb)
1242 {
1243         struct btrfs_fs_info *fs_info = eb->fs_info;
1244
1245         if (!btrfs_is_zoned(fs_info) ||
1246             btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
1247             !list_empty(&eb->release_list))
1248                 return;
1249
1250         set_extent_buffer_dirty(eb);
1251         set_extent_bits_nowait(&trans->dirty_pages, eb->start,
1252                                eb->start + eb->len - 1, EXTENT_DIRTY);
1253         memzero_extent_buffer(eb, 0, eb->len);
1254         set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1255
1256         spin_lock(&trans->releasing_ebs_lock);
1257         list_add_tail(&eb->release_list, &trans->releasing_ebs);
1258         spin_unlock(&trans->releasing_ebs_lock);
1259         atomic_inc(&eb->refs);
1260 }
1261
1262 void btrfs_free_redirty_list(struct btrfs_transaction *trans)
1263 {
1264         spin_lock(&trans->releasing_ebs_lock);
1265         while (!list_empty(&trans->releasing_ebs)) {
1266                 struct extent_buffer *eb;
1267
1268                 eb = list_first_entry(&trans->releasing_ebs,
1269                                       struct extent_buffer, release_list);
1270                 list_del_init(&eb->release_list);
1271                 free_extent_buffer(eb);
1272         }
1273         spin_unlock(&trans->releasing_ebs_lock);
1274 }
1275
1276 bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
1277 {
1278         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1279         struct btrfs_block_group *cache;
1280         bool ret = false;
1281
1282         if (!btrfs_is_zoned(fs_info))
1283                 return false;
1284
1285         if (!fs_info->max_zone_append_size)
1286                 return false;
1287
1288         if (!is_data_inode(&inode->vfs_inode))
1289                 return false;
1290
1291         cache = btrfs_lookup_block_group(fs_info, em->block_start);
1292         ASSERT(cache);
1293         if (!cache)
1294                 return false;
1295
1296         ret = cache->seq_zone;
1297         btrfs_put_block_group(cache);
1298
1299         return ret;
1300 }
1301
1302 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
1303                                  struct bio *bio)
1304 {
1305         struct btrfs_ordered_extent *ordered;
1306         const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
1307
1308         if (bio_op(bio) != REQ_OP_ZONE_APPEND)
1309                 return;
1310
1311         ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
1312         if (WARN_ON(!ordered))
1313                 return;
1314
1315         ordered->physical = physical;
1316         ordered->disk = bio->bi_bdev->bd_disk;
1317         ordered->partno = bio->bi_bdev->bd_partno;
1318
1319         btrfs_put_ordered_extent(ordered);
1320 }
1321
1322 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
1323 {
1324         struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1325         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1326         struct extent_map_tree *em_tree;
1327         struct extent_map *em;
1328         struct btrfs_ordered_sum *sum;
1329         struct block_device *bdev;
1330         u64 orig_logical = ordered->disk_bytenr;
1331         u64 *logical = NULL;
1332         int nr, stripe_len;
1333
1334         /* Zoned devices should not have partitions. So, we can assume it is 0 */
1335         ASSERT(ordered->partno == 0);
1336         bdev = bdgrab(ordered->disk->part0);
1337         if (WARN_ON(!bdev))
1338                 return;
1339
1340         if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
1341                                      ordered->physical, &logical, &nr,
1342                                      &stripe_len)))
1343                 goto out;
1344
1345         WARN_ON(nr != 1);
1346
1347         if (orig_logical == *logical)
1348                 goto out;
1349
1350         ordered->disk_bytenr = *logical;
1351
1352         em_tree = &inode->extent_tree;
1353         write_lock(&em_tree->lock);
1354         em = search_extent_mapping(em_tree, ordered->file_offset,
1355                                    ordered->num_bytes);
1356         em->block_start = *logical;
1357         free_extent_map(em);
1358         write_unlock(&em_tree->lock);
1359
1360         list_for_each_entry(sum, &ordered->list, list) {
1361                 if (*logical < orig_logical)
1362                         sum->bytenr -= orig_logical - *logical;
1363                 else
1364                         sum->bytenr += *logical - orig_logical;
1365         }
1366
1367 out:
1368         kfree(logical);
1369         bdput(bdev);
1370 }
1371
1372 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1373                                     struct extent_buffer *eb,
1374                                     struct btrfs_block_group **cache_ret)
1375 {
1376         struct btrfs_block_group *cache;
1377         bool ret = true;
1378
1379         if (!btrfs_is_zoned(fs_info))
1380                 return true;
1381
1382         cache = *cache_ret;
1383
1384         if (cache && (eb->start < cache->start ||
1385                       cache->start + cache->length <= eb->start)) {
1386                 btrfs_put_block_group(cache);
1387                 cache = NULL;
1388                 *cache_ret = NULL;
1389         }
1390
1391         if (!cache)
1392                 cache = btrfs_lookup_block_group(fs_info, eb->start);
1393
1394         if (cache) {
1395                 if (cache->meta_write_pointer != eb->start) {
1396                         btrfs_put_block_group(cache);
1397                         cache = NULL;
1398                         ret = false;
1399                 } else {
1400                         cache->meta_write_pointer = eb->start + eb->len;
1401                 }
1402
1403                 *cache_ret = cache;
1404         }
1405
1406         return ret;
1407 }
1408
1409 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1410                                      struct extent_buffer *eb)
1411 {
1412         if (!btrfs_is_zoned(eb->fs_info) || !cache)
1413                 return;
1414
1415         ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1416         cache->meta_write_pointer = eb->start;
1417 }
1418
1419 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1420 {
1421         if (!btrfs_dev_is_sequential(device, physical))
1422                 return -EOPNOTSUPP;
1423
1424         return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1425                                     length >> SECTOR_SHIFT, GFP_NOFS, 0);
1426 }
1427
1428 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1429                           struct blk_zone *zone)
1430 {
1431         struct btrfs_bio *bbio = NULL;
1432         u64 mapped_length = PAGE_SIZE;
1433         unsigned int nofs_flag;
1434         int nmirrors;
1435         int i, ret;
1436
1437         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1438                                &mapped_length, &bbio);
1439         if (ret || !bbio || mapped_length < PAGE_SIZE) {
1440                 btrfs_put_bbio(bbio);
1441                 return -EIO;
1442         }
1443
1444         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1445                 return -EINVAL;
1446
1447         nofs_flag = memalloc_nofs_save();
1448         nmirrors = (int)bbio->num_stripes;
1449         for (i = 0; i < nmirrors; i++) {
1450                 u64 physical = bbio->stripes[i].physical;
1451                 struct btrfs_device *dev = bbio->stripes[i].dev;
1452
1453                 /* Missing device */
1454                 if (!dev->bdev)
1455                         continue;
1456
1457                 ret = btrfs_get_dev_zone(dev, physical, zone);
1458                 /* Failing device */
1459                 if (ret == -EIO || ret == -EOPNOTSUPP)
1460                         continue;
1461                 break;
1462         }
1463         memalloc_nofs_restore(nofs_flag);
1464
1465         return ret;
1466 }
1467
1468 /*
1469  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1470  * filling zeros between @physical_pos to a write pointer of dev-replace
1471  * source device.
1472  */
1473 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1474                                     u64 physical_start, u64 physical_pos)
1475 {
1476         struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1477         struct blk_zone zone;
1478         u64 length;
1479         u64 wp;
1480         int ret;
1481
1482         if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1483                 return 0;
1484
1485         ret = read_zone_info(fs_info, logical, &zone);
1486         if (ret)
1487                 return ret;
1488
1489         wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1490
1491         if (physical_pos == wp)
1492                 return 0;
1493
1494         if (physical_pos > wp)
1495                 return -EUCLEAN;
1496
1497         length = wp - physical_pos;
1498         return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1499 }