Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block

[linux-2.6-microblaze.git] / fs / btrfs / zoned.c
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c

index c388466..d0eb0c8 100644 (file)
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1,14 +1,25 @@
  // SPDX-License-Identifier: GPL-2.0
  
+#include <linux/bitops.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
+#include <linux/sched/mm.h>
  #include "ctree.h"
  #include "volumes.h"
  #include "zoned.h"
  #include "rcu-string.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "transaction.h"
+#include "dev-replace.h"
+#include "space-info.h"
  
  /* Maximum number of zones to report per blkdev_report_zones() call */
  #define BTRFS_REPORT_NR_ZONES   4096
+/* Invalid allocation pointer value for missing devices */
+#define WP_MISSING_DEV ((u64)-1)
+/* Pseudo write pointer value for conventional zone */
+#define WP_CONVENTIONAL ((u64)-2)
  
  /* Number of superblock log zones */
  #define BTRFS_NR_SB_LOG_ZONES 2
@@ -119,6 +130,36 @@ static inline u32 sb_zone_number(int shift, int mirror)
         return 0;
  }
  
+/*
+ * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
+ * device into static sized chunks and fake a conventional zone on each of
+ * them.
+ */
+static int emulate_report_zones(struct btrfs_device *device, u64 pos,
+                               struct blk_zone *zones, unsigned int nr_zones)
+{
+       const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
+       sector_t bdev_size = bdev_nr_sectors(device->bdev);
+       unsigned int i;
+
+       pos >>= SECTOR_SHIFT;
+       for (i = 0; i < nr_zones; i++) {
+               zones[i].start = i * zone_sectors + pos;
+               zones[i].len = zone_sectors;
+               zones[i].capacity = zone_sectors;
+               zones[i].wp = zones[i].start + zone_sectors;
+               zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
+               zones[i].cond = BLK_ZONE_COND_NOT_WP;
+
+               if (zones[i].wp >= bdev_size) {
+                       i++;
+                       break;
+               }
+       }
+
+       return i;
+}
+
  static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
                                struct blk_zone *zones, unsigned int *nr_zones)
  {
@@ -127,6 +168,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
         if (!*nr_zones)
                 return 0;
  
+       if (!bdev_is_zoned(device->bdev)) {
+               ret = emulate_report_zones(device, pos, zones, *nr_zones);
+               *nr_zones = ret;
+               return 0;
+       }
+
         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
                                   copy_zone_info_cb, zones);
         if (ret < 0) {
@@ -143,8 +190,78 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
         return 0;
  }
  
+/* The emulated zone size is determined from the size of device extent */
+static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root = fs_info->dev_root;
+       struct btrfs_key key;
+       struct extent_buffer *leaf;
+       struct btrfs_dev_extent *dext;
+       int ret = 0;
+
+       key.objectid = 1;
+       key.type = BTRFS_DEV_EXTENT_KEY;
+       key.offset = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
+       if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+               ret = btrfs_next_item(root, path);
+               if (ret < 0)
+                       goto out;
+               /* No dev extents at all? Not good */
+               if (ret > 0) {
+                       ret = -EUCLEAN;
+                       goto out;
+               }
+       }
+
+       leaf = path->nodes[0];
+       dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+       fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
+       ret = 0;
+
+out:
+       btrfs_free_path(path);
+
+       return ret;
+}
+
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       int ret = 0;
+
+       /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
+       if (!btrfs_fs_incompat(fs_info, ZONED))
+               return 0;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               /* We can skip reading of zone info for missing devices */
+               if (!device->bdev)
+                       continue;
+
+               ret = btrfs_get_dev_zone_info(device);
+               if (ret)
+                       break;
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       return ret;
+}
+
  int btrfs_get_dev_zone_info(struct btrfs_device *device)
  {
+       struct btrfs_fs_info *fs_info = device->fs_info;
         struct btrfs_zoned_device_info *zone_info = NULL;
         struct block_device *bdev = device->bdev;
         struct request_queue *queue = bdev_get_queue(bdev);
@@ -153,9 +270,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
         struct blk_zone *zones = NULL;
         unsigned int i, nreported = 0, nr_zones;
         unsigned int zone_sectors;
+       char *model, *emulated;
         int ret;
  
-       if (!bdev_is_zoned(bdev))
+       /*
+        * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
+        * yet be set.
+        */
+       if (!btrfs_fs_incompat(fs_info, ZONED))
                 return 0;
  
         if (device->zone_info)
@@ -165,8 +287,20 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
         if (!zone_info)
                 return -ENOMEM;
  
+       if (!bdev_is_zoned(bdev)) {
+               if (!fs_info->zone_size) {
+                       ret = calculate_emulated_zone_size(fs_info);
+                       if (ret)
+                               goto out;
+               }
+
+               ASSERT(fs_info->zone_size);
+               zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+       } else {
+               zone_sectors = bdev_zone_sectors(bdev);
+       }
+
         nr_sectors = bdev_nr_sectors(bdev);
-       zone_sectors = bdev_zone_sectors(bdev);
         /* Check if it's power of 2 (see is_power_of_2) */
         ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
@@ -272,20 +406,42 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
  
         device->zone_info = zone_info;
  
-       /* device->fs_info is not safe to use for printing messages */
-       btrfs_info_in_rcu(NULL,
-                       "host-%s zoned block device %s, %u zones of %llu bytes",
-                       bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
-                       rcu_str_deref(device->name), zone_info->nr_zones,
-                       zone_info->zone_size);
+       switch (bdev_zoned_model(bdev)) {
+       case BLK_ZONED_HM:
+               model = "host-managed zoned";
+               emulated = "";
+               break;
+       case BLK_ZONED_HA:
+               model = "host-aware zoned";
+               emulated = "";
+               break;
+       case BLK_ZONED_NONE:
+               model = "regular";
+               emulated = "emulated ";
+               break;
+       default:
+               /* Just in case */
+               btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
+                                bdev_zoned_model(bdev),
+                                rcu_str_deref(device->name));
+               ret = -EOPNOTSUPP;
+               goto out_free_zone_info;
+       }
+
+       btrfs_info_in_rcu(fs_info,
+               "%s block device %s, %u %szones of %llu bytes",
+               model, rcu_str_deref(device->name), zone_info->nr_zones,
+               emulated, zone_info->zone_size);
  
         return 0;
  
  out:
         kfree(zones);
+out_free_zone_info:
         bitmap_free(zone_info->empty_zones);
         bitmap_free(zone_info->seq_zones);
         kfree(zone_info);
+       device->zone_info = NULL;
  
         return ret;
  }
@@ -324,7 +480,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
         u64 nr_devices = 0;
         u64 zone_size = 0;
         u64 max_zone_append_size = 0;
-       const bool incompat_zoned = btrfs_is_zoned(fs_info);
+       const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
         int ret = 0;
  
         /* Count zoned devices */
@@ -335,9 +491,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
                         continue;
  
                 model = bdev_zoned_model(device->bdev);
+               /*
+                * A Host-Managed zoned device must be used as a zoned device.
+                * A Host-Aware zoned device and a non-zoned devices can be
+                * treated as a zoned device, if ZONED flag is enabled in the
+                * superblock.
+                */
                 if (model == BLK_ZONED_HM ||
-                   (model == BLK_ZONED_HA && incompat_zoned)) {
-                       struct btrfs_zoned_device_info *zone_info;
+                   (model == BLK_ZONED_HA && incompat_zoned) ||
+                   (model == BLK_ZONED_NONE && incompat_zoned)) {
+                       struct btrfs_zoned_device_info *zone_info =
+                               device->zone_info;
  
                         zone_info = device->zone_info;
                         zoned_devices++;
@@ -406,6 +570,15 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
  
         fs_info->zone_size = zone_size;
         fs_info->max_zone_append_size = max_zone_append_size;
+       fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+
+       /*
+        * Check mount options here, because we might change fs_info->zoned
+        * from fs_info->zone_size.
+        */
+       ret = btrfs_check_mountopts_zoned(fs_info);
+       if (ret)
+               goto out;
  
         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
  out:
@@ -488,7 +661,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
         unsigned int zone_sectors;
         u32 sb_zone;
         int ret;
-       u64 zone_size;
         u8 zone_sectors_shift;
         sector_t nr_sectors;
         u32 nr_zones;
@@ -503,7 +675,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
         zone_sectors = bdev_zone_sectors(bdev);
         if (!is_power_of_2(zone_sectors))
                 return -EINVAL;
-       zone_size = zone_sectors << SECTOR_SHIFT;
         zone_sectors_shift = ilog2(zone_sectors);
         nr_sectors = bdev_nr_sectors(bdev);
         nr_zones = nr_sectors >> zone_sectors_shift;
@@ -529,7 +700,13 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
         struct btrfs_zoned_device_info *zinfo = device->zone_info;
         u32 zone_num;
  
-       if (!zinfo) {
+       /*
+        * For a zoned filesystem on a non-zoned block device, use the same
+        * super block locations as regular filesystem. Doing so, the super
+        * block can always be retrieved and the zoned flag of the volume
+        * detected from the super block information.
+        */
+       if (!bdev_is_zoned(device->bdev)) {
                 *bytenr_ret = btrfs_sb_offset(mirror);
                 return 0;
         }
@@ -614,3 +791,671 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
                                 sb_zone << zone_sectors_shift,
                                 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
  }
+
+/**
+ * btrfs_find_allocatable_zones - find allocatable zones within a given region
+ *
+ * @device:    the device to allocate a region on
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: size of wanted region
+ * @hole_end:  the end of the hole
+ * @return:    position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+                                u64 hole_end, u64 num_bytes)
+{
+       struct btrfs_zoned_device_info *zinfo = device->zone_info;
+       const u8 shift = zinfo->zone_size_shift;
+       u64 nzones = num_bytes >> shift;
+       u64 pos = hole_start;
+       u64 begin, end;
+       bool have_sb;
+       int i;
+
+       ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+       ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+       while (pos < hole_end) {
+               begin = pos >> shift;
+               end = begin + nzones;
+
+               if (end > zinfo->nr_zones)
+                       return hole_end;
+
+               /* Check if zones in the region are all empty */
+               if (btrfs_dev_is_sequential(device, pos) &&
+                   find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+                       pos += zinfo->zone_size;
+                       continue;
+               }
+
+               have_sb = false;
+               for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                       u32 sb_zone;
+                       u64 sb_pos;
+
+                       sb_zone = sb_zone_number(shift, i);
+                       if (!(end <= sb_zone ||
+                             sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+                               have_sb = true;
+                               pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
+                               break;
+                       }
+
+                       /* We also need to exclude regular superblock positions */
+                       sb_pos = btrfs_sb_offset(i);
+                       if (!(pos + num_bytes <= sb_pos ||
+                             sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+                               have_sb = true;
+                               pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+                                           zinfo->zone_size);
+                               break;
+                       }
+               }
+               if (!have_sb)
+                       break;
+       }
+
+       return pos;
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+                           u64 length, u64 *bytes)
+{
+       int ret;
+
+       *bytes = 0;
+       ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+                              physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+                              GFP_NOFS);
+       if (ret)
+               return ret;
+
+       *bytes = length;
+       while (length) {
+               btrfs_dev_set_zone_empty(device, physical);
+               physical += device->zone_info->zone_size;
+               length -= device->zone_info->zone_size;
+       }
+
+       return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+       struct btrfs_zoned_device_info *zinfo = device->zone_info;
+       const u8 shift = zinfo->zone_size_shift;
+       unsigned long begin = start >> shift;
+       unsigned long end = (start + size) >> shift;
+       u64 pos;
+       int ret;
+
+       ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+       ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+       if (end > zinfo->nr_zones)
+               return -ERANGE;
+
+       /* All the zones are conventional */
+       if (find_next_bit(zinfo->seq_zones, begin, end) == end)
+               return 0;
+
+       /* All the zones are sequential and empty */
+       if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
+           find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
+               return 0;
+
+       for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+               u64 reset_bytes;
+
+               if (!btrfs_dev_is_sequential(device, pos) ||
+                   btrfs_dev_is_empty_zone(device, pos))
+                       continue;
+
+               /* Free regions should be empty */
+               btrfs_warn_in_rcu(
+                       device->fs_info,
+               "zoned: resetting device %s (devid %llu) zone %llu for allocation",
+                       rcu_str_deref(device->name), device->devid, pos >> shift);
+               WARN_ON_ONCE(1);
+
+               ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+                                             &reset_bytes);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * Calculate an allocation pointer from the extent allocation information
+ * for a block group consist of conventional zones. It is pointed to the
+ * end of the highest addressed extent in the block group as an allocation
+ * offset.
+ */
+static int calculate_alloc_pointer(struct btrfs_block_group *cache,
+                                  u64 *offset_ret)
+{
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       struct btrfs_root *root = fs_info->extent_root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       int ret;
+       u64 length;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = cache->start + cache->length;
+       key.type = 0;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       /* We should not find the exact match */
+       if (!ret)
+               ret = -EUCLEAN;
+       if (ret < 0)
+               goto out;
+
+       ret = btrfs_previous_extent_item(root, path, cache->start);
+       if (ret) {
+               if (ret == 1) {
+                       ret = 0;
+                       *offset_ret = 0;
+               }
+               goto out;
+       }
+
+       btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+       if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+               length = found_key.offset;
+       else
+               length = fs_info->nodesize;
+
+       if (!(found_key.objectid >= cache->start &&
+              found_key.objectid + length <= cache->start + cache->length)) {
+               ret = -EUCLEAN;
+               goto out;
+       }
+       *offset_ret = found_key.objectid + length - cache->start;
+       ret = 0;
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
+{
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 logical = cache->start;
+       u64 length = cache->length;
+       u64 physical = 0;
+       int ret;
+       int i;
+       unsigned int nofs_flag;
+       u64 *alloc_offsets = NULL;
+       u64 last_alloc = 0;
+       u32 num_sequential = 0, num_conventional = 0;
+
+       if (!btrfs_is_zoned(fs_info))
+               return 0;
+
+       /* Sanity check */
+       if (!IS_ALIGNED(length, fs_info->zone_size)) {
+               btrfs_err(fs_info,
+               "zoned: block group %llu len %llu unaligned to zone size %llu",
+                         logical, length, fs_info->zone_size);
+               return -EIO;
+       }
+
+       /* Get the chunk mapping */
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, logical, length);
+       read_unlock(&em_tree->lock);
+
+       if (!em)
+               return -EINVAL;
+
+       map = em->map_lookup;
+
+       alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
+       if (!alloc_offsets) {
+               free_extent_map(em);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < map->num_stripes; i++) {
+               bool is_sequential;
+               struct blk_zone zone;
+               struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+               int dev_replace_is_ongoing = 0;
+
+               device = map->stripes[i].dev;
+               physical = map->stripes[i].physical;
+
+               if (device->bdev == NULL) {
+                       alloc_offsets[i] = WP_MISSING_DEV;
+                       continue;
+               }
+
+               is_sequential = btrfs_dev_is_sequential(device, physical);
+               if (is_sequential)
+                       num_sequential++;
+               else
+                       num_conventional++;
+
+               if (!is_sequential) {
+                       alloc_offsets[i] = WP_CONVENTIONAL;
+                       continue;
+               }
+
+               /*
+                * This zone will be used for allocation, so mark this zone
+                * non-empty.
+                */
+               btrfs_dev_clear_zone_empty(device, physical);
+
+               down_read(&dev_replace->rwsem);
+               dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+               if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+                       btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
+               up_read(&dev_replace->rwsem);
+
+               /*
+                * The group is mapped to a sequential zone. Get the zone write
+                * pointer to determine the allocation offset within the zone.
+                */
+               WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+               nofs_flag = memalloc_nofs_save();
+               ret = btrfs_get_dev_zone(device, physical, &zone);
+               memalloc_nofs_restore(nofs_flag);
+               if (ret == -EIO || ret == -EOPNOTSUPP) {
+                       ret = 0;
+                       alloc_offsets[i] = WP_MISSING_DEV;
+                       continue;
+               } else if (ret) {
+                       goto out;
+               }
+
+               switch (zone.cond) {
+               case BLK_ZONE_COND_OFFLINE:
+               case BLK_ZONE_COND_READONLY:
+                       btrfs_err(fs_info,
+               "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+                                 physical >> device->zone_info->zone_size_shift,
+                                 rcu_str_deref(device->name), device->devid);
+                       alloc_offsets[i] = WP_MISSING_DEV;
+                       break;
+               case BLK_ZONE_COND_EMPTY:
+                       alloc_offsets[i] = 0;
+                       break;
+               case BLK_ZONE_COND_FULL:
+                       alloc_offsets[i] = fs_info->zone_size;
+                       break;
+               default:
+                       /* Partially used zone */
+                       alloc_offsets[i] =
+                                       ((zone.wp - zone.start) << SECTOR_SHIFT);
+                       break;
+               }
+       }
+
+       if (num_sequential > 0)
+               cache->seq_zone = true;
+
+       if (num_conventional > 0) {
+               /*
+                * Avoid calling calculate_alloc_pointer() for new BG. It
+                * is no use for new BG. It must be always 0.
+                *
+                * Also, we have a lock chain of extent buffer lock ->
+                * chunk mutex.  For new BG, this function is called from
+                * btrfs_make_block_group() which is already taking the
+                * chunk mutex. Thus, we cannot call
+                * calculate_alloc_pointer() which takes extent buffer
+                * locks to avoid deadlock.
+                */
+               if (new) {
+                       cache->alloc_offset = 0;
+                       goto out;
+               }
+               ret = calculate_alloc_pointer(cache, &last_alloc);
+               if (ret || map->num_stripes == num_conventional) {
+                       if (!ret)
+                               cache->alloc_offset = last_alloc;
+                       else
+                               btrfs_err(fs_info,
+                       "zoned: failed to determine allocation offset of bg %llu",
+                                         cache->start);
+                       goto out;
+               }
+       }
+
+       switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+       case 0: /* single */
+               cache->alloc_offset = alloc_offsets[0];
+               break;
+       case BTRFS_BLOCK_GROUP_DUP:
+       case BTRFS_BLOCK_GROUP_RAID1:
+       case BTRFS_BLOCK_GROUP_RAID0:
+       case BTRFS_BLOCK_GROUP_RAID10:
+       case BTRFS_BLOCK_GROUP_RAID5:
+       case BTRFS_BLOCK_GROUP_RAID6:
+               /* non-single profiles are not supported yet */
+       default:
+               btrfs_err(fs_info, "zoned: profile %s not yet supported",
+                         btrfs_bg_type_to_raid_name(map->type));
+               ret = -EINVAL;
+               goto out;
+       }
+
+out:
+       /* An extent is allocated after the write pointer */
+       if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
+               btrfs_err(fs_info,
+                         "zoned: got wrong write pointer in BG %llu: %llu > %llu",
+                         logical, last_alloc, cache->alloc_offset);
+               ret = -EIO;
+       }
+
+       if (!ret)
+               cache->meta_write_pointer = cache->alloc_offset + cache->start;
+
+       kfree(alloc_offsets);
+       free_extent_map(em);
+
+       return ret;
+}
+
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
+{
+       u64 unusable, free;
+
+       if (!btrfs_is_zoned(cache->fs_info))
+               return;
+
+       WARN_ON(cache->bytes_super != 0);
+       unusable = cache->alloc_offset - cache->used;
+       free = cache->length - cache->alloc_offset;
+
+       /* We only need ->free_space in ALLOC_SEQ block groups */
+       cache->last_byte_to_unpin = (u64)-1;
+       cache->cached = BTRFS_CACHE_FINISHED;
+       cache->free_space_ctl->free_space = free;
+       cache->zone_unusable = unusable;
+
+       /* Should not have any excluded extents. Just in case, though */
+       btrfs_free_excluded_extents(cache);
+}
+
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+                           struct extent_buffer *eb)
+{
+       struct btrfs_fs_info *fs_info = eb->fs_info;
+
+       if (!btrfs_is_zoned(fs_info) ||
+           btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
+           !list_empty(&eb->release_list))
+               return;
+
+       set_extent_buffer_dirty(eb);
+       set_extent_bits_nowait(&trans->dirty_pages, eb->start,
+                              eb->start + eb->len - 1, EXTENT_DIRTY);
+       memzero_extent_buffer(eb, 0, eb->len);
+       set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
+
+       spin_lock(&trans->releasing_ebs_lock);
+       list_add_tail(&eb->release_list, &trans->releasing_ebs);
+       spin_unlock(&trans->releasing_ebs_lock);
+       atomic_inc(&eb->refs);
+}
+
+void btrfs_free_redirty_list(struct btrfs_transaction *trans)
+{
+       spin_lock(&trans->releasing_ebs_lock);
+       while (!list_empty(&trans->releasing_ebs)) {
+               struct extent_buffer *eb;
+
+               eb = list_first_entry(&trans->releasing_ebs,
+                                     struct extent_buffer, release_list);
+               list_del_init(&eb->release_list);
+               free_extent_buffer(eb);
+       }
+       spin_unlock(&trans->releasing_ebs_lock);
+}
+
+bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct btrfs_block_group *cache;
+       bool ret = false;
+
+       if (!btrfs_is_zoned(fs_info))
+               return false;
+
+       if (!fs_info->max_zone_append_size)
+               return false;
+
+       if (!is_data_inode(&inode->vfs_inode))
+               return false;
+
+       cache = btrfs_lookup_block_group(fs_info, em->block_start);
+       ASSERT(cache);
+       if (!cache)
+               return false;
+
+       ret = cache->seq_zone;
+       btrfs_put_block_group(cache);
+
+       return ret;
+}
+
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+                                struct bio *bio)
+{
+       struct btrfs_ordered_extent *ordered;
+       const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+       if (bio_op(bio) != REQ_OP_ZONE_APPEND)
+               return;
+
+       ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
+       if (WARN_ON(!ordered))
+               return;
+
+       ordered->physical = physical;
+       ordered->disk = bio->bi_bdev->bd_disk;
+       ordered->partno = bio->bi_bdev->bd_partno;
+
+       btrfs_put_ordered_extent(ordered);
+}
+
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+{
+       struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct btrfs_ordered_sum *sum;
+       struct block_device *bdev;
+       u64 orig_logical = ordered->disk_bytenr;
+       u64 *logical = NULL;
+       int nr, stripe_len;
+
+       /* Zoned devices should not have partitions. So, we can assume it is 0 */
+       ASSERT(ordered->partno == 0);
+       bdev = bdgrab(ordered->disk->part0);
+       if (WARN_ON(!bdev))
+               return;
+
+       if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
+                                    ordered->physical, &logical, &nr,
+                                    &stripe_len)))
+               goto out;
+
+       WARN_ON(nr != 1);
+
+       if (orig_logical == *logical)
+               goto out;
+
+       ordered->disk_bytenr = *logical;
+
+       em_tree = &inode->extent_tree;
+       write_lock(&em_tree->lock);
+       em = search_extent_mapping(em_tree, ordered->file_offset,
+                                  ordered->num_bytes);
+       em->block_start = *logical;
+       free_extent_map(em);
+       write_unlock(&em_tree->lock);
+
+       list_for_each_entry(sum, &ordered->list, list) {
+               if (*logical < orig_logical)
+                       sum->bytenr -= orig_logical - *logical;
+               else
+                       sum->bytenr += *logical - orig_logical;
+       }
+
+out:
+       kfree(logical);
+       bdput(bdev);
+}
+
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+                                   struct extent_buffer *eb,
+                                   struct btrfs_block_group **cache_ret)
+{
+       struct btrfs_block_group *cache;
+       bool ret = true;
+
+       if (!btrfs_is_zoned(fs_info))
+               return true;
+
+       cache = *cache_ret;
+
+       if (cache && (eb->start < cache->start ||
+                     cache->start + cache->length <= eb->start)) {
+               btrfs_put_block_group(cache);
+               cache = NULL;
+               *cache_ret = NULL;
+       }
+
+       if (!cache)
+               cache = btrfs_lookup_block_group(fs_info, eb->start);
+
+       if (cache) {
+               if (cache->meta_write_pointer != eb->start) {
+                       btrfs_put_block_group(cache);
+                       cache = NULL;
+                       ret = false;
+               } else {
+                       cache->meta_write_pointer = eb->start + eb->len;
+               }
+
+               *cache_ret = cache;
+       }
+
+       return ret;
+}
+
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+                                    struct extent_buffer *eb)
+{
+       if (!btrfs_is_zoned(eb->fs_info) || !cache)
+               return;
+
+       ASSERT(cache->meta_write_pointer == eb->start + eb->len);
+       cache->meta_write_pointer = eb->start;
+}
+
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
+{
+       if (!btrfs_dev_is_sequential(device, physical))
+               return -EOPNOTSUPP;
+
+       return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
+                                   length >> SECTOR_SHIFT, GFP_NOFS, 0);
+}
+
+static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
+                         struct blk_zone *zone)
+{
+       struct btrfs_bio *bbio = NULL;
+       u64 mapped_length = PAGE_SIZE;
+       unsigned int nofs_flag;
+       int nmirrors;
+       int i, ret;
+
+       ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+                              &mapped_length, &bbio);
+       if (ret || !bbio || mapped_length < PAGE_SIZE) {
+               btrfs_put_bbio(bbio);
+               return -EIO;
+       }
+
+       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+               return -EINVAL;
+
+       nofs_flag = memalloc_nofs_save();
+       nmirrors = (int)bbio->num_stripes;
+       for (i = 0; i < nmirrors; i++) {
+               u64 physical = bbio->stripes[i].physical;
+               struct btrfs_device *dev = bbio->stripes[i].dev;
+
+               /* Missing device */
+               if (!dev->bdev)
+                       continue;
+
+               ret = btrfs_get_dev_zone(dev, physical, zone);
+               /* Failing device */
+               if (ret == -EIO || ret == -EOPNOTSUPP)
+                       continue;
+               break;
+       }
+       memalloc_nofs_restore(nofs_flag);
+
+       return ret;
+}
+
+/*
+ * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
+ * filling zeros between @physical_pos to a write pointer of dev-replace
+ * source device.
+ */
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+                                   u64 physical_start, u64 physical_pos)
+{
+       struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
+       struct blk_zone zone;
+       u64 length;
+       u64 wp;
+       int ret;
+
+       if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
+               return 0;
+
+       ret = read_zone_info(fs_info, logical, &zone);
+       if (ret)
+               return ret;
+
+       wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
+
+       if (physical_pos == wp)
+               return 0;
+
+       if (physical_pos > wp)
+               return -EUCLEAN;
+
+       length = wp - physical_pos;
+       return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
+}