LCOV - code coverage report
Current view: top level - fs/btrfs - zoned.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-acha @ Mon Jul 31 20:08:06 PDT 2023 Lines: 11 1203 0.9 %
Date: 2023-07-31 20:08:07 Functions: 1 54 1.9 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/bitops.h>
       4             : #include <linux/slab.h>
       5             : #include <linux/blkdev.h>
       6             : #include <linux/sched/mm.h>
       7             : #include <linux/atomic.h>
       8             : #include <linux/vmalloc.h>
       9             : #include "ctree.h"
      10             : #include "volumes.h"
      11             : #include "zoned.h"
      12             : #include "rcu-string.h"
      13             : #include "disk-io.h"
      14             : #include "block-group.h"
      15             : #include "transaction.h"
      16             : #include "dev-replace.h"
      17             : #include "space-info.h"
      18             : #include "super.h"
      19             : #include "fs.h"
      20             : #include "accessors.h"
      21             : #include "bio.h"
      22             : 
      23             : /* Maximum number of zones to report per blkdev_report_zones() call */
      24             : #define BTRFS_REPORT_NR_ZONES   4096
      25             : /* Invalid allocation pointer value for missing devices */
      26             : #define WP_MISSING_DEV ((u64)-1)
      27             : /* Pseudo write pointer value for conventional zone */
      28             : #define WP_CONVENTIONAL ((u64)-2)
      29             : 
      30             : /*
      31             :  * Location of the first zone of superblock logging zone pairs.
      32             :  *
      33             :  * - primary superblock:    0B (zone 0)
      34             :  * - first copy:          512G (zone starting at that offset)
      35             :  * - second copy:           4T (zone starting at that offset)
      36             :  */
      37             : #define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
      38             : #define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
      39             : #define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
      40             : 
      41             : #define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
      42             : #define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
      43             : 
      44             : /* Number of superblock log zones */
      45             : #define BTRFS_NR_SB_LOG_ZONES 2
      46             : 
      47             : /*
      48             :  * Minimum of active zones we need:
      49             :  *
      50             :  * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
      51             :  * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
      52             :  * - 1 zone for tree-log dedicated block group
      53             :  * - 1 zone for relocation
      54             :  */
      55             : #define BTRFS_MIN_ACTIVE_ZONES          (BTRFS_SUPER_MIRROR_MAX + 5)
      56             : 
      57             : /*
      58             :  * Minimum / maximum supported zone size. Currently, SMR disks have a zone
      59             :  * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
      60             :  * We do not expect the zone size to become larger than 8GiB or smaller than
      61             :  * 4MiB in the near future.
      62             :  */
      63             : #define BTRFS_MAX_ZONE_SIZE             SZ_8G
      64             : #define BTRFS_MIN_ZONE_SIZE             SZ_4M
      65             : 
      66             : #define SUPER_INFO_SECTORS      ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
      67             : 
      68             : static inline bool sb_zone_is_full(const struct blk_zone *zone)
      69             : {
      70           0 :         return (zone->cond == BLK_ZONE_COND_FULL) ||
      71           0 :                 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
      72             : }
      73             : 
      74           0 : static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
      75             : {
      76           0 :         struct blk_zone *zones = data;
      77             : 
      78           0 :         memcpy(&zones[idx], zone, sizeof(*zone));
      79             : 
      80           0 :         return 0;
      81             : }
      82             : 
      83           0 : static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
      84             :                             u64 *wp_ret)
      85             : {
      86           0 :         bool empty[BTRFS_NR_SB_LOG_ZONES];
      87           0 :         bool full[BTRFS_NR_SB_LOG_ZONES];
      88           0 :         sector_t sector;
      89           0 :         int i;
      90             : 
      91           0 :         for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
      92           0 :                 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
      93           0 :                 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
      94           0 :                 full[i] = sb_zone_is_full(&zones[i]);
      95             :         }
      96             : 
      97             :         /*
      98             :          * Possible states of log buffer zones
      99             :          *
     100             :          *           Empty[0]  In use[0]  Full[0]
     101             :          * Empty[1]         *          0        1
     102             :          * In use[1]        x          x        1
     103             :          * Full[1]          0          0        C
     104             :          *
     105             :          * Log position:
     106             :          *   *: Special case, no superblock is written
     107             :          *   0: Use write pointer of zones[0]
     108             :          *   1: Use write pointer of zones[1]
     109             :          *   C: Compare super blocks from zones[0] and zones[1], use the latest
     110             :          *      one determined by generation
     111             :          *   x: Invalid state
     112             :          */
     113             : 
     114           0 :         if (empty[0] && empty[1]) {
     115             :                 /* Special case to distinguish no superblock to read */
     116           0 :                 *wp_ret = zones[0].start << SECTOR_SHIFT;
     117           0 :                 return -ENOENT;
     118           0 :         } else if (full[0] && full[1]) {
     119             :                 /* Compare two super blocks */
     120           0 :                 struct address_space *mapping = bdev->bd_inode->i_mapping;
     121           0 :                 struct page *page[BTRFS_NR_SB_LOG_ZONES];
     122           0 :                 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
     123           0 :                 int i;
     124             : 
     125           0 :                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
     126           0 :                         u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
     127           0 :                         u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
     128             :                                                 BTRFS_SUPER_INFO_SIZE;
     129             : 
     130           0 :                         page[i] = read_cache_page_gfp(mapping,
     131           0 :                                         bytenr >> PAGE_SHIFT, GFP_NOFS);
     132           0 :                         if (IS_ERR(page[i])) {
     133           0 :                                 if (i == 1)
     134           0 :                                         btrfs_release_disk_super(super[0]);
     135           0 :                                 return PTR_ERR(page[i]);
     136             :                         }
     137           0 :                         super[i] = page_address(page[i]);
     138             :                 }
     139             : 
     140           0 :                 if (btrfs_super_generation(super[0]) >
     141           0 :                     btrfs_super_generation(super[1]))
     142           0 :                         sector = zones[1].start;
     143             :                 else
     144           0 :                         sector = zones[0].start;
     145             : 
     146           0 :                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
     147           0 :                         btrfs_release_disk_super(super[i]);
     148           0 :         } else if (!full[0] && (empty[1] || full[1])) {
     149           0 :                 sector = zones[0].wp;
     150           0 :         } else if (full[0]) {
     151           0 :                 sector = zones[1].wp;
     152             :         } else {
     153             :                 return -EUCLEAN;
     154             :         }
     155           0 :         *wp_ret = sector << SECTOR_SHIFT;
     156           0 :         return 0;
     157             : }
     158             : 
     159             : /*
     160             :  * Get the first zone number of the superblock mirror
     161             :  */
     162           0 : static inline u32 sb_zone_number(int shift, int mirror)
     163             : {
     164           0 :         u64 zone = U64_MAX;
     165             : 
     166           0 :         ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
     167           0 :         switch (mirror) {
     168           0 :         case 0: zone = 0; break;
     169           0 :         case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
     170           0 :         case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
     171             :         }
     172             : 
     173           0 :         ASSERT(zone <= U32_MAX);
     174             : 
     175           0 :         return (u32)zone;
     176             : }
     177             : 
     178           0 : static inline sector_t zone_start_sector(u32 zone_number,
     179             :                                          struct block_device *bdev)
     180             : {
     181           0 :         return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
     182             : }
     183             : 
     184             : static inline u64 zone_start_physical(u32 zone_number,
     185             :                                       struct btrfs_zoned_device_info *zone_info)
     186             : {
     187           0 :         return (u64)zone_number << zone_info->zone_size_shift;
     188             : }
     189             : 
     190             : /*
     191             :  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
     192             :  * device into static sized chunks and fake a conventional zone on each of
     193             :  * them.
     194             :  */
     195           0 : static int emulate_report_zones(struct btrfs_device *device, u64 pos,
     196             :                                 struct blk_zone *zones, unsigned int nr_zones)
     197             : {
     198           0 :         const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
     199           0 :         sector_t bdev_size = bdev_nr_sectors(device->bdev);
     200           0 :         unsigned int i;
     201             : 
     202           0 :         pos >>= SECTOR_SHIFT;
     203           0 :         for (i = 0; i < nr_zones; i++) {
     204           0 :                 zones[i].start = i * zone_sectors + pos;
     205           0 :                 zones[i].len = zone_sectors;
     206           0 :                 zones[i].capacity = zone_sectors;
     207           0 :                 zones[i].wp = zones[i].start + zone_sectors;
     208           0 :                 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
     209           0 :                 zones[i].cond = BLK_ZONE_COND_NOT_WP;
     210             : 
     211           0 :                 if (zones[i].wp >= bdev_size) {
     212           0 :                         i++;
     213           0 :                         break;
     214             :                 }
     215             :         }
     216             : 
     217           0 :         return i;
     218             : }
     219             : 
     220           0 : static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
     221             :                                struct blk_zone *zones, unsigned int *nr_zones)
     222             : {
     223           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
     224           0 :         int ret;
     225             : 
     226           0 :         if (!*nr_zones)
     227             :                 return 0;
     228             : 
     229           0 :         if (!bdev_is_zoned(device->bdev)) {
     230           0 :                 ret = emulate_report_zones(device, pos, zones, *nr_zones);
     231           0 :                 *nr_zones = ret;
     232           0 :                 return 0;
     233             :         }
     234             : 
     235             :         /* Check cache */
     236           0 :         if (zinfo->zone_cache) {
     237           0 :                 unsigned int i;
     238           0 :                 u32 zno;
     239             : 
     240           0 :                 ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
     241           0 :                 zno = pos >> zinfo->zone_size_shift;
     242             :                 /*
     243             :                  * We cannot report zones beyond the zone end. So, it is OK to
     244             :                  * cap *nr_zones to at the end.
     245             :                  */
     246           0 :                 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
     247             : 
     248           0 :                 for (i = 0; i < *nr_zones; i++) {
     249           0 :                         struct blk_zone *zone_info;
     250             : 
     251           0 :                         zone_info = &zinfo->zone_cache[zno + i];
     252           0 :                         if (!zone_info->len)
     253             :                                 break;
     254             :                 }
     255             : 
     256           0 :                 if (i == *nr_zones) {
     257             :                         /* Cache hit on all the zones */
     258           0 :                         memcpy(zones, zinfo->zone_cache + zno,
     259             :                                sizeof(*zinfo->zone_cache) * *nr_zones);
     260           0 :                         return 0;
     261             :                 }
     262             :         }
     263             : 
     264           0 :         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
     265             :                                   copy_zone_info_cb, zones);
     266           0 :         if (ret < 0) {
     267           0 :                 btrfs_err_in_rcu(device->fs_info,
     268             :                                  "zoned: failed to read zone %llu on %s (devid %llu)",
     269             :                                  pos, rcu_str_deref(device->name),
     270             :                                  device->devid);
     271           0 :                 return ret;
     272             :         }
     273           0 :         *nr_zones = ret;
     274           0 :         if (!ret)
     275             :                 return -EIO;
     276             : 
     277             :         /* Populate cache */
     278           0 :         if (zinfo->zone_cache) {
     279           0 :                 u32 zno = pos >> zinfo->zone_size_shift;
     280             : 
     281           0 :                 memcpy(zinfo->zone_cache + zno, zones,
     282             :                        sizeof(*zinfo->zone_cache) * *nr_zones);
     283             :         }
     284             : 
     285             :         return 0;
     286             : }
     287             : 
     288             : /* The emulated zone size is determined from the size of device extent */
     289           0 : static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
     290             : {
     291           0 :         struct btrfs_path *path;
     292           0 :         struct btrfs_root *root = fs_info->dev_root;
     293           0 :         struct btrfs_key key;
     294           0 :         struct extent_buffer *leaf;
     295           0 :         struct btrfs_dev_extent *dext;
     296           0 :         int ret = 0;
     297             : 
     298           0 :         key.objectid = 1;
     299           0 :         key.type = BTRFS_DEV_EXTENT_KEY;
     300           0 :         key.offset = 0;
     301             : 
     302           0 :         path = btrfs_alloc_path();
     303           0 :         if (!path)
     304             :                 return -ENOMEM;
     305             : 
     306           0 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
     307           0 :         if (ret < 0)
     308           0 :                 goto out;
     309             : 
     310           0 :         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
     311           0 :                 ret = btrfs_next_leaf(root, path);
     312           0 :                 if (ret < 0)
     313           0 :                         goto out;
     314             :                 /* No dev extents at all? Not good */
     315           0 :                 if (ret > 0) {
     316           0 :                         ret = -EUCLEAN;
     317           0 :                         goto out;
     318             :                 }
     319             :         }
     320             : 
     321           0 :         leaf = path->nodes[0];
     322           0 :         dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
     323           0 :         fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
     324           0 :         ret = 0;
     325             : 
     326           0 : out:
     327           0 :         btrfs_free_path(path);
     328             : 
     329           0 :         return ret;
     330             : }
     331             : 
     332           0 : int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
     333             : {
     334           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
     335           0 :         struct btrfs_device *device;
     336           0 :         int ret = 0;
     337             : 
     338             :         /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
     339           0 :         if (!btrfs_fs_incompat(fs_info, ZONED))
     340             :                 return 0;
     341             : 
     342           0 :         mutex_lock(&fs_devices->device_list_mutex);
     343           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
     344             :                 /* We can skip reading of zone info for missing devices */
     345           0 :                 if (!device->bdev)
     346           0 :                         continue;
     347             : 
     348           0 :                 ret = btrfs_get_dev_zone_info(device, true);
     349           0 :                 if (ret)
     350             :                         break;
     351             :         }
     352           0 :         mutex_unlock(&fs_devices->device_list_mutex);
     353             : 
     354           0 :         return ret;
     355             : }
     356             : 
     357           0 : int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
     358             : {
     359           0 :         struct btrfs_fs_info *fs_info = device->fs_info;
     360           0 :         struct btrfs_zoned_device_info *zone_info = NULL;
     361           0 :         struct block_device *bdev = device->bdev;
     362           0 :         unsigned int max_active_zones;
     363           0 :         unsigned int nactive;
     364           0 :         sector_t nr_sectors;
     365           0 :         sector_t sector = 0;
     366           0 :         struct blk_zone *zones = NULL;
     367           0 :         unsigned int i, nreported = 0, nr_zones;
     368           0 :         sector_t zone_sectors;
     369           0 :         char *model, *emulated;
     370           0 :         int ret;
     371             : 
     372             :         /*
     373             :          * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
     374             :          * yet be set.
     375             :          */
     376           0 :         if (!btrfs_fs_incompat(fs_info, ZONED))
     377             :                 return 0;
     378             : 
     379           0 :         if (device->zone_info)
     380             :                 return 0;
     381             : 
     382           0 :         zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
     383           0 :         if (!zone_info)
     384             :                 return -ENOMEM;
     385             : 
     386           0 :         device->zone_info = zone_info;
     387             : 
     388           0 :         if (!bdev_is_zoned(bdev)) {
     389           0 :                 if (!fs_info->zone_size) {
     390           0 :                         ret = calculate_emulated_zone_size(fs_info);
     391           0 :                         if (ret)
     392           0 :                                 goto out;
     393             :                 }
     394             : 
     395           0 :                 ASSERT(fs_info->zone_size);
     396           0 :                 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
     397             :         } else {
     398           0 :                 zone_sectors = bdev_zone_sectors(bdev);
     399             :         }
     400             : 
     401           0 :         ASSERT(is_power_of_two_u64(zone_sectors));
     402           0 :         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
     403             : 
     404             :         /* We reject devices with a zone size larger than 8GB */
     405           0 :         if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
     406           0 :                 btrfs_err_in_rcu(fs_info,
     407             :                 "zoned: %s: zone size %llu larger than supported maximum %llu",
     408             :                                  rcu_str_deref(device->name),
     409             :                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
     410           0 :                 ret = -EINVAL;
     411           0 :                 goto out;
     412           0 :         } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
     413           0 :                 btrfs_err_in_rcu(fs_info,
     414             :                 "zoned: %s: zone size %llu smaller than supported minimum %u",
     415             :                                  rcu_str_deref(device->name),
     416             :                                  zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
     417           0 :                 ret = -EINVAL;
     418           0 :                 goto out;
     419             :         }
     420             : 
     421           0 :         nr_sectors = bdev_nr_sectors(bdev);
     422           0 :         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
     423           0 :         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
     424           0 :         if (!IS_ALIGNED(nr_sectors, zone_sectors))
     425           0 :                 zone_info->nr_zones++;
     426             : 
     427           0 :         max_active_zones = bdev_max_active_zones(bdev);
     428           0 :         if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
     429           0 :                 btrfs_err_in_rcu(fs_info,
     430             : "zoned: %s: max active zones %u is too small, need at least %u active zones",
     431             :                                  rcu_str_deref(device->name), max_active_zones,
     432             :                                  BTRFS_MIN_ACTIVE_ZONES);
     433           0 :                 ret = -EINVAL;
     434           0 :                 goto out;
     435             :         }
     436           0 :         zone_info->max_active_zones = max_active_zones;
     437             : 
     438           0 :         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     439           0 :         if (!zone_info->seq_zones) {
     440           0 :                 ret = -ENOMEM;
     441           0 :                 goto out;
     442             :         }
     443             : 
     444           0 :         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     445           0 :         if (!zone_info->empty_zones) {
     446           0 :                 ret = -ENOMEM;
     447           0 :                 goto out;
     448             :         }
     449             : 
     450           0 :         zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     451           0 :         if (!zone_info->active_zones) {
     452           0 :                 ret = -ENOMEM;
     453           0 :                 goto out;
     454             :         }
     455             : 
     456           0 :         zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
     457           0 :         if (!zones) {
     458           0 :                 ret = -ENOMEM;
     459           0 :                 goto out;
     460             :         }
     461             : 
     462             :         /*
     463             :          * Enable zone cache only for a zoned device. On a non-zoned device, we
     464             :          * fill the zone info with emulated CONVENTIONAL zones, so no need to
     465             :          * use the cache.
     466             :          */
     467           0 :         if (populate_cache && bdev_is_zoned(device->bdev)) {
     468           0 :                 zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
     469           0 :                                                 zone_info->nr_zones);
     470           0 :                 if (!zone_info->zone_cache) {
     471           0 :                         btrfs_err_in_rcu(device->fs_info,
     472             :                                 "zoned: failed to allocate zone cache for %s",
     473             :                                 rcu_str_deref(device->name));
     474           0 :                         ret = -ENOMEM;
     475           0 :                         goto out;
     476             :                 }
     477             :         }
     478             : 
     479             :         /* Get zones type */
     480             :         nactive = 0;
     481           0 :         while (sector < nr_sectors) {
     482           0 :                 nr_zones = BTRFS_REPORT_NR_ZONES;
     483           0 :                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
     484             :                                           &nr_zones);
     485           0 :                 if (ret)
     486           0 :                         goto out;
     487             : 
     488           0 :                 for (i = 0; i < nr_zones; i++) {
     489           0 :                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
     490           0 :                                 __set_bit(nreported, zone_info->seq_zones);
     491           0 :                         switch (zones[i].cond) {
     492           0 :                         case BLK_ZONE_COND_EMPTY:
     493           0 :                                 __set_bit(nreported, zone_info->empty_zones);
     494             :                                 break;
     495           0 :                         case BLK_ZONE_COND_IMP_OPEN:
     496             :                         case BLK_ZONE_COND_EXP_OPEN:
     497             :                         case BLK_ZONE_COND_CLOSED:
     498           0 :                                 __set_bit(nreported, zone_info->active_zones);
     499           0 :                                 nactive++;
     500           0 :                                 break;
     501             :                         }
     502           0 :                         nreported++;
     503             :                 }
     504           0 :                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
     505             :         }
     506             : 
     507           0 :         if (nreported != zone_info->nr_zones) {
     508           0 :                 btrfs_err_in_rcu(device->fs_info,
     509             :                                  "inconsistent number of zones on %s (%u/%u)",
     510             :                                  rcu_str_deref(device->name), nreported,
     511             :                                  zone_info->nr_zones);
     512           0 :                 ret = -EIO;
     513           0 :                 goto out;
     514             :         }
     515             : 
     516           0 :         if (max_active_zones) {
     517           0 :                 if (nactive > max_active_zones) {
     518           0 :                         btrfs_err_in_rcu(device->fs_info,
     519             :                         "zoned: %u active zones on %s exceeds max_active_zones %u",
     520             :                                          nactive, rcu_str_deref(device->name),
     521             :                                          max_active_zones);
     522           0 :                         ret = -EIO;
     523           0 :                         goto out;
     524             :                 }
     525           0 :                 atomic_set(&zone_info->active_zones_left,
     526           0 :                            max_active_zones - nactive);
     527           0 :                 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
     528             :         }
     529             : 
     530             :         /* Validate superblock log */
     531           0 :         nr_zones = BTRFS_NR_SB_LOG_ZONES;
     532           0 :         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
     533           0 :                 u32 sb_zone;
     534           0 :                 u64 sb_wp;
     535           0 :                 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
     536             : 
     537           0 :                 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
     538           0 :                 if (sb_zone + 1 >= zone_info->nr_zones)
     539           0 :                         continue;
     540             : 
     541           0 :                 ret = btrfs_get_dev_zones(device,
     542             :                                           zone_start_physical(sb_zone, zone_info),
     543             :                                           &zone_info->sb_zones[sb_pos],
     544             :                                           &nr_zones);
     545           0 :                 if (ret)
     546           0 :                         goto out;
     547             : 
     548           0 :                 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
     549           0 :                         btrfs_err_in_rcu(device->fs_info,
     550             :         "zoned: failed to read super block log zone info at devid %llu zone %u",
     551             :                                          device->devid, sb_zone);
     552           0 :                         ret = -EUCLEAN;
     553           0 :                         goto out;
     554             :                 }
     555             : 
     556             :                 /*
     557             :                  * If zones[0] is conventional, always use the beginning of the
     558             :                  * zone to record superblock. No need to validate in that case.
     559             :                  */
     560           0 :                 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
     561             :                     BLK_ZONE_TYPE_CONVENTIONAL)
     562           0 :                         continue;
     563             : 
     564           0 :                 ret = sb_write_pointer(device->bdev,
     565             :                                        &zone_info->sb_zones[sb_pos], &sb_wp);
     566           0 :                 if (ret != -ENOENT && ret) {
     567           0 :                         btrfs_err_in_rcu(device->fs_info,
     568             :                         "zoned: super block log zone corrupted devid %llu zone %u",
     569             :                                          device->devid, sb_zone);
     570           0 :                         ret = -EUCLEAN;
     571           0 :                         goto out;
     572             :                 }
     573             :         }
     574             : 
     575             : 
     576           0 :         kvfree(zones);
     577             : 
     578           0 :         switch (bdev_zoned_model(bdev)) {
     579             :         case BLK_ZONED_HM:
     580             :                 model = "host-managed zoned";
     581             :                 emulated = "";
     582             :                 break;
     583             :         case BLK_ZONED_HA:
     584             :                 model = "host-aware zoned";
     585             :                 emulated = "";
     586             :                 break;
     587             :         case BLK_ZONED_NONE:
     588             :                 model = "regular";
     589             :                 emulated = "emulated ";
     590             :                 break;
     591             :         default:
     592             :                 /* Just in case */
     593           0 :                 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
     594             :                                  bdev_zoned_model(bdev),
     595             :                                  rcu_str_deref(device->name));
     596           0 :                 ret = -EOPNOTSUPP;
     597           0 :                 goto out_free_zone_info;
     598             :         }
     599             : 
     600           0 :         btrfs_info_in_rcu(fs_info,
     601             :                 "%s block device %s, %u %szones of %llu bytes",
     602             :                 model, rcu_str_deref(device->name), zone_info->nr_zones,
     603             :                 emulated, zone_info->zone_size);
     604             : 
     605           0 :         return 0;
     606             : 
     607           0 : out:
     608           0 :         kvfree(zones);
     609           0 : out_free_zone_info:
     610           0 :         btrfs_destroy_dev_zone_info(device);
     611             : 
     612           0 :         return ret;
     613             : }
     614             : 
     615           0 : void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
     616             : {
     617           0 :         struct btrfs_zoned_device_info *zone_info = device->zone_info;
     618             : 
     619           0 :         if (!zone_info)
     620             :                 return;
     621             : 
     622           0 :         bitmap_free(zone_info->active_zones);
     623           0 :         bitmap_free(zone_info->seq_zones);
     624           0 :         bitmap_free(zone_info->empty_zones);
     625           0 :         vfree(zone_info->zone_cache);
     626           0 :         kfree(zone_info);
     627           0 :         device->zone_info = NULL;
     628             : }
     629             : 
     630           0 : struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
     631             : {
     632           0 :         struct btrfs_zoned_device_info *zone_info;
     633             : 
     634           0 :         zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
     635           0 :         if (!zone_info)
     636             :                 return NULL;
     637             : 
     638           0 :         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     639           0 :         if (!zone_info->seq_zones)
     640           0 :                 goto out;
     641             : 
     642           0 :         bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
     643             :                     zone_info->nr_zones);
     644             : 
     645           0 :         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     646           0 :         if (!zone_info->empty_zones)
     647           0 :                 goto out;
     648             : 
     649           0 :         bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
     650             :                     zone_info->nr_zones);
     651             : 
     652           0 :         zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     653           0 :         if (!zone_info->active_zones)
     654           0 :                 goto out;
     655             : 
     656           0 :         bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
     657             :                     zone_info->nr_zones);
     658           0 :         zone_info->zone_cache = NULL;
     659             : 
     660           0 :         return zone_info;
     661             : 
     662           0 : out:
     663           0 :         bitmap_free(zone_info->seq_zones);
     664           0 :         bitmap_free(zone_info->empty_zones);
     665           0 :         bitmap_free(zone_info->active_zones);
     666           0 :         kfree(zone_info);
     667           0 :         return NULL;
     668             : }
     669             : 
     670           0 : int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
     671             :                        struct blk_zone *zone)
     672             : {
     673           0 :         unsigned int nr_zones = 1;
     674           0 :         int ret;
     675             : 
     676           0 :         ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
     677           0 :         if (ret != 0 || !nr_zones)
     678           0 :                 return ret ? ret : -EIO;
     679             : 
     680             :         return 0;
     681             : }
     682             : 
     683           0 : static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
     684             : {
     685           0 :         struct btrfs_device *device;
     686             : 
     687           0 :         list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
     688           0 :                 if (device->bdev &&
     689             :                     bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
     690           0 :                         btrfs_err(fs_info,
     691             :                                 "zoned: mode not enabled but zoned device found: %pg",
     692             :                                 device->bdev);
     693           0 :                         return -EINVAL;
     694             :                 }
     695             :         }
     696             : 
     697             :         return 0;
     698             : }
     699             : 
     700           0 : int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
     701             : {
     702           0 :         struct queue_limits *lim = &fs_info->limits;
     703           0 :         struct btrfs_device *device;
     704           0 :         u64 zone_size = 0;
     705           0 :         int ret;
     706             : 
     707             :         /*
     708             :          * Host-Managed devices can't be used without the ZONED flag.  With the
     709             :          * ZONED all devices can be used, using zone emulation if required.
     710             :          */
     711           0 :         if (!btrfs_fs_incompat(fs_info, ZONED))
     712           0 :                 return btrfs_check_for_zoned_device(fs_info);
     713             : 
     714           0 :         blk_set_stacking_limits(lim);
     715             : 
     716           0 :         list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
     717           0 :                 struct btrfs_zoned_device_info *zone_info = device->zone_info;
     718             : 
     719           0 :                 if (!device->bdev)
     720           0 :                         continue;
     721             : 
     722           0 :                 if (!zone_size) {
     723           0 :                         zone_size = zone_info->zone_size;
     724           0 :                 } else if (zone_info->zone_size != zone_size) {
     725           0 :                         btrfs_err(fs_info,
     726             :                 "zoned: unequal block device zone sizes: have %llu found %llu",
     727             :                                   zone_info->zone_size, zone_size);
     728           0 :                         return -EINVAL;
     729             :                 }
     730             : 
     731             :                 /*
     732             :                  * With the zoned emulation, we can have non-zoned device on the
     733             :                  * zoned mode. In this case, we don't have a valid max zone
     734             :                  * append size.
     735             :                  */
     736           0 :                 if (bdev_is_zoned(device->bdev)) {
     737           0 :                         blk_stack_limits(lim,
     738             :                                          &bdev_get_queue(device->bdev)->limits,
     739             :                                          0);
     740             :                 }
     741             :         }
     742             : 
     743             :         /*
     744             :          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
     745             :          * btrfs_create_chunk(). Since we want stripe_len == zone_size,
     746             :          * check the alignment here.
     747             :          */
     748           0 :         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
     749           0 :                 btrfs_err(fs_info,
     750             :                           "zoned: zone size %llu not aligned to stripe %u",
     751             :                           zone_size, BTRFS_STRIPE_LEN);
     752           0 :                 return -EINVAL;
     753             :         }
     754             : 
     755           0 :         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
     756           0 :                 btrfs_err(fs_info, "zoned: mixed block groups not supported");
     757           0 :                 return -EINVAL;
     758             :         }
     759             : 
     760           0 :         fs_info->zone_size = zone_size;
     761             :         /*
     762             :          * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
     763             :          * Technically, we can have multiple pages per segment. But, since
     764             :          * we add the pages one by one to a bio, and cannot increase the
     765             :          * metadata reservation even if it increases the number of extents, it
     766             :          * is safe to stick with the limit.
     767             :          */
     768           0 :         fs_info->max_zone_append_size = ALIGN_DOWN(
     769             :                 min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
     770             :                      (u64)lim->max_sectors << SECTOR_SHIFT,
     771             :                      (u64)lim->max_segments << PAGE_SHIFT),
     772             :                 fs_info->sectorsize);
     773           0 :         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
     774           0 :         if (fs_info->max_zone_append_size < fs_info->max_extent_size)
     775           0 :                 fs_info->max_extent_size = fs_info->max_zone_append_size;
     776             : 
     777             :         /*
     778             :          * Check mount options here, because we might change fs_info->zoned
     779             :          * from fs_info->zone_size.
     780             :          */
     781           0 :         ret = btrfs_check_mountopts_zoned(fs_info);
     782           0 :         if (ret)
     783             :                 return ret;
     784             : 
     785           0 :         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
     786           0 :         return 0;
     787             : }
     788             : 
     789           0 : int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
     790             : {
     791           0 :         if (!btrfs_is_zoned(info))
     792             :                 return 0;
     793             : 
     794             :         /*
     795             :          * Space cache writing is not COWed. Disable that to avoid write errors
     796             :          * in sequential zones.
     797             :          */
     798           0 :         if (btrfs_test_opt(info, SPACE_CACHE)) {
     799           0 :                 btrfs_err(info, "zoned: space cache v1 is not supported");
     800           0 :                 return -EINVAL;
     801             :         }
     802             : 
     803           0 :         if (btrfs_test_opt(info, NODATACOW)) {
     804           0 :                 btrfs_err(info, "zoned: NODATACOW not supported");
     805           0 :                 return -EINVAL;
     806             :         }
     807             : 
     808             :         return 0;
     809             : }
     810             : 
     811           0 : static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
     812             :                            int rw, u64 *bytenr_ret)
     813             : {
     814           0 :         u64 wp;
     815           0 :         int ret;
     816             : 
     817           0 :         if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
     818           0 :                 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
     819           0 :                 return 0;
     820             :         }
     821             : 
     822           0 :         ret = sb_write_pointer(bdev, zones, &wp);
     823           0 :         if (ret != -ENOENT && ret < 0)
     824             :                 return ret;
     825             : 
     826           0 :         if (rw == WRITE) {
     827           0 :                 struct blk_zone *reset = NULL;
     828             : 
     829           0 :                 if (wp == zones[0].start << SECTOR_SHIFT)
     830             :                         reset = &zones[0];
     831           0 :                 else if (wp == zones[1].start << SECTOR_SHIFT)
     832           0 :                         reset = &zones[1];
     833             : 
     834           0 :                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
     835           0 :                         ASSERT(sb_zone_is_full(reset));
     836             : 
     837           0 :                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
     838             :                                                reset->start, reset->len,
     839             :                                                GFP_NOFS);
     840           0 :                         if (ret)
     841             :                                 return ret;
     842             : 
     843           0 :                         reset->cond = BLK_ZONE_COND_EMPTY;
     844           0 :                         reset->wp = reset->start;
     845             :                 }
     846           0 :         } else if (ret != -ENOENT) {
     847             :                 /*
     848             :                  * For READ, we want the previous one. Move write pointer to
     849             :                  * the end of a zone, if it is at the head of a zone.
     850             :                  */
     851           0 :                 u64 zone_end = 0;
     852             : 
     853           0 :                 if (wp == zones[0].start << SECTOR_SHIFT)
     854           0 :                         zone_end = zones[1].start + zones[1].capacity;
     855           0 :                 else if (wp == zones[1].start << SECTOR_SHIFT)
     856           0 :                         zone_end = zones[0].start + zones[0].capacity;
     857           0 :                 if (zone_end)
     858           0 :                         wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
     859             :                                         BTRFS_SUPER_INFO_SIZE);
     860             : 
     861           0 :                 wp -= BTRFS_SUPER_INFO_SIZE;
     862             :         }
     863             : 
     864           0 :         *bytenr_ret = wp;
     865           0 :         return 0;
     866             : 
     867             : }
     868             : 
     869           4 : int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
     870             :                                u64 *bytenr_ret)
     871             : {
     872           4 :         struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
     873           4 :         sector_t zone_sectors;
     874           4 :         u32 sb_zone;
     875           4 :         int ret;
     876           4 :         u8 zone_sectors_shift;
     877           4 :         sector_t nr_sectors;
     878           4 :         u32 nr_zones;
     879             : 
     880           4 :         if (!bdev_is_zoned(bdev)) {
     881           4 :                 *bytenr_ret = btrfs_sb_offset(mirror);
     882           4 :                 return 0;
     883             :         }
     884             : 
     885           0 :         ASSERT(rw == READ || rw == WRITE);
     886             : 
     887           0 :         zone_sectors = bdev_zone_sectors(bdev);
     888           0 :         if (!is_power_of_2(zone_sectors))
     889             :                 return -EINVAL;
     890           0 :         zone_sectors_shift = ilog2(zone_sectors);
     891           0 :         nr_sectors = bdev_nr_sectors(bdev);
     892           0 :         nr_zones = nr_sectors >> zone_sectors_shift;
     893             : 
     894           0 :         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
     895           0 :         if (sb_zone + 1 >= nr_zones)
     896             :                 return -ENOENT;
     897             : 
     898           0 :         ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
     899             :                                   BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
     900             :                                   zones);
     901           0 :         if (ret < 0)
     902             :                 return ret;
     903           0 :         if (ret != BTRFS_NR_SB_LOG_ZONES)
     904             :                 return -EIO;
     905             : 
     906           0 :         return sb_log_location(bdev, zones, rw, bytenr_ret);
     907             : }
     908             : 
     909           0 : int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
     910             :                           u64 *bytenr_ret)
     911             : {
     912           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
     913           0 :         u32 zone_num;
     914             : 
     915             :         /*
     916             :          * For a zoned filesystem on a non-zoned block device, use the same
     917             :          * super block locations as regular filesystem. Doing so, the super
     918             :          * block can always be retrieved and the zoned flag of the volume
     919             :          * detected from the super block information.
     920             :          */
     921           0 :         if (!bdev_is_zoned(device->bdev)) {
     922           0 :                 *bytenr_ret = btrfs_sb_offset(mirror);
     923           0 :                 return 0;
     924             :         }
     925             : 
     926           0 :         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
     927           0 :         if (zone_num + 1 >= zinfo->nr_zones)
     928             :                 return -ENOENT;
     929             : 
     930           0 :         return sb_log_location(device->bdev,
     931           0 :                                &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
     932             :                                rw, bytenr_ret);
     933             : }
     934             : 
     935           0 : static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
     936             :                                   int mirror)
     937             : {
     938           0 :         u32 zone_num;
     939             : 
     940           0 :         if (!zinfo)
     941             :                 return false;
     942             : 
     943           0 :         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
     944           0 :         if (zone_num + 1 >= zinfo->nr_zones)
     945             :                 return false;
     946             : 
     947           0 :         if (!test_bit(zone_num, zinfo->seq_zones))
     948           0 :                 return false;
     949             : 
     950             :         return true;
     951             : }
     952             : 
     953           0 : int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
     954             : {
     955           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
     956           0 :         struct blk_zone *zone;
     957           0 :         int i;
     958             : 
     959           0 :         if (!is_sb_log_zone(zinfo, mirror))
     960             :                 return 0;
     961             : 
     962           0 :         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
     963           0 :         for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
     964             :                 /* Advance the next zone */
     965           0 :                 if (zone->cond == BLK_ZONE_COND_FULL) {
     966           0 :                         zone++;
     967           0 :                         continue;
     968             :                 }
     969             : 
     970           0 :                 if (zone->cond == BLK_ZONE_COND_EMPTY)
     971           0 :                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
     972             : 
     973           0 :                 zone->wp += SUPER_INFO_SECTORS;
     974             : 
     975           0 :                 if (sb_zone_is_full(zone)) {
     976             :                         /*
     977             :                          * No room left to write new superblock. Since
     978             :                          * superblock is written with REQ_SYNC, it is safe to
     979             :                          * finish the zone now.
     980             :                          *
     981             :                          * If the write pointer is exactly at the capacity,
     982             :                          * explicit ZONE_FINISH is not necessary.
     983             :                          */
     984           0 :                         if (zone->wp != zone->start + zone->capacity) {
     985           0 :                                 int ret;
     986             : 
     987           0 :                                 ret = blkdev_zone_mgmt(device->bdev,
     988             :                                                 REQ_OP_ZONE_FINISH, zone->start,
     989             :                                                 zone->len, GFP_NOFS);
     990           0 :                                 if (ret)
     991             :                                         return ret;
     992             :                         }
     993             : 
     994           0 :                         zone->wp = zone->start + zone->len;
     995           0 :                         zone->cond = BLK_ZONE_COND_FULL;
     996             :                 }
     997             :                 return 0;
     998             :         }
     999             : 
    1000             :         /* All the zones are FULL. Should not reach here. */
    1001             :         ASSERT(0);
    1002             :         return -EIO;
    1003             : }
    1004             : 
    1005           0 : int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
    1006             : {
    1007           0 :         sector_t zone_sectors;
    1008           0 :         sector_t nr_sectors;
    1009           0 :         u8 zone_sectors_shift;
    1010           0 :         u32 sb_zone;
    1011           0 :         u32 nr_zones;
    1012             : 
    1013           0 :         zone_sectors = bdev_zone_sectors(bdev);
    1014           0 :         zone_sectors_shift = ilog2(zone_sectors);
    1015           0 :         nr_sectors = bdev_nr_sectors(bdev);
    1016           0 :         nr_zones = nr_sectors >> zone_sectors_shift;
    1017             : 
    1018           0 :         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
    1019           0 :         if (sb_zone + 1 >= nr_zones)
    1020             :                 return -ENOENT;
    1021             : 
    1022           0 :         return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
    1023             :                                 zone_start_sector(sb_zone, bdev),
    1024             :                                 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
    1025             : }
    1026             : 
    1027             : /*
    1028             :  * Find allocatable zones within a given region.
    1029             :  *
    1030             :  * @device:     the device to allocate a region on
    1031             :  * @hole_start: the position of the hole to allocate the region
    1032             :  * @num_bytes:  size of wanted region
    1033             :  * @hole_end:   the end of the hole
    1034             :  * @return:     position of allocatable zones
    1035             :  *
    1036             :  * Allocatable region should not contain any superblock locations.
    1037             :  */
    1038           0 : u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
    1039             :                                  u64 hole_end, u64 num_bytes)
    1040             : {
    1041           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
    1042           0 :         const u8 shift = zinfo->zone_size_shift;
    1043           0 :         u64 nzones = num_bytes >> shift;
    1044           0 :         u64 pos = hole_start;
    1045           0 :         u64 begin, end;
    1046           0 :         bool have_sb;
    1047           0 :         int i;
    1048             : 
    1049           0 :         ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
    1050           0 :         ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
    1051             : 
    1052           0 :         while (pos < hole_end) {
    1053           0 :                 begin = pos >> shift;
    1054           0 :                 end = begin + nzones;
    1055             : 
    1056           0 :                 if (end > zinfo->nr_zones)
    1057             :                         return hole_end;
    1058             : 
    1059             :                 /* Check if zones in the region are all empty */
    1060           0 :                 if (btrfs_dev_is_sequential(device, pos) &&
    1061           0 :                     !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
    1062           0 :                         pos += zinfo->zone_size;
    1063           0 :                         continue;
    1064             :                 }
    1065             : 
    1066             :                 have_sb = false;
    1067           0 :                 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    1068           0 :                         u32 sb_zone;
    1069           0 :                         u64 sb_pos;
    1070             : 
    1071           0 :                         sb_zone = sb_zone_number(shift, i);
    1072           0 :                         if (!(end <= sb_zone ||
    1073           0 :                               sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
    1074           0 :                                 have_sb = true;
    1075           0 :                                 pos = zone_start_physical(
    1076             :                                         sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
    1077           0 :                                 break;
    1078             :                         }
    1079             : 
    1080             :                         /* We also need to exclude regular superblock positions */
    1081           0 :                         sb_pos = btrfs_sb_offset(i);
    1082           0 :                         if (!(pos + num_bytes <= sb_pos ||
    1083           0 :                               sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
    1084           0 :                                 have_sb = true;
    1085           0 :                                 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
    1086             :                                             zinfo->zone_size);
    1087           0 :                                 break;
    1088             :                         }
    1089             :                 }
    1090           0 :                 if (!have_sb)
    1091             :                         break;
    1092             :         }
    1093             : 
    1094             :         return pos;
    1095             : }
    1096             : 
    1097           0 : static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
    1098             : {
    1099           0 :         struct btrfs_zoned_device_info *zone_info = device->zone_info;
    1100           0 :         unsigned int zno = (pos >> zone_info->zone_size_shift);
    1101             : 
    1102             :         /* We can use any number of zones */
    1103           0 :         if (zone_info->max_active_zones == 0)
    1104             :                 return true;
    1105             : 
    1106           0 :         if (!test_bit(zno, zone_info->active_zones)) {
    1107             :                 /* Active zone left? */
    1108           0 :                 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
    1109             :                         return false;
    1110           0 :                 if (test_and_set_bit(zno, zone_info->active_zones)) {
    1111             :                         /* Someone already set the bit */
    1112           0 :                         atomic_inc(&zone_info->active_zones_left);
    1113             :                 }
    1114             :         }
    1115             : 
    1116             :         return true;
    1117             : }
    1118             : 
    1119           0 : static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
    1120             : {
    1121           0 :         struct btrfs_zoned_device_info *zone_info = device->zone_info;
    1122           0 :         unsigned int zno = (pos >> zone_info->zone_size_shift);
    1123             : 
    1124             :         /* We can use any number of zones */
    1125           0 :         if (zone_info->max_active_zones == 0)
    1126             :                 return;
    1127             : 
    1128           0 :         if (test_and_clear_bit(zno, zone_info->active_zones))
    1129           0 :                 atomic_inc(&zone_info->active_zones_left);
    1130             : }
    1131             : 
    1132           0 : int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
    1133             :                             u64 length, u64 *bytes)
    1134             : {
    1135           0 :         int ret;
    1136             : 
    1137           0 :         *bytes = 0;
    1138           0 :         ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
    1139             :                                physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
    1140             :                                GFP_NOFS);
    1141           0 :         if (ret)
    1142             :                 return ret;
    1143             : 
    1144           0 :         *bytes = length;
    1145           0 :         while (length) {
    1146           0 :                 btrfs_dev_set_zone_empty(device, physical);
    1147           0 :                 btrfs_dev_clear_active_zone(device, physical);
    1148           0 :                 physical += device->zone_info->zone_size;
    1149           0 :                 length -= device->zone_info->zone_size;
    1150             :         }
    1151             : 
    1152             :         return 0;
    1153             : }
    1154             : 
    1155           0 : int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
    1156             : {
    1157           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
    1158           0 :         const u8 shift = zinfo->zone_size_shift;
    1159           0 :         unsigned long begin = start >> shift;
    1160           0 :         unsigned long nbits = size >> shift;
    1161           0 :         u64 pos;
    1162           0 :         int ret;
    1163             : 
    1164           0 :         ASSERT(IS_ALIGNED(start, zinfo->zone_size));
    1165           0 :         ASSERT(IS_ALIGNED(size, zinfo->zone_size));
    1166             : 
    1167           0 :         if (begin + nbits > zinfo->nr_zones)
    1168             :                 return -ERANGE;
    1169             : 
    1170             :         /* All the zones are conventional */
    1171           0 :         if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
    1172             :                 return 0;
    1173             : 
    1174             :         /* All the zones are sequential and empty */
    1175           0 :         if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
    1176           0 :             bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
    1177             :                 return 0;
    1178             : 
    1179           0 :         for (pos = start; pos < start + size; pos += zinfo->zone_size) {
    1180           0 :                 u64 reset_bytes;
    1181             : 
    1182           0 :                 if (!btrfs_dev_is_sequential(device, pos) ||
    1183           0 :                     btrfs_dev_is_empty_zone(device, pos))
    1184           0 :                         continue;
    1185             : 
    1186             :                 /* Free regions should be empty */
    1187           0 :                 btrfs_warn_in_rcu(
    1188             :                         device->fs_info,
    1189             :                 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
    1190             :                         rcu_str_deref(device->name), device->devid, pos >> shift);
    1191           0 :                 WARN_ON_ONCE(1);
    1192             : 
    1193           0 :                 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
    1194             :                                               &reset_bytes);
    1195           0 :                 if (ret)
    1196           0 :                         return ret;
    1197             :         }
    1198             : 
    1199             :         return 0;
    1200             : }
    1201             : 
    1202             : /*
    1203             :  * Calculate an allocation pointer from the extent allocation information
    1204             :  * for a block group consist of conventional zones. It is pointed to the
    1205             :  * end of the highest addressed extent in the block group as an allocation
    1206             :  * offset.
    1207             :  */
    1208           0 : static int calculate_alloc_pointer(struct btrfs_block_group *cache,
    1209             :                                    u64 *offset_ret, bool new)
    1210             : {
    1211           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    1212           0 :         struct btrfs_root *root;
    1213           0 :         struct btrfs_path *path;
    1214           0 :         struct btrfs_key key;
    1215           0 :         struct btrfs_key found_key;
    1216           0 :         int ret;
    1217           0 :         u64 length;
    1218             : 
    1219             :         /*
    1220             :          * Avoid  tree lookups for a new block group, there's no use for it.
    1221             :          * It must always be 0.
    1222             :          *
    1223             :          * Also, we have a lock chain of extent buffer lock -> chunk mutex.
    1224             :          * For new a block group, this function is called from
    1225             :          * btrfs_make_block_group() which is already taking the chunk mutex.
    1226             :          * Thus, we cannot call calculate_alloc_pointer() which takes extent
    1227             :          * buffer locks to avoid deadlock.
    1228             :          */
    1229           0 :         if (new) {
    1230           0 :                 *offset_ret = 0;
    1231           0 :                 return 0;
    1232             :         }
    1233             : 
    1234           0 :         path = btrfs_alloc_path();
    1235           0 :         if (!path)
    1236             :                 return -ENOMEM;
    1237             : 
    1238           0 :         key.objectid = cache->start + cache->length;
    1239           0 :         key.type = 0;
    1240           0 :         key.offset = 0;
    1241             : 
    1242           0 :         root = btrfs_extent_root(fs_info, key.objectid);
    1243           0 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    1244             :         /* We should not find the exact match */
    1245           0 :         if (!ret)
    1246             :                 ret = -EUCLEAN;
    1247           0 :         if (ret < 0)
    1248           0 :                 goto out;
    1249             : 
    1250           0 :         ret = btrfs_previous_extent_item(root, path, cache->start);
    1251           0 :         if (ret) {
    1252           0 :                 if (ret == 1) {
    1253           0 :                         ret = 0;
    1254           0 :                         *offset_ret = 0;
    1255             :                 }
    1256           0 :                 goto out;
    1257             :         }
    1258             : 
    1259           0 :         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
    1260             : 
    1261           0 :         if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
    1262           0 :                 length = found_key.offset;
    1263             :         else
    1264           0 :                 length = fs_info->nodesize;
    1265             : 
    1266           0 :         if (!(found_key.objectid >= cache->start &&
    1267           0 :                found_key.objectid + length <= cache->start + cache->length)) {
    1268           0 :                 ret = -EUCLEAN;
    1269           0 :                 goto out;
    1270             :         }
    1271           0 :         *offset_ret = found_key.objectid + length - cache->start;
    1272           0 :         ret = 0;
    1273             : 
    1274           0 : out:
    1275           0 :         btrfs_free_path(path);
    1276           0 :         return ret;
    1277             : }
    1278             : 
    1279           0 : int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
    1280             : {
    1281           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    1282           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    1283           0 :         struct extent_map *em;
    1284           0 :         struct map_lookup *map;
    1285           0 :         struct btrfs_device *device;
    1286           0 :         u64 logical = cache->start;
    1287           0 :         u64 length = cache->length;
    1288           0 :         int ret;
    1289           0 :         int i;
    1290           0 :         unsigned int nofs_flag;
    1291           0 :         u64 *alloc_offsets = NULL;
    1292           0 :         u64 *caps = NULL;
    1293           0 :         u64 *physical = NULL;
    1294           0 :         unsigned long *active = NULL;
    1295           0 :         u64 last_alloc = 0;
    1296           0 :         u32 num_sequential = 0, num_conventional = 0;
    1297             : 
    1298           0 :         if (!btrfs_is_zoned(fs_info))
    1299             :                 return 0;
    1300             : 
    1301             :         /* Sanity check */
    1302           0 :         if (!IS_ALIGNED(length, fs_info->zone_size)) {
    1303           0 :                 btrfs_err(fs_info,
    1304             :                 "zoned: block group %llu len %llu unaligned to zone size %llu",
    1305             :                           logical, length, fs_info->zone_size);
    1306           0 :                 return -EIO;
    1307             :         }
    1308             : 
    1309             :         /* Get the chunk mapping */
    1310           0 :         read_lock(&em_tree->lock);
    1311           0 :         em = lookup_extent_mapping(em_tree, logical, length);
    1312           0 :         read_unlock(&em_tree->lock);
    1313             : 
    1314           0 :         if (!em)
    1315             :                 return -EINVAL;
    1316             : 
    1317           0 :         map = em->map_lookup;
    1318             : 
    1319           0 :         cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
    1320           0 :         if (!cache->physical_map) {
    1321           0 :                 ret = -ENOMEM;
    1322           0 :                 goto out;
    1323             :         }
    1324             : 
    1325           0 :         alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
    1326           0 :         if (!alloc_offsets) {
    1327           0 :                 ret = -ENOMEM;
    1328           0 :                 goto out;
    1329             :         }
    1330             : 
    1331           0 :         caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
    1332           0 :         if (!caps) {
    1333           0 :                 ret = -ENOMEM;
    1334           0 :                 goto out;
    1335             :         }
    1336             : 
    1337           0 :         physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
    1338           0 :         if (!physical) {
    1339           0 :                 ret = -ENOMEM;
    1340           0 :                 goto out;
    1341             :         }
    1342             : 
    1343           0 :         active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
    1344           0 :         if (!active) {
    1345           0 :                 ret = -ENOMEM;
    1346           0 :                 goto out;
    1347             :         }
    1348             : 
    1349           0 :         for (i = 0; i < map->num_stripes; i++) {
    1350           0 :                 bool is_sequential;
    1351           0 :                 struct blk_zone zone;
    1352           0 :                 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1353           0 :                 int dev_replace_is_ongoing = 0;
    1354             : 
    1355           0 :                 device = map->stripes[i].dev;
    1356           0 :                 physical[i] = map->stripes[i].physical;
    1357             : 
    1358           0 :                 if (device->bdev == NULL) {
    1359           0 :                         alloc_offsets[i] = WP_MISSING_DEV;
    1360           0 :                         continue;
    1361             :                 }
    1362             : 
    1363           0 :                 is_sequential = btrfs_dev_is_sequential(device, physical[i]);
    1364           0 :                 if (is_sequential)
    1365           0 :                         num_sequential++;
    1366             :                 else
    1367           0 :                         num_conventional++;
    1368             : 
    1369             :                 /*
    1370             :                  * Consider a zone as active if we can allow any number of
    1371             :                  * active zones.
    1372             :                  */
    1373           0 :                 if (!device->zone_info->max_active_zones)
    1374           0 :                         __set_bit(i, active);
    1375             : 
    1376           0 :                 if (!is_sequential) {
    1377           0 :                         alloc_offsets[i] = WP_CONVENTIONAL;
    1378           0 :                         continue;
    1379             :                 }
    1380             : 
    1381             :                 /*
    1382             :                  * This zone will be used for allocation, so mark this zone
    1383             :                  * non-empty.
    1384             :                  */
    1385           0 :                 btrfs_dev_clear_zone_empty(device, physical[i]);
    1386             : 
    1387           0 :                 down_read(&dev_replace->rwsem);
    1388           0 :                 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
    1389           0 :                 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
    1390           0 :                         btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
    1391           0 :                 up_read(&dev_replace->rwsem);
    1392             : 
    1393             :                 /*
    1394             :                  * The group is mapped to a sequential zone. Get the zone write
    1395             :                  * pointer to determine the allocation offset within the zone.
    1396             :                  */
    1397           0 :                 WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
    1398           0 :                 nofs_flag = memalloc_nofs_save();
    1399           0 :                 ret = btrfs_get_dev_zone(device, physical[i], &zone);
    1400           0 :                 memalloc_nofs_restore(nofs_flag);
    1401           0 :                 if (ret == -EIO || ret == -EOPNOTSUPP) {
    1402           0 :                         ret = 0;
    1403           0 :                         alloc_offsets[i] = WP_MISSING_DEV;
    1404           0 :                         continue;
    1405           0 :                 } else if (ret) {
    1406           0 :                         goto out;
    1407             :                 }
    1408             : 
    1409           0 :                 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
    1410           0 :                         btrfs_err_in_rcu(fs_info,
    1411             :         "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
    1412             :                                 zone.start << SECTOR_SHIFT,
    1413             :                                 rcu_str_deref(device->name), device->devid);
    1414           0 :                         ret = -EIO;
    1415           0 :                         goto out;
    1416             :                 }
    1417             : 
    1418           0 :                 caps[i] = (zone.capacity << SECTOR_SHIFT);
    1419             : 
    1420           0 :                 switch (zone.cond) {
    1421           0 :                 case BLK_ZONE_COND_OFFLINE:
    1422             :                 case BLK_ZONE_COND_READONLY:
    1423           0 :                         btrfs_err(fs_info,
    1424             :                 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
    1425             :                                   physical[i] >> device->zone_info->zone_size_shift,
    1426             :                                   rcu_str_deref(device->name), device->devid);
    1427           0 :                         alloc_offsets[i] = WP_MISSING_DEV;
    1428           0 :                         break;
    1429           0 :                 case BLK_ZONE_COND_EMPTY:
    1430           0 :                         alloc_offsets[i] = 0;
    1431           0 :                         break;
    1432           0 :                 case BLK_ZONE_COND_FULL:
    1433           0 :                         alloc_offsets[i] = caps[i];
    1434           0 :                         break;
    1435           0 :                 default:
    1436             :                         /* Partially used zone */
    1437           0 :                         alloc_offsets[i] =
    1438           0 :                                         ((zone.wp - zone.start) << SECTOR_SHIFT);
    1439           0 :                         __set_bit(i, active);
    1440             :                         break;
    1441             :                 }
    1442             :         }
    1443             : 
    1444           0 :         if (num_sequential > 0)
    1445           0 :                 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
    1446             : 
    1447           0 :         if (num_conventional > 0) {
    1448             :                 /* Zone capacity is always zone size in emulation */
    1449           0 :                 cache->zone_capacity = cache->length;
    1450           0 :                 ret = calculate_alloc_pointer(cache, &last_alloc, new);
    1451           0 :                 if (ret) {
    1452           0 :                         btrfs_err(fs_info,
    1453             :                         "zoned: failed to determine allocation offset of bg %llu",
    1454             :                                   cache->start);
    1455           0 :                         goto out;
    1456           0 :                 } else if (map->num_stripes == num_conventional) {
    1457           0 :                         cache->alloc_offset = last_alloc;
    1458           0 :                         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
    1459           0 :                         goto out;
    1460             :                 }
    1461             :         }
    1462             : 
    1463           0 :         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
    1464           0 :         case 0: /* single */
    1465           0 :                 if (alloc_offsets[0] == WP_MISSING_DEV) {
    1466           0 :                         btrfs_err(fs_info,
    1467             :                         "zoned: cannot recover write pointer for zone %llu",
    1468             :                                 physical[0]);
    1469           0 :                         ret = -EIO;
    1470           0 :                         goto out;
    1471             :                 }
    1472           0 :                 cache->alloc_offset = alloc_offsets[0];
    1473           0 :                 cache->zone_capacity = caps[0];
    1474           0 :                 if (test_bit(0, active))
    1475           0 :                         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
    1476             :                 break;
    1477           0 :         case BTRFS_BLOCK_GROUP_DUP:
    1478           0 :                 if (map->type & BTRFS_BLOCK_GROUP_DATA) {
    1479           0 :                         btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
    1480           0 :                         ret = -EINVAL;
    1481           0 :                         goto out;
    1482             :                 }
    1483           0 :                 if (alloc_offsets[0] == WP_MISSING_DEV) {
    1484           0 :                         btrfs_err(fs_info,
    1485             :                         "zoned: cannot recover write pointer for zone %llu",
    1486             :                                 physical[0]);
    1487           0 :                         ret = -EIO;
    1488           0 :                         goto out;
    1489             :                 }
    1490           0 :                 if (alloc_offsets[1] == WP_MISSING_DEV) {
    1491           0 :                         btrfs_err(fs_info,
    1492             :                         "zoned: cannot recover write pointer for zone %llu",
    1493             :                                 physical[1]);
    1494           0 :                         ret = -EIO;
    1495           0 :                         goto out;
    1496             :                 }
    1497           0 :                 if (alloc_offsets[0] != alloc_offsets[1]) {
    1498           0 :                         btrfs_err(fs_info,
    1499             :                         "zoned: write pointer offset mismatch of zones in DUP profile");
    1500           0 :                         ret = -EIO;
    1501           0 :                         goto out;
    1502             :                 }
    1503           0 :                 if (test_bit(0, active) != test_bit(1, active)) {
    1504           0 :                         if (!btrfs_zone_activate(cache)) {
    1505           0 :                                 ret = -EIO;
    1506           0 :                                 goto out;
    1507             :                         }
    1508             :                 } else {
    1509           0 :                         if (test_bit(0, active))
    1510           0 :                                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
    1511           0 :                                         &cache->runtime_flags);
    1512             :                 }
    1513           0 :                 cache->alloc_offset = alloc_offsets[0];
    1514           0 :                 cache->zone_capacity = min(caps[0], caps[1]);
    1515           0 :                 break;
    1516           0 :         case BTRFS_BLOCK_GROUP_RAID1:
    1517             :         case BTRFS_BLOCK_GROUP_RAID0:
    1518             :         case BTRFS_BLOCK_GROUP_RAID10:
    1519             :         case BTRFS_BLOCK_GROUP_RAID5:
    1520             :         case BTRFS_BLOCK_GROUP_RAID6:
    1521             :                 /* non-single profiles are not supported yet */
    1522             :         default:
    1523           0 :                 btrfs_err(fs_info, "zoned: profile %s not yet supported",
    1524             :                           btrfs_bg_type_to_raid_name(map->type));
    1525           0 :                 ret = -EINVAL;
    1526           0 :                 goto out;
    1527             :         }
    1528             : 
    1529           0 : out:
    1530           0 :         if (cache->alloc_offset > fs_info->zone_size) {
    1531           0 :                 btrfs_err(fs_info,
    1532             :                         "zoned: invalid write pointer %llu in block group %llu",
    1533             :                         cache->alloc_offset, cache->start);
    1534           0 :                 ret = -EIO;
    1535             :         }
    1536             : 
    1537           0 :         if (cache->alloc_offset > cache->zone_capacity) {
    1538           0 :                 btrfs_err(fs_info,
    1539             : "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
    1540             :                           cache->alloc_offset, cache->zone_capacity,
    1541             :                           cache->start);
    1542           0 :                 ret = -EIO;
    1543             :         }
    1544             : 
    1545             :         /* An extent is allocated after the write pointer */
    1546           0 :         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
    1547           0 :                 btrfs_err(fs_info,
    1548             :                           "zoned: got wrong write pointer in BG %llu: %llu > %llu",
    1549             :                           logical, last_alloc, cache->alloc_offset);
    1550           0 :                 ret = -EIO;
    1551             :         }
    1552             : 
    1553           0 :         if (!ret) {
    1554           0 :                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
    1555           0 :                 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
    1556           0 :                         btrfs_get_block_group(cache);
    1557           0 :                         spin_lock(&fs_info->zone_active_bgs_lock);
    1558           0 :                         list_add_tail(&cache->active_bg_list,
    1559             :                                       &fs_info->zone_active_bgs);
    1560           0 :                         spin_unlock(&fs_info->zone_active_bgs_lock);
    1561             :                 }
    1562             :         } else {
    1563           0 :                 kfree(cache->physical_map);
    1564           0 :                 cache->physical_map = NULL;
    1565             :         }
    1566           0 :         bitmap_free(active);
    1567           0 :         kfree(physical);
    1568           0 :         kfree(caps);
    1569           0 :         kfree(alloc_offsets);
    1570           0 :         free_extent_map(em);
    1571             : 
    1572           0 :         return ret;
    1573             : }
    1574             : 
    1575           0 : void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
    1576             : {
    1577           0 :         u64 unusable, free;
    1578             : 
    1579           0 :         if (!btrfs_is_zoned(cache->fs_info))
    1580             :                 return;
    1581             : 
    1582           0 :         WARN_ON(cache->bytes_super != 0);
    1583             : 
    1584             :         /* Check for block groups never get activated */
    1585           0 :         if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
    1586           0 :             cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
    1587           0 :             !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
    1588           0 :             cache->alloc_offset == 0) {
    1589           0 :                 unusable = cache->length;
    1590           0 :                 free = 0;
    1591             :         } else {
    1592           0 :                 unusable = (cache->alloc_offset - cache->used) +
    1593           0 :                            (cache->length - cache->zone_capacity);
    1594           0 :                 free = cache->zone_capacity - cache->alloc_offset;
    1595             :         }
    1596             : 
    1597             :         /* We only need ->free_space in ALLOC_SEQ block groups */
    1598           0 :         cache->cached = BTRFS_CACHE_FINISHED;
    1599           0 :         cache->free_space_ctl->free_space = free;
    1600           0 :         cache->zone_unusable = unusable;
    1601             : }
    1602             : 
    1603           0 : void btrfs_redirty_list_add(struct btrfs_transaction *trans,
    1604             :                             struct extent_buffer *eb)
    1605             : {
    1606           0 :         if (!btrfs_is_zoned(eb->fs_info) ||
    1607             :             btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
    1608             :                 return;
    1609             : 
    1610           0 :         ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
    1611             : 
    1612           0 :         memzero_extent_buffer(eb, 0, eb->len);
    1613           0 :         set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
    1614           0 :         set_extent_buffer_dirty(eb);
    1615           0 :         set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
    1616             :                         EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
    1617             : }
    1618             : 
    1619           0 : bool btrfs_use_zone_append(struct btrfs_bio *bbio)
    1620             : {
    1621           0 :         u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
    1622           0 :         struct btrfs_inode *inode = bbio->inode;
    1623           0 :         struct btrfs_fs_info *fs_info = bbio->fs_info;
    1624           0 :         struct btrfs_block_group *cache;
    1625           0 :         bool ret = false;
    1626             : 
    1627           0 :         if (!btrfs_is_zoned(fs_info))
    1628             :                 return false;
    1629             : 
    1630           0 :         if (!inode || !is_data_inode(&inode->vfs_inode))
    1631             :                 return false;
    1632             : 
    1633           0 :         if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
    1634             :                 return false;
    1635             : 
    1636             :         /*
    1637             :          * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
    1638             :          * extent layout the relocation code has.
    1639             :          * Furthermore we have set aside own block-group from which only the
    1640             :          * relocation "process" can allocate and make sure only one process at a
    1641             :          * time can add pages to an extent that gets relocated, so it's safe to
    1642             :          * use regular REQ_OP_WRITE for this special case.
    1643             :          */
    1644           0 :         if (btrfs_is_data_reloc_root(inode->root))
    1645             :                 return false;
    1646             : 
    1647           0 :         cache = btrfs_lookup_block_group(fs_info, start);
    1648           0 :         ASSERT(cache);
    1649           0 :         if (!cache)
    1650             :                 return false;
    1651             : 
    1652           0 :         ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
    1653           0 :         btrfs_put_block_group(cache);
    1654             : 
    1655           0 :         return ret;
    1656             : }
    1657             : 
    1658           0 : void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
    1659             : {
    1660           0 :         const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
    1661           0 :         struct btrfs_ordered_sum *sum = bbio->sums;
    1662             : 
    1663           0 :         if (physical < bbio->orig_physical)
    1664           0 :                 sum->logical -= bbio->orig_physical - physical;
    1665             :         else
    1666           0 :                 sum->logical += physical - bbio->orig_physical;
    1667           0 : }
    1668             : 
    1669           0 : static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
    1670             :                                         u64 logical)
    1671             : {
    1672           0 :         struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
    1673           0 :         struct extent_map *em;
    1674             : 
    1675           0 :         ordered->disk_bytenr = logical;
    1676             : 
    1677           0 :         write_lock(&em_tree->lock);
    1678           0 :         em = search_extent_mapping(em_tree, ordered->file_offset,
    1679             :                                    ordered->num_bytes);
    1680           0 :         em->block_start = logical;
    1681           0 :         free_extent_map(em);
    1682           0 :         write_unlock(&em_tree->lock);
    1683           0 : }
    1684             : 
    1685           0 : static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
    1686             :                                       u64 logical, u64 len)
    1687             : {
    1688           0 :         struct btrfs_ordered_extent *new;
    1689             : 
    1690           0 :         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
    1691           0 :             split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
    1692             :                              ordered->num_bytes, len, logical))
    1693             :                 return false;
    1694             : 
    1695           0 :         new = btrfs_split_ordered_extent(ordered, len);
    1696           0 :         if (IS_ERR(new))
    1697             :                 return false;
    1698           0 :         new->disk_bytenr = logical;
    1699           0 :         btrfs_finish_one_ordered(new);
    1700           0 :         return true;
    1701             : }
    1702             : 
    1703           0 : void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
    1704             : {
    1705           0 :         struct btrfs_inode *inode = BTRFS_I(ordered->inode);
    1706           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1707           0 :         struct btrfs_ordered_sum *sum =
    1708           0 :                 list_first_entry(&ordered->list, typeof(*sum), list);
    1709           0 :         u64 logical = sum->logical;
    1710           0 :         u64 len = sum->len;
    1711             : 
    1712           0 :         while (len < ordered->disk_num_bytes) {
    1713           0 :                 sum = list_next_entry(sum, list);
    1714           0 :                 if (sum->logical == logical + len) {
    1715           0 :                         len += sum->len;
    1716           0 :                         continue;
    1717             :                 }
    1718           0 :                 if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
    1719           0 :                         set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
    1720           0 :                         btrfs_err(fs_info, "failed to split ordered extent");
    1721           0 :                         goto out;
    1722             :                 }
    1723           0 :                 logical = sum->logical;
    1724           0 :                 len = sum->len;
    1725             :         }
    1726             : 
    1727           0 :         if (ordered->disk_bytenr != logical)
    1728           0 :                 btrfs_rewrite_logical_zoned(ordered, logical);
    1729             : 
    1730           0 : out:
    1731             :         /*
    1732             :          * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
    1733             :          * were allocated by btrfs_alloc_dummy_sum only to record the logical
    1734             :          * addresses and don't contain actual checksums.  We thus must free them
    1735             :          * here so that we don't attempt to log the csums later.
    1736             :          */
    1737           0 :         if ((inode->flags & BTRFS_INODE_NODATASUM) ||
    1738           0 :             test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
    1739           0 :                 while ((sum = list_first_entry_or_null(&ordered->list,
    1740             :                                                        typeof(*sum), list))) {
    1741           0 :                         list_del(&sum->list);
    1742           0 :                         kfree(sum);
    1743             :                 }
    1744             :         }
    1745           0 : }
    1746             : 
    1747           0 : bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
    1748             :                                     struct extent_buffer *eb,
    1749             :                                     struct btrfs_block_group **cache_ret)
    1750             : {
    1751           0 :         struct btrfs_block_group *cache;
    1752           0 :         bool ret = true;
    1753             : 
    1754           0 :         if (!btrfs_is_zoned(fs_info))
    1755             :                 return true;
    1756             : 
    1757           0 :         cache = btrfs_lookup_block_group(fs_info, eb->start);
    1758           0 :         if (!cache)
    1759             :                 return true;
    1760             : 
    1761           0 :         if (cache->meta_write_pointer != eb->start) {
    1762           0 :                 btrfs_put_block_group(cache);
    1763           0 :                 cache = NULL;
    1764           0 :                 ret = false;
    1765             :         } else {
    1766           0 :                 cache->meta_write_pointer = eb->start + eb->len;
    1767             :         }
    1768             : 
    1769           0 :         *cache_ret = cache;
    1770             : 
    1771           0 :         return ret;
    1772             : }
    1773             : 
    1774           0 : void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
    1775             :                                      struct extent_buffer *eb)
    1776             : {
    1777           0 :         if (!btrfs_is_zoned(eb->fs_info) || !cache)
    1778             :                 return;
    1779             : 
    1780           0 :         ASSERT(cache->meta_write_pointer == eb->start + eb->len);
    1781           0 :         cache->meta_write_pointer = eb->start;
    1782             : }
    1783             : 
    1784           0 : int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
    1785             : {
    1786           0 :         if (!btrfs_dev_is_sequential(device, physical))
    1787             :                 return -EOPNOTSUPP;
    1788             : 
    1789           0 :         return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
    1790             :                                     length >> SECTOR_SHIFT, GFP_NOFS, 0);
    1791             : }
    1792             : 
    1793           0 : static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
    1794             :                           struct blk_zone *zone)
    1795             : {
    1796           0 :         struct btrfs_io_context *bioc = NULL;
    1797           0 :         u64 mapped_length = PAGE_SIZE;
    1798           0 :         unsigned int nofs_flag;
    1799           0 :         int nmirrors;
    1800           0 :         int i, ret;
    1801             : 
    1802           0 :         ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
    1803             :                               &mapped_length, &bioc, NULL, NULL, 1);
    1804           0 :         if (ret || !bioc || mapped_length < PAGE_SIZE) {
    1805           0 :                 ret = -EIO;
    1806           0 :                 goto out_put_bioc;
    1807             :         }
    1808             : 
    1809           0 :         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
    1810           0 :                 ret = -EINVAL;
    1811           0 :                 goto out_put_bioc;
    1812             :         }
    1813             : 
    1814           0 :         nofs_flag = memalloc_nofs_save();
    1815           0 :         nmirrors = (int)bioc->num_stripes;
    1816           0 :         for (i = 0; i < nmirrors; i++) {
    1817           0 :                 u64 physical = bioc->stripes[i].physical;
    1818           0 :                 struct btrfs_device *dev = bioc->stripes[i].dev;
    1819             : 
    1820             :                 /* Missing device */
    1821           0 :                 if (!dev->bdev)
    1822           0 :                         continue;
    1823             : 
    1824           0 :                 ret = btrfs_get_dev_zone(dev, physical, zone);
    1825             :                 /* Failing device */
    1826           0 :                 if (ret == -EIO || ret == -EOPNOTSUPP)
    1827           0 :                         continue;
    1828             :                 break;
    1829             :         }
    1830           0 :         memalloc_nofs_restore(nofs_flag);
    1831           0 : out_put_bioc:
    1832           0 :         btrfs_put_bioc(bioc);
    1833           0 :         return ret;
    1834             : }
    1835             : 
    1836             : /*
    1837             :  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
    1838             :  * filling zeros between @physical_pos to a write pointer of dev-replace
    1839             :  * source device.
    1840             :  */
    1841           0 : int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
    1842             :                                     u64 physical_start, u64 physical_pos)
    1843             : {
    1844           0 :         struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
    1845           0 :         struct blk_zone zone;
    1846           0 :         u64 length;
    1847           0 :         u64 wp;
    1848           0 :         int ret;
    1849             : 
    1850           0 :         if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
    1851             :                 return 0;
    1852             : 
    1853           0 :         ret = read_zone_info(fs_info, logical, &zone);
    1854           0 :         if (ret)
    1855             :                 return ret;
    1856             : 
    1857           0 :         wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
    1858             : 
    1859           0 :         if (physical_pos == wp)
    1860             :                 return 0;
    1861             : 
    1862           0 :         if (physical_pos > wp)
    1863             :                 return -EUCLEAN;
    1864             : 
    1865           0 :         length = wp - physical_pos;
    1866           0 :         return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
    1867             : }
    1868             : 
    1869             : /*
    1870             :  * Activate block group and underlying device zones
    1871             :  *
    1872             :  * @block_group: the block group to activate
    1873             :  *
    1874             :  * Return: true on success, false otherwise
    1875             :  */
    1876           0 : bool btrfs_zone_activate(struct btrfs_block_group *block_group)
    1877             : {
    1878           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    1879           0 :         struct btrfs_space_info *space_info = block_group->space_info;
    1880           0 :         struct map_lookup *map;
    1881           0 :         struct btrfs_device *device;
    1882           0 :         u64 physical;
    1883           0 :         bool ret;
    1884           0 :         int i;
    1885             : 
    1886           0 :         if (!btrfs_is_zoned(block_group->fs_info))
    1887             :                 return true;
    1888             : 
    1889           0 :         map = block_group->physical_map;
    1890             : 
    1891           0 :         spin_lock(&space_info->lock);
    1892           0 :         spin_lock(&block_group->lock);
    1893           0 :         if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
    1894           0 :                 ret = true;
    1895           0 :                 goto out_unlock;
    1896             :         }
    1897             : 
    1898             :         /* No space left */
    1899           0 :         if (btrfs_zoned_bg_is_full(block_group)) {
    1900           0 :                 ret = false;
    1901           0 :                 goto out_unlock;
    1902             :         }
    1903             : 
    1904           0 :         for (i = 0; i < map->num_stripes; i++) {
    1905           0 :                 device = map->stripes[i].dev;
    1906           0 :                 physical = map->stripes[i].physical;
    1907             : 
    1908           0 :                 if (device->zone_info->max_active_zones == 0)
    1909           0 :                         continue;
    1910             : 
    1911           0 :                 if (!btrfs_dev_set_active_zone(device, physical)) {
    1912             :                         /* Cannot activate the zone */
    1913           0 :                         ret = false;
    1914           0 :                         goto out_unlock;
    1915             :                 }
    1916             :         }
    1917             : 
    1918             :         /* Successfully activated all the zones */
    1919           0 :         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
    1920           0 :         WARN_ON(block_group->alloc_offset != 0);
    1921           0 :         if (block_group->zone_unusable == block_group->length) {
    1922           0 :                 block_group->zone_unusable = block_group->length - block_group->zone_capacity;
    1923           0 :                 space_info->bytes_zone_unusable -= block_group->zone_capacity;
    1924             :         }
    1925           0 :         spin_unlock(&block_group->lock);
    1926           0 :         btrfs_try_granting_tickets(fs_info, space_info);
    1927           0 :         spin_unlock(&space_info->lock);
    1928             : 
    1929             :         /* For the active block group list */
    1930           0 :         btrfs_get_block_group(block_group);
    1931             : 
    1932           0 :         spin_lock(&fs_info->zone_active_bgs_lock);
    1933           0 :         list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
    1934           0 :         spin_unlock(&fs_info->zone_active_bgs_lock);
    1935             : 
    1936           0 :         return true;
    1937             : 
    1938           0 : out_unlock:
    1939           0 :         spin_unlock(&block_group->lock);
    1940           0 :         spin_unlock(&space_info->lock);
    1941           0 :         return ret;
    1942             : }
    1943             : 
    1944           0 : static void wait_eb_writebacks(struct btrfs_block_group *block_group)
    1945             : {
    1946           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    1947           0 :         const u64 end = block_group->start + block_group->length;
    1948           0 :         struct radix_tree_iter iter;
    1949           0 :         struct extent_buffer *eb;
    1950           0 :         void __rcu **slot;
    1951             : 
    1952           0 :         rcu_read_lock();
    1953           0 :         radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
    1954             :                                  block_group->start >> fs_info->sectorsize_bits) {
    1955           0 :                 eb = radix_tree_deref_slot(slot);
    1956           0 :                 if (!eb)
    1957           0 :                         continue;
    1958           0 :                 if (radix_tree_deref_retry(eb)) {
    1959           0 :                         slot = radix_tree_iter_retry(&iter);
    1960           0 :                         continue;
    1961             :                 }
    1962             : 
    1963           0 :                 if (eb->start < block_group->start)
    1964           0 :                         continue;
    1965           0 :                 if (eb->start >= end)
    1966             :                         break;
    1967             : 
    1968           0 :                 slot = radix_tree_iter_resume(slot, &iter);
    1969           0 :                 rcu_read_unlock();
    1970           0 :                 wait_on_extent_buffer_writeback(eb);
    1971           0 :                 rcu_read_lock();
    1972             :         }
    1973           0 :         rcu_read_unlock();
    1974           0 : }
    1975             : 
    1976           0 : static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
    1977             : {
    1978           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    1979           0 :         struct map_lookup *map;
    1980           0 :         const bool is_metadata = (block_group->flags &
    1981             :                         (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
    1982           0 :         int ret = 0;
    1983           0 :         int i;
    1984             : 
    1985           0 :         spin_lock(&block_group->lock);
    1986           0 :         if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
    1987           0 :                 spin_unlock(&block_group->lock);
    1988           0 :                 return 0;
    1989             :         }
    1990             : 
    1991             :         /* Check if we have unwritten allocated space */
    1992           0 :         if (is_metadata &&
    1993           0 :             block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
    1994           0 :                 spin_unlock(&block_group->lock);
    1995           0 :                 return -EAGAIN;
    1996             :         }
    1997             : 
    1998             :         /*
    1999             :          * If we are sure that the block group is full (= no more room left for
    2000             :          * new allocation) and the IO for the last usable block is completed, we
    2001             :          * don't need to wait for the other IOs. This holds because we ensure
    2002             :          * the sequential IO submissions using the ZONE_APPEND command for data
    2003             :          * and block_group->meta_write_pointer for metadata.
    2004             :          */
    2005           0 :         if (!fully_written) {
    2006           0 :                 spin_unlock(&block_group->lock);
    2007             : 
    2008           0 :                 ret = btrfs_inc_block_group_ro(block_group, false);
    2009           0 :                 if (ret)
    2010             :                         return ret;
    2011             : 
    2012             :                 /* Ensure all writes in this block group finish */
    2013           0 :                 btrfs_wait_block_group_reservations(block_group);
    2014             :                 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
    2015           0 :                 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
    2016             :                                          block_group->length);
    2017             :                 /* Wait for extent buffers to be written. */
    2018           0 :                 if (is_metadata)
    2019           0 :                         wait_eb_writebacks(block_group);
    2020             : 
    2021           0 :                 spin_lock(&block_group->lock);
    2022             : 
    2023             :                 /*
    2024             :                  * Bail out if someone already deactivated the block group, or
    2025             :                  * allocated space is left in the block group.
    2026             :                  */
    2027           0 :                 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
    2028             :                               &block_group->runtime_flags)) {
    2029           0 :                         spin_unlock(&block_group->lock);
    2030           0 :                         btrfs_dec_block_group_ro(block_group);
    2031           0 :                         return 0;
    2032             :                 }
    2033             : 
    2034           0 :                 if (block_group->reserved) {
    2035           0 :                         spin_unlock(&block_group->lock);
    2036           0 :                         btrfs_dec_block_group_ro(block_group);
    2037           0 :                         return -EAGAIN;
    2038             :                 }
    2039             :         }
    2040             : 
    2041           0 :         clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
    2042           0 :         block_group->alloc_offset = block_group->zone_capacity;
    2043           0 :         block_group->free_space_ctl->free_space = 0;
    2044           0 :         btrfs_clear_treelog_bg(block_group);
    2045           0 :         btrfs_clear_data_reloc_bg(block_group);
    2046           0 :         spin_unlock(&block_group->lock);
    2047             : 
    2048           0 :         map = block_group->physical_map;
    2049           0 :         for (i = 0; i < map->num_stripes; i++) {
    2050           0 :                 struct btrfs_device *device = map->stripes[i].dev;
    2051           0 :                 const u64 physical = map->stripes[i].physical;
    2052             : 
    2053           0 :                 if (device->zone_info->max_active_zones == 0)
    2054           0 :                         continue;
    2055             : 
    2056           0 :                 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
    2057             :                                        physical >> SECTOR_SHIFT,
    2058           0 :                                        device->zone_info->zone_size >> SECTOR_SHIFT,
    2059             :                                        GFP_NOFS);
    2060             : 
    2061           0 :                 if (ret)
    2062           0 :                         return ret;
    2063             : 
    2064           0 :                 btrfs_dev_clear_active_zone(device, physical);
    2065             :         }
    2066             : 
    2067           0 :         if (!fully_written)
    2068           0 :                 btrfs_dec_block_group_ro(block_group);
    2069             : 
    2070           0 :         spin_lock(&fs_info->zone_active_bgs_lock);
    2071           0 :         ASSERT(!list_empty(&block_group->active_bg_list));
    2072           0 :         list_del_init(&block_group->active_bg_list);
    2073           0 :         spin_unlock(&fs_info->zone_active_bgs_lock);
    2074             : 
    2075             :         /* For active_bg_list */
    2076           0 :         btrfs_put_block_group(block_group);
    2077             : 
    2078           0 :         clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
    2079             : 
    2080           0 :         return 0;
    2081             : }
    2082             : 
    2083           0 : int btrfs_zone_finish(struct btrfs_block_group *block_group)
    2084             : {
    2085           0 :         if (!btrfs_is_zoned(block_group->fs_info))
    2086             :                 return 0;
    2087             : 
    2088           0 :         return do_zone_finish(block_group, false);
    2089             : }
    2090             : 
    2091           0 : bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
    2092             : {
    2093           0 :         struct btrfs_fs_info *fs_info = fs_devices->fs_info;
    2094           0 :         struct btrfs_device *device;
    2095           0 :         bool ret = false;
    2096             : 
    2097           0 :         if (!btrfs_is_zoned(fs_info))
    2098             :                 return true;
    2099             : 
    2100             :         /* Check if there is a device with active zones left */
    2101           0 :         mutex_lock(&fs_info->chunk_mutex);
    2102           0 :         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
    2103           0 :                 struct btrfs_zoned_device_info *zinfo = device->zone_info;
    2104             : 
    2105           0 :                 if (!device->bdev)
    2106           0 :                         continue;
    2107             : 
    2108           0 :                 if (!zinfo->max_active_zones) {
    2109             :                         ret = true;
    2110             :                         break;
    2111             :                 }
    2112             : 
    2113           0 :                 switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
    2114           0 :                 case 0: /* single */
    2115           0 :                         ret = (atomic_read(&zinfo->active_zones_left) >= 1);
    2116           0 :                         break;
    2117           0 :                 case BTRFS_BLOCK_GROUP_DUP:
    2118           0 :                         ret = (atomic_read(&zinfo->active_zones_left) >= 2);
    2119           0 :                         break;
    2120             :                 }
    2121           0 :                 if (ret)
    2122             :                         break;
    2123             :         }
    2124           0 :         mutex_unlock(&fs_info->chunk_mutex);
    2125             : 
    2126           0 :         if (!ret)
    2127           0 :                 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
    2128             : 
    2129             :         return ret;
    2130             : }
    2131             : 
    2132           0 : void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
    2133             : {
    2134           0 :         struct btrfs_block_group *block_group;
    2135           0 :         u64 min_alloc_bytes;
    2136             : 
    2137           0 :         if (!btrfs_is_zoned(fs_info))
    2138             :                 return;
    2139             : 
    2140           0 :         block_group = btrfs_lookup_block_group(fs_info, logical);
    2141           0 :         ASSERT(block_group);
    2142             : 
    2143             :         /* No MIXED_BG on zoned btrfs. */
    2144           0 :         if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
    2145           0 :                 min_alloc_bytes = fs_info->sectorsize;
    2146             :         else
    2147           0 :                 min_alloc_bytes = fs_info->nodesize;
    2148             : 
    2149             :         /* Bail out if we can allocate more data from this block group. */
    2150           0 :         if (logical + length + min_alloc_bytes <=
    2151           0 :             block_group->start + block_group->zone_capacity)
    2152           0 :                 goto out;
    2153             : 
    2154           0 :         do_zone_finish(block_group, true);
    2155             : 
    2156           0 : out:
    2157           0 :         btrfs_put_block_group(block_group);
    2158             : }
    2159             : 
    2160           0 : static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
    2161             : {
    2162           0 :         struct btrfs_block_group *bg =
    2163           0 :                 container_of(work, struct btrfs_block_group, zone_finish_work);
    2164             : 
    2165           0 :         wait_on_extent_buffer_writeback(bg->last_eb);
    2166           0 :         free_extent_buffer(bg->last_eb);
    2167           0 :         btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
    2168           0 :         btrfs_put_block_group(bg);
    2169           0 : }
    2170             : 
    2171           0 : void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
    2172             :                                    struct extent_buffer *eb)
    2173             : {
    2174           0 :         if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
    2175           0 :             eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
    2176             :                 return;
    2177             : 
    2178           0 :         if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
    2179           0 :                 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
    2180             :                           bg->start);
    2181           0 :                 return;
    2182             :         }
    2183             : 
    2184             :         /* For the work */
    2185           0 :         btrfs_get_block_group(bg);
    2186           0 :         atomic_inc(&eb->refs);
    2187           0 :         bg->last_eb = eb;
    2188           0 :         INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
    2189           0 :         queue_work(system_unbound_wq, &bg->zone_finish_work);
    2190             : }
    2191             : 
    2192           0 : void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
    2193             : {
    2194           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    2195             : 
    2196           0 :         spin_lock(&fs_info->relocation_bg_lock);
    2197           0 :         if (fs_info->data_reloc_bg == bg->start)
    2198           0 :                 fs_info->data_reloc_bg = 0;
    2199           0 :         spin_unlock(&fs_info->relocation_bg_lock);
    2200           0 : }
    2201             : 
    2202           0 : void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
    2203             : {
    2204           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2205           0 :         struct btrfs_device *device;
    2206             : 
    2207           0 :         if (!btrfs_is_zoned(fs_info))
    2208             :                 return;
    2209             : 
    2210           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2211           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
    2212           0 :                 if (device->zone_info) {
    2213           0 :                         vfree(device->zone_info->zone_cache);
    2214           0 :                         device->zone_info->zone_cache = NULL;
    2215             :                 }
    2216             :         }
    2217           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2218             : }
    2219             : 
    2220           0 : bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
    2221             : {
    2222           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2223           0 :         struct btrfs_device *device;
    2224           0 :         u64 used = 0;
    2225           0 :         u64 total = 0;
    2226           0 :         u64 factor;
    2227             : 
    2228           0 :         ASSERT(btrfs_is_zoned(fs_info));
    2229             : 
    2230           0 :         if (fs_info->bg_reclaim_threshold == 0)
    2231             :                 return false;
    2232             : 
    2233           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2234           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
    2235           0 :                 if (!device->bdev)
    2236           0 :                         continue;
    2237             : 
    2238           0 :                 total += device->disk_total_bytes;
    2239           0 :                 used += device->bytes_used;
    2240             :         }
    2241           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2242             : 
    2243           0 :         factor = div64_u64(used * 100, total);
    2244           0 :         return factor >= fs_info->bg_reclaim_threshold;
    2245             : }
    2246             : 
    2247           0 : void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
    2248             :                                        u64 length)
    2249             : {
    2250           0 :         struct btrfs_block_group *block_group;
    2251             : 
    2252           0 :         if (!btrfs_is_zoned(fs_info))
    2253             :                 return;
    2254             : 
    2255           0 :         block_group = btrfs_lookup_block_group(fs_info, logical);
    2256             :         /* It should be called on a previous data relocation block group. */
    2257           0 :         ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
    2258             : 
    2259           0 :         spin_lock(&block_group->lock);
    2260           0 :         if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
    2261           0 :                 goto out;
    2262             : 
    2263             :         /* All relocation extents are written. */
    2264           0 :         if (block_group->start + block_group->alloc_offset == logical + length) {
    2265             :                 /* Now, release this block group for further allocations. */
    2266           0 :                 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
    2267             :                           &block_group->runtime_flags);
    2268             :         }
    2269             : 
    2270           0 : out:
    2271           0 :         spin_unlock(&block_group->lock);
    2272           0 :         btrfs_put_block_group(block_group);
    2273             : }
    2274             : 
    2275           0 : int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
    2276             : {
    2277           0 :         struct btrfs_block_group *block_group;
    2278           0 :         struct btrfs_block_group *min_bg = NULL;
    2279           0 :         u64 min_avail = U64_MAX;
    2280           0 :         int ret;
    2281             : 
    2282           0 :         spin_lock(&fs_info->zone_active_bgs_lock);
    2283           0 :         list_for_each_entry(block_group, &fs_info->zone_active_bgs,
    2284             :                             active_bg_list) {
    2285           0 :                 u64 avail;
    2286             : 
    2287           0 :                 spin_lock(&block_group->lock);
    2288           0 :                 if (block_group->reserved || block_group->alloc_offset == 0 ||
    2289           0 :                     (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
    2290           0 :                         spin_unlock(&block_group->lock);
    2291           0 :                         continue;
    2292             :                 }
    2293             : 
    2294           0 :                 avail = block_group->zone_capacity - block_group->alloc_offset;
    2295           0 :                 if (min_avail > avail) {
    2296           0 :                         if (min_bg)
    2297           0 :                                 btrfs_put_block_group(min_bg);
    2298           0 :                         min_bg = block_group;
    2299           0 :                         min_avail = avail;
    2300           0 :                         btrfs_get_block_group(min_bg);
    2301             :                 }
    2302           0 :                 spin_unlock(&block_group->lock);
    2303             :         }
    2304           0 :         spin_unlock(&fs_info->zone_active_bgs_lock);
    2305             : 
    2306           0 :         if (!min_bg)
    2307             :                 return 0;
    2308             : 
    2309           0 :         ret = btrfs_zone_finish(min_bg);
    2310           0 :         btrfs_put_block_group(min_bg);
    2311             : 
    2312           0 :         return ret < 0 ? ret : 1;
    2313             : }
    2314             : 
    2315           0 : int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
    2316             :                                 struct btrfs_space_info *space_info,
    2317             :                                 bool do_finish)
    2318             : {
    2319           0 :         struct btrfs_block_group *bg;
    2320           0 :         int index;
    2321             : 
    2322           0 :         if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
    2323             :                 return 0;
    2324             : 
    2325           0 :         for (;;) {
    2326           0 :                 int ret;
    2327           0 :                 bool need_finish = false;
    2328             : 
    2329           0 :                 down_read(&space_info->groups_sem);
    2330           0 :                 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
    2331           0 :                         list_for_each_entry(bg, &space_info->block_groups[index],
    2332             :                                             list) {
    2333           0 :                                 if (!spin_trylock(&bg->lock))
    2334           0 :                                         continue;
    2335           0 :                                 if (btrfs_zoned_bg_is_full(bg) ||
    2336           0 :                                     test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
    2337             :                                              &bg->runtime_flags)) {
    2338           0 :                                         spin_unlock(&bg->lock);
    2339           0 :                                         continue;
    2340             :                                 }
    2341           0 :                                 spin_unlock(&bg->lock);
    2342             : 
    2343           0 :                                 if (btrfs_zone_activate(bg)) {
    2344           0 :                                         up_read(&space_info->groups_sem);
    2345           0 :                                         return 1;
    2346             :                                 }
    2347             : 
    2348             :                                 need_finish = true;
    2349             :                         }
    2350             :                 }
    2351           0 :                 up_read(&space_info->groups_sem);
    2352             : 
    2353           0 :                 if (!do_finish || !need_finish)
    2354             :                         break;
    2355             : 
    2356           0 :                 ret = btrfs_zone_finish_one_bg(fs_info);
    2357           0 :                 if (ret == 0)
    2358             :                         break;
    2359           0 :                 if (ret < 0)
    2360           0 :                         return ret;
    2361             :         }
    2362             : 
    2363             :         return 0;
    2364             : }

Generated by: LCOV version 1.14