LCOV - fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023

LCOV - code coverage report

Current view:	top level - fs/btrfs - zoned.c (source / functions)		Hit	Total	Coverage
Test:	fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023	Lines:	11	1205	0.9 %
Date:	2023-07-31 20:08:27	Functions:	1	54	1.9 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/bitops.h>
       4             : #include <linux/slab.h>
       5             : #include <linux/blkdev.h>
       6             : #include <linux/sched/mm.h>
       7             : #include <linux/atomic.h>
       8             : #include <linux/vmalloc.h>
       9             : #include "ctree.h"
      10             : #include "volumes.h"
      11             : #include "zoned.h"
      12             : #include "rcu-string.h"
      13             : #include "disk-io.h"
      14             : #include "block-group.h"
      15             : #include "transaction.h"
      16             : #include "dev-replace.h"
      17             : #include "space-info.h"
      18             : #include "super.h"
      19             : #include "fs.h"
      20             : #include "accessors.h"
      21             : #include "bio.h"
      22             : 
      23             : /* Maximum number of zones to report per blkdev_report_zones() call */
      24             : #define BTRFS_REPORT_NR_ZONES   4096
      25             : /* Invalid allocation pointer value for missing devices */
      26             : #define WP_MISSING_DEV ((u64)-1)
      27             : /* Pseudo write pointer value for conventional zone */
      28             : #define WP_CONVENTIONAL ((u64)-2)
      29             : 
      30             : /*
      31             :  * Location of the first zone of superblock logging zone pairs.
      32             :  *
      33             :  * - primary superblock:    0B (zone 0)
      34             :  * - first copy:          512G (zone starting at that offset)
      35             :  * - second copy:           4T (zone starting at that offset)
      36             :  */
      37             : #define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
      38             : #define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
      39             : #define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
      40             : 
      41             : #define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
      42             : #define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
      43             : 
      44             : /* Number of superblock log zones */
      45             : #define BTRFS_NR_SB_LOG_ZONES 2
      46             : 
      47             : /*
      48             :  * Minimum of active zones we need:
      49             :  *
      50             :  * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
      51             :  * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
      52             :  * - 1 zone for tree-log dedicated block group
      53             :  * - 1 zone for relocation
      54             :  */
      55             : #define BTRFS_MIN_ACTIVE_ZONES          (BTRFS_SUPER_MIRROR_MAX + 5)
      56             : 
      57             : /*
      58             :  * Minimum / maximum supported zone size. Currently, SMR disks have a zone
      59             :  * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
      60             :  * We do not expect the zone size to become larger than 8GiB or smaller than
      61             :  * 4MiB in the near future.
      62             :  */
      63             : #define BTRFS_MAX_ZONE_SIZE             SZ_8G
      64             : #define BTRFS_MIN_ZONE_SIZE             SZ_4M
      65             : 
      66             : #define SUPER_INFO_SECTORS      ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
      67             : 
      68             : static inline bool sb_zone_is_full(const struct blk_zone *zone)
      69             : {
      70           0 :         return (zone->cond == BLK_ZONE_COND_FULL) ||
      71           0 :                 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
      72             : }
      73             : 
      74           0 : static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
      75             : {
      76           0 :         struct blk_zone *zones = data;
      77             : 
      78           0 :         memcpy(&zones[idx], zone, sizeof(*zone));
      79             : 
      80           0 :         return 0;
      81             : }
      82             : 
      83           0 : static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
      84             :                             u64 *wp_ret)
      85             : {
      86           0 :         bool empty[BTRFS_NR_SB_LOG_ZONES];
      87           0 :         bool full[BTRFS_NR_SB_LOG_ZONES];
      88           0 :         sector_t sector;
      89           0 :         int i;
      90             : 
      91           0 :         for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
      92           0 :                 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
      93           0 :                 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
      94           0 :                 full[i] = sb_zone_is_full(&zones[i]);
      95             :         }
      96             : 
      97             :         /*
      98             :          * Possible states of log buffer zones
      99             :          *
     100             :          *           Empty[0]  In use[0]  Full[0]
     101             :          * Empty[1]         *          0        1
     102             :          * In use[1]        x          x        1
     103             :          * Full[1]          0          0        C
     104             :          *
     105             :          * Log position:
     106             :          *   *: Special case, no superblock is written
     107             :          *   0: Use write pointer of zones[0]
     108             :          *   1: Use write pointer of zones[1]
     109             :          *   C: Compare super blocks from zones[0] and zones[1], use the latest
     110             :          *      one determined by generation
     111             :          *   x: Invalid state
     112             :          */
     113             : 
     114           0 :         if (empty[0] && empty[1]) {
     115             :                 /* Special case to distinguish no superblock to read */
     116           0 :                 *wp_ret = zones[0].start << SECTOR_SHIFT;
     117           0 :                 return -ENOENT;
     118           0 :         } else if (full[0] && full[1]) {
     119             :                 /* Compare two super blocks */
     120           0 :                 struct address_space *mapping = bdev->bd_inode->i_mapping;
     121           0 :                 struct page *page[BTRFS_NR_SB_LOG_ZONES];
     122           0 :                 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
     123           0 :                 int i;
     124             : 
     125           0 :                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
     126           0 :                         u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
     127           0 :                         u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
     128             :                                                 BTRFS_SUPER_INFO_SIZE;
     129             : 
     130           0 :                         page[i] = read_cache_page_gfp(mapping,
     131           0 :                                         bytenr >> PAGE_SHIFT, GFP_NOFS);
     132           0 :                         if (IS_ERR(page[i])) {
     133           0 :                                 if (i == 1)
     134           0 :                                         btrfs_release_disk_super(super[0]);
     135           0 :                                 return PTR_ERR(page[i]);
     136             :                         }
     137           0 :                         super[i] = page_address(page[i]);
     138             :                 }
     139             : 
     140           0 :                 if (btrfs_super_generation(super[0]) >
     141           0 :                     btrfs_super_generation(super[1]))
     142           0 :                         sector = zones[1].start;
     143             :                 else
     144           0 :                         sector = zones[0].start;
     145             : 
     146           0 :                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
     147           0 :                         btrfs_release_disk_super(super[i]);
     148           0 :         } else if (!full[0] && (empty[1] || full[1])) {
     149           0 :                 sector = zones[0].wp;
     150           0 :         } else if (full[0]) {
     151           0 :                 sector = zones[1].wp;
     152             :         } else {
     153             :                 return -EUCLEAN;
     154             :         }
     155           0 :         *wp_ret = sector << SECTOR_SHIFT;
     156           0 :         return 0;
     157             : }
     158             : 
     159             : /*
     160             :  * Get the first zone number of the superblock mirror
     161             :  */
     162           0 : static inline u32 sb_zone_number(int shift, int mirror)
     163             : {
     164           0 :         u64 zone = U64_MAX;
     165             : 
     166           0 :         ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
     167           0 :         switch (mirror) {
     168           0 :         case 0: zone = 0; break;
     169           0 :         case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
     170           0 :         case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
     171             :         }
     172             : 
     173           0 :         ASSERT(zone <= U32_MAX);
     174             : 
     175           0 :         return (u32)zone;
     176             : }
     177             : 
     178           0 : static inline sector_t zone_start_sector(u32 zone_number,
     179             :                                          struct block_device *bdev)
     180             : {
     181           0 :         return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
     182             : }
     183             : 
     184             : static inline u64 zone_start_physical(u32 zone_number,
     185             :                                       struct btrfs_zoned_device_info *zone_info)
     186             : {
     187           0 :         return (u64)zone_number << zone_info->zone_size_shift;
     188             : }
     189             : 
     190             : /*
     191             :  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
     192             :  * device into static sized chunks and fake a conventional zone on each of
     193             :  * them.
     194             :  */
     195           0 : static int emulate_report_zones(struct btrfs_device *device, u64 pos,
     196             :                                 struct blk_zone *zones, unsigned int nr_zones)
     197             : {
     198           0 :         const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
     199           0 :         sector_t bdev_size = bdev_nr_sectors(device->bdev);
     200           0 :         unsigned int i;
     201             : 
     202           0 :         pos >>= SECTOR_SHIFT;
     203           0 :         for (i = 0; i < nr_zones; i++) {
     204           0 :                 zones[i].start = i * zone_sectors + pos;
     205           0 :                 zones[i].len = zone_sectors;
     206           0 :                 zones[i].capacity = zone_sectors;
     207           0 :                 zones[i].wp = zones[i].start + zone_sectors;
     208           0 :                 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
     209           0 :                 zones[i].cond = BLK_ZONE_COND_NOT_WP;
     210             : 
     211           0 :                 if (zones[i].wp >= bdev_size) {
     212           0 :                         i++;
     213           0 :                         break;
     214             :                 }
     215             :         }
     216             : 
     217           0 :         return i;
     218             : }
     219             : 
     220           0 : static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
     221             :                                struct blk_zone *zones, unsigned int *nr_zones)
     222             : {
     223           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
     224           0 :         int ret;
     225             : 
     226           0 :         if (!*nr_zones)
     227             :                 return 0;
     228             : 
     229           0 :         if (!bdev_is_zoned(device->bdev)) {
     230           0 :                 ret = emulate_report_zones(device, pos, zones, *nr_zones);
     231           0 :                 *nr_zones = ret;
     232           0 :                 return 0;
     233             :         }
     234             : 
     235             :         /* Check cache */
     236           0 :         if (zinfo->zone_cache) {
     237           0 :                 unsigned int i;
     238           0 :                 u32 zno;
     239             : 
     240           0 :                 ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
     241           0 :                 zno = pos >> zinfo->zone_size_shift;
     242             :                 /*
     243             :                  * We cannot report zones beyond the zone end. So, it is OK to
     244             :                  * cap *nr_zones to at the end.
     245             :                  */
     246           0 :                 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
     247             : 
     248           0 :                 for (i = 0; i < *nr_zones; i++) {
     249           0 :                         struct blk_zone *zone_info;
     250             : 
     251           0 :                         zone_info = &zinfo->zone_cache[zno + i];
     252           0 :                         if (!zone_info->len)
     253             :                                 break;
     254             :                 }
     255             : 
     256           0 :                 if (i == *nr_zones) {
     257             :                         /* Cache hit on all the zones */
     258           0 :                         memcpy(zones, zinfo->zone_cache + zno,
     259             :                                sizeof(*zinfo->zone_cache) * *nr_zones);
     260           0 :                         return 0;
     261             :                 }
     262             :         }
     263             : 
     264           0 :         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
     265             :                                   copy_zone_info_cb, zones);
     266           0 :         if (ret < 0) {
     267           0 :                 btrfs_err_in_rcu(device->fs_info,
     268             :                                  "zoned: failed to read zone %llu on %s (devid %llu)",
     269             :                                  pos, rcu_str_deref(device->name),
     270             :                                  device->devid);
     271           0 :                 return ret;
     272             :         }
     273           0 :         *nr_zones = ret;
     274           0 :         if (!ret)
     275             :                 return -EIO;
     276             : 
     277             :         /* Populate cache */
     278           0 :         if (zinfo->zone_cache) {
     279           0 :                 u32 zno = pos >> zinfo->zone_size_shift;
     280             : 
     281           0 :                 memcpy(zinfo->zone_cache + zno, zones,
     282             :                        sizeof(*zinfo->zone_cache) * *nr_zones);
     283             :         }
     284             : 
     285             :         return 0;
     286             : }
     287             : 
     288             : /* The emulated zone size is determined from the size of device extent */
     289           0 : static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
     290             : {
     291           0 :         struct btrfs_path *path;
     292           0 :         struct btrfs_root *root = fs_info->dev_root;
     293           0 :         struct btrfs_key key;
     294           0 :         struct extent_buffer *leaf;
     295           0 :         struct btrfs_dev_extent *dext;
     296           0 :         int ret = 0;
     297             : 
     298           0 :         key.objectid = 1;
     299           0 :         key.type = BTRFS_DEV_EXTENT_KEY;
     300           0 :         key.offset = 0;
     301             : 
     302           0 :         path = btrfs_alloc_path();
     303           0 :         if (!path)
     304             :                 return -ENOMEM;
     305             : 
     306           0 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
     307           0 :         if (ret < 0)
     308           0 :                 goto out;
     309             : 
     310           0 :         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
     311           0 :                 ret = btrfs_next_leaf(root, path);
     312           0 :                 if (ret < 0)
     313           0 :                         goto out;
     314             :                 /* No dev extents at all? Not good */
     315           0 :                 if (ret > 0) {
     316           0 :                         ret = -EUCLEAN;
     317           0 :                         goto out;
     318             :                 }
     319             :         }
     320             : 
     321           0 :         leaf = path->nodes[0];
     322           0 :         dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
     323           0 :         fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
     324           0 :         ret = 0;
     325             : 
     326           0 : out:
     327           0 :         btrfs_free_path(path);
     328             : 
     329           0 :         return ret;
     330             : }
     331             : 
     332           0 : int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
     333             : {
     334           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
     335           0 :         struct btrfs_device *device;
     336           0 :         int ret = 0;
     337             : 
     338             :         /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
     339           0 :         if (!btrfs_fs_incompat(fs_info, ZONED))
     340             :                 return 0;
     341             : 
     342           0 :         mutex_lock(&fs_devices->device_list_mutex);
     343           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
     344             :                 /* We can skip reading of zone info for missing devices */
     345           0 :                 if (!device->bdev)
     346           0 :                         continue;
     347             : 
     348           0 :                 ret = btrfs_get_dev_zone_info(device, true);
     349           0 :                 if (ret)
     350             :                         break;
     351             :         }
     352           0 :         mutex_unlock(&fs_devices->device_list_mutex);
     353             : 
     354           0 :         return ret;
     355             : }
     356             : 
     357           0 : int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
     358             : {
     359           0 :         struct btrfs_fs_info *fs_info = device->fs_info;
     360           0 :         struct btrfs_zoned_device_info *zone_info = NULL;
     361           0 :         struct block_device *bdev = device->bdev;
     362           0 :         unsigned int max_active_zones;
     363           0 :         unsigned int nactive;
     364           0 :         sector_t nr_sectors;
     365           0 :         sector_t sector = 0;
     366           0 :         struct blk_zone *zones = NULL;
     367           0 :         unsigned int i, nreported = 0, nr_zones;
     368           0 :         sector_t zone_sectors;
     369           0 :         char *model, *emulated;
     370           0 :         int ret;
     371             : 
     372             :         /*
     373             :          * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
     374             :          * yet be set.
     375             :          */
     376           0 :         if (!btrfs_fs_incompat(fs_info, ZONED))
     377             :                 return 0;
     378             : 
     379           0 :         if (device->zone_info)
     380             :                 return 0;
     381             : 
     382           0 :         zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
     383           0 :         if (!zone_info)
     384             :                 return -ENOMEM;
     385             : 
     386           0 :         device->zone_info = zone_info;
     387             : 
     388           0 :         if (!bdev_is_zoned(bdev)) {
     389           0 :                 if (!fs_info->zone_size) {
     390           0 :                         ret = calculate_emulated_zone_size(fs_info);
     391           0 :                         if (ret)
     392           0 :                                 goto out;
     393             :                 }
     394             : 
     395           0 :                 ASSERT(fs_info->zone_size);
     396           0 :                 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
     397             :         } else {
     398           0 :                 zone_sectors = bdev_zone_sectors(bdev);
     399             :         }
     400             : 
     401           0 :         ASSERT(is_power_of_two_u64(zone_sectors));
     402           0 :         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
     403             : 
     404             :         /* We reject devices with a zone size larger than 8GB */
     405           0 :         if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
     406           0 :                 btrfs_err_in_rcu(fs_info,
     407             :                 "zoned: %s: zone size %llu larger than supported maximum %llu",
     408             :                                  rcu_str_deref(device->name),
     409             :                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
     410           0 :                 ret = -EINVAL;
     411           0 :                 goto out;
     412           0 :         } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
     413           0 :                 btrfs_err_in_rcu(fs_info,
     414             :                 "zoned: %s: zone size %llu smaller than supported minimum %u",
     415             :                                  rcu_str_deref(device->name),
     416             :                                  zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
     417           0 :                 ret = -EINVAL;
     418           0 :                 goto out;
     419             :         }
     420             : 
     421           0 :         nr_sectors = bdev_nr_sectors(bdev);
     422           0 :         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
     423           0 :         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
     424           0 :         if (!IS_ALIGNED(nr_sectors, zone_sectors))
     425           0 :                 zone_info->nr_zones++;
     426             : 
     427           0 :         max_active_zones = bdev_max_active_zones(bdev);
     428           0 :         if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
     429           0 :                 btrfs_err_in_rcu(fs_info,
     430             : "zoned: %s: max active zones %u is too small, need at least %u active zones",
     431             :                                  rcu_str_deref(device->name), max_active_zones,
     432             :                                  BTRFS_MIN_ACTIVE_ZONES);
     433           0 :                 ret = -EINVAL;
     434           0 :                 goto out;
     435             :         }
     436           0 :         zone_info->max_active_zones = max_active_zones;
     437             : 
     438           0 :         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     439           0 :         if (!zone_info->seq_zones) {
     440           0 :                 ret = -ENOMEM;
     441           0 :                 goto out;
     442             :         }
     443             : 
     444           0 :         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     445           0 :         if (!zone_info->empty_zones) {
     446           0 :                 ret = -ENOMEM;
     447           0 :                 goto out;
     448             :         }
     449             : 
     450           0 :         zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     451           0 :         if (!zone_info->active_zones) {
     452           0 :                 ret = -ENOMEM;
     453           0 :                 goto out;
     454             :         }
     455             : 
     456           0 :         zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
     457           0 :         if (!zones) {
     458           0 :                 ret = -ENOMEM;
     459           0 :                 goto out;
     460             :         }
     461             : 
     462             :         /*
     463             :          * Enable zone cache only for a zoned device. On a non-zoned device, we
     464             :          * fill the zone info with emulated CONVENTIONAL zones, so no need to
     465             :          * use the cache.
     466             :          */
     467           0 :         if (populate_cache && bdev_is_zoned(device->bdev)) {
     468           0 :                 zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
     469           0 :                                                 zone_info->nr_zones);
     470           0 :                 if (!zone_info->zone_cache) {
     471           0 :                         btrfs_err_in_rcu(device->fs_info,
     472             :                                 "zoned: failed to allocate zone cache for %s",
     473             :                                 rcu_str_deref(device->name));
     474           0 :                         ret = -ENOMEM;
     475           0 :                         goto out;
     476             :                 }
     477             :         }
     478             : 
     479             :         /* Get zones type */
     480             :         nactive = 0;
     481           0 :         while (sector < nr_sectors) {
     482           0 :                 nr_zones = BTRFS_REPORT_NR_ZONES;
     483           0 :                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
     484             :                                           &nr_zones);
     485           0 :                 if (ret)
     486           0 :                         goto out;
     487             : 
     488           0 :                 for (i = 0; i < nr_zones; i++) {
     489           0 :                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
     490           0 :                                 __set_bit(nreported, zone_info->seq_zones);
     491           0 :                         switch (zones[i].cond) {
     492           0 :                         case BLK_ZONE_COND_EMPTY:
     493           0 :                                 __set_bit(nreported, zone_info->empty_zones);
     494             :                                 break;
     495           0 :                         case BLK_ZONE_COND_IMP_OPEN:
     496             :                         case BLK_ZONE_COND_EXP_OPEN:
     497             :                         case BLK_ZONE_COND_CLOSED:
     498           0 :                                 __set_bit(nreported, zone_info->active_zones);
     499           0 :                                 nactive++;
     500           0 :                                 break;
     501             :                         }
     502           0 :                         nreported++;
     503             :                 }
     504           0 :                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
     505             :         }
     506             : 
     507           0 :         if (nreported != zone_info->nr_zones) {
     508           0 :                 btrfs_err_in_rcu(device->fs_info,
     509             :                                  "inconsistent number of zones on %s (%u/%u)",
     510             :                                  rcu_str_deref(device->name), nreported,
     511             :                                  zone_info->nr_zones);
     512           0 :                 ret = -EIO;
     513           0 :                 goto out;
     514             :         }
     515             : 
     516           0 :         if (max_active_zones) {
     517           0 :                 if (nactive > max_active_zones) {
     518           0 :                         btrfs_err_in_rcu(device->fs_info,
     519             :                         "zoned: %u active zones on %s exceeds max_active_zones %u",
     520             :                                          nactive, rcu_str_deref(device->name),
     521             :                                          max_active_zones);
     522           0 :                         ret = -EIO;
     523           0 :                         goto out;
     524             :                 }
     525           0 :                 atomic_set(&zone_info->active_zones_left,
     526           0 :                            max_active_zones - nactive);
     527           0 :                 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
     528             :         }
     529             : 
     530             :         /* Validate superblock log */
     531           0 :         nr_zones = BTRFS_NR_SB_LOG_ZONES;
     532           0 :         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
     533           0 :                 u32 sb_zone;
     534           0 :                 u64 sb_wp;
     535           0 :                 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
     536             : 
     537           0 :                 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
     538           0 :                 if (sb_zone + 1 >= zone_info->nr_zones)
     539           0 :                         continue;
     540             : 
     541           0 :                 ret = btrfs_get_dev_zones(device,
     542             :                                           zone_start_physical(sb_zone, zone_info),
     543             :                                           &zone_info->sb_zones[sb_pos],
     544             :                                           &nr_zones);
     545           0 :                 if (ret)
     546           0 :                         goto out;
     547             : 
     548           0 :                 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
     549           0 :                         btrfs_err_in_rcu(device->fs_info,
     550             :         "zoned: failed to read super block log zone info at devid %llu zone %u",
     551             :                                          device->devid, sb_zone);
     552           0 :                         ret = -EUCLEAN;
     553           0 :                         goto out;
     554             :                 }
     555             : 
     556             :                 /*
     557             :                  * If zones[0] is conventional, always use the beginning of the
     558             :                  * zone to record superblock. No need to validate in that case.
     559             :                  */
     560           0 :                 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
     561             :                     BLK_ZONE_TYPE_CONVENTIONAL)
     562           0 :                         continue;
     563             : 
     564           0 :                 ret = sb_write_pointer(device->bdev,
     565             :                                        &zone_info->sb_zones[sb_pos], &sb_wp);
     566           0 :                 if (ret != -ENOENT && ret) {
     567           0 :                         btrfs_err_in_rcu(device->fs_info,
     568             :                         "zoned: super block log zone corrupted devid %llu zone %u",
     569             :                                          device->devid, sb_zone);
     570           0 :                         ret = -EUCLEAN;
     571           0 :                         goto out;
     572             :                 }
     573             :         }
     574             : 
     575             : 
     576           0 :         kvfree(zones);
     577             : 
     578           0 :         switch (bdev_zoned_model(bdev)) {
     579             :         case BLK_ZONED_HM:
     580             :                 model = "host-managed zoned";
     581             :                 emulated = "";
     582             :                 break;
     583             :         case BLK_ZONED_HA:
     584             :                 model = "host-aware zoned";
     585             :                 emulated = "";
     586             :                 break;
     587             :         case BLK_ZONED_NONE:
     588             :                 model = "regular";
     589             :                 emulated = "emulated ";
     590             :                 break;
     591             :         default:
     592             :                 /* Just in case */
     593           0 :                 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
     594             :                                  bdev_zoned_model(bdev),
     595             :                                  rcu_str_deref(device->name));
     596           0 :                 ret = -EOPNOTSUPP;
     597           0 :                 goto out_free_zone_info;
     598             :         }
     599             : 
     600           0 :         btrfs_info_in_rcu(fs_info,
     601             :                 "%s block device %s, %u %szones of %llu bytes",
     602             :                 model, rcu_str_deref(device->name), zone_info->nr_zones,
     603             :                 emulated, zone_info->zone_size);
     604             : 
     605           0 :         return 0;
     606             : 
     607           0 : out:
     608           0 :         kvfree(zones);
     609           0 : out_free_zone_info:
     610           0 :         btrfs_destroy_dev_zone_info(device);
     611             : 
     612           0 :         return ret;
     613             : }
     614             : 
     615           0 : void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
     616             : {
     617           0 :         struct btrfs_zoned_device_info *zone_info = device->zone_info;
     618             : 
     619           0 :         if (!zone_info)
     620             :                 return;
     621             : 
     622           0 :         bitmap_free(zone_info->active_zones);
     623           0 :         bitmap_free(zone_info->seq_zones);
     624           0 :         bitmap_free(zone_info->empty_zones);
     625           0 :         vfree(zone_info->zone_cache);
     626           0 :         kfree(zone_info);
     627           0 :         device->zone_info = NULL;
     628             : }
     629             : 
     630           0 : struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
     631             : {
     632           0 :         struct btrfs_zoned_device_info *zone_info;
     633             : 
     634           0 :         zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
     635           0 :         if (!zone_info)
     636             :                 return NULL;
     637             : 
     638           0 :         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     639           0 :         if (!zone_info->seq_zones)
     640           0 :                 goto out;
     641             : 
     642           0 :         bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
     643             :                     zone_info->nr_zones);
     644             : 
     645           0 :         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     646           0 :         if (!zone_info->empty_zones)
     647           0 :                 goto out;
     648             : 
     649           0 :         bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
     650             :                     zone_info->nr_zones);
     651             : 
     652           0 :         zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
     653           0 :         if (!zone_info->active_zones)
     654           0 :                 goto out;
     655             : 
     656           0 :         bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
     657             :                     zone_info->nr_zones);
     658           0 :         zone_info->zone_cache = NULL;
     659             : 
     660           0 :         return zone_info;
     661             : 
     662           0 : out:
     663           0 :         bitmap_free(zone_info->seq_zones);
     664           0 :         bitmap_free(zone_info->empty_zones);
     665           0 :         bitmap_free(zone_info->active_zones);
     666           0 :         kfree(zone_info);
     667           0 :         return NULL;
     668             : }
     669             : 
     670           0 : int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
     671             :                        struct blk_zone *zone)
     672             : {
     673           0 :         unsigned int nr_zones = 1;
     674           0 :         int ret;
     675             : 
     676           0 :         ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
     677           0 :         if (ret != 0 || !nr_zones)
     678           0 :                 return ret ? ret : -EIO;
     679             : 
     680             :         return 0;
     681             : }
     682             : 
     683           0 : static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
     684             : {
     685           0 :         struct btrfs_device *device;
     686             : 
     687           0 :         list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
     688           0 :                 if (device->bdev &&
     689             :                     bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
     690           0 :                         btrfs_err(fs_info,
     691             :                                 "zoned: mode not enabled but zoned device found: %pg",
     692             :                                 device->bdev);
     693           0 :                         return -EINVAL;
     694             :                 }
     695             :         }
     696             : 
     697             :         return 0;
     698             : }
     699             : 
     700           0 : int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
     701             : {
     702           0 :         struct queue_limits *lim = &fs_info->limits;
     703           0 :         struct btrfs_device *device;
     704           0 :         u64 zone_size = 0;
     705           0 :         int ret;
     706             : 
     707             :         /*
     708             :          * Host-Managed devices can't be used without the ZONED flag.  With the
     709             :          * ZONED all devices can be used, using zone emulation if required.
     710             :          */
     711           0 :         if (!btrfs_fs_incompat(fs_info, ZONED))
     712           0 :                 return btrfs_check_for_zoned_device(fs_info);
     713             : 
     714           0 :         blk_set_stacking_limits(lim);
     715             : 
     716           0 :         list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
     717           0 :                 struct btrfs_zoned_device_info *zone_info = device->zone_info;
     718             : 
     719           0 :                 if (!device->bdev)
     720           0 :                         continue;
     721             : 
     722           0 :                 if (!zone_size) {
     723           0 :                         zone_size = zone_info->zone_size;
     724           0 :                 } else if (zone_info->zone_size != zone_size) {
     725           0 :                         btrfs_err(fs_info,
     726             :                 "zoned: unequal block device zone sizes: have %llu found %llu",
     727             :                                   zone_info->zone_size, zone_size);
     728           0 :                         return -EINVAL;
     729             :                 }
     730             : 
     731             :                 /*
     732             :                  * With the zoned emulation, we can have non-zoned device on the
     733             :                  * zoned mode. In this case, we don't have a valid max zone
     734             :                  * append size.
     735             :                  */
     736           0 :                 if (bdev_is_zoned(device->bdev)) {
     737           0 :                         blk_stack_limits(lim,
     738             :                                          &bdev_get_queue(device->bdev)->limits,
     739             :                                          0);
     740             :                 }
     741             :         }
     742             : 
     743             :         /*
     744             :          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
     745             :          * btrfs_create_chunk(). Since we want stripe_len == zone_size,
     746             :          * check the alignment here.
     747             :          */
     748           0 :         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
     749           0 :                 btrfs_err(fs_info,
     750             :                           "zoned: zone size %llu not aligned to stripe %u",
     751             :                           zone_size, BTRFS_STRIPE_LEN);
     752           0 :                 return -EINVAL;
     753             :         }
     754             : 
     755           0 :         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
     756           0 :                 btrfs_err(fs_info, "zoned: mixed block groups not supported");
     757           0 :                 return -EINVAL;
     758             :         }
     759             : 
     760           0 :         fs_info->zone_size = zone_size;
     761             :         /*
     762             :          * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
     763             :          * Technically, we can have multiple pages per segment. But, since
     764             :          * we add the pages one by one to a bio, and cannot increase the
     765             :          * metadata reservation even if it increases the number of extents, it
     766             :          * is safe to stick with the limit.
     767             :          */
     768           0 :         fs_info->max_zone_append_size = ALIGN_DOWN(
     769             :                 min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
     770             :                      (u64)lim->max_sectors << SECTOR_SHIFT,
     771             :                      (u64)lim->max_segments << PAGE_SHIFT),
     772             :                 fs_info->sectorsize);
     773           0 :         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
     774           0 :         if (fs_info->max_zone_append_size < fs_info->max_extent_size)
     775           0 :                 fs_info->max_extent_size = fs_info->max_zone_append_size;
     776             : 
     777             :         /*
     778             :          * Check mount options here, because we might change fs_info->zoned
     779             :          * from fs_info->zone_size.
     780             :          */
     781           0 :         ret = btrfs_check_mountopts_zoned(fs_info);
     782           0 :         if (ret)
     783             :                 return ret;
     784             : 
     785           0 :         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
     786           0 :         return 0;
     787             : }
     788             : 
     789           0 : int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
     790             : {
     791           0 :         if (!btrfs_is_zoned(info))
     792             :                 return 0;
     793             : 
     794             :         /*
     795             :          * Space cache writing is not COWed. Disable that to avoid write errors
     796             :          * in sequential zones.
     797             :          */
     798           0 :         if (btrfs_test_opt(info, SPACE_CACHE)) {
     799           0 :                 btrfs_err(info, "zoned: space cache v1 is not supported");
     800           0 :                 return -EINVAL;
     801             :         }
     802             : 
     803           0 :         if (btrfs_test_opt(info, NODATACOW)) {
     804           0 :                 btrfs_err(info, "zoned: NODATACOW not supported");
     805           0 :                 return -EINVAL;
     806             :         }
     807             : 
     808           0 :         btrfs_clear_and_info(info, DISCARD_ASYNC,
     809             :                         "zoned: async discard ignored and disabled for zoned mode");
     810             : 
     811           0 :         return 0;
     812             : }
     813             : 
     814           0 : static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
     815             :                            int rw, u64 *bytenr_ret)
     816             : {
     817           0 :         u64 wp;
     818           0 :         int ret;
     819             : 
     820           0 :         if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
     821           0 :                 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
     822           0 :                 return 0;
     823             :         }
     824             : 
     825           0 :         ret = sb_write_pointer(bdev, zones, &wp);
     826           0 :         if (ret != -ENOENT && ret < 0)
     827             :                 return ret;
     828             : 
     829           0 :         if (rw == WRITE) {
     830           0 :                 struct blk_zone *reset = NULL;
     831             : 
     832           0 :                 if (wp == zones[0].start << SECTOR_SHIFT)
     833             :                         reset = &zones[0];
     834           0 :                 else if (wp == zones[1].start << SECTOR_SHIFT)
     835           0 :                         reset = &zones[1];
     836             : 
     837           0 :                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
     838           0 :                         ASSERT(sb_zone_is_full(reset));
     839             : 
     840           0 :                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
     841             :                                                reset->start, reset->len,
     842             :                                                GFP_NOFS);
     843           0 :                         if (ret)
     844             :                                 return ret;
     845             : 
     846           0 :                         reset->cond = BLK_ZONE_COND_EMPTY;
     847           0 :                         reset->wp = reset->start;
     848             :                 }
     849           0 :         } else if (ret != -ENOENT) {
     850             :                 /*
     851             :                  * For READ, we want the previous one. Move write pointer to
     852             :                  * the end of a zone, if it is at the head of a zone.
     853             :                  */
     854           0 :                 u64 zone_end = 0;
     855             : 
     856           0 :                 if (wp == zones[0].start << SECTOR_SHIFT)
     857           0 :                         zone_end = zones[1].start + zones[1].capacity;
     858           0 :                 else if (wp == zones[1].start << SECTOR_SHIFT)
     859           0 :                         zone_end = zones[0].start + zones[0].capacity;
     860           0 :                 if (zone_end)
     861           0 :                         wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
     862             :                                         BTRFS_SUPER_INFO_SIZE);
     863             : 
     864           0 :                 wp -= BTRFS_SUPER_INFO_SIZE;
     865             :         }
     866             : 
     867           0 :         *bytenr_ret = wp;
     868           0 :         return 0;
     869             : 
     870             : }
     871             : 
     872           4 : int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
     873             :                                u64 *bytenr_ret)
     874             : {
     875           4 :         struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
     876           4 :         sector_t zone_sectors;
     877           4 :         u32 sb_zone;
     878           4 :         int ret;
     879           4 :         u8 zone_sectors_shift;
     880           4 :         sector_t nr_sectors;
     881           4 :         u32 nr_zones;
     882             : 
     883           4 :         if (!bdev_is_zoned(bdev)) {
     884           4 :                 *bytenr_ret = btrfs_sb_offset(mirror);
     885           4 :                 return 0;
     886             :         }
     887             : 
     888           0 :         ASSERT(rw == READ || rw == WRITE);
     889             : 
     890           0 :         zone_sectors = bdev_zone_sectors(bdev);
     891           0 :         if (!is_power_of_2(zone_sectors))
     892             :                 return -EINVAL;
     893           0 :         zone_sectors_shift = ilog2(zone_sectors);
     894           0 :         nr_sectors = bdev_nr_sectors(bdev);
     895           0 :         nr_zones = nr_sectors >> zone_sectors_shift;
     896             : 
     897           0 :         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
     898           0 :         if (sb_zone + 1 >= nr_zones)
     899             :                 return -ENOENT;
     900             : 
     901           0 :         ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
     902             :                                   BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
     903             :                                   zones);
     904           0 :         if (ret < 0)
     905             :                 return ret;
     906           0 :         if (ret != BTRFS_NR_SB_LOG_ZONES)
     907             :                 return -EIO;
     908             : 
     909           0 :         return sb_log_location(bdev, zones, rw, bytenr_ret);
     910             : }
     911             : 
     912           0 : int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
     913             :                           u64 *bytenr_ret)
     914             : {
     915           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
     916           0 :         u32 zone_num;
     917             : 
     918             :         /*
     919             :          * For a zoned filesystem on a non-zoned block device, use the same
     920             :          * super block locations as regular filesystem. Doing so, the super
     921             :          * block can always be retrieved and the zoned flag of the volume
     922             :          * detected from the super block information.
     923             :          */
     924           0 :         if (!bdev_is_zoned(device->bdev)) {
     925           0 :                 *bytenr_ret = btrfs_sb_offset(mirror);
     926           0 :                 return 0;
     927             :         }
     928             : 
     929           0 :         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
     930           0 :         if (zone_num + 1 >= zinfo->nr_zones)
     931             :                 return -ENOENT;
     932             : 
     933           0 :         return sb_log_location(device->bdev,
     934           0 :                                &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
     935             :                                rw, bytenr_ret);
     936             : }
     937             : 
     938           0 : static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
     939             :                                   int mirror)
     940             : {
     941           0 :         u32 zone_num;
     942             : 
     943           0 :         if (!zinfo)
     944             :                 return false;
     945             : 
     946           0 :         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
     947           0 :         if (zone_num + 1 >= zinfo->nr_zones)
     948             :                 return false;
     949             : 
     950           0 :         if (!test_bit(zone_num, zinfo->seq_zones))
     951           0 :                 return false;
     952             : 
     953             :         return true;
     954             : }
     955             : 
     956           0 : int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
     957             : {
     958           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
     959           0 :         struct blk_zone *zone;
     960           0 :         int i;
     961             : 
     962           0 :         if (!is_sb_log_zone(zinfo, mirror))
     963             :                 return 0;
     964             : 
     965           0 :         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
     966           0 :         for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
     967             :                 /* Advance the next zone */
     968           0 :                 if (zone->cond == BLK_ZONE_COND_FULL) {
     969           0 :                         zone++;
     970           0 :                         continue;
     971             :                 }
     972             : 
     973           0 :                 if (zone->cond == BLK_ZONE_COND_EMPTY)
     974           0 :                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
     975             : 
     976           0 :                 zone->wp += SUPER_INFO_SECTORS;
     977             : 
     978           0 :                 if (sb_zone_is_full(zone)) {
     979             :                         /*
     980             :                          * No room left to write new superblock. Since
     981             :                          * superblock is written with REQ_SYNC, it is safe to
     982             :                          * finish the zone now.
     983             :                          *
     984             :                          * If the write pointer is exactly at the capacity,
     985             :                          * explicit ZONE_FINISH is not necessary.
     986             :                          */
     987           0 :                         if (zone->wp != zone->start + zone->capacity) {
     988           0 :                                 int ret;
     989             : 
     990           0 :                                 ret = blkdev_zone_mgmt(device->bdev,
     991             :                                                 REQ_OP_ZONE_FINISH, zone->start,
     992             :                                                 zone->len, GFP_NOFS);
     993           0 :                                 if (ret)
     994             :                                         return ret;
     995             :                         }
     996             : 
     997           0 :                         zone->wp = zone->start + zone->len;
     998           0 :                         zone->cond = BLK_ZONE_COND_FULL;
     999             :                 }
    1000             :                 return 0;
    1001             :         }
    1002             : 
    1003             :         /* All the zones are FULL. Should not reach here. */
    1004             :         ASSERT(0);
    1005             :         return -EIO;
    1006             : }
    1007             : 
    1008           0 : int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
    1009             : {
    1010           0 :         sector_t zone_sectors;
    1011           0 :         sector_t nr_sectors;
    1012           0 :         u8 zone_sectors_shift;
    1013           0 :         u32 sb_zone;
    1014           0 :         u32 nr_zones;
    1015             : 
    1016           0 :         zone_sectors = bdev_zone_sectors(bdev);
    1017           0 :         zone_sectors_shift = ilog2(zone_sectors);
    1018           0 :         nr_sectors = bdev_nr_sectors(bdev);
    1019           0 :         nr_zones = nr_sectors >> zone_sectors_shift;
    1020             : 
    1021           0 :         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
    1022           0 :         if (sb_zone + 1 >= nr_zones)
    1023             :                 return -ENOENT;
    1024             : 
    1025           0 :         return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
    1026             :                                 zone_start_sector(sb_zone, bdev),
    1027             :                                 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
    1028             : }
    1029             : 
    1030             : /*
    1031             :  * Find allocatable zones within a given region.
    1032             :  *
    1033             :  * @device:     the device to allocate a region on
    1034             :  * @hole_start: the position of the hole to allocate the region
    1035             :  * @num_bytes:  size of wanted region
    1036             :  * @hole_end:   the end of the hole
    1037             :  * @return:     position of allocatable zones
    1038             :  *
    1039             :  * Allocatable region should not contain any superblock locations.
    1040             :  */
    1041           0 : u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
    1042             :                                  u64 hole_end, u64 num_bytes)
    1043             : {
    1044           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
    1045           0 :         const u8 shift = zinfo->zone_size_shift;
    1046           0 :         u64 nzones = num_bytes >> shift;
    1047           0 :         u64 pos = hole_start;
    1048           0 :         u64 begin, end;
    1049           0 :         bool have_sb;
    1050           0 :         int i;
    1051             : 
    1052           0 :         ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
    1053           0 :         ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
    1054             : 
    1055           0 :         while (pos < hole_end) {
    1056           0 :                 begin = pos >> shift;
    1057           0 :                 end = begin + nzones;
    1058             : 
    1059           0 :                 if (end > zinfo->nr_zones)
    1060             :                         return hole_end;
    1061             : 
    1062             :                 /* Check if zones in the region are all empty */
    1063           0 :                 if (btrfs_dev_is_sequential(device, pos) &&
    1064           0 :                     !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
    1065           0 :                         pos += zinfo->zone_size;
    1066           0 :                         continue;
    1067             :                 }
    1068             : 
    1069             :                 have_sb = false;
    1070           0 :                 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    1071           0 :                         u32 sb_zone;
    1072           0 :                         u64 sb_pos;
    1073             : 
    1074           0 :                         sb_zone = sb_zone_number(shift, i);
    1075           0 :                         if (!(end <= sb_zone ||
    1076           0 :                               sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
    1077           0 :                                 have_sb = true;
    1078           0 :                                 pos = zone_start_physical(
    1079             :                                         sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
    1080           0 :                                 break;
    1081             :                         }
    1082             : 
    1083             :                         /* We also need to exclude regular superblock positions */
    1084           0 :                         sb_pos = btrfs_sb_offset(i);
    1085           0 :                         if (!(pos + num_bytes <= sb_pos ||
    1086           0 :                               sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
    1087           0 :                                 have_sb = true;
    1088           0 :                                 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
    1089             :                                             zinfo->zone_size);
    1090           0 :                                 break;
    1091             :                         }
    1092             :                 }
    1093           0 :                 if (!have_sb)
    1094             :                         break;
    1095             :         }
    1096             : 
    1097             :         return pos;
    1098             : }
    1099             : 
    1100           0 : static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
    1101             : {
    1102           0 :         struct btrfs_zoned_device_info *zone_info = device->zone_info;
    1103           0 :         unsigned int zno = (pos >> zone_info->zone_size_shift);
    1104             : 
    1105             :         /* We can use any number of zones */
    1106           0 :         if (zone_info->max_active_zones == 0)
    1107             :                 return true;
    1108             : 
    1109           0 :         if (!test_bit(zno, zone_info->active_zones)) {
    1110             :                 /* Active zone left? */
    1111           0 :                 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
    1112             :                         return false;
    1113           0 :                 if (test_and_set_bit(zno, zone_info->active_zones)) {
    1114             :                         /* Someone already set the bit */
    1115           0 :                         atomic_inc(&zone_info->active_zones_left);
    1116             :                 }
    1117             :         }
    1118             : 
    1119             :         return true;
    1120             : }
    1121             : 
    1122           0 : static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
    1123             : {
    1124           0 :         struct btrfs_zoned_device_info *zone_info = device->zone_info;
    1125           0 :         unsigned int zno = (pos >> zone_info->zone_size_shift);
    1126             : 
    1127             :         /* We can use any number of zones */
    1128           0 :         if (zone_info->max_active_zones == 0)
    1129             :                 return;
    1130             : 
    1131           0 :         if (test_and_clear_bit(zno, zone_info->active_zones))
    1132           0 :                 atomic_inc(&zone_info->active_zones_left);
    1133             : }
    1134             : 
    1135           0 : int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
    1136             :                             u64 length, u64 *bytes)
    1137             : {
    1138           0 :         int ret;
    1139             : 
    1140           0 :         *bytes = 0;
    1141           0 :         ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
    1142             :                                physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
    1143             :                                GFP_NOFS);
    1144           0 :         if (ret)
    1145             :                 return ret;
    1146             : 
    1147           0 :         *bytes = length;
    1148           0 :         while (length) {
    1149           0 :                 btrfs_dev_set_zone_empty(device, physical);
    1150           0 :                 btrfs_dev_clear_active_zone(device, physical);
    1151           0 :                 physical += device->zone_info->zone_size;
    1152           0 :                 length -= device->zone_info->zone_size;
    1153             :         }
    1154             : 
    1155             :         return 0;
    1156             : }
    1157             : 
    1158           0 : int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
    1159             : {
    1160           0 :         struct btrfs_zoned_device_info *zinfo = device->zone_info;
    1161           0 :         const u8 shift = zinfo->zone_size_shift;
    1162           0 :         unsigned long begin = start >> shift;
    1163           0 :         unsigned long nbits = size >> shift;
    1164           0 :         u64 pos;
    1165           0 :         int ret;
    1166             : 
    1167           0 :         ASSERT(IS_ALIGNED(start, zinfo->zone_size));
    1168           0 :         ASSERT(IS_ALIGNED(size, zinfo->zone_size));
    1169             : 
    1170           0 :         if (begin + nbits > zinfo->nr_zones)
    1171             :                 return -ERANGE;
    1172             : 
    1173             :         /* All the zones are conventional */
    1174           0 :         if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
    1175             :                 return 0;
    1176             : 
    1177             :         /* All the zones are sequential and empty */
    1178           0 :         if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
    1179           0 :             bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
    1180             :                 return 0;
    1181             : 
    1182           0 :         for (pos = start; pos < start + size; pos += zinfo->zone_size) {
    1183           0 :                 u64 reset_bytes;
    1184             : 
    1185           0 :                 if (!btrfs_dev_is_sequential(device, pos) ||
    1186           0 :                     btrfs_dev_is_empty_zone(device, pos))
    1187           0 :                         continue;
    1188             : 
    1189             :                 /* Free regions should be empty */
    1190           0 :                 btrfs_warn_in_rcu(
    1191             :                         device->fs_info,
    1192             :                 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
    1193             :                         rcu_str_deref(device->name), device->devid, pos >> shift);
    1194           0 :                 WARN_ON_ONCE(1);
    1195             : 
    1196           0 :                 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
    1197             :                                               &reset_bytes);
    1198           0 :                 if (ret)
    1199           0 :                         return ret;
    1200             :         }
    1201             : 
    1202             :         return 0;
    1203             : }
    1204             : 
    1205             : /*
    1206             :  * Calculate an allocation pointer from the extent allocation information
    1207             :  * for a block group consist of conventional zones. It is pointed to the
    1208             :  * end of the highest addressed extent in the block group as an allocation
    1209             :  * offset.
    1210             :  */
    1211           0 : static int calculate_alloc_pointer(struct btrfs_block_group *cache,
    1212             :                                    u64 *offset_ret, bool new)
    1213             : {
    1214           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    1215           0 :         struct btrfs_root *root;
    1216           0 :         struct btrfs_path *path;
    1217           0 :         struct btrfs_key key;
    1218           0 :         struct btrfs_key found_key;
    1219           0 :         int ret;
    1220           0 :         u64 length;
    1221             : 
    1222             :         /*
    1223             :          * Avoid  tree lookups for a new block group, there's no use for it.
    1224             :          * It must always be 0.
    1225             :          *
    1226             :          * Also, we have a lock chain of extent buffer lock -> chunk mutex.
    1227             :          * For new a block group, this function is called from
    1228             :          * btrfs_make_block_group() which is already taking the chunk mutex.
    1229             :          * Thus, we cannot call calculate_alloc_pointer() which takes extent
    1230             :          * buffer locks to avoid deadlock.
    1231             :          */
    1232           0 :         if (new) {
    1233           0 :                 *offset_ret = 0;
    1234           0 :                 return 0;
    1235             :         }
    1236             : 
    1237           0 :         path = btrfs_alloc_path();
    1238           0 :         if (!path)
    1239             :                 return -ENOMEM;
    1240             : 
    1241           0 :         key.objectid = cache->start + cache->length;
    1242           0 :         key.type = 0;
    1243           0 :         key.offset = 0;
    1244             : 
    1245           0 :         root = btrfs_extent_root(fs_info, key.objectid);
    1246           0 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    1247             :         /* We should not find the exact match */
    1248           0 :         if (!ret)
    1249             :                 ret = -EUCLEAN;
    1250           0 :         if (ret < 0)
    1251           0 :                 goto out;
    1252             : 
    1253           0 :         ret = btrfs_previous_extent_item(root, path, cache->start);
    1254           0 :         if (ret) {
    1255           0 :                 if (ret == 1) {
    1256           0 :                         ret = 0;
    1257           0 :                         *offset_ret = 0;
    1258             :                 }
    1259           0 :                 goto out;
    1260             :         }
    1261             : 
    1262           0 :         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
    1263             : 
    1264           0 :         if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
    1265           0 :                 length = found_key.offset;
    1266             :         else
    1267           0 :                 length = fs_info->nodesize;
    1268             : 
    1269           0 :         if (!(found_key.objectid >= cache->start &&
    1270           0 :                found_key.objectid + length <= cache->start + cache->length)) {
    1271           0 :                 ret = -EUCLEAN;
    1272           0 :                 goto out;
    1273             :         }
    1274           0 :         *offset_ret = found_key.objectid + length - cache->start;
    1275           0 :         ret = 0;
    1276             : 
    1277           0 : out:
    1278           0 :         btrfs_free_path(path);
    1279           0 :         return ret;
    1280             : }
    1281             : 
    1282           0 : int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
    1283             : {
    1284           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    1285           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    1286           0 :         struct extent_map *em;
    1287           0 :         struct map_lookup *map;
    1288           0 :         struct btrfs_device *device;
    1289           0 :         u64 logical = cache->start;
    1290           0 :         u64 length = cache->length;
    1291           0 :         int ret;
    1292           0 :         int i;
    1293           0 :         unsigned int nofs_flag;
    1294           0 :         u64 *alloc_offsets = NULL;
    1295           0 :         u64 *caps = NULL;
    1296           0 :         u64 *physical = NULL;
    1297           0 :         unsigned long *active = NULL;
    1298           0 :         u64 last_alloc = 0;
    1299           0 :         u32 num_sequential = 0, num_conventional = 0;
    1300             : 
    1301           0 :         if (!btrfs_is_zoned(fs_info))
    1302             :                 return 0;
    1303             : 
    1304             :         /* Sanity check */
    1305           0 :         if (!IS_ALIGNED(length, fs_info->zone_size)) {
    1306           0 :                 btrfs_err(fs_info,
    1307             :                 "zoned: block group %llu len %llu unaligned to zone size %llu",
    1308             :                           logical, length, fs_info->zone_size);
    1309           0 :                 return -EIO;
    1310             :         }
    1311             : 
    1312             :         /* Get the chunk mapping */
    1313           0 :         read_lock(&em_tree->lock);
    1314           0 :         em = lookup_extent_mapping(em_tree, logical, length);
    1315           0 :         read_unlock(&em_tree->lock);
    1316             : 
    1317           0 :         if (!em)
    1318             :                 return -EINVAL;
    1319             : 
    1320           0 :         map = em->map_lookup;
    1321             : 
    1322           0 :         cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
    1323           0 :         if (!cache->physical_map) {
    1324           0 :                 ret = -ENOMEM;
    1325           0 :                 goto out;
    1326             :         }
    1327             : 
    1328           0 :         alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
    1329           0 :         if (!alloc_offsets) {
    1330           0 :                 ret = -ENOMEM;
    1331           0 :                 goto out;
    1332             :         }
    1333             : 
    1334           0 :         caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
    1335           0 :         if (!caps) {
    1336           0 :                 ret = -ENOMEM;
    1337           0 :                 goto out;
    1338             :         }
    1339             : 
    1340           0 :         physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
    1341           0 :         if (!physical) {
    1342           0 :                 ret = -ENOMEM;
    1343           0 :                 goto out;
    1344             :         }
    1345             : 
    1346           0 :         active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
    1347           0 :         if (!active) {
    1348           0 :                 ret = -ENOMEM;
    1349           0 :                 goto out;
    1350             :         }
    1351             : 
    1352           0 :         for (i = 0; i < map->num_stripes; i++) {
    1353           0 :                 bool is_sequential;
    1354           0 :                 struct blk_zone zone;
    1355           0 :                 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1356           0 :                 int dev_replace_is_ongoing = 0;
    1357             : 
    1358           0 :                 device = map->stripes[i].dev;
    1359           0 :                 physical[i] = map->stripes[i].physical;
    1360             : 
    1361           0 :                 if (device->bdev == NULL) {
    1362           0 :                         alloc_offsets[i] = WP_MISSING_DEV;
    1363           0 :                         continue;
    1364             :                 }
    1365             : 
    1366           0 :                 is_sequential = btrfs_dev_is_sequential(device, physical[i]);
    1367           0 :                 if (is_sequential)
    1368           0 :                         num_sequential++;
    1369             :                 else
    1370           0 :                         num_conventional++;
    1371             : 
    1372             :                 /*
    1373             :                  * Consider a zone as active if we can allow any number of
    1374             :                  * active zones.
    1375             :                  */
    1376           0 :                 if (!device->zone_info->max_active_zones)
    1377           0 :                         __set_bit(i, active);
    1378             : 
    1379           0 :                 if (!is_sequential) {
    1380           0 :                         alloc_offsets[i] = WP_CONVENTIONAL;
    1381           0 :                         continue;
    1382             :                 }
    1383             : 
    1384             :                 /*
    1385             :                  * This zone will be used for allocation, so mark this zone
    1386             :                  * non-empty.
    1387             :                  */
    1388           0 :                 btrfs_dev_clear_zone_empty(device, physical[i]);
    1389             : 
    1390           0 :                 down_read(&dev_replace->rwsem);
    1391           0 :                 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
    1392           0 :                 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
    1393           0 :                         btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
    1394           0 :                 up_read(&dev_replace->rwsem);
    1395             : 
    1396             :                 /*
    1397             :                  * The group is mapped to a sequential zone. Get the zone write
    1398             :                  * pointer to determine the allocation offset within the zone.
    1399             :                  */
    1400           0 :                 WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
    1401           0 :                 nofs_flag = memalloc_nofs_save();
    1402           0 :                 ret = btrfs_get_dev_zone(device, physical[i], &zone);
    1403           0 :                 memalloc_nofs_restore(nofs_flag);
    1404           0 :                 if (ret == -EIO || ret == -EOPNOTSUPP) {
    1405           0 :                         ret = 0;
    1406           0 :                         alloc_offsets[i] = WP_MISSING_DEV;
    1407           0 :                         continue;
    1408           0 :                 } else if (ret) {
    1409           0 :                         goto out;
    1410             :                 }
    1411             : 
    1412           0 :                 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
    1413           0 :                         btrfs_err_in_rcu(fs_info,
    1414             :         "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
    1415             :                                 zone.start << SECTOR_SHIFT,
    1416             :                                 rcu_str_deref(device->name), device->devid);
    1417           0 :                         ret = -EIO;
    1418           0 :                         goto out;
    1419             :                 }
    1420             : 
    1421           0 :                 caps[i] = (zone.capacity << SECTOR_SHIFT);
    1422             : 
    1423           0 :                 switch (zone.cond) {
    1424           0 :                 case BLK_ZONE_COND_OFFLINE:
    1425             :                 case BLK_ZONE_COND_READONLY:
    1426           0 :                         btrfs_err(fs_info,
    1427             :                 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
    1428             :                                   physical[i] >> device->zone_info->zone_size_shift,
    1429             :                                   rcu_str_deref(device->name), device->devid);
    1430           0 :                         alloc_offsets[i] = WP_MISSING_DEV;
    1431           0 :                         break;
    1432           0 :                 case BLK_ZONE_COND_EMPTY:
    1433           0 :                         alloc_offsets[i] = 0;
    1434           0 :                         break;
    1435           0 :                 case BLK_ZONE_COND_FULL:
    1436           0 :                         alloc_offsets[i] = caps[i];
    1437           0 :                         break;
    1438           0 :                 default:
    1439             :                         /* Partially used zone */
    1440           0 :                         alloc_offsets[i] =
    1441           0 :                                         ((zone.wp - zone.start) << SECTOR_SHIFT);
    1442           0 :                         __set_bit(i, active);
    1443             :                         break;
    1444             :                 }
    1445             :         }
    1446             : 
    1447           0 :         if (num_sequential > 0)
    1448           0 :                 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
    1449             : 
    1450           0 :         if (num_conventional > 0) {
    1451             :                 /* Zone capacity is always zone size in emulation */
    1452           0 :                 cache->zone_capacity = cache->length;
    1453           0 :                 ret = calculate_alloc_pointer(cache, &last_alloc, new);
    1454           0 :                 if (ret) {
    1455           0 :                         btrfs_err(fs_info,
    1456             :                         "zoned: failed to determine allocation offset of bg %llu",
    1457             :                                   cache->start);
    1458           0 :                         goto out;
    1459           0 :                 } else if (map->num_stripes == num_conventional) {
    1460           0 :                         cache->alloc_offset = last_alloc;
    1461           0 :                         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
    1462           0 :                         goto out;
    1463             :                 }
    1464             :         }
    1465             : 
    1466           0 :         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
    1467           0 :         case 0: /* single */
    1468           0 :                 if (alloc_offsets[0] == WP_MISSING_DEV) {
    1469           0 :                         btrfs_err(fs_info,
    1470             :                         "zoned: cannot recover write pointer for zone %llu",
    1471             :                                 physical[0]);
    1472           0 :                         ret = -EIO;
    1473           0 :                         goto out;
    1474             :                 }
    1475           0 :                 cache->alloc_offset = alloc_offsets[0];
    1476           0 :                 cache->zone_capacity = caps[0];
    1477           0 :                 if (test_bit(0, active))
    1478           0 :                         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
    1479             :                 break;
    1480           0 :         case BTRFS_BLOCK_GROUP_DUP:
    1481           0 :                 if (map->type & BTRFS_BLOCK_GROUP_DATA) {
    1482           0 :                         btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
    1483           0 :                         ret = -EINVAL;
    1484           0 :                         goto out;
    1485             :                 }
    1486           0 :                 if (alloc_offsets[0] == WP_MISSING_DEV) {
    1487           0 :                         btrfs_err(fs_info,
    1488             :                         "zoned: cannot recover write pointer for zone %llu",
    1489             :                                 physical[0]);
    1490           0 :                         ret = -EIO;
    1491           0 :                         goto out;
    1492             :                 }
    1493           0 :                 if (alloc_offsets[1] == WP_MISSING_DEV) {
    1494           0 :                         btrfs_err(fs_info,
    1495             :                         "zoned: cannot recover write pointer for zone %llu",
    1496             :                                 physical[1]);
    1497           0 :                         ret = -EIO;
    1498           0 :                         goto out;
    1499             :                 }
    1500           0 :                 if (alloc_offsets[0] != alloc_offsets[1]) {
    1501           0 :                         btrfs_err(fs_info,
    1502             :                         "zoned: write pointer offset mismatch of zones in DUP profile");
    1503           0 :                         ret = -EIO;
    1504           0 :                         goto out;
    1505             :                 }
    1506           0 :                 if (test_bit(0, active) != test_bit(1, active)) {
    1507           0 :                         if (!btrfs_zone_activate(cache)) {
    1508           0 :                                 ret = -EIO;
    1509           0 :                                 goto out;
    1510             :                         }
    1511             :                 } else {
    1512           0 :                         if (test_bit(0, active))
    1513           0 :                                 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
    1514           0 :                                         &cache->runtime_flags);
    1515             :                 }
    1516           0 :                 cache->alloc_offset = alloc_offsets[0];
    1517           0 :                 cache->zone_capacity = min(caps[0], caps[1]);
    1518           0 :                 break;
    1519           0 :         case BTRFS_BLOCK_GROUP_RAID1:
    1520             :         case BTRFS_BLOCK_GROUP_RAID0:
    1521             :         case BTRFS_BLOCK_GROUP_RAID10:
    1522             :         case BTRFS_BLOCK_GROUP_RAID5:
    1523             :         case BTRFS_BLOCK_GROUP_RAID6:
    1524             :                 /* non-single profiles are not supported yet */
    1525             :         default:
    1526           0 :                 btrfs_err(fs_info, "zoned: profile %s not yet supported",
    1527             :                           btrfs_bg_type_to_raid_name(map->type));
    1528           0 :                 ret = -EINVAL;
    1529           0 :                 goto out;
    1530             :         }
    1531             : 
    1532           0 : out:
    1533           0 :         if (cache->alloc_offset > fs_info->zone_size) {
    1534           0 :                 btrfs_err(fs_info,
    1535             :                         "zoned: invalid write pointer %llu in block group %llu",
    1536             :                         cache->alloc_offset, cache->start);
    1537           0 :                 ret = -EIO;
    1538             :         }
    1539             : 
    1540           0 :         if (cache->alloc_offset > cache->zone_capacity) {
    1541           0 :                 btrfs_err(fs_info,
    1542             : "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
    1543             :                           cache->alloc_offset, cache->zone_capacity,
    1544             :                           cache->start);
    1545           0 :                 ret = -EIO;
    1546             :         }
    1547             : 
    1548             :         /* An extent is allocated after the write pointer */
    1549           0 :         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
    1550           0 :                 btrfs_err(fs_info,
    1551             :                           "zoned: got wrong write pointer in BG %llu: %llu > %llu",
    1552             :                           logical, last_alloc, cache->alloc_offset);
    1553           0 :                 ret = -EIO;
    1554             :         }
    1555             : 
    1556           0 :         if (!ret) {
    1557           0 :                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
    1558           0 :                 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
    1559           0 :                         btrfs_get_block_group(cache);
    1560           0 :                         spin_lock(&fs_info->zone_active_bgs_lock);
    1561           0 :                         list_add_tail(&cache->active_bg_list,
    1562             :                                       &fs_info->zone_active_bgs);
    1563           0 :                         spin_unlock(&fs_info->zone_active_bgs_lock);
    1564             :                 }
    1565             :         } else {
    1566           0 :                 kfree(cache->physical_map);
    1567           0 :                 cache->physical_map = NULL;
    1568             :         }
    1569           0 :         bitmap_free(active);
    1570           0 :         kfree(physical);
    1571           0 :         kfree(caps);
    1572           0 :         kfree(alloc_offsets);
    1573           0 :         free_extent_map(em);
    1574             : 
    1575           0 :         return ret;
    1576             : }
    1577             : 
    1578           0 : void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
    1579             : {
    1580           0 :         u64 unusable, free;
    1581             : 
    1582           0 :         if (!btrfs_is_zoned(cache->fs_info))
    1583             :                 return;
    1584             : 
    1585           0 :         WARN_ON(cache->bytes_super != 0);
    1586             : 
    1587             :         /* Check for block groups never get activated */
    1588           0 :         if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
    1589           0 :             cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
    1590           0 :             !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
    1591           0 :             cache->alloc_offset == 0) {
    1592           0 :                 unusable = cache->length;
    1593           0 :                 free = 0;
    1594             :         } else {
    1595           0 :                 unusable = (cache->alloc_offset - cache->used) +
    1596           0 :                            (cache->length - cache->zone_capacity);
    1597           0 :                 free = cache->zone_capacity - cache->alloc_offset;
    1598             :         }
    1599             : 
    1600             :         /* We only need ->free_space in ALLOC_SEQ block groups */
    1601           0 :         cache->cached = BTRFS_CACHE_FINISHED;
    1602           0 :         cache->free_space_ctl->free_space = free;
    1603           0 :         cache->zone_unusable = unusable;
    1604             : }
    1605             : 
    1606           0 : void btrfs_redirty_list_add(struct btrfs_transaction *trans,
    1607             :                             struct extent_buffer *eb)
    1608             : {
    1609           0 :         if (!btrfs_is_zoned(eb->fs_info) ||
    1610             :             btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
    1611             :                 return;
    1612             : 
    1613           0 :         ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
    1614             : 
    1615           0 :         memzero_extent_buffer(eb, 0, eb->len);
    1616           0 :         set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
    1617           0 :         set_extent_buffer_dirty(eb);
    1618           0 :         set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
    1619             :                         EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
    1620             : }
    1621             : 
    1622           0 : bool btrfs_use_zone_append(struct btrfs_bio *bbio)
    1623             : {
    1624           0 :         u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
    1625           0 :         struct btrfs_inode *inode = bbio->inode;
    1626           0 :         struct btrfs_fs_info *fs_info = bbio->fs_info;
    1627           0 :         struct btrfs_block_group *cache;
    1628           0 :         bool ret = false;
    1629             : 
    1630           0 :         if (!btrfs_is_zoned(fs_info))
    1631             :                 return false;
    1632             : 
    1633           0 :         if (!inode || !is_data_inode(&inode->vfs_inode))
    1634             :                 return false;
    1635             : 
    1636           0 :         if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
    1637             :                 return false;
    1638             : 
    1639             :         /*
    1640             :          * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
    1641             :          * extent layout the relocation code has.
    1642             :          * Furthermore we have set aside own block-group from which only the
    1643             :          * relocation "process" can allocate and make sure only one process at a
    1644             :          * time can add pages to an extent that gets relocated, so it's safe to
    1645             :          * use regular REQ_OP_WRITE for this special case.
    1646             :          */
    1647           0 :         if (btrfs_is_data_reloc_root(inode->root))
    1648             :                 return false;
    1649             : 
    1650           0 :         cache = btrfs_lookup_block_group(fs_info, start);
    1651           0 :         ASSERT(cache);
    1652           0 :         if (!cache)
    1653             :                 return false;
    1654             : 
    1655           0 :         ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
    1656           0 :         btrfs_put_block_group(cache);
    1657             : 
    1658           0 :         return ret;
    1659             : }
    1660             : 
    1661           0 : void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
    1662             : {
    1663           0 :         const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
    1664           0 :         struct btrfs_ordered_sum *sum = bbio->sums;
    1665             : 
    1666           0 :         if (physical < bbio->orig_physical)
    1667           0 :                 sum->logical -= bbio->orig_physical - physical;
    1668             :         else
    1669           0 :                 sum->logical += physical - bbio->orig_physical;
    1670           0 : }
    1671             : 
    1672           0 : static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
    1673             :                                         u64 logical)
    1674             : {
    1675           0 :         struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
    1676           0 :         struct extent_map *em;
    1677             : 
    1678           0 :         ordered->disk_bytenr = logical;
    1679             : 
    1680           0 :         write_lock(&em_tree->lock);
    1681           0 :         em = search_extent_mapping(em_tree, ordered->file_offset,
    1682             :                                    ordered->num_bytes);
    1683           0 :         em->block_start = logical;
    1684           0 :         free_extent_map(em);
    1685           0 :         write_unlock(&em_tree->lock);
    1686           0 : }
    1687             : 
    1688           0 : static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
    1689             :                                       u64 logical, u64 len)
    1690             : {
    1691           0 :         struct btrfs_ordered_extent *new;
    1692             : 
    1693           0 :         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
    1694           0 :             split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
    1695             :                              ordered->num_bytes, len, logical))
    1696             :                 return false;
    1697             : 
    1698           0 :         new = btrfs_split_ordered_extent(ordered, len);
    1699           0 :         if (IS_ERR(new))
    1700             :                 return false;
    1701           0 :         new->disk_bytenr = logical;
    1702           0 :         btrfs_finish_one_ordered(new);
    1703           0 :         return true;
    1704             : }
    1705             : 
    1706           0 : void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
    1707             : {
    1708           0 :         struct btrfs_inode *inode = BTRFS_I(ordered->inode);
    1709           0 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
    1710           0 :         struct btrfs_ordered_sum *sum =
    1711           0 :                 list_first_entry(&ordered->list, typeof(*sum), list);
    1712           0 :         u64 logical = sum->logical;
    1713           0 :         u64 len = sum->len;
    1714             : 
    1715           0 :         while (len < ordered->disk_num_bytes) {
    1716           0 :                 sum = list_next_entry(sum, list);
    1717           0 :                 if (sum->logical == logical + len) {
    1718           0 :                         len += sum->len;
    1719           0 :                         continue;
    1720             :                 }
    1721           0 :                 if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
    1722           0 :                         set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
    1723           0 :                         btrfs_err(fs_info, "failed to split ordered extent");
    1724           0 :                         goto out;
    1725             :                 }
    1726           0 :                 logical = sum->logical;
    1727           0 :                 len = sum->len;
    1728             :         }
    1729             : 
    1730           0 :         if (ordered->disk_bytenr != logical)
    1731           0 :                 btrfs_rewrite_logical_zoned(ordered, logical);
    1732             : 
    1733           0 : out:
    1734             :         /*
    1735             :          * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
    1736             :          * were allocated by btrfs_alloc_dummy_sum only to record the logical
    1737             :          * addresses and don't contain actual checksums.  We thus must free them
    1738             :          * here so that we don't attempt to log the csums later.
    1739             :          */
    1740           0 :         if ((inode->flags & BTRFS_INODE_NODATASUM) ||
    1741           0 :             test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
    1742           0 :                 while ((sum = list_first_entry_or_null(&ordered->list,
    1743             :                                                        typeof(*sum), list))) {
    1744           0 :                         list_del(&sum->list);
    1745           0 :                         kfree(sum);
    1746             :                 }
    1747             :         }
    1748           0 : }
    1749             : 
    1750           0 : bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
    1751             :                                     struct extent_buffer *eb,
    1752             :                                     struct btrfs_block_group **cache_ret)
    1753             : {
    1754           0 :         struct btrfs_block_group *cache;
    1755           0 :         bool ret = true;
    1756             : 
    1757           0 :         if (!btrfs_is_zoned(fs_info))
    1758             :                 return true;
    1759             : 
    1760           0 :         cache = btrfs_lookup_block_group(fs_info, eb->start);
    1761           0 :         if (!cache)
    1762             :                 return true;
    1763             : 
    1764           0 :         if (cache->meta_write_pointer != eb->start) {
    1765           0 :                 btrfs_put_block_group(cache);
    1766           0 :                 cache = NULL;
    1767           0 :                 ret = false;
    1768             :         } else {
    1769           0 :                 cache->meta_write_pointer = eb->start + eb->len;
    1770             :         }
    1771             : 
    1772           0 :         *cache_ret = cache;
    1773             : 
    1774           0 :         return ret;
    1775             : }
    1776             : 
    1777           0 : void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
    1778             :                                      struct extent_buffer *eb)
    1779             : {
    1780           0 :         if (!btrfs_is_zoned(eb->fs_info) || !cache)
    1781             :                 return;
    1782             : 
    1783           0 :         ASSERT(cache->meta_write_pointer == eb->start + eb->len);
    1784           0 :         cache->meta_write_pointer = eb->start;
    1785             : }
    1786             : 
    1787           0 : int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
    1788             : {
    1789           0 :         if (!btrfs_dev_is_sequential(device, physical))
    1790             :                 return -EOPNOTSUPP;
    1791             : 
    1792           0 :         return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
    1793             :                                     length >> SECTOR_SHIFT, GFP_NOFS, 0);
    1794             : }
    1795             : 
    1796           0 : static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
    1797             :                           struct blk_zone *zone)
    1798             : {
    1799           0 :         struct btrfs_io_context *bioc = NULL;
    1800           0 :         u64 mapped_length = PAGE_SIZE;
    1801           0 :         unsigned int nofs_flag;
    1802           0 :         int nmirrors;
    1803           0 :         int i, ret;
    1804             : 
    1805           0 :         ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
    1806             :                               &mapped_length, &bioc, NULL, NULL, 1);
    1807           0 :         if (ret || !bioc || mapped_length < PAGE_SIZE) {
    1808           0 :                 ret = -EIO;
    1809           0 :                 goto out_put_bioc;
    1810             :         }
    1811             : 
    1812           0 :         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
    1813           0 :                 ret = -EINVAL;
    1814           0 :                 goto out_put_bioc;
    1815             :         }
    1816             : 
    1817           0 :         nofs_flag = memalloc_nofs_save();
    1818           0 :         nmirrors = (int)bioc->num_stripes;
    1819           0 :         for (i = 0; i < nmirrors; i++) {
    1820           0 :                 u64 physical = bioc->stripes[i].physical;
    1821           0 :                 struct btrfs_device *dev = bioc->stripes[i].dev;
    1822             : 
    1823             :                 /* Missing device */
    1824           0 :                 if (!dev->bdev)
    1825           0 :                         continue;
    1826             : 
    1827           0 :                 ret = btrfs_get_dev_zone(dev, physical, zone);
    1828             :                 /* Failing device */
    1829           0 :                 if (ret == -EIO || ret == -EOPNOTSUPP)
    1830           0 :                         continue;
    1831             :                 break;
    1832             :         }
    1833           0 :         memalloc_nofs_restore(nofs_flag);
    1834           0 : out_put_bioc:
    1835           0 :         btrfs_put_bioc(bioc);
    1836           0 :         return ret;
    1837             : }
    1838             : 
    1839             : /*
    1840             :  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
    1841             :  * filling zeros between @physical_pos to a write pointer of dev-replace
    1842             :  * source device.
    1843             :  */
    1844           0 : int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
    1845             :                                     u64 physical_start, u64 physical_pos)
    1846             : {
    1847           0 :         struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
    1848           0 :         struct blk_zone zone;
    1849           0 :         u64 length;
    1850           0 :         u64 wp;
    1851           0 :         int ret;
    1852             : 
    1853           0 :         if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
    1854             :                 return 0;
    1855             : 
    1856           0 :         ret = read_zone_info(fs_info, logical, &zone);
    1857           0 :         if (ret)
    1858             :                 return ret;
    1859             : 
    1860           0 :         wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
    1861             : 
    1862           0 :         if (physical_pos == wp)
    1863             :                 return 0;
    1864             : 
    1865           0 :         if (physical_pos > wp)
    1866             :                 return -EUCLEAN;
    1867             : 
    1868           0 :         length = wp - physical_pos;
    1869           0 :         return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
    1870             : }
    1871             : 
    1872             : /*
    1873             :  * Activate block group and underlying device zones
    1874             :  *
    1875             :  * @block_group: the block group to activate
    1876             :  *
    1877             :  * Return: true on success, false otherwise
    1878             :  */
    1879           0 : bool btrfs_zone_activate(struct btrfs_block_group *block_group)
    1880             : {
    1881           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    1882           0 :         struct btrfs_space_info *space_info = block_group->space_info;
    1883           0 :         struct map_lookup *map;
    1884           0 :         struct btrfs_device *device;
    1885           0 :         u64 physical;
    1886           0 :         bool ret;
    1887           0 :         int i;
    1888             : 
    1889           0 :         if (!btrfs_is_zoned(block_group->fs_info))
    1890             :                 return true;
    1891             : 
    1892           0 :         map = block_group->physical_map;
    1893             : 
    1894           0 :         spin_lock(&space_info->lock);
    1895           0 :         spin_lock(&block_group->lock);
    1896           0 :         if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
    1897           0 :                 ret = true;
    1898           0 :                 goto out_unlock;
    1899             :         }
    1900             : 
    1901             :         /* No space left */
    1902           0 :         if (btrfs_zoned_bg_is_full(block_group)) {
    1903           0 :                 ret = false;
    1904           0 :                 goto out_unlock;
    1905             :         }
    1906             : 
    1907           0 :         for (i = 0; i < map->num_stripes; i++) {
    1908           0 :                 device = map->stripes[i].dev;
    1909           0 :                 physical = map->stripes[i].physical;
    1910             : 
    1911           0 :                 if (device->zone_info->max_active_zones == 0)
    1912           0 :                         continue;
    1913             : 
    1914           0 :                 if (!btrfs_dev_set_active_zone(device, physical)) {
    1915             :                         /* Cannot activate the zone */
    1916           0 :                         ret = false;
    1917           0 :                         goto out_unlock;
    1918             :                 }
    1919             :         }
    1920             : 
    1921             :         /* Successfully activated all the zones */
    1922           0 :         set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
    1923           0 :         WARN_ON(block_group->alloc_offset != 0);
    1924           0 :         if (block_group->zone_unusable == block_group->length) {
    1925           0 :                 block_group->zone_unusable = block_group->length - block_group->zone_capacity;
    1926           0 :                 space_info->bytes_zone_unusable -= block_group->zone_capacity;
    1927             :         }
    1928           0 :         spin_unlock(&block_group->lock);
    1929           0 :         btrfs_try_granting_tickets(fs_info, space_info);
    1930           0 :         spin_unlock(&space_info->lock);
    1931             : 
    1932             :         /* For the active block group list */
    1933           0 :         btrfs_get_block_group(block_group);
    1934             : 
    1935           0 :         spin_lock(&fs_info->zone_active_bgs_lock);
    1936           0 :         list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
    1937           0 :         spin_unlock(&fs_info->zone_active_bgs_lock);
    1938             : 
    1939           0 :         return true;
    1940             : 
    1941           0 : out_unlock:
    1942           0 :         spin_unlock(&block_group->lock);
    1943           0 :         spin_unlock(&space_info->lock);
    1944           0 :         return ret;
    1945             : }
    1946             : 
    1947           0 : static void wait_eb_writebacks(struct btrfs_block_group *block_group)
    1948             : {
    1949           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    1950           0 :         const u64 end = block_group->start + block_group->length;
    1951           0 :         struct radix_tree_iter iter;
    1952           0 :         struct extent_buffer *eb;
    1953           0 :         void __rcu **slot;
    1954             : 
    1955           0 :         rcu_read_lock();
    1956           0 :         radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
    1957             :                                  block_group->start >> fs_info->sectorsize_bits) {
    1958           0 :                 eb = radix_tree_deref_slot(slot);
    1959           0 :                 if (!eb)
    1960           0 :                         continue;
    1961           0 :                 if (radix_tree_deref_retry(eb)) {
    1962           0 :                         slot = radix_tree_iter_retry(&iter);
    1963           0 :                         continue;
    1964             :                 }
    1965             : 
    1966           0 :                 if (eb->start < block_group->start)
    1967           0 :                         continue;
    1968           0 :                 if (eb->start >= end)
    1969             :                         break;
    1970             : 
    1971           0 :                 slot = radix_tree_iter_resume(slot, &iter);
    1972           0 :                 rcu_read_unlock();
    1973           0 :                 wait_on_extent_buffer_writeback(eb);
    1974           0 :                 rcu_read_lock();
    1975             :         }
    1976           0 :         rcu_read_unlock();
    1977           0 : }
    1978             : 
    1979           0 : static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
    1980             : {
    1981           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    1982           0 :         struct map_lookup *map;
    1983           0 :         const bool is_metadata = (block_group->flags &
    1984             :                         (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
    1985           0 :         int ret = 0;
    1986           0 :         int i;
    1987             : 
    1988           0 :         spin_lock(&block_group->lock);
    1989           0 :         if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
    1990           0 :                 spin_unlock(&block_group->lock);
    1991           0 :                 return 0;
    1992             :         }
    1993             : 
    1994             :         /* Check if we have unwritten allocated space */
    1995           0 :         if (is_metadata &&
    1996           0 :             block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
    1997           0 :                 spin_unlock(&block_group->lock);
    1998           0 :                 return -EAGAIN;
    1999             :         }
    2000             : 
    2001             :         /*
    2002             :          * If we are sure that the block group is full (= no more room left for
    2003             :          * new allocation) and the IO for the last usable block is completed, we
    2004             :          * don't need to wait for the other IOs. This holds because we ensure
    2005             :          * the sequential IO submissions using the ZONE_APPEND command for data
    2006             :          * and block_group->meta_write_pointer for metadata.
    2007             :          */
    2008           0 :         if (!fully_written) {
    2009           0 :                 spin_unlock(&block_group->lock);
    2010             : 
    2011           0 :                 ret = btrfs_inc_block_group_ro(block_group, false);
    2012           0 :                 if (ret)
    2013             :                         return ret;
    2014             : 
    2015             :                 /* Ensure all writes in this block group finish */
    2016           0 :                 btrfs_wait_block_group_reservations(block_group);
    2017             :                 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
    2018           0 :                 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
    2019             :                                          block_group->length);
    2020             :                 /* Wait for extent buffers to be written. */
    2021           0 :                 if (is_metadata)
    2022           0 :                         wait_eb_writebacks(block_group);
    2023             : 
    2024           0 :                 spin_lock(&block_group->lock);
    2025             : 
    2026             :                 /*
    2027             :                  * Bail out if someone already deactivated the block group, or
    2028             :                  * allocated space is left in the block group.
    2029             :                  */
    2030           0 :                 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
    2031             :                               &block_group->runtime_flags)) {
    2032           0 :                         spin_unlock(&block_group->lock);
    2033           0 :                         btrfs_dec_block_group_ro(block_group);
    2034           0 :                         return 0;
    2035             :                 }
    2036             : 
    2037           0 :                 if (block_group->reserved) {
    2038           0 :                         spin_unlock(&block_group->lock);
    2039           0 :                         btrfs_dec_block_group_ro(block_group);
    2040           0 :                         return -EAGAIN;
    2041             :                 }
    2042             :         }
    2043             : 
    2044           0 :         clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
    2045           0 :         block_group->alloc_offset = block_group->zone_capacity;
    2046           0 :         block_group->free_space_ctl->free_space = 0;
    2047           0 :         btrfs_clear_treelog_bg(block_group);
    2048           0 :         btrfs_clear_data_reloc_bg(block_group);
    2049           0 :         spin_unlock(&block_group->lock);
    2050             : 
    2051           0 :         map = block_group->physical_map;
    2052           0 :         for (i = 0; i < map->num_stripes; i++) {
    2053           0 :                 struct btrfs_device *device = map->stripes[i].dev;
    2054           0 :                 const u64 physical = map->stripes[i].physical;
    2055             : 
    2056           0 :                 if (device->zone_info->max_active_zones == 0)
    2057           0 :                         continue;
    2058             : 
    2059           0 :                 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
    2060             :                                        physical >> SECTOR_SHIFT,
    2061           0 :                                        device->zone_info->zone_size >> SECTOR_SHIFT,
    2062             :                                        GFP_NOFS);
    2063             : 
    2064           0 :                 if (ret)
    2065           0 :                         return ret;
    2066             : 
    2067           0 :                 btrfs_dev_clear_active_zone(device, physical);
    2068             :         }
    2069             : 
    2070           0 :         if (!fully_written)
    2071           0 :                 btrfs_dec_block_group_ro(block_group);
    2072             : 
    2073           0 :         spin_lock(&fs_info->zone_active_bgs_lock);
    2074           0 :         ASSERT(!list_empty(&block_group->active_bg_list));
    2075           0 :         list_del_init(&block_group->active_bg_list);
    2076           0 :         spin_unlock(&fs_info->zone_active_bgs_lock);
    2077             : 
    2078             :         /* For active_bg_list */
    2079           0 :         btrfs_put_block_group(block_group);
    2080             : 
    2081           0 :         clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
    2082             : 
    2083           0 :         return 0;
    2084             : }
    2085             : 
    2086           0 : int btrfs_zone_finish(struct btrfs_block_group *block_group)
    2087             : {
    2088           0 :         if (!btrfs_is_zoned(block_group->fs_info))
    2089             :                 return 0;
    2090             : 
    2091           0 :         return do_zone_finish(block_group, false);
    2092             : }
    2093             : 
    2094           0 : bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
    2095             : {
    2096           0 :         struct btrfs_fs_info *fs_info = fs_devices->fs_info;
    2097           0 :         struct btrfs_device *device;
    2098           0 :         bool ret = false;
    2099             : 
    2100           0 :         if (!btrfs_is_zoned(fs_info))
    2101             :                 return true;
    2102             : 
    2103             :         /* Check if there is a device with active zones left */
    2104           0 :         mutex_lock(&fs_info->chunk_mutex);
    2105           0 :         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
    2106           0 :                 struct btrfs_zoned_device_info *zinfo = device->zone_info;
    2107             : 
    2108           0 :                 if (!device->bdev)
    2109           0 :                         continue;
    2110             : 
    2111           0 :                 if (!zinfo->max_active_zones) {
    2112             :                         ret = true;
    2113             :                         break;
    2114             :                 }
    2115             : 
    2116           0 :                 switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
    2117           0 :                 case 0: /* single */
    2118           0 :                         ret = (atomic_read(&zinfo->active_zones_left) >= 1);
    2119           0 :                         break;
    2120           0 :                 case BTRFS_BLOCK_GROUP_DUP:
    2121           0 :                         ret = (atomic_read(&zinfo->active_zones_left) >= 2);
    2122           0 :                         break;
    2123             :                 }
    2124           0 :                 if (ret)
    2125             :                         break;
    2126             :         }
    2127           0 :         mutex_unlock(&fs_info->chunk_mutex);
    2128             : 
    2129           0 :         if (!ret)
    2130           0 :                 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
    2131             : 
    2132             :         return ret;
    2133             : }
    2134             : 
    2135           0 : void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
    2136             : {
    2137           0 :         struct btrfs_block_group *block_group;
    2138           0 :         u64 min_alloc_bytes;
    2139             : 
    2140           0 :         if (!btrfs_is_zoned(fs_info))
    2141             :                 return;
    2142             : 
    2143           0 :         block_group = btrfs_lookup_block_group(fs_info, logical);
    2144           0 :         ASSERT(block_group);
    2145             : 
    2146             :         /* No MIXED_BG on zoned btrfs. */
    2147           0 :         if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
    2148           0 :                 min_alloc_bytes = fs_info->sectorsize;
    2149             :         else
    2150           0 :                 min_alloc_bytes = fs_info->nodesize;
    2151             : 
    2152             :         /* Bail out if we can allocate more data from this block group. */
    2153           0 :         if (logical + length + min_alloc_bytes <=
    2154           0 :             block_group->start + block_group->zone_capacity)
    2155           0 :                 goto out;
    2156             : 
    2157           0 :         do_zone_finish(block_group, true);
    2158             : 
    2159           0 : out:
    2160           0 :         btrfs_put_block_group(block_group);
    2161             : }
    2162             : 
    2163           0 : static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
    2164             : {
    2165           0 :         struct btrfs_block_group *bg =
    2166           0 :                 container_of(work, struct btrfs_block_group, zone_finish_work);
    2167             : 
    2168           0 :         wait_on_extent_buffer_writeback(bg->last_eb);
    2169           0 :         free_extent_buffer(bg->last_eb);
    2170           0 :         btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
    2171           0 :         btrfs_put_block_group(bg);
    2172           0 : }
    2173             : 
    2174           0 : void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
    2175             :                                    struct extent_buffer *eb)
    2176             : {
    2177           0 :         if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
    2178           0 :             eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
    2179             :                 return;
    2180             : 
    2181           0 :         if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
    2182           0 :                 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
    2183             :                           bg->start);
    2184           0 :                 return;
    2185             :         }
    2186             : 
    2187             :         /* For the work */
    2188           0 :         btrfs_get_block_group(bg);
    2189           0 :         atomic_inc(&eb->refs);
    2190           0 :         bg->last_eb = eb;
    2191           0 :         INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
    2192           0 :         queue_work(system_unbound_wq, &bg->zone_finish_work);
    2193             : }
    2194             : 
    2195           0 : void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
    2196             : {
    2197           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    2198             : 
    2199           0 :         spin_lock(&fs_info->relocation_bg_lock);
    2200           0 :         if (fs_info->data_reloc_bg == bg->start)
    2201           0 :                 fs_info->data_reloc_bg = 0;
    2202           0 :         spin_unlock(&fs_info->relocation_bg_lock);
    2203           0 : }
    2204             : 
    2205           0 : void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
    2206             : {
    2207           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2208           0 :         struct btrfs_device *device;
    2209             : 
    2210           0 :         if (!btrfs_is_zoned(fs_info))
    2211             :                 return;
    2212             : 
    2213           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2214           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
    2215           0 :                 if (device->zone_info) {
    2216           0 :                         vfree(device->zone_info->zone_cache);
    2217           0 :                         device->zone_info->zone_cache = NULL;
    2218             :                 }
    2219             :         }
    2220           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2221             : }
    2222             : 
    2223           0 : bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
    2224             : {
    2225           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2226           0 :         struct btrfs_device *device;
    2227           0 :         u64 used = 0;
    2228           0 :         u64 total = 0;
    2229           0 :         u64 factor;
    2230             : 
    2231           0 :         ASSERT(btrfs_is_zoned(fs_info));
    2232             : 
    2233           0 :         if (fs_info->bg_reclaim_threshold == 0)
    2234             :                 return false;
    2235             : 
    2236           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2237           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
    2238           0 :                 if (!device->bdev)
    2239           0 :                         continue;
    2240             : 
    2241           0 :                 total += device->disk_total_bytes;
    2242           0 :                 used += device->bytes_used;
    2243             :         }
    2244           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2245             : 
    2246           0 :         factor = div64_u64(used * 100, total);
    2247           0 :         return factor >= fs_info->bg_reclaim_threshold;
    2248             : }
    2249             : 
    2250           0 : void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
    2251             :                                        u64 length)
    2252             : {
    2253           0 :         struct btrfs_block_group *block_group;
    2254             : 
    2255           0 :         if (!btrfs_is_zoned(fs_info))
    2256             :                 return;
    2257             : 
    2258           0 :         block_group = btrfs_lookup_block_group(fs_info, logical);
    2259             :         /* It should be called on a previous data relocation block group. */
    2260           0 :         ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
    2261             : 
    2262           0 :         spin_lock(&block_group->lock);
    2263           0 :         if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
    2264           0 :                 goto out;
    2265             : 
    2266             :         /* All relocation extents are written. */
    2267           0 :         if (block_group->start + block_group->alloc_offset == logical + length) {
    2268             :                 /* Now, release this block group for further allocations. */
    2269           0 :                 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
    2270             :                           &block_group->runtime_flags);
    2271             :         }
    2272             : 
    2273           0 : out:
    2274           0 :         spin_unlock(&block_group->lock);
    2275           0 :         btrfs_put_block_group(block_group);
    2276             : }
    2277             : 
    2278           0 : int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
    2279             : {
    2280           0 :         struct btrfs_block_group *block_group;
    2281           0 :         struct btrfs_block_group *min_bg = NULL;
    2282           0 :         u64 min_avail = U64_MAX;
    2283           0 :         int ret;
    2284             : 
    2285           0 :         spin_lock(&fs_info->zone_active_bgs_lock);
    2286           0 :         list_for_each_entry(block_group, &fs_info->zone_active_bgs,
    2287             :                             active_bg_list) {
    2288           0 :                 u64 avail;
    2289             : 
    2290           0 :                 spin_lock(&block_group->lock);
    2291           0 :                 if (block_group->reserved || block_group->alloc_offset == 0 ||
    2292           0 :                     (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
    2293           0 :                         spin_unlock(&block_group->lock);
    2294           0 :                         continue;
    2295             :                 }
    2296             : 
    2297           0 :                 avail = block_group->zone_capacity - block_group->alloc_offset;
    2298           0 :                 if (min_avail > avail) {
    2299           0 :                         if (min_bg)
    2300           0 :                                 btrfs_put_block_group(min_bg);
    2301           0 :                         min_bg = block_group;
    2302           0 :                         min_avail = avail;
    2303           0 :                         btrfs_get_block_group(min_bg);
    2304             :                 }
    2305           0 :                 spin_unlock(&block_group->lock);
    2306             :         }
    2307           0 :         spin_unlock(&fs_info->zone_active_bgs_lock);
    2308             : 
    2309           0 :         if (!min_bg)
    2310             :                 return 0;
    2311             : 
    2312           0 :         ret = btrfs_zone_finish(min_bg);
    2313           0 :         btrfs_put_block_group(min_bg);
    2314             : 
    2315           0 :         return ret < 0 ? ret : 1;
    2316             : }
    2317             : 
    2318           0 : int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
    2319             :                                 struct btrfs_space_info *space_info,
    2320             :                                 bool do_finish)
    2321             : {
    2322           0 :         struct btrfs_block_group *bg;
    2323           0 :         int index;
    2324             : 
    2325           0 :         if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
    2326             :                 return 0;
    2327             : 
    2328           0 :         for (;;) {
    2329           0 :                 int ret;
    2330           0 :                 bool need_finish = false;
    2331             : 
    2332           0 :                 down_read(&space_info->groups_sem);
    2333           0 :                 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
    2334           0 :                         list_for_each_entry(bg, &space_info->block_groups[index],
    2335             :                                             list) {
    2336           0 :                                 if (!spin_trylock(&bg->lock))
    2337           0 :                                         continue;
    2338           0 :                                 if (btrfs_zoned_bg_is_full(bg) ||
    2339           0 :                                     test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
    2340             :                                              &bg->runtime_flags)) {
    2341           0 :                                         spin_unlock(&bg->lock);
    2342           0 :                                         continue;
    2343             :                                 }
    2344           0 :                                 spin_unlock(&bg->lock);
    2345             : 
    2346           0 :                                 if (btrfs_zone_activate(bg)) {
    2347           0 :                                         up_read(&space_info->groups_sem);
    2348           0 :                                         return 1;
    2349             :                                 }
    2350             : 
    2351             :                                 need_finish = true;
    2352             :                         }
    2353             :                 }
    2354           0 :                 up_read(&space_info->groups_sem);
    2355             : 
    2356           0 :                 if (!do_finish || !need_finish)
    2357             :                         break;
    2358             : 
    2359           0 :                 ret = btrfs_zone_finish_one_bg(fs_info);
    2360           0 :                 if (ret == 0)
    2361             :                         break;
    2362           0 :                 if (ret < 0)
    2363           0 :                         return ret;
    2364             :         }
    2365             : 
    2366             :         return 0;
    2367             : }

Generated by: LCOV version 1.14