Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 :
3 : #include <linux/bitops.h>
4 : #include <linux/slab.h>
5 : #include <linux/blkdev.h>
6 : #include <linux/sched/mm.h>
7 : #include <linux/atomic.h>
8 : #include <linux/vmalloc.h>
9 : #include "ctree.h"
10 : #include "volumes.h"
11 : #include "zoned.h"
12 : #include "rcu-string.h"
13 : #include "disk-io.h"
14 : #include "block-group.h"
15 : #include "transaction.h"
16 : #include "dev-replace.h"
17 : #include "space-info.h"
18 : #include "super.h"
19 : #include "fs.h"
20 : #include "accessors.h"
21 : #include "bio.h"
22 :
23 : /* Maximum number of zones to report per blkdev_report_zones() call */
24 : #define BTRFS_REPORT_NR_ZONES 4096
25 : /* Invalid allocation pointer value for missing devices */
26 : #define WP_MISSING_DEV ((u64)-1)
27 : /* Pseudo write pointer value for conventional zone */
28 : #define WP_CONVENTIONAL ((u64)-2)
29 :
30 : /*
31 : * Location of the first zone of superblock logging zone pairs.
32 : *
33 : * - primary superblock: 0B (zone 0)
34 : * - first copy: 512G (zone starting at that offset)
35 : * - second copy: 4T (zone starting at that offset)
36 : */
37 : #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
38 : #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
39 : #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
40 :
41 : #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
42 : #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
43 :
44 : /* Number of superblock log zones */
45 : #define BTRFS_NR_SB_LOG_ZONES 2
46 :
47 : /*
48 : * Minimum of active zones we need:
49 : *
50 : * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
51 : * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
52 : * - 1 zone for tree-log dedicated block group
53 : * - 1 zone for relocation
54 : */
55 : #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
56 :
57 : /*
58 : * Minimum / maximum supported zone size. Currently, SMR disks have a zone
59 : * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
60 : * We do not expect the zone size to become larger than 8GiB or smaller than
61 : * 4MiB in the near future.
62 : */
63 : #define BTRFS_MAX_ZONE_SIZE SZ_8G
64 : #define BTRFS_MIN_ZONE_SIZE SZ_4M
65 :
66 : #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
67 :
68 : static inline bool sb_zone_is_full(const struct blk_zone *zone)
69 : {
70 0 : return (zone->cond == BLK_ZONE_COND_FULL) ||
71 0 : (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
72 : }
73 :
74 0 : static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
75 : {
76 0 : struct blk_zone *zones = data;
77 :
78 0 : memcpy(&zones[idx], zone, sizeof(*zone));
79 :
80 0 : return 0;
81 : }
82 :
83 0 : static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
84 : u64 *wp_ret)
85 : {
86 0 : bool empty[BTRFS_NR_SB_LOG_ZONES];
87 0 : bool full[BTRFS_NR_SB_LOG_ZONES];
88 0 : sector_t sector;
89 0 : int i;
90 :
91 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
92 0 : ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
93 0 : empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
94 0 : full[i] = sb_zone_is_full(&zones[i]);
95 : }
96 :
97 : /*
98 : * Possible states of log buffer zones
99 : *
100 : * Empty[0] In use[0] Full[0]
101 : * Empty[1] * 0 1
102 : * In use[1] x x 1
103 : * Full[1] 0 0 C
104 : *
105 : * Log position:
106 : * *: Special case, no superblock is written
107 : * 0: Use write pointer of zones[0]
108 : * 1: Use write pointer of zones[1]
109 : * C: Compare super blocks from zones[0] and zones[1], use the latest
110 : * one determined by generation
111 : * x: Invalid state
112 : */
113 :
114 0 : if (empty[0] && empty[1]) {
115 : /* Special case to distinguish no superblock to read */
116 0 : *wp_ret = zones[0].start << SECTOR_SHIFT;
117 0 : return -ENOENT;
118 0 : } else if (full[0] && full[1]) {
119 : /* Compare two super blocks */
120 0 : struct address_space *mapping = bdev->bd_inode->i_mapping;
121 0 : struct page *page[BTRFS_NR_SB_LOG_ZONES];
122 0 : struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
123 0 : int i;
124 :
125 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
126 0 : u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
127 0 : u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
128 : BTRFS_SUPER_INFO_SIZE;
129 :
130 0 : page[i] = read_cache_page_gfp(mapping,
131 0 : bytenr >> PAGE_SHIFT, GFP_NOFS);
132 0 : if (IS_ERR(page[i])) {
133 0 : if (i == 1)
134 0 : btrfs_release_disk_super(super[0]);
135 0 : return PTR_ERR(page[i]);
136 : }
137 0 : super[i] = page_address(page[i]);
138 : }
139 :
140 0 : if (btrfs_super_generation(super[0]) >
141 0 : btrfs_super_generation(super[1]))
142 0 : sector = zones[1].start;
143 : else
144 0 : sector = zones[0].start;
145 :
146 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
147 0 : btrfs_release_disk_super(super[i]);
148 0 : } else if (!full[0] && (empty[1] || full[1])) {
149 0 : sector = zones[0].wp;
150 0 : } else if (full[0]) {
151 0 : sector = zones[1].wp;
152 : } else {
153 : return -EUCLEAN;
154 : }
155 0 : *wp_ret = sector << SECTOR_SHIFT;
156 0 : return 0;
157 : }
158 :
159 : /*
160 : * Get the first zone number of the superblock mirror
161 : */
162 0 : static inline u32 sb_zone_number(int shift, int mirror)
163 : {
164 0 : u64 zone = U64_MAX;
165 :
166 0 : ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
167 0 : switch (mirror) {
168 0 : case 0: zone = 0; break;
169 0 : case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
170 0 : case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
171 : }
172 :
173 0 : ASSERT(zone <= U32_MAX);
174 :
175 0 : return (u32)zone;
176 : }
177 :
178 0 : static inline sector_t zone_start_sector(u32 zone_number,
179 : struct block_device *bdev)
180 : {
181 0 : return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
182 : }
183 :
184 : static inline u64 zone_start_physical(u32 zone_number,
185 : struct btrfs_zoned_device_info *zone_info)
186 : {
187 0 : return (u64)zone_number << zone_info->zone_size_shift;
188 : }
189 :
190 : /*
191 : * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
192 : * device into static sized chunks and fake a conventional zone on each of
193 : * them.
194 : */
195 0 : static int emulate_report_zones(struct btrfs_device *device, u64 pos,
196 : struct blk_zone *zones, unsigned int nr_zones)
197 : {
198 0 : const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
199 0 : sector_t bdev_size = bdev_nr_sectors(device->bdev);
200 0 : unsigned int i;
201 :
202 0 : pos >>= SECTOR_SHIFT;
203 0 : for (i = 0; i < nr_zones; i++) {
204 0 : zones[i].start = i * zone_sectors + pos;
205 0 : zones[i].len = zone_sectors;
206 0 : zones[i].capacity = zone_sectors;
207 0 : zones[i].wp = zones[i].start + zone_sectors;
208 0 : zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
209 0 : zones[i].cond = BLK_ZONE_COND_NOT_WP;
210 :
211 0 : if (zones[i].wp >= bdev_size) {
212 0 : i++;
213 0 : break;
214 : }
215 : }
216 :
217 0 : return i;
218 : }
219 :
220 0 : static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
221 : struct blk_zone *zones, unsigned int *nr_zones)
222 : {
223 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
224 0 : int ret;
225 :
226 0 : if (!*nr_zones)
227 : return 0;
228 :
229 0 : if (!bdev_is_zoned(device->bdev)) {
230 0 : ret = emulate_report_zones(device, pos, zones, *nr_zones);
231 0 : *nr_zones = ret;
232 0 : return 0;
233 : }
234 :
235 : /* Check cache */
236 0 : if (zinfo->zone_cache) {
237 0 : unsigned int i;
238 0 : u32 zno;
239 :
240 0 : ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
241 0 : zno = pos >> zinfo->zone_size_shift;
242 : /*
243 : * We cannot report zones beyond the zone end. So, it is OK to
244 : * cap *nr_zones to at the end.
245 : */
246 0 : *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
247 :
248 0 : for (i = 0; i < *nr_zones; i++) {
249 0 : struct blk_zone *zone_info;
250 :
251 0 : zone_info = &zinfo->zone_cache[zno + i];
252 0 : if (!zone_info->len)
253 : break;
254 : }
255 :
256 0 : if (i == *nr_zones) {
257 : /* Cache hit on all the zones */
258 0 : memcpy(zones, zinfo->zone_cache + zno,
259 : sizeof(*zinfo->zone_cache) * *nr_zones);
260 0 : return 0;
261 : }
262 : }
263 :
264 0 : ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
265 : copy_zone_info_cb, zones);
266 0 : if (ret < 0) {
267 0 : btrfs_err_in_rcu(device->fs_info,
268 : "zoned: failed to read zone %llu on %s (devid %llu)",
269 : pos, rcu_str_deref(device->name),
270 : device->devid);
271 0 : return ret;
272 : }
273 0 : *nr_zones = ret;
274 0 : if (!ret)
275 : return -EIO;
276 :
277 : /* Populate cache */
278 0 : if (zinfo->zone_cache) {
279 0 : u32 zno = pos >> zinfo->zone_size_shift;
280 :
281 0 : memcpy(zinfo->zone_cache + zno, zones,
282 : sizeof(*zinfo->zone_cache) * *nr_zones);
283 : }
284 :
285 : return 0;
286 : }
287 :
288 : /* The emulated zone size is determined from the size of device extent */
289 0 : static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
290 : {
291 0 : struct btrfs_path *path;
292 0 : struct btrfs_root *root = fs_info->dev_root;
293 0 : struct btrfs_key key;
294 0 : struct extent_buffer *leaf;
295 0 : struct btrfs_dev_extent *dext;
296 0 : int ret = 0;
297 :
298 0 : key.objectid = 1;
299 0 : key.type = BTRFS_DEV_EXTENT_KEY;
300 0 : key.offset = 0;
301 :
302 0 : path = btrfs_alloc_path();
303 0 : if (!path)
304 : return -ENOMEM;
305 :
306 0 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
307 0 : if (ret < 0)
308 0 : goto out;
309 :
310 0 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
311 0 : ret = btrfs_next_leaf(root, path);
312 0 : if (ret < 0)
313 0 : goto out;
314 : /* No dev extents at all? Not good */
315 0 : if (ret > 0) {
316 0 : ret = -EUCLEAN;
317 0 : goto out;
318 : }
319 : }
320 :
321 0 : leaf = path->nodes[0];
322 0 : dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
323 0 : fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
324 0 : ret = 0;
325 :
326 0 : out:
327 0 : btrfs_free_path(path);
328 :
329 0 : return ret;
330 : }
331 :
332 0 : int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
333 : {
334 0 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
335 0 : struct btrfs_device *device;
336 0 : int ret = 0;
337 :
338 : /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
339 0 : if (!btrfs_fs_incompat(fs_info, ZONED))
340 : return 0;
341 :
342 0 : mutex_lock(&fs_devices->device_list_mutex);
343 0 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
344 : /* We can skip reading of zone info for missing devices */
345 0 : if (!device->bdev)
346 0 : continue;
347 :
348 0 : ret = btrfs_get_dev_zone_info(device, true);
349 0 : if (ret)
350 : break;
351 : }
352 0 : mutex_unlock(&fs_devices->device_list_mutex);
353 :
354 0 : return ret;
355 : }
356 :
357 0 : int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
358 : {
359 0 : struct btrfs_fs_info *fs_info = device->fs_info;
360 0 : struct btrfs_zoned_device_info *zone_info = NULL;
361 0 : struct block_device *bdev = device->bdev;
362 0 : unsigned int max_active_zones;
363 0 : unsigned int nactive;
364 0 : sector_t nr_sectors;
365 0 : sector_t sector = 0;
366 0 : struct blk_zone *zones = NULL;
367 0 : unsigned int i, nreported = 0, nr_zones;
368 0 : sector_t zone_sectors;
369 0 : char *model, *emulated;
370 0 : int ret;
371 :
372 : /*
373 : * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
374 : * yet be set.
375 : */
376 0 : if (!btrfs_fs_incompat(fs_info, ZONED))
377 : return 0;
378 :
379 0 : if (device->zone_info)
380 : return 0;
381 :
382 0 : zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
383 0 : if (!zone_info)
384 : return -ENOMEM;
385 :
386 0 : device->zone_info = zone_info;
387 :
388 0 : if (!bdev_is_zoned(bdev)) {
389 0 : if (!fs_info->zone_size) {
390 0 : ret = calculate_emulated_zone_size(fs_info);
391 0 : if (ret)
392 0 : goto out;
393 : }
394 :
395 0 : ASSERT(fs_info->zone_size);
396 0 : zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
397 : } else {
398 0 : zone_sectors = bdev_zone_sectors(bdev);
399 : }
400 :
401 0 : ASSERT(is_power_of_two_u64(zone_sectors));
402 0 : zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
403 :
404 : /* We reject devices with a zone size larger than 8GB */
405 0 : if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
406 0 : btrfs_err_in_rcu(fs_info,
407 : "zoned: %s: zone size %llu larger than supported maximum %llu",
408 : rcu_str_deref(device->name),
409 : zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
410 0 : ret = -EINVAL;
411 0 : goto out;
412 0 : } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
413 0 : btrfs_err_in_rcu(fs_info,
414 : "zoned: %s: zone size %llu smaller than supported minimum %u",
415 : rcu_str_deref(device->name),
416 : zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
417 0 : ret = -EINVAL;
418 0 : goto out;
419 : }
420 :
421 0 : nr_sectors = bdev_nr_sectors(bdev);
422 0 : zone_info->zone_size_shift = ilog2(zone_info->zone_size);
423 0 : zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
424 0 : if (!IS_ALIGNED(nr_sectors, zone_sectors))
425 0 : zone_info->nr_zones++;
426 :
427 0 : max_active_zones = bdev_max_active_zones(bdev);
428 0 : if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
429 0 : btrfs_err_in_rcu(fs_info,
430 : "zoned: %s: max active zones %u is too small, need at least %u active zones",
431 : rcu_str_deref(device->name), max_active_zones,
432 : BTRFS_MIN_ACTIVE_ZONES);
433 0 : ret = -EINVAL;
434 0 : goto out;
435 : }
436 0 : zone_info->max_active_zones = max_active_zones;
437 :
438 0 : zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
439 0 : if (!zone_info->seq_zones) {
440 0 : ret = -ENOMEM;
441 0 : goto out;
442 : }
443 :
444 0 : zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
445 0 : if (!zone_info->empty_zones) {
446 0 : ret = -ENOMEM;
447 0 : goto out;
448 : }
449 :
450 0 : zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
451 0 : if (!zone_info->active_zones) {
452 0 : ret = -ENOMEM;
453 0 : goto out;
454 : }
455 :
456 0 : zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
457 0 : if (!zones) {
458 0 : ret = -ENOMEM;
459 0 : goto out;
460 : }
461 :
462 : /*
463 : * Enable zone cache only for a zoned device. On a non-zoned device, we
464 : * fill the zone info with emulated CONVENTIONAL zones, so no need to
465 : * use the cache.
466 : */
467 0 : if (populate_cache && bdev_is_zoned(device->bdev)) {
468 0 : zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
469 0 : zone_info->nr_zones);
470 0 : if (!zone_info->zone_cache) {
471 0 : btrfs_err_in_rcu(device->fs_info,
472 : "zoned: failed to allocate zone cache for %s",
473 : rcu_str_deref(device->name));
474 0 : ret = -ENOMEM;
475 0 : goto out;
476 : }
477 : }
478 :
479 : /* Get zones type */
480 : nactive = 0;
481 0 : while (sector < nr_sectors) {
482 0 : nr_zones = BTRFS_REPORT_NR_ZONES;
483 0 : ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
484 : &nr_zones);
485 0 : if (ret)
486 0 : goto out;
487 :
488 0 : for (i = 0; i < nr_zones; i++) {
489 0 : if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
490 0 : __set_bit(nreported, zone_info->seq_zones);
491 0 : switch (zones[i].cond) {
492 0 : case BLK_ZONE_COND_EMPTY:
493 0 : __set_bit(nreported, zone_info->empty_zones);
494 : break;
495 0 : case BLK_ZONE_COND_IMP_OPEN:
496 : case BLK_ZONE_COND_EXP_OPEN:
497 : case BLK_ZONE_COND_CLOSED:
498 0 : __set_bit(nreported, zone_info->active_zones);
499 0 : nactive++;
500 0 : break;
501 : }
502 0 : nreported++;
503 : }
504 0 : sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
505 : }
506 :
507 0 : if (nreported != zone_info->nr_zones) {
508 0 : btrfs_err_in_rcu(device->fs_info,
509 : "inconsistent number of zones on %s (%u/%u)",
510 : rcu_str_deref(device->name), nreported,
511 : zone_info->nr_zones);
512 0 : ret = -EIO;
513 0 : goto out;
514 : }
515 :
516 0 : if (max_active_zones) {
517 0 : if (nactive > max_active_zones) {
518 0 : btrfs_err_in_rcu(device->fs_info,
519 : "zoned: %u active zones on %s exceeds max_active_zones %u",
520 : nactive, rcu_str_deref(device->name),
521 : max_active_zones);
522 0 : ret = -EIO;
523 0 : goto out;
524 : }
525 0 : atomic_set(&zone_info->active_zones_left,
526 0 : max_active_zones - nactive);
527 0 : set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
528 : }
529 :
530 : /* Validate superblock log */
531 0 : nr_zones = BTRFS_NR_SB_LOG_ZONES;
532 0 : for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
533 0 : u32 sb_zone;
534 0 : u64 sb_wp;
535 0 : int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
536 :
537 0 : sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
538 0 : if (sb_zone + 1 >= zone_info->nr_zones)
539 0 : continue;
540 :
541 0 : ret = btrfs_get_dev_zones(device,
542 : zone_start_physical(sb_zone, zone_info),
543 : &zone_info->sb_zones[sb_pos],
544 : &nr_zones);
545 0 : if (ret)
546 0 : goto out;
547 :
548 0 : if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
549 0 : btrfs_err_in_rcu(device->fs_info,
550 : "zoned: failed to read super block log zone info at devid %llu zone %u",
551 : device->devid, sb_zone);
552 0 : ret = -EUCLEAN;
553 0 : goto out;
554 : }
555 :
556 : /*
557 : * If zones[0] is conventional, always use the beginning of the
558 : * zone to record superblock. No need to validate in that case.
559 : */
560 0 : if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
561 : BLK_ZONE_TYPE_CONVENTIONAL)
562 0 : continue;
563 :
564 0 : ret = sb_write_pointer(device->bdev,
565 : &zone_info->sb_zones[sb_pos], &sb_wp);
566 0 : if (ret != -ENOENT && ret) {
567 0 : btrfs_err_in_rcu(device->fs_info,
568 : "zoned: super block log zone corrupted devid %llu zone %u",
569 : device->devid, sb_zone);
570 0 : ret = -EUCLEAN;
571 0 : goto out;
572 : }
573 : }
574 :
575 :
576 0 : kvfree(zones);
577 :
578 0 : switch (bdev_zoned_model(bdev)) {
579 : case BLK_ZONED_HM:
580 : model = "host-managed zoned";
581 : emulated = "";
582 : break;
583 : case BLK_ZONED_HA:
584 : model = "host-aware zoned";
585 : emulated = "";
586 : break;
587 : case BLK_ZONED_NONE:
588 : model = "regular";
589 : emulated = "emulated ";
590 : break;
591 : default:
592 : /* Just in case */
593 0 : btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
594 : bdev_zoned_model(bdev),
595 : rcu_str_deref(device->name));
596 0 : ret = -EOPNOTSUPP;
597 0 : goto out_free_zone_info;
598 : }
599 :
600 0 : btrfs_info_in_rcu(fs_info,
601 : "%s block device %s, %u %szones of %llu bytes",
602 : model, rcu_str_deref(device->name), zone_info->nr_zones,
603 : emulated, zone_info->zone_size);
604 :
605 0 : return 0;
606 :
607 0 : out:
608 0 : kvfree(zones);
609 0 : out_free_zone_info:
610 0 : btrfs_destroy_dev_zone_info(device);
611 :
612 0 : return ret;
613 : }
614 :
615 0 : void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
616 : {
617 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
618 :
619 0 : if (!zone_info)
620 : return;
621 :
622 0 : bitmap_free(zone_info->active_zones);
623 0 : bitmap_free(zone_info->seq_zones);
624 0 : bitmap_free(zone_info->empty_zones);
625 0 : vfree(zone_info->zone_cache);
626 0 : kfree(zone_info);
627 0 : device->zone_info = NULL;
628 : }
629 :
630 0 : struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
631 : {
632 0 : struct btrfs_zoned_device_info *zone_info;
633 :
634 0 : zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
635 0 : if (!zone_info)
636 : return NULL;
637 :
638 0 : zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
639 0 : if (!zone_info->seq_zones)
640 0 : goto out;
641 :
642 0 : bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
643 : zone_info->nr_zones);
644 :
645 0 : zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
646 0 : if (!zone_info->empty_zones)
647 0 : goto out;
648 :
649 0 : bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
650 : zone_info->nr_zones);
651 :
652 0 : zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
653 0 : if (!zone_info->active_zones)
654 0 : goto out;
655 :
656 0 : bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
657 : zone_info->nr_zones);
658 0 : zone_info->zone_cache = NULL;
659 :
660 0 : return zone_info;
661 :
662 0 : out:
663 0 : bitmap_free(zone_info->seq_zones);
664 0 : bitmap_free(zone_info->empty_zones);
665 0 : bitmap_free(zone_info->active_zones);
666 0 : kfree(zone_info);
667 0 : return NULL;
668 : }
669 :
670 0 : int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
671 : struct blk_zone *zone)
672 : {
673 0 : unsigned int nr_zones = 1;
674 0 : int ret;
675 :
676 0 : ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
677 0 : if (ret != 0 || !nr_zones)
678 0 : return ret ? ret : -EIO;
679 :
680 : return 0;
681 : }
682 :
683 0 : static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
684 : {
685 0 : struct btrfs_device *device;
686 :
687 0 : list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
688 0 : if (device->bdev &&
689 : bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
690 0 : btrfs_err(fs_info,
691 : "zoned: mode not enabled but zoned device found: %pg",
692 : device->bdev);
693 0 : return -EINVAL;
694 : }
695 : }
696 :
697 : return 0;
698 : }
699 :
700 0 : int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
701 : {
702 0 : struct queue_limits *lim = &fs_info->limits;
703 0 : struct btrfs_device *device;
704 0 : u64 zone_size = 0;
705 0 : int ret;
706 :
707 : /*
708 : * Host-Managed devices can't be used without the ZONED flag. With the
709 : * ZONED all devices can be used, using zone emulation if required.
710 : */
711 0 : if (!btrfs_fs_incompat(fs_info, ZONED))
712 0 : return btrfs_check_for_zoned_device(fs_info);
713 :
714 0 : blk_set_stacking_limits(lim);
715 :
716 0 : list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
717 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
718 :
719 0 : if (!device->bdev)
720 0 : continue;
721 :
722 0 : if (!zone_size) {
723 0 : zone_size = zone_info->zone_size;
724 0 : } else if (zone_info->zone_size != zone_size) {
725 0 : btrfs_err(fs_info,
726 : "zoned: unequal block device zone sizes: have %llu found %llu",
727 : zone_info->zone_size, zone_size);
728 0 : return -EINVAL;
729 : }
730 :
731 : /*
732 : * With the zoned emulation, we can have non-zoned device on the
733 : * zoned mode. In this case, we don't have a valid max zone
734 : * append size.
735 : */
736 0 : if (bdev_is_zoned(device->bdev)) {
737 0 : blk_stack_limits(lim,
738 : &bdev_get_queue(device->bdev)->limits,
739 : 0);
740 : }
741 : }
742 :
743 : /*
744 : * stripe_size is always aligned to BTRFS_STRIPE_LEN in
745 : * btrfs_create_chunk(). Since we want stripe_len == zone_size,
746 : * check the alignment here.
747 : */
748 0 : if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
749 0 : btrfs_err(fs_info,
750 : "zoned: zone size %llu not aligned to stripe %u",
751 : zone_size, BTRFS_STRIPE_LEN);
752 0 : return -EINVAL;
753 : }
754 :
755 0 : if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
756 0 : btrfs_err(fs_info, "zoned: mixed block groups not supported");
757 0 : return -EINVAL;
758 : }
759 :
760 0 : fs_info->zone_size = zone_size;
761 : /*
762 : * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
763 : * Technically, we can have multiple pages per segment. But, since
764 : * we add the pages one by one to a bio, and cannot increase the
765 : * metadata reservation even if it increases the number of extents, it
766 : * is safe to stick with the limit.
767 : */
768 0 : fs_info->max_zone_append_size = ALIGN_DOWN(
769 : min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
770 : (u64)lim->max_sectors << SECTOR_SHIFT,
771 : (u64)lim->max_segments << PAGE_SHIFT),
772 : fs_info->sectorsize);
773 0 : fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
774 0 : if (fs_info->max_zone_append_size < fs_info->max_extent_size)
775 0 : fs_info->max_extent_size = fs_info->max_zone_append_size;
776 :
777 : /*
778 : * Check mount options here, because we might change fs_info->zoned
779 : * from fs_info->zone_size.
780 : */
781 0 : ret = btrfs_check_mountopts_zoned(fs_info);
782 0 : if (ret)
783 : return ret;
784 :
785 0 : btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
786 0 : return 0;
787 : }
788 :
789 0 : int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
790 : {
791 0 : if (!btrfs_is_zoned(info))
792 : return 0;
793 :
794 : /*
795 : * Space cache writing is not COWed. Disable that to avoid write errors
796 : * in sequential zones.
797 : */
798 0 : if (btrfs_test_opt(info, SPACE_CACHE)) {
799 0 : btrfs_err(info, "zoned: space cache v1 is not supported");
800 0 : return -EINVAL;
801 : }
802 :
803 0 : if (btrfs_test_opt(info, NODATACOW)) {
804 0 : btrfs_err(info, "zoned: NODATACOW not supported");
805 0 : return -EINVAL;
806 : }
807 :
808 : return 0;
809 : }
810 :
811 0 : static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
812 : int rw, u64 *bytenr_ret)
813 : {
814 0 : u64 wp;
815 0 : int ret;
816 :
817 0 : if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
818 0 : *bytenr_ret = zones[0].start << SECTOR_SHIFT;
819 0 : return 0;
820 : }
821 :
822 0 : ret = sb_write_pointer(bdev, zones, &wp);
823 0 : if (ret != -ENOENT && ret < 0)
824 : return ret;
825 :
826 0 : if (rw == WRITE) {
827 0 : struct blk_zone *reset = NULL;
828 :
829 0 : if (wp == zones[0].start << SECTOR_SHIFT)
830 : reset = &zones[0];
831 0 : else if (wp == zones[1].start << SECTOR_SHIFT)
832 0 : reset = &zones[1];
833 :
834 0 : if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
835 0 : ASSERT(sb_zone_is_full(reset));
836 :
837 0 : ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
838 : reset->start, reset->len,
839 : GFP_NOFS);
840 0 : if (ret)
841 : return ret;
842 :
843 0 : reset->cond = BLK_ZONE_COND_EMPTY;
844 0 : reset->wp = reset->start;
845 : }
846 0 : } else if (ret != -ENOENT) {
847 : /*
848 : * For READ, we want the previous one. Move write pointer to
849 : * the end of a zone, if it is at the head of a zone.
850 : */
851 0 : u64 zone_end = 0;
852 :
853 0 : if (wp == zones[0].start << SECTOR_SHIFT)
854 0 : zone_end = zones[1].start + zones[1].capacity;
855 0 : else if (wp == zones[1].start << SECTOR_SHIFT)
856 0 : zone_end = zones[0].start + zones[0].capacity;
857 0 : if (zone_end)
858 0 : wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
859 : BTRFS_SUPER_INFO_SIZE);
860 :
861 0 : wp -= BTRFS_SUPER_INFO_SIZE;
862 : }
863 :
864 0 : *bytenr_ret = wp;
865 0 : return 0;
866 :
867 : }
868 :
869 4 : int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
870 : u64 *bytenr_ret)
871 : {
872 4 : struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
873 4 : sector_t zone_sectors;
874 4 : u32 sb_zone;
875 4 : int ret;
876 4 : u8 zone_sectors_shift;
877 4 : sector_t nr_sectors;
878 4 : u32 nr_zones;
879 :
880 4 : if (!bdev_is_zoned(bdev)) {
881 4 : *bytenr_ret = btrfs_sb_offset(mirror);
882 4 : return 0;
883 : }
884 :
885 0 : ASSERT(rw == READ || rw == WRITE);
886 :
887 0 : zone_sectors = bdev_zone_sectors(bdev);
888 0 : if (!is_power_of_2(zone_sectors))
889 : return -EINVAL;
890 0 : zone_sectors_shift = ilog2(zone_sectors);
891 0 : nr_sectors = bdev_nr_sectors(bdev);
892 0 : nr_zones = nr_sectors >> zone_sectors_shift;
893 :
894 0 : sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
895 0 : if (sb_zone + 1 >= nr_zones)
896 : return -ENOENT;
897 :
898 0 : ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
899 : BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
900 : zones);
901 0 : if (ret < 0)
902 : return ret;
903 0 : if (ret != BTRFS_NR_SB_LOG_ZONES)
904 : return -EIO;
905 :
906 0 : return sb_log_location(bdev, zones, rw, bytenr_ret);
907 : }
908 :
909 0 : int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
910 : u64 *bytenr_ret)
911 : {
912 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
913 0 : u32 zone_num;
914 :
915 : /*
916 : * For a zoned filesystem on a non-zoned block device, use the same
917 : * super block locations as regular filesystem. Doing so, the super
918 : * block can always be retrieved and the zoned flag of the volume
919 : * detected from the super block information.
920 : */
921 0 : if (!bdev_is_zoned(device->bdev)) {
922 0 : *bytenr_ret = btrfs_sb_offset(mirror);
923 0 : return 0;
924 : }
925 :
926 0 : zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
927 0 : if (zone_num + 1 >= zinfo->nr_zones)
928 : return -ENOENT;
929 :
930 0 : return sb_log_location(device->bdev,
931 0 : &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
932 : rw, bytenr_ret);
933 : }
934 :
935 0 : static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
936 : int mirror)
937 : {
938 0 : u32 zone_num;
939 :
940 0 : if (!zinfo)
941 : return false;
942 :
943 0 : zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
944 0 : if (zone_num + 1 >= zinfo->nr_zones)
945 : return false;
946 :
947 0 : if (!test_bit(zone_num, zinfo->seq_zones))
948 0 : return false;
949 :
950 : return true;
951 : }
952 :
953 0 : int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
954 : {
955 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
956 0 : struct blk_zone *zone;
957 0 : int i;
958 :
959 0 : if (!is_sb_log_zone(zinfo, mirror))
960 : return 0;
961 :
962 0 : zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
963 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
964 : /* Advance the next zone */
965 0 : if (zone->cond == BLK_ZONE_COND_FULL) {
966 0 : zone++;
967 0 : continue;
968 : }
969 :
970 0 : if (zone->cond == BLK_ZONE_COND_EMPTY)
971 0 : zone->cond = BLK_ZONE_COND_IMP_OPEN;
972 :
973 0 : zone->wp += SUPER_INFO_SECTORS;
974 :
975 0 : if (sb_zone_is_full(zone)) {
976 : /*
977 : * No room left to write new superblock. Since
978 : * superblock is written with REQ_SYNC, it is safe to
979 : * finish the zone now.
980 : *
981 : * If the write pointer is exactly at the capacity,
982 : * explicit ZONE_FINISH is not necessary.
983 : */
984 0 : if (zone->wp != zone->start + zone->capacity) {
985 0 : int ret;
986 :
987 0 : ret = blkdev_zone_mgmt(device->bdev,
988 : REQ_OP_ZONE_FINISH, zone->start,
989 : zone->len, GFP_NOFS);
990 0 : if (ret)
991 : return ret;
992 : }
993 :
994 0 : zone->wp = zone->start + zone->len;
995 0 : zone->cond = BLK_ZONE_COND_FULL;
996 : }
997 : return 0;
998 : }
999 :
1000 : /* All the zones are FULL. Should not reach here. */
1001 : ASSERT(0);
1002 : return -EIO;
1003 : }
1004 :
1005 0 : int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
1006 : {
1007 0 : sector_t zone_sectors;
1008 0 : sector_t nr_sectors;
1009 0 : u8 zone_sectors_shift;
1010 0 : u32 sb_zone;
1011 0 : u32 nr_zones;
1012 :
1013 0 : zone_sectors = bdev_zone_sectors(bdev);
1014 0 : zone_sectors_shift = ilog2(zone_sectors);
1015 0 : nr_sectors = bdev_nr_sectors(bdev);
1016 0 : nr_zones = nr_sectors >> zone_sectors_shift;
1017 :
1018 0 : sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
1019 0 : if (sb_zone + 1 >= nr_zones)
1020 : return -ENOENT;
1021 :
1022 0 : return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
1023 : zone_start_sector(sb_zone, bdev),
1024 : zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
1025 : }
1026 :
1027 : /*
1028 : * Find allocatable zones within a given region.
1029 : *
1030 : * @device: the device to allocate a region on
1031 : * @hole_start: the position of the hole to allocate the region
1032 : * @num_bytes: size of wanted region
1033 : * @hole_end: the end of the hole
1034 : * @return: position of allocatable zones
1035 : *
1036 : * Allocatable region should not contain any superblock locations.
1037 : */
1038 0 : u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
1039 : u64 hole_end, u64 num_bytes)
1040 : {
1041 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
1042 0 : const u8 shift = zinfo->zone_size_shift;
1043 0 : u64 nzones = num_bytes >> shift;
1044 0 : u64 pos = hole_start;
1045 0 : u64 begin, end;
1046 0 : bool have_sb;
1047 0 : int i;
1048 :
1049 0 : ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
1050 0 : ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
1051 :
1052 0 : while (pos < hole_end) {
1053 0 : begin = pos >> shift;
1054 0 : end = begin + nzones;
1055 :
1056 0 : if (end > zinfo->nr_zones)
1057 : return hole_end;
1058 :
1059 : /* Check if zones in the region are all empty */
1060 0 : if (btrfs_dev_is_sequential(device, pos) &&
1061 0 : !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
1062 0 : pos += zinfo->zone_size;
1063 0 : continue;
1064 : }
1065 :
1066 : have_sb = false;
1067 0 : for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1068 0 : u32 sb_zone;
1069 0 : u64 sb_pos;
1070 :
1071 0 : sb_zone = sb_zone_number(shift, i);
1072 0 : if (!(end <= sb_zone ||
1073 0 : sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
1074 0 : have_sb = true;
1075 0 : pos = zone_start_physical(
1076 : sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
1077 0 : break;
1078 : }
1079 :
1080 : /* We also need to exclude regular superblock positions */
1081 0 : sb_pos = btrfs_sb_offset(i);
1082 0 : if (!(pos + num_bytes <= sb_pos ||
1083 0 : sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
1084 0 : have_sb = true;
1085 0 : pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
1086 : zinfo->zone_size);
1087 0 : break;
1088 : }
1089 : }
1090 0 : if (!have_sb)
1091 : break;
1092 : }
1093 :
1094 : return pos;
1095 : }
1096 :
1097 0 : static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
1098 : {
1099 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
1100 0 : unsigned int zno = (pos >> zone_info->zone_size_shift);
1101 :
1102 : /* We can use any number of zones */
1103 0 : if (zone_info->max_active_zones == 0)
1104 : return true;
1105 :
1106 0 : if (!test_bit(zno, zone_info->active_zones)) {
1107 : /* Active zone left? */
1108 0 : if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
1109 : return false;
1110 0 : if (test_and_set_bit(zno, zone_info->active_zones)) {
1111 : /* Someone already set the bit */
1112 0 : atomic_inc(&zone_info->active_zones_left);
1113 : }
1114 : }
1115 :
1116 : return true;
1117 : }
1118 :
1119 0 : static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
1120 : {
1121 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
1122 0 : unsigned int zno = (pos >> zone_info->zone_size_shift);
1123 :
1124 : /* We can use any number of zones */
1125 0 : if (zone_info->max_active_zones == 0)
1126 : return;
1127 :
1128 0 : if (test_and_clear_bit(zno, zone_info->active_zones))
1129 0 : atomic_inc(&zone_info->active_zones_left);
1130 : }
1131 :
1132 0 : int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
1133 : u64 length, u64 *bytes)
1134 : {
1135 0 : int ret;
1136 :
1137 0 : *bytes = 0;
1138 0 : ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
1139 : physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
1140 : GFP_NOFS);
1141 0 : if (ret)
1142 : return ret;
1143 :
1144 0 : *bytes = length;
1145 0 : while (length) {
1146 0 : btrfs_dev_set_zone_empty(device, physical);
1147 0 : btrfs_dev_clear_active_zone(device, physical);
1148 0 : physical += device->zone_info->zone_size;
1149 0 : length -= device->zone_info->zone_size;
1150 : }
1151 :
1152 : return 0;
1153 : }
1154 :
1155 0 : int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
1156 : {
1157 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
1158 0 : const u8 shift = zinfo->zone_size_shift;
1159 0 : unsigned long begin = start >> shift;
1160 0 : unsigned long nbits = size >> shift;
1161 0 : u64 pos;
1162 0 : int ret;
1163 :
1164 0 : ASSERT(IS_ALIGNED(start, zinfo->zone_size));
1165 0 : ASSERT(IS_ALIGNED(size, zinfo->zone_size));
1166 :
1167 0 : if (begin + nbits > zinfo->nr_zones)
1168 : return -ERANGE;
1169 :
1170 : /* All the zones are conventional */
1171 0 : if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
1172 : return 0;
1173 :
1174 : /* All the zones are sequential and empty */
1175 0 : if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
1176 0 : bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
1177 : return 0;
1178 :
1179 0 : for (pos = start; pos < start + size; pos += zinfo->zone_size) {
1180 0 : u64 reset_bytes;
1181 :
1182 0 : if (!btrfs_dev_is_sequential(device, pos) ||
1183 0 : btrfs_dev_is_empty_zone(device, pos))
1184 0 : continue;
1185 :
1186 : /* Free regions should be empty */
1187 0 : btrfs_warn_in_rcu(
1188 : device->fs_info,
1189 : "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1190 : rcu_str_deref(device->name), device->devid, pos >> shift);
1191 0 : WARN_ON_ONCE(1);
1192 :
1193 0 : ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
1194 : &reset_bytes);
1195 0 : if (ret)
1196 0 : return ret;
1197 : }
1198 :
1199 : return 0;
1200 : }
1201 :
1202 : /*
1203 : * Calculate an allocation pointer from the extent allocation information
1204 : * for a block group consist of conventional zones. It is pointed to the
1205 : * end of the highest addressed extent in the block group as an allocation
1206 : * offset.
1207 : */
1208 0 : static int calculate_alloc_pointer(struct btrfs_block_group *cache,
1209 : u64 *offset_ret, bool new)
1210 : {
1211 0 : struct btrfs_fs_info *fs_info = cache->fs_info;
1212 0 : struct btrfs_root *root;
1213 0 : struct btrfs_path *path;
1214 0 : struct btrfs_key key;
1215 0 : struct btrfs_key found_key;
1216 0 : int ret;
1217 0 : u64 length;
1218 :
1219 : /*
1220 : * Avoid tree lookups for a new block group, there's no use for it.
1221 : * It must always be 0.
1222 : *
1223 : * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1224 : * For new a block group, this function is called from
1225 : * btrfs_make_block_group() which is already taking the chunk mutex.
1226 : * Thus, we cannot call calculate_alloc_pointer() which takes extent
1227 : * buffer locks to avoid deadlock.
1228 : */
1229 0 : if (new) {
1230 0 : *offset_ret = 0;
1231 0 : return 0;
1232 : }
1233 :
1234 0 : path = btrfs_alloc_path();
1235 0 : if (!path)
1236 : return -ENOMEM;
1237 :
1238 0 : key.objectid = cache->start + cache->length;
1239 0 : key.type = 0;
1240 0 : key.offset = 0;
1241 :
1242 0 : root = btrfs_extent_root(fs_info, key.objectid);
1243 0 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1244 : /* We should not find the exact match */
1245 0 : if (!ret)
1246 : ret = -EUCLEAN;
1247 0 : if (ret < 0)
1248 0 : goto out;
1249 :
1250 0 : ret = btrfs_previous_extent_item(root, path, cache->start);
1251 0 : if (ret) {
1252 0 : if (ret == 1) {
1253 0 : ret = 0;
1254 0 : *offset_ret = 0;
1255 : }
1256 0 : goto out;
1257 : }
1258 :
1259 0 : btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1260 :
1261 0 : if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1262 0 : length = found_key.offset;
1263 : else
1264 0 : length = fs_info->nodesize;
1265 :
1266 0 : if (!(found_key.objectid >= cache->start &&
1267 0 : found_key.objectid + length <= cache->start + cache->length)) {
1268 0 : ret = -EUCLEAN;
1269 0 : goto out;
1270 : }
1271 0 : *offset_ret = found_key.objectid + length - cache->start;
1272 0 : ret = 0;
1273 :
1274 0 : out:
1275 0 : btrfs_free_path(path);
1276 0 : return ret;
1277 : }
1278 :
1279 0 : int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1280 : {
1281 0 : struct btrfs_fs_info *fs_info = cache->fs_info;
1282 0 : struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1283 0 : struct extent_map *em;
1284 0 : struct map_lookup *map;
1285 0 : struct btrfs_device *device;
1286 0 : u64 logical = cache->start;
1287 0 : u64 length = cache->length;
1288 0 : int ret;
1289 0 : int i;
1290 0 : unsigned int nofs_flag;
1291 0 : u64 *alloc_offsets = NULL;
1292 0 : u64 *caps = NULL;
1293 0 : u64 *physical = NULL;
1294 0 : unsigned long *active = NULL;
1295 0 : u64 last_alloc = 0;
1296 0 : u32 num_sequential = 0, num_conventional = 0;
1297 :
1298 0 : if (!btrfs_is_zoned(fs_info))
1299 : return 0;
1300 :
1301 : /* Sanity check */
1302 0 : if (!IS_ALIGNED(length, fs_info->zone_size)) {
1303 0 : btrfs_err(fs_info,
1304 : "zoned: block group %llu len %llu unaligned to zone size %llu",
1305 : logical, length, fs_info->zone_size);
1306 0 : return -EIO;
1307 : }
1308 :
1309 : /* Get the chunk mapping */
1310 0 : read_lock(&em_tree->lock);
1311 0 : em = lookup_extent_mapping(em_tree, logical, length);
1312 0 : read_unlock(&em_tree->lock);
1313 :
1314 0 : if (!em)
1315 : return -EINVAL;
1316 :
1317 0 : map = em->map_lookup;
1318 :
1319 0 : cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
1320 0 : if (!cache->physical_map) {
1321 0 : ret = -ENOMEM;
1322 0 : goto out;
1323 : }
1324 :
1325 0 : alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1326 0 : if (!alloc_offsets) {
1327 0 : ret = -ENOMEM;
1328 0 : goto out;
1329 : }
1330 :
1331 0 : caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
1332 0 : if (!caps) {
1333 0 : ret = -ENOMEM;
1334 0 : goto out;
1335 : }
1336 :
1337 0 : physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
1338 0 : if (!physical) {
1339 0 : ret = -ENOMEM;
1340 0 : goto out;
1341 : }
1342 :
1343 0 : active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
1344 0 : if (!active) {
1345 0 : ret = -ENOMEM;
1346 0 : goto out;
1347 : }
1348 :
1349 0 : for (i = 0; i < map->num_stripes; i++) {
1350 0 : bool is_sequential;
1351 0 : struct blk_zone zone;
1352 0 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1353 0 : int dev_replace_is_ongoing = 0;
1354 :
1355 0 : device = map->stripes[i].dev;
1356 0 : physical[i] = map->stripes[i].physical;
1357 :
1358 0 : if (device->bdev == NULL) {
1359 0 : alloc_offsets[i] = WP_MISSING_DEV;
1360 0 : continue;
1361 : }
1362 :
1363 0 : is_sequential = btrfs_dev_is_sequential(device, physical[i]);
1364 0 : if (is_sequential)
1365 0 : num_sequential++;
1366 : else
1367 0 : num_conventional++;
1368 :
1369 : /*
1370 : * Consider a zone as active if we can allow any number of
1371 : * active zones.
1372 : */
1373 0 : if (!device->zone_info->max_active_zones)
1374 0 : __set_bit(i, active);
1375 :
1376 0 : if (!is_sequential) {
1377 0 : alloc_offsets[i] = WP_CONVENTIONAL;
1378 0 : continue;
1379 : }
1380 :
1381 : /*
1382 : * This zone will be used for allocation, so mark this zone
1383 : * non-empty.
1384 : */
1385 0 : btrfs_dev_clear_zone_empty(device, physical[i]);
1386 :
1387 0 : down_read(&dev_replace->rwsem);
1388 0 : dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1389 0 : if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1390 0 : btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
1391 0 : up_read(&dev_replace->rwsem);
1392 :
1393 : /*
1394 : * The group is mapped to a sequential zone. Get the zone write
1395 : * pointer to determine the allocation offset within the zone.
1396 : */
1397 0 : WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
1398 0 : nofs_flag = memalloc_nofs_save();
1399 0 : ret = btrfs_get_dev_zone(device, physical[i], &zone);
1400 0 : memalloc_nofs_restore(nofs_flag);
1401 0 : if (ret == -EIO || ret == -EOPNOTSUPP) {
1402 0 : ret = 0;
1403 0 : alloc_offsets[i] = WP_MISSING_DEV;
1404 0 : continue;
1405 0 : } else if (ret) {
1406 0 : goto out;
1407 : }
1408 :
1409 0 : if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1410 0 : btrfs_err_in_rcu(fs_info,
1411 : "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1412 : zone.start << SECTOR_SHIFT,
1413 : rcu_str_deref(device->name), device->devid);
1414 0 : ret = -EIO;
1415 0 : goto out;
1416 : }
1417 :
1418 0 : caps[i] = (zone.capacity << SECTOR_SHIFT);
1419 :
1420 0 : switch (zone.cond) {
1421 0 : case BLK_ZONE_COND_OFFLINE:
1422 : case BLK_ZONE_COND_READONLY:
1423 0 : btrfs_err(fs_info,
1424 : "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1425 : physical[i] >> device->zone_info->zone_size_shift,
1426 : rcu_str_deref(device->name), device->devid);
1427 0 : alloc_offsets[i] = WP_MISSING_DEV;
1428 0 : break;
1429 0 : case BLK_ZONE_COND_EMPTY:
1430 0 : alloc_offsets[i] = 0;
1431 0 : break;
1432 0 : case BLK_ZONE_COND_FULL:
1433 0 : alloc_offsets[i] = caps[i];
1434 0 : break;
1435 0 : default:
1436 : /* Partially used zone */
1437 0 : alloc_offsets[i] =
1438 0 : ((zone.wp - zone.start) << SECTOR_SHIFT);
1439 0 : __set_bit(i, active);
1440 : break;
1441 : }
1442 : }
1443 :
1444 0 : if (num_sequential > 0)
1445 0 : set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1446 :
1447 0 : if (num_conventional > 0) {
1448 : /* Zone capacity is always zone size in emulation */
1449 0 : cache->zone_capacity = cache->length;
1450 0 : ret = calculate_alloc_pointer(cache, &last_alloc, new);
1451 0 : if (ret) {
1452 0 : btrfs_err(fs_info,
1453 : "zoned: failed to determine allocation offset of bg %llu",
1454 : cache->start);
1455 0 : goto out;
1456 0 : } else if (map->num_stripes == num_conventional) {
1457 0 : cache->alloc_offset = last_alloc;
1458 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
1459 0 : goto out;
1460 : }
1461 : }
1462 :
1463 0 : switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1464 0 : case 0: /* single */
1465 0 : if (alloc_offsets[0] == WP_MISSING_DEV) {
1466 0 : btrfs_err(fs_info,
1467 : "zoned: cannot recover write pointer for zone %llu",
1468 : physical[0]);
1469 0 : ret = -EIO;
1470 0 : goto out;
1471 : }
1472 0 : cache->alloc_offset = alloc_offsets[0];
1473 0 : cache->zone_capacity = caps[0];
1474 0 : if (test_bit(0, active))
1475 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
1476 : break;
1477 0 : case BTRFS_BLOCK_GROUP_DUP:
1478 0 : if (map->type & BTRFS_BLOCK_GROUP_DATA) {
1479 0 : btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
1480 0 : ret = -EINVAL;
1481 0 : goto out;
1482 : }
1483 0 : if (alloc_offsets[0] == WP_MISSING_DEV) {
1484 0 : btrfs_err(fs_info,
1485 : "zoned: cannot recover write pointer for zone %llu",
1486 : physical[0]);
1487 0 : ret = -EIO;
1488 0 : goto out;
1489 : }
1490 0 : if (alloc_offsets[1] == WP_MISSING_DEV) {
1491 0 : btrfs_err(fs_info,
1492 : "zoned: cannot recover write pointer for zone %llu",
1493 : physical[1]);
1494 0 : ret = -EIO;
1495 0 : goto out;
1496 : }
1497 0 : if (alloc_offsets[0] != alloc_offsets[1]) {
1498 0 : btrfs_err(fs_info,
1499 : "zoned: write pointer offset mismatch of zones in DUP profile");
1500 0 : ret = -EIO;
1501 0 : goto out;
1502 : }
1503 0 : if (test_bit(0, active) != test_bit(1, active)) {
1504 0 : if (!btrfs_zone_activate(cache)) {
1505 0 : ret = -EIO;
1506 0 : goto out;
1507 : }
1508 : } else {
1509 0 : if (test_bit(0, active))
1510 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
1511 0 : &cache->runtime_flags);
1512 : }
1513 0 : cache->alloc_offset = alloc_offsets[0];
1514 0 : cache->zone_capacity = min(caps[0], caps[1]);
1515 0 : break;
1516 0 : case BTRFS_BLOCK_GROUP_RAID1:
1517 : case BTRFS_BLOCK_GROUP_RAID0:
1518 : case BTRFS_BLOCK_GROUP_RAID10:
1519 : case BTRFS_BLOCK_GROUP_RAID5:
1520 : case BTRFS_BLOCK_GROUP_RAID6:
1521 : /* non-single profiles are not supported yet */
1522 : default:
1523 0 : btrfs_err(fs_info, "zoned: profile %s not yet supported",
1524 : btrfs_bg_type_to_raid_name(map->type));
1525 0 : ret = -EINVAL;
1526 0 : goto out;
1527 : }
1528 :
1529 0 : out:
1530 0 : if (cache->alloc_offset > fs_info->zone_size) {
1531 0 : btrfs_err(fs_info,
1532 : "zoned: invalid write pointer %llu in block group %llu",
1533 : cache->alloc_offset, cache->start);
1534 0 : ret = -EIO;
1535 : }
1536 :
1537 0 : if (cache->alloc_offset > cache->zone_capacity) {
1538 0 : btrfs_err(fs_info,
1539 : "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1540 : cache->alloc_offset, cache->zone_capacity,
1541 : cache->start);
1542 0 : ret = -EIO;
1543 : }
1544 :
1545 : /* An extent is allocated after the write pointer */
1546 0 : if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1547 0 : btrfs_err(fs_info,
1548 : "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1549 : logical, last_alloc, cache->alloc_offset);
1550 0 : ret = -EIO;
1551 : }
1552 :
1553 0 : if (!ret) {
1554 0 : cache->meta_write_pointer = cache->alloc_offset + cache->start;
1555 0 : if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
1556 0 : btrfs_get_block_group(cache);
1557 0 : spin_lock(&fs_info->zone_active_bgs_lock);
1558 0 : list_add_tail(&cache->active_bg_list,
1559 : &fs_info->zone_active_bgs);
1560 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
1561 : }
1562 : } else {
1563 0 : kfree(cache->physical_map);
1564 0 : cache->physical_map = NULL;
1565 : }
1566 0 : bitmap_free(active);
1567 0 : kfree(physical);
1568 0 : kfree(caps);
1569 0 : kfree(alloc_offsets);
1570 0 : free_extent_map(em);
1571 :
1572 0 : return ret;
1573 : }
1574 :
1575 0 : void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1576 : {
1577 0 : u64 unusable, free;
1578 :
1579 0 : if (!btrfs_is_zoned(cache->fs_info))
1580 : return;
1581 :
1582 0 : WARN_ON(cache->bytes_super != 0);
1583 :
1584 : /* Check for block groups never get activated */
1585 0 : if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
1586 0 : cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
1587 0 : !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
1588 0 : cache->alloc_offset == 0) {
1589 0 : unusable = cache->length;
1590 0 : free = 0;
1591 : } else {
1592 0 : unusable = (cache->alloc_offset - cache->used) +
1593 0 : (cache->length - cache->zone_capacity);
1594 0 : free = cache->zone_capacity - cache->alloc_offset;
1595 : }
1596 :
1597 : /* We only need ->free_space in ALLOC_SEQ block groups */
1598 0 : cache->cached = BTRFS_CACHE_FINISHED;
1599 0 : cache->free_space_ctl->free_space = free;
1600 0 : cache->zone_unusable = unusable;
1601 : }
1602 :
1603 0 : void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1604 : struct extent_buffer *eb)
1605 : {
1606 0 : if (!btrfs_is_zoned(eb->fs_info) ||
1607 : btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
1608 : return;
1609 :
1610 0 : ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1611 :
1612 0 : memzero_extent_buffer(eb, 0, eb->len);
1613 0 : set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1614 0 : set_extent_buffer_dirty(eb);
1615 0 : set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
1616 : EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
1617 : }
1618 :
1619 0 : bool btrfs_use_zone_append(struct btrfs_bio *bbio)
1620 : {
1621 0 : u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
1622 0 : struct btrfs_inode *inode = bbio->inode;
1623 0 : struct btrfs_fs_info *fs_info = bbio->fs_info;
1624 0 : struct btrfs_block_group *cache;
1625 0 : bool ret = false;
1626 :
1627 0 : if (!btrfs_is_zoned(fs_info))
1628 : return false;
1629 :
1630 0 : if (!inode || !is_data_inode(&inode->vfs_inode))
1631 : return false;
1632 :
1633 0 : if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
1634 : return false;
1635 :
1636 : /*
1637 : * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1638 : * extent layout the relocation code has.
1639 : * Furthermore we have set aside own block-group from which only the
1640 : * relocation "process" can allocate and make sure only one process at a
1641 : * time can add pages to an extent that gets relocated, so it's safe to
1642 : * use regular REQ_OP_WRITE for this special case.
1643 : */
1644 0 : if (btrfs_is_data_reloc_root(inode->root))
1645 : return false;
1646 :
1647 0 : cache = btrfs_lookup_block_group(fs_info, start);
1648 0 : ASSERT(cache);
1649 0 : if (!cache)
1650 : return false;
1651 :
1652 0 : ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1653 0 : btrfs_put_block_group(cache);
1654 :
1655 0 : return ret;
1656 : }
1657 :
1658 0 : void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
1659 : {
1660 0 : const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1661 0 : struct btrfs_ordered_sum *sum = bbio->sums;
1662 :
1663 0 : if (physical < bbio->orig_physical)
1664 0 : sum->logical -= bbio->orig_physical - physical;
1665 : else
1666 0 : sum->logical += physical - bbio->orig_physical;
1667 0 : }
1668 :
1669 0 : static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
1670 : u64 logical)
1671 : {
1672 0 : struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
1673 0 : struct extent_map *em;
1674 :
1675 0 : ordered->disk_bytenr = logical;
1676 :
1677 0 : write_lock(&em_tree->lock);
1678 0 : em = search_extent_mapping(em_tree, ordered->file_offset,
1679 : ordered->num_bytes);
1680 0 : em->block_start = logical;
1681 0 : free_extent_map(em);
1682 0 : write_unlock(&em_tree->lock);
1683 0 : }
1684 :
1685 0 : static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
1686 : u64 logical, u64 len)
1687 : {
1688 0 : struct btrfs_ordered_extent *new;
1689 :
1690 0 : if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
1691 0 : split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
1692 : ordered->num_bytes, len, logical))
1693 : return false;
1694 :
1695 0 : new = btrfs_split_ordered_extent(ordered, len);
1696 0 : if (IS_ERR(new))
1697 : return false;
1698 0 : new->disk_bytenr = logical;
1699 0 : btrfs_finish_one_ordered(new);
1700 0 : return true;
1701 : }
1702 :
1703 0 : void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
1704 : {
1705 0 : struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1706 0 : struct btrfs_fs_info *fs_info = inode->root->fs_info;
1707 0 : struct btrfs_ordered_sum *sum =
1708 0 : list_first_entry(&ordered->list, typeof(*sum), list);
1709 0 : u64 logical = sum->logical;
1710 0 : u64 len = sum->len;
1711 :
1712 0 : while (len < ordered->disk_num_bytes) {
1713 0 : sum = list_next_entry(sum, list);
1714 0 : if (sum->logical == logical + len) {
1715 0 : len += sum->len;
1716 0 : continue;
1717 : }
1718 0 : if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
1719 0 : set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
1720 0 : btrfs_err(fs_info, "failed to split ordered extent");
1721 0 : goto out;
1722 : }
1723 0 : logical = sum->logical;
1724 0 : len = sum->len;
1725 : }
1726 :
1727 0 : if (ordered->disk_bytenr != logical)
1728 0 : btrfs_rewrite_logical_zoned(ordered, logical);
1729 :
1730 0 : out:
1731 : /*
1732 : * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1733 : * were allocated by btrfs_alloc_dummy_sum only to record the logical
1734 : * addresses and don't contain actual checksums. We thus must free them
1735 : * here so that we don't attempt to log the csums later.
1736 : */
1737 0 : if ((inode->flags & BTRFS_INODE_NODATASUM) ||
1738 0 : test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
1739 0 : while ((sum = list_first_entry_or_null(&ordered->list,
1740 : typeof(*sum), list))) {
1741 0 : list_del(&sum->list);
1742 0 : kfree(sum);
1743 : }
1744 : }
1745 0 : }
1746 :
1747 0 : bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1748 : struct extent_buffer *eb,
1749 : struct btrfs_block_group **cache_ret)
1750 : {
1751 0 : struct btrfs_block_group *cache;
1752 0 : bool ret = true;
1753 :
1754 0 : if (!btrfs_is_zoned(fs_info))
1755 : return true;
1756 :
1757 0 : cache = btrfs_lookup_block_group(fs_info, eb->start);
1758 0 : if (!cache)
1759 : return true;
1760 :
1761 0 : if (cache->meta_write_pointer != eb->start) {
1762 0 : btrfs_put_block_group(cache);
1763 0 : cache = NULL;
1764 0 : ret = false;
1765 : } else {
1766 0 : cache->meta_write_pointer = eb->start + eb->len;
1767 : }
1768 :
1769 0 : *cache_ret = cache;
1770 :
1771 0 : return ret;
1772 : }
1773 :
1774 0 : void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1775 : struct extent_buffer *eb)
1776 : {
1777 0 : if (!btrfs_is_zoned(eb->fs_info) || !cache)
1778 : return;
1779 :
1780 0 : ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1781 0 : cache->meta_write_pointer = eb->start;
1782 : }
1783 :
1784 0 : int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1785 : {
1786 0 : if (!btrfs_dev_is_sequential(device, physical))
1787 : return -EOPNOTSUPP;
1788 :
1789 0 : return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1790 : length >> SECTOR_SHIFT, GFP_NOFS, 0);
1791 : }
1792 :
1793 0 : static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1794 : struct blk_zone *zone)
1795 : {
1796 0 : struct btrfs_io_context *bioc = NULL;
1797 0 : u64 mapped_length = PAGE_SIZE;
1798 0 : unsigned int nofs_flag;
1799 0 : int nmirrors;
1800 0 : int i, ret;
1801 :
1802 0 : ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1803 : &mapped_length, &bioc, NULL, NULL, 1);
1804 0 : if (ret || !bioc || mapped_length < PAGE_SIZE) {
1805 0 : ret = -EIO;
1806 0 : goto out_put_bioc;
1807 : }
1808 :
1809 0 : if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1810 0 : ret = -EINVAL;
1811 0 : goto out_put_bioc;
1812 : }
1813 :
1814 0 : nofs_flag = memalloc_nofs_save();
1815 0 : nmirrors = (int)bioc->num_stripes;
1816 0 : for (i = 0; i < nmirrors; i++) {
1817 0 : u64 physical = bioc->stripes[i].physical;
1818 0 : struct btrfs_device *dev = bioc->stripes[i].dev;
1819 :
1820 : /* Missing device */
1821 0 : if (!dev->bdev)
1822 0 : continue;
1823 :
1824 0 : ret = btrfs_get_dev_zone(dev, physical, zone);
1825 : /* Failing device */
1826 0 : if (ret == -EIO || ret == -EOPNOTSUPP)
1827 0 : continue;
1828 : break;
1829 : }
1830 0 : memalloc_nofs_restore(nofs_flag);
1831 0 : out_put_bioc:
1832 0 : btrfs_put_bioc(bioc);
1833 0 : return ret;
1834 : }
1835 :
1836 : /*
1837 : * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1838 : * filling zeros between @physical_pos to a write pointer of dev-replace
1839 : * source device.
1840 : */
1841 0 : int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1842 : u64 physical_start, u64 physical_pos)
1843 : {
1844 0 : struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1845 0 : struct blk_zone zone;
1846 0 : u64 length;
1847 0 : u64 wp;
1848 0 : int ret;
1849 :
1850 0 : if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1851 : return 0;
1852 :
1853 0 : ret = read_zone_info(fs_info, logical, &zone);
1854 0 : if (ret)
1855 : return ret;
1856 :
1857 0 : wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1858 :
1859 0 : if (physical_pos == wp)
1860 : return 0;
1861 :
1862 0 : if (physical_pos > wp)
1863 : return -EUCLEAN;
1864 :
1865 0 : length = wp - physical_pos;
1866 0 : return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1867 : }
1868 :
1869 : /*
1870 : * Activate block group and underlying device zones
1871 : *
1872 : * @block_group: the block group to activate
1873 : *
1874 : * Return: true on success, false otherwise
1875 : */
1876 0 : bool btrfs_zone_activate(struct btrfs_block_group *block_group)
1877 : {
1878 0 : struct btrfs_fs_info *fs_info = block_group->fs_info;
1879 0 : struct btrfs_space_info *space_info = block_group->space_info;
1880 0 : struct map_lookup *map;
1881 0 : struct btrfs_device *device;
1882 0 : u64 physical;
1883 0 : bool ret;
1884 0 : int i;
1885 :
1886 0 : if (!btrfs_is_zoned(block_group->fs_info))
1887 : return true;
1888 :
1889 0 : map = block_group->physical_map;
1890 :
1891 0 : spin_lock(&space_info->lock);
1892 0 : spin_lock(&block_group->lock);
1893 0 : if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
1894 0 : ret = true;
1895 0 : goto out_unlock;
1896 : }
1897 :
1898 : /* No space left */
1899 0 : if (btrfs_zoned_bg_is_full(block_group)) {
1900 0 : ret = false;
1901 0 : goto out_unlock;
1902 : }
1903 :
1904 0 : for (i = 0; i < map->num_stripes; i++) {
1905 0 : device = map->stripes[i].dev;
1906 0 : physical = map->stripes[i].physical;
1907 :
1908 0 : if (device->zone_info->max_active_zones == 0)
1909 0 : continue;
1910 :
1911 0 : if (!btrfs_dev_set_active_zone(device, physical)) {
1912 : /* Cannot activate the zone */
1913 0 : ret = false;
1914 0 : goto out_unlock;
1915 : }
1916 : }
1917 :
1918 : /* Successfully activated all the zones */
1919 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
1920 0 : WARN_ON(block_group->alloc_offset != 0);
1921 0 : if (block_group->zone_unusable == block_group->length) {
1922 0 : block_group->zone_unusable = block_group->length - block_group->zone_capacity;
1923 0 : space_info->bytes_zone_unusable -= block_group->zone_capacity;
1924 : }
1925 0 : spin_unlock(&block_group->lock);
1926 0 : btrfs_try_granting_tickets(fs_info, space_info);
1927 0 : spin_unlock(&space_info->lock);
1928 :
1929 : /* For the active block group list */
1930 0 : btrfs_get_block_group(block_group);
1931 :
1932 0 : spin_lock(&fs_info->zone_active_bgs_lock);
1933 0 : list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
1934 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
1935 :
1936 0 : return true;
1937 :
1938 0 : out_unlock:
1939 0 : spin_unlock(&block_group->lock);
1940 0 : spin_unlock(&space_info->lock);
1941 0 : return ret;
1942 : }
1943 :
1944 0 : static void wait_eb_writebacks(struct btrfs_block_group *block_group)
1945 : {
1946 0 : struct btrfs_fs_info *fs_info = block_group->fs_info;
1947 0 : const u64 end = block_group->start + block_group->length;
1948 0 : struct radix_tree_iter iter;
1949 0 : struct extent_buffer *eb;
1950 0 : void __rcu **slot;
1951 :
1952 0 : rcu_read_lock();
1953 0 : radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
1954 : block_group->start >> fs_info->sectorsize_bits) {
1955 0 : eb = radix_tree_deref_slot(slot);
1956 0 : if (!eb)
1957 0 : continue;
1958 0 : if (radix_tree_deref_retry(eb)) {
1959 0 : slot = radix_tree_iter_retry(&iter);
1960 0 : continue;
1961 : }
1962 :
1963 0 : if (eb->start < block_group->start)
1964 0 : continue;
1965 0 : if (eb->start >= end)
1966 : break;
1967 :
1968 0 : slot = radix_tree_iter_resume(slot, &iter);
1969 0 : rcu_read_unlock();
1970 0 : wait_on_extent_buffer_writeback(eb);
1971 0 : rcu_read_lock();
1972 : }
1973 0 : rcu_read_unlock();
1974 0 : }
1975 :
1976 0 : static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
1977 : {
1978 0 : struct btrfs_fs_info *fs_info = block_group->fs_info;
1979 0 : struct map_lookup *map;
1980 0 : const bool is_metadata = (block_group->flags &
1981 : (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
1982 0 : int ret = 0;
1983 0 : int i;
1984 :
1985 0 : spin_lock(&block_group->lock);
1986 0 : if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
1987 0 : spin_unlock(&block_group->lock);
1988 0 : return 0;
1989 : }
1990 :
1991 : /* Check if we have unwritten allocated space */
1992 0 : if (is_metadata &&
1993 0 : block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
1994 0 : spin_unlock(&block_group->lock);
1995 0 : return -EAGAIN;
1996 : }
1997 :
1998 : /*
1999 : * If we are sure that the block group is full (= no more room left for
2000 : * new allocation) and the IO for the last usable block is completed, we
2001 : * don't need to wait for the other IOs. This holds because we ensure
2002 : * the sequential IO submissions using the ZONE_APPEND command for data
2003 : * and block_group->meta_write_pointer for metadata.
2004 : */
2005 0 : if (!fully_written) {
2006 0 : spin_unlock(&block_group->lock);
2007 :
2008 0 : ret = btrfs_inc_block_group_ro(block_group, false);
2009 0 : if (ret)
2010 : return ret;
2011 :
2012 : /* Ensure all writes in this block group finish */
2013 0 : btrfs_wait_block_group_reservations(block_group);
2014 : /* No need to wait for NOCOW writers. Zoned mode does not allow that */
2015 0 : btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
2016 : block_group->length);
2017 : /* Wait for extent buffers to be written. */
2018 0 : if (is_metadata)
2019 0 : wait_eb_writebacks(block_group);
2020 :
2021 0 : spin_lock(&block_group->lock);
2022 :
2023 : /*
2024 : * Bail out if someone already deactivated the block group, or
2025 : * allocated space is left in the block group.
2026 : */
2027 0 : if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2028 : &block_group->runtime_flags)) {
2029 0 : spin_unlock(&block_group->lock);
2030 0 : btrfs_dec_block_group_ro(block_group);
2031 0 : return 0;
2032 : }
2033 :
2034 0 : if (block_group->reserved) {
2035 0 : spin_unlock(&block_group->lock);
2036 0 : btrfs_dec_block_group_ro(block_group);
2037 0 : return -EAGAIN;
2038 : }
2039 : }
2040 :
2041 0 : clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
2042 0 : block_group->alloc_offset = block_group->zone_capacity;
2043 0 : block_group->free_space_ctl->free_space = 0;
2044 0 : btrfs_clear_treelog_bg(block_group);
2045 0 : btrfs_clear_data_reloc_bg(block_group);
2046 0 : spin_unlock(&block_group->lock);
2047 :
2048 0 : map = block_group->physical_map;
2049 0 : for (i = 0; i < map->num_stripes; i++) {
2050 0 : struct btrfs_device *device = map->stripes[i].dev;
2051 0 : const u64 physical = map->stripes[i].physical;
2052 :
2053 0 : if (device->zone_info->max_active_zones == 0)
2054 0 : continue;
2055 :
2056 0 : ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
2057 : physical >> SECTOR_SHIFT,
2058 0 : device->zone_info->zone_size >> SECTOR_SHIFT,
2059 : GFP_NOFS);
2060 :
2061 0 : if (ret)
2062 0 : return ret;
2063 :
2064 0 : btrfs_dev_clear_active_zone(device, physical);
2065 : }
2066 :
2067 0 : if (!fully_written)
2068 0 : btrfs_dec_block_group_ro(block_group);
2069 :
2070 0 : spin_lock(&fs_info->zone_active_bgs_lock);
2071 0 : ASSERT(!list_empty(&block_group->active_bg_list));
2072 0 : list_del_init(&block_group->active_bg_list);
2073 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
2074 :
2075 : /* For active_bg_list */
2076 0 : btrfs_put_block_group(block_group);
2077 :
2078 0 : clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2079 :
2080 0 : return 0;
2081 : }
2082 :
2083 0 : int btrfs_zone_finish(struct btrfs_block_group *block_group)
2084 : {
2085 0 : if (!btrfs_is_zoned(block_group->fs_info))
2086 : return 0;
2087 :
2088 0 : return do_zone_finish(block_group, false);
2089 : }
2090 :
2091 0 : bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
2092 : {
2093 0 : struct btrfs_fs_info *fs_info = fs_devices->fs_info;
2094 0 : struct btrfs_device *device;
2095 0 : bool ret = false;
2096 :
2097 0 : if (!btrfs_is_zoned(fs_info))
2098 : return true;
2099 :
2100 : /* Check if there is a device with active zones left */
2101 0 : mutex_lock(&fs_info->chunk_mutex);
2102 0 : list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
2103 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
2104 :
2105 0 : if (!device->bdev)
2106 0 : continue;
2107 :
2108 0 : if (!zinfo->max_active_zones) {
2109 : ret = true;
2110 : break;
2111 : }
2112 :
2113 0 : switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2114 0 : case 0: /* single */
2115 0 : ret = (atomic_read(&zinfo->active_zones_left) >= 1);
2116 0 : break;
2117 0 : case BTRFS_BLOCK_GROUP_DUP:
2118 0 : ret = (atomic_read(&zinfo->active_zones_left) >= 2);
2119 0 : break;
2120 : }
2121 0 : if (ret)
2122 : break;
2123 : }
2124 0 : mutex_unlock(&fs_info->chunk_mutex);
2125 :
2126 0 : if (!ret)
2127 0 : set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2128 :
2129 : return ret;
2130 : }
2131 :
2132 0 : void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
2133 : {
2134 0 : struct btrfs_block_group *block_group;
2135 0 : u64 min_alloc_bytes;
2136 :
2137 0 : if (!btrfs_is_zoned(fs_info))
2138 : return;
2139 :
2140 0 : block_group = btrfs_lookup_block_group(fs_info, logical);
2141 0 : ASSERT(block_group);
2142 :
2143 : /* No MIXED_BG on zoned btrfs. */
2144 0 : if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
2145 0 : min_alloc_bytes = fs_info->sectorsize;
2146 : else
2147 0 : min_alloc_bytes = fs_info->nodesize;
2148 :
2149 : /* Bail out if we can allocate more data from this block group. */
2150 0 : if (logical + length + min_alloc_bytes <=
2151 0 : block_group->start + block_group->zone_capacity)
2152 0 : goto out;
2153 :
2154 0 : do_zone_finish(block_group, true);
2155 :
2156 0 : out:
2157 0 : btrfs_put_block_group(block_group);
2158 : }
2159 :
2160 0 : static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
2161 : {
2162 0 : struct btrfs_block_group *bg =
2163 0 : container_of(work, struct btrfs_block_group, zone_finish_work);
2164 :
2165 0 : wait_on_extent_buffer_writeback(bg->last_eb);
2166 0 : free_extent_buffer(bg->last_eb);
2167 0 : btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
2168 0 : btrfs_put_block_group(bg);
2169 0 : }
2170 :
2171 0 : void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
2172 : struct extent_buffer *eb)
2173 : {
2174 0 : if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
2175 0 : eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
2176 : return;
2177 :
2178 0 : if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
2179 0 : btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
2180 : bg->start);
2181 0 : return;
2182 : }
2183 :
2184 : /* For the work */
2185 0 : btrfs_get_block_group(bg);
2186 0 : atomic_inc(&eb->refs);
2187 0 : bg->last_eb = eb;
2188 0 : INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
2189 0 : queue_work(system_unbound_wq, &bg->zone_finish_work);
2190 : }
2191 :
2192 0 : void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
2193 : {
2194 0 : struct btrfs_fs_info *fs_info = bg->fs_info;
2195 :
2196 0 : spin_lock(&fs_info->relocation_bg_lock);
2197 0 : if (fs_info->data_reloc_bg == bg->start)
2198 0 : fs_info->data_reloc_bg = 0;
2199 0 : spin_unlock(&fs_info->relocation_bg_lock);
2200 0 : }
2201 :
2202 0 : void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
2203 : {
2204 0 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2205 0 : struct btrfs_device *device;
2206 :
2207 0 : if (!btrfs_is_zoned(fs_info))
2208 : return;
2209 :
2210 0 : mutex_lock(&fs_devices->device_list_mutex);
2211 0 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
2212 0 : if (device->zone_info) {
2213 0 : vfree(device->zone_info->zone_cache);
2214 0 : device->zone_info->zone_cache = NULL;
2215 : }
2216 : }
2217 0 : mutex_unlock(&fs_devices->device_list_mutex);
2218 : }
2219 :
2220 0 : bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
2221 : {
2222 0 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2223 0 : struct btrfs_device *device;
2224 0 : u64 used = 0;
2225 0 : u64 total = 0;
2226 0 : u64 factor;
2227 :
2228 0 : ASSERT(btrfs_is_zoned(fs_info));
2229 :
2230 0 : if (fs_info->bg_reclaim_threshold == 0)
2231 : return false;
2232 :
2233 0 : mutex_lock(&fs_devices->device_list_mutex);
2234 0 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
2235 0 : if (!device->bdev)
2236 0 : continue;
2237 :
2238 0 : total += device->disk_total_bytes;
2239 0 : used += device->bytes_used;
2240 : }
2241 0 : mutex_unlock(&fs_devices->device_list_mutex);
2242 :
2243 0 : factor = div64_u64(used * 100, total);
2244 0 : return factor >= fs_info->bg_reclaim_threshold;
2245 : }
2246 :
2247 0 : void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
2248 : u64 length)
2249 : {
2250 0 : struct btrfs_block_group *block_group;
2251 :
2252 0 : if (!btrfs_is_zoned(fs_info))
2253 : return;
2254 :
2255 0 : block_group = btrfs_lookup_block_group(fs_info, logical);
2256 : /* It should be called on a previous data relocation block group. */
2257 0 : ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
2258 :
2259 0 : spin_lock(&block_group->lock);
2260 0 : if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
2261 0 : goto out;
2262 :
2263 : /* All relocation extents are written. */
2264 0 : if (block_group->start + block_group->alloc_offset == logical + length) {
2265 : /* Now, release this block group for further allocations. */
2266 0 : clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2267 : &block_group->runtime_flags);
2268 : }
2269 :
2270 0 : out:
2271 0 : spin_unlock(&block_group->lock);
2272 0 : btrfs_put_block_group(block_group);
2273 : }
2274 :
2275 0 : int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
2276 : {
2277 0 : struct btrfs_block_group *block_group;
2278 0 : struct btrfs_block_group *min_bg = NULL;
2279 0 : u64 min_avail = U64_MAX;
2280 0 : int ret;
2281 :
2282 0 : spin_lock(&fs_info->zone_active_bgs_lock);
2283 0 : list_for_each_entry(block_group, &fs_info->zone_active_bgs,
2284 : active_bg_list) {
2285 0 : u64 avail;
2286 :
2287 0 : spin_lock(&block_group->lock);
2288 0 : if (block_group->reserved || block_group->alloc_offset == 0 ||
2289 0 : (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
2290 0 : spin_unlock(&block_group->lock);
2291 0 : continue;
2292 : }
2293 :
2294 0 : avail = block_group->zone_capacity - block_group->alloc_offset;
2295 0 : if (min_avail > avail) {
2296 0 : if (min_bg)
2297 0 : btrfs_put_block_group(min_bg);
2298 0 : min_bg = block_group;
2299 0 : min_avail = avail;
2300 0 : btrfs_get_block_group(min_bg);
2301 : }
2302 0 : spin_unlock(&block_group->lock);
2303 : }
2304 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
2305 :
2306 0 : if (!min_bg)
2307 : return 0;
2308 :
2309 0 : ret = btrfs_zone_finish(min_bg);
2310 0 : btrfs_put_block_group(min_bg);
2311 :
2312 0 : return ret < 0 ? ret : 1;
2313 : }
2314 :
2315 0 : int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
2316 : struct btrfs_space_info *space_info,
2317 : bool do_finish)
2318 : {
2319 0 : struct btrfs_block_group *bg;
2320 0 : int index;
2321 :
2322 0 : if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
2323 : return 0;
2324 :
2325 0 : for (;;) {
2326 0 : int ret;
2327 0 : bool need_finish = false;
2328 :
2329 0 : down_read(&space_info->groups_sem);
2330 0 : for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
2331 0 : list_for_each_entry(bg, &space_info->block_groups[index],
2332 : list) {
2333 0 : if (!spin_trylock(&bg->lock))
2334 0 : continue;
2335 0 : if (btrfs_zoned_bg_is_full(bg) ||
2336 0 : test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2337 : &bg->runtime_flags)) {
2338 0 : spin_unlock(&bg->lock);
2339 0 : continue;
2340 : }
2341 0 : spin_unlock(&bg->lock);
2342 :
2343 0 : if (btrfs_zone_activate(bg)) {
2344 0 : up_read(&space_info->groups_sem);
2345 0 : return 1;
2346 : }
2347 :
2348 : need_finish = true;
2349 : }
2350 : }
2351 0 : up_read(&space_info->groups_sem);
2352 :
2353 0 : if (!do_finish || !need_finish)
2354 : break;
2355 :
2356 0 : ret = btrfs_zone_finish_one_bg(fs_info);
2357 0 : if (ret == 0)
2358 : break;
2359 0 : if (ret < 0)
2360 0 : return ret;
2361 : }
2362 :
2363 : return 0;
2364 : }
|