Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 :
3 : #include <linux/bitops.h>
4 : #include <linux/slab.h>
5 : #include <linux/blkdev.h>
6 : #include <linux/sched/mm.h>
7 : #include <linux/atomic.h>
8 : #include <linux/vmalloc.h>
9 : #include "ctree.h"
10 : #include "volumes.h"
11 : #include "zoned.h"
12 : #include "rcu-string.h"
13 : #include "disk-io.h"
14 : #include "block-group.h"
15 : #include "transaction.h"
16 : #include "dev-replace.h"
17 : #include "space-info.h"
18 : #include "super.h"
19 : #include "fs.h"
20 : #include "accessors.h"
21 : #include "bio.h"
22 :
23 : /* Maximum number of zones to report per blkdev_report_zones() call */
24 : #define BTRFS_REPORT_NR_ZONES 4096
25 : /* Invalid allocation pointer value for missing devices */
26 : #define WP_MISSING_DEV ((u64)-1)
27 : /* Pseudo write pointer value for conventional zone */
28 : #define WP_CONVENTIONAL ((u64)-2)
29 :
30 : /*
31 : * Location of the first zone of superblock logging zone pairs.
32 : *
33 : * - primary superblock: 0B (zone 0)
34 : * - first copy: 512G (zone starting at that offset)
35 : * - second copy: 4T (zone starting at that offset)
36 : */
37 : #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
38 : #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
39 : #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
40 :
41 : #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
42 : #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
43 :
44 : /* Number of superblock log zones */
45 : #define BTRFS_NR_SB_LOG_ZONES 2
46 :
47 : /*
48 : * Minimum of active zones we need:
49 : *
50 : * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
51 : * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
52 : * - 1 zone for tree-log dedicated block group
53 : * - 1 zone for relocation
54 : */
55 : #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
56 :
57 : /*
58 : * Minimum / maximum supported zone size. Currently, SMR disks have a zone
59 : * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
60 : * We do not expect the zone size to become larger than 8GiB or smaller than
61 : * 4MiB in the near future.
62 : */
63 : #define BTRFS_MAX_ZONE_SIZE SZ_8G
64 : #define BTRFS_MIN_ZONE_SIZE SZ_4M
65 :
66 : #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
67 :
68 : static inline bool sb_zone_is_full(const struct blk_zone *zone)
69 : {
70 0 : return (zone->cond == BLK_ZONE_COND_FULL) ||
71 0 : (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
72 : }
73 :
74 0 : static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
75 : {
76 0 : struct blk_zone *zones = data;
77 :
78 0 : memcpy(&zones[idx], zone, sizeof(*zone));
79 :
80 0 : return 0;
81 : }
82 :
83 0 : static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
84 : u64 *wp_ret)
85 : {
86 0 : bool empty[BTRFS_NR_SB_LOG_ZONES];
87 0 : bool full[BTRFS_NR_SB_LOG_ZONES];
88 0 : sector_t sector;
89 0 : int i;
90 :
91 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
92 0 : ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
93 0 : empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
94 0 : full[i] = sb_zone_is_full(&zones[i]);
95 : }
96 :
97 : /*
98 : * Possible states of log buffer zones
99 : *
100 : * Empty[0] In use[0] Full[0]
101 : * Empty[1] * 0 1
102 : * In use[1] x x 1
103 : * Full[1] 0 0 C
104 : *
105 : * Log position:
106 : * *: Special case, no superblock is written
107 : * 0: Use write pointer of zones[0]
108 : * 1: Use write pointer of zones[1]
109 : * C: Compare super blocks from zones[0] and zones[1], use the latest
110 : * one determined by generation
111 : * x: Invalid state
112 : */
113 :
114 0 : if (empty[0] && empty[1]) {
115 : /* Special case to distinguish no superblock to read */
116 0 : *wp_ret = zones[0].start << SECTOR_SHIFT;
117 0 : return -ENOENT;
118 0 : } else if (full[0] && full[1]) {
119 : /* Compare two super blocks */
120 0 : struct address_space *mapping = bdev->bd_inode->i_mapping;
121 0 : struct page *page[BTRFS_NR_SB_LOG_ZONES];
122 0 : struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
123 0 : int i;
124 :
125 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
126 0 : u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
127 0 : u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
128 : BTRFS_SUPER_INFO_SIZE;
129 :
130 0 : page[i] = read_cache_page_gfp(mapping,
131 0 : bytenr >> PAGE_SHIFT, GFP_NOFS);
132 0 : if (IS_ERR(page[i])) {
133 0 : if (i == 1)
134 0 : btrfs_release_disk_super(super[0]);
135 0 : return PTR_ERR(page[i]);
136 : }
137 0 : super[i] = page_address(page[i]);
138 : }
139 :
140 0 : if (btrfs_super_generation(super[0]) >
141 0 : btrfs_super_generation(super[1]))
142 0 : sector = zones[1].start;
143 : else
144 0 : sector = zones[0].start;
145 :
146 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
147 0 : btrfs_release_disk_super(super[i]);
148 0 : } else if (!full[0] && (empty[1] || full[1])) {
149 0 : sector = zones[0].wp;
150 0 : } else if (full[0]) {
151 0 : sector = zones[1].wp;
152 : } else {
153 : return -EUCLEAN;
154 : }
155 0 : *wp_ret = sector << SECTOR_SHIFT;
156 0 : return 0;
157 : }
158 :
159 : /*
160 : * Get the first zone number of the superblock mirror
161 : */
162 0 : static inline u32 sb_zone_number(int shift, int mirror)
163 : {
164 0 : u64 zone = U64_MAX;
165 :
166 0 : ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
167 0 : switch (mirror) {
168 0 : case 0: zone = 0; break;
169 0 : case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
170 0 : case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
171 : }
172 :
173 0 : ASSERT(zone <= U32_MAX);
174 :
175 0 : return (u32)zone;
176 : }
177 :
178 0 : static inline sector_t zone_start_sector(u32 zone_number,
179 : struct block_device *bdev)
180 : {
181 0 : return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
182 : }
183 :
184 : static inline u64 zone_start_physical(u32 zone_number,
185 : struct btrfs_zoned_device_info *zone_info)
186 : {
187 0 : return (u64)zone_number << zone_info->zone_size_shift;
188 : }
189 :
190 : /*
191 : * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
192 : * device into static sized chunks and fake a conventional zone on each of
193 : * them.
194 : */
195 0 : static int emulate_report_zones(struct btrfs_device *device, u64 pos,
196 : struct blk_zone *zones, unsigned int nr_zones)
197 : {
198 0 : const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
199 0 : sector_t bdev_size = bdev_nr_sectors(device->bdev);
200 0 : unsigned int i;
201 :
202 0 : pos >>= SECTOR_SHIFT;
203 0 : for (i = 0; i < nr_zones; i++) {
204 0 : zones[i].start = i * zone_sectors + pos;
205 0 : zones[i].len = zone_sectors;
206 0 : zones[i].capacity = zone_sectors;
207 0 : zones[i].wp = zones[i].start + zone_sectors;
208 0 : zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
209 0 : zones[i].cond = BLK_ZONE_COND_NOT_WP;
210 :
211 0 : if (zones[i].wp >= bdev_size) {
212 0 : i++;
213 0 : break;
214 : }
215 : }
216 :
217 0 : return i;
218 : }
219 :
220 0 : static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
221 : struct blk_zone *zones, unsigned int *nr_zones)
222 : {
223 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
224 0 : int ret;
225 :
226 0 : if (!*nr_zones)
227 : return 0;
228 :
229 0 : if (!bdev_is_zoned(device->bdev)) {
230 0 : ret = emulate_report_zones(device, pos, zones, *nr_zones);
231 0 : *nr_zones = ret;
232 0 : return 0;
233 : }
234 :
235 : /* Check cache */
236 0 : if (zinfo->zone_cache) {
237 0 : unsigned int i;
238 0 : u32 zno;
239 :
240 0 : ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
241 0 : zno = pos >> zinfo->zone_size_shift;
242 : /*
243 : * We cannot report zones beyond the zone end. So, it is OK to
244 : * cap *nr_zones to at the end.
245 : */
246 0 : *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
247 :
248 0 : for (i = 0; i < *nr_zones; i++) {
249 0 : struct blk_zone *zone_info;
250 :
251 0 : zone_info = &zinfo->zone_cache[zno + i];
252 0 : if (!zone_info->len)
253 : break;
254 : }
255 :
256 0 : if (i == *nr_zones) {
257 : /* Cache hit on all the zones */
258 0 : memcpy(zones, zinfo->zone_cache + zno,
259 : sizeof(*zinfo->zone_cache) * *nr_zones);
260 0 : return 0;
261 : }
262 : }
263 :
264 0 : ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
265 : copy_zone_info_cb, zones);
266 0 : if (ret < 0) {
267 0 : btrfs_err_in_rcu(device->fs_info,
268 : "zoned: failed to read zone %llu on %s (devid %llu)",
269 : pos, rcu_str_deref(device->name),
270 : device->devid);
271 0 : return ret;
272 : }
273 0 : *nr_zones = ret;
274 0 : if (!ret)
275 : return -EIO;
276 :
277 : /* Populate cache */
278 0 : if (zinfo->zone_cache) {
279 0 : u32 zno = pos >> zinfo->zone_size_shift;
280 :
281 0 : memcpy(zinfo->zone_cache + zno, zones,
282 : sizeof(*zinfo->zone_cache) * *nr_zones);
283 : }
284 :
285 : return 0;
286 : }
287 :
288 : /* The emulated zone size is determined from the size of device extent */
289 0 : static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
290 : {
291 0 : struct btrfs_path *path;
292 0 : struct btrfs_root *root = fs_info->dev_root;
293 0 : struct btrfs_key key;
294 0 : struct extent_buffer *leaf;
295 0 : struct btrfs_dev_extent *dext;
296 0 : int ret = 0;
297 :
298 0 : key.objectid = 1;
299 0 : key.type = BTRFS_DEV_EXTENT_KEY;
300 0 : key.offset = 0;
301 :
302 0 : path = btrfs_alloc_path();
303 0 : if (!path)
304 : return -ENOMEM;
305 :
306 0 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
307 0 : if (ret < 0)
308 0 : goto out;
309 :
310 0 : if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
311 0 : ret = btrfs_next_leaf(root, path);
312 0 : if (ret < 0)
313 0 : goto out;
314 : /* No dev extents at all? Not good */
315 0 : if (ret > 0) {
316 0 : ret = -EUCLEAN;
317 0 : goto out;
318 : }
319 : }
320 :
321 0 : leaf = path->nodes[0];
322 0 : dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
323 0 : fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
324 0 : ret = 0;
325 :
326 0 : out:
327 0 : btrfs_free_path(path);
328 :
329 0 : return ret;
330 : }
331 :
332 0 : int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
333 : {
334 0 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
335 0 : struct btrfs_device *device;
336 0 : int ret = 0;
337 :
338 : /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
339 0 : if (!btrfs_fs_incompat(fs_info, ZONED))
340 : return 0;
341 :
342 0 : mutex_lock(&fs_devices->device_list_mutex);
343 0 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
344 : /* We can skip reading of zone info for missing devices */
345 0 : if (!device->bdev)
346 0 : continue;
347 :
348 0 : ret = btrfs_get_dev_zone_info(device, true);
349 0 : if (ret)
350 : break;
351 : }
352 0 : mutex_unlock(&fs_devices->device_list_mutex);
353 :
354 0 : return ret;
355 : }
356 :
357 0 : int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
358 : {
359 0 : struct btrfs_fs_info *fs_info = device->fs_info;
360 0 : struct btrfs_zoned_device_info *zone_info = NULL;
361 0 : struct block_device *bdev = device->bdev;
362 0 : unsigned int max_active_zones;
363 0 : unsigned int nactive;
364 0 : sector_t nr_sectors;
365 0 : sector_t sector = 0;
366 0 : struct blk_zone *zones = NULL;
367 0 : unsigned int i, nreported = 0, nr_zones;
368 0 : sector_t zone_sectors;
369 0 : char *model, *emulated;
370 0 : int ret;
371 :
372 : /*
373 : * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
374 : * yet be set.
375 : */
376 0 : if (!btrfs_fs_incompat(fs_info, ZONED))
377 : return 0;
378 :
379 0 : if (device->zone_info)
380 : return 0;
381 :
382 0 : zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
383 0 : if (!zone_info)
384 : return -ENOMEM;
385 :
386 0 : device->zone_info = zone_info;
387 :
388 0 : if (!bdev_is_zoned(bdev)) {
389 0 : if (!fs_info->zone_size) {
390 0 : ret = calculate_emulated_zone_size(fs_info);
391 0 : if (ret)
392 0 : goto out;
393 : }
394 :
395 0 : ASSERT(fs_info->zone_size);
396 0 : zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
397 : } else {
398 0 : zone_sectors = bdev_zone_sectors(bdev);
399 : }
400 :
401 0 : ASSERT(is_power_of_two_u64(zone_sectors));
402 0 : zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
403 :
404 : /* We reject devices with a zone size larger than 8GB */
405 0 : if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
406 0 : btrfs_err_in_rcu(fs_info,
407 : "zoned: %s: zone size %llu larger than supported maximum %llu",
408 : rcu_str_deref(device->name),
409 : zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
410 0 : ret = -EINVAL;
411 0 : goto out;
412 0 : } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
413 0 : btrfs_err_in_rcu(fs_info,
414 : "zoned: %s: zone size %llu smaller than supported minimum %u",
415 : rcu_str_deref(device->name),
416 : zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
417 0 : ret = -EINVAL;
418 0 : goto out;
419 : }
420 :
421 0 : nr_sectors = bdev_nr_sectors(bdev);
422 0 : zone_info->zone_size_shift = ilog2(zone_info->zone_size);
423 0 : zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
424 0 : if (!IS_ALIGNED(nr_sectors, zone_sectors))
425 0 : zone_info->nr_zones++;
426 :
427 0 : max_active_zones = bdev_max_active_zones(bdev);
428 0 : if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
429 0 : btrfs_err_in_rcu(fs_info,
430 : "zoned: %s: max active zones %u is too small, need at least %u active zones",
431 : rcu_str_deref(device->name), max_active_zones,
432 : BTRFS_MIN_ACTIVE_ZONES);
433 0 : ret = -EINVAL;
434 0 : goto out;
435 : }
436 0 : zone_info->max_active_zones = max_active_zones;
437 :
438 0 : zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
439 0 : if (!zone_info->seq_zones) {
440 0 : ret = -ENOMEM;
441 0 : goto out;
442 : }
443 :
444 0 : zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
445 0 : if (!zone_info->empty_zones) {
446 0 : ret = -ENOMEM;
447 0 : goto out;
448 : }
449 :
450 0 : zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
451 0 : if (!zone_info->active_zones) {
452 0 : ret = -ENOMEM;
453 0 : goto out;
454 : }
455 :
456 0 : zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
457 0 : if (!zones) {
458 0 : ret = -ENOMEM;
459 0 : goto out;
460 : }
461 :
462 : /*
463 : * Enable zone cache only for a zoned device. On a non-zoned device, we
464 : * fill the zone info with emulated CONVENTIONAL zones, so no need to
465 : * use the cache.
466 : */
467 0 : if (populate_cache && bdev_is_zoned(device->bdev)) {
468 0 : zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
469 0 : zone_info->nr_zones);
470 0 : if (!zone_info->zone_cache) {
471 0 : btrfs_err_in_rcu(device->fs_info,
472 : "zoned: failed to allocate zone cache for %s",
473 : rcu_str_deref(device->name));
474 0 : ret = -ENOMEM;
475 0 : goto out;
476 : }
477 : }
478 :
479 : /* Get zones type */
480 : nactive = 0;
481 0 : while (sector < nr_sectors) {
482 0 : nr_zones = BTRFS_REPORT_NR_ZONES;
483 0 : ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
484 : &nr_zones);
485 0 : if (ret)
486 0 : goto out;
487 :
488 0 : for (i = 0; i < nr_zones; i++) {
489 0 : if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
490 0 : __set_bit(nreported, zone_info->seq_zones);
491 0 : switch (zones[i].cond) {
492 0 : case BLK_ZONE_COND_EMPTY:
493 0 : __set_bit(nreported, zone_info->empty_zones);
494 : break;
495 0 : case BLK_ZONE_COND_IMP_OPEN:
496 : case BLK_ZONE_COND_EXP_OPEN:
497 : case BLK_ZONE_COND_CLOSED:
498 0 : __set_bit(nreported, zone_info->active_zones);
499 0 : nactive++;
500 0 : break;
501 : }
502 0 : nreported++;
503 : }
504 0 : sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
505 : }
506 :
507 0 : if (nreported != zone_info->nr_zones) {
508 0 : btrfs_err_in_rcu(device->fs_info,
509 : "inconsistent number of zones on %s (%u/%u)",
510 : rcu_str_deref(device->name), nreported,
511 : zone_info->nr_zones);
512 0 : ret = -EIO;
513 0 : goto out;
514 : }
515 :
516 0 : if (max_active_zones) {
517 0 : if (nactive > max_active_zones) {
518 0 : btrfs_err_in_rcu(device->fs_info,
519 : "zoned: %u active zones on %s exceeds max_active_zones %u",
520 : nactive, rcu_str_deref(device->name),
521 : max_active_zones);
522 0 : ret = -EIO;
523 0 : goto out;
524 : }
525 0 : atomic_set(&zone_info->active_zones_left,
526 0 : max_active_zones - nactive);
527 0 : set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
528 : }
529 :
530 : /* Validate superblock log */
531 0 : nr_zones = BTRFS_NR_SB_LOG_ZONES;
532 0 : for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
533 0 : u32 sb_zone;
534 0 : u64 sb_wp;
535 0 : int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
536 :
537 0 : sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
538 0 : if (sb_zone + 1 >= zone_info->nr_zones)
539 0 : continue;
540 :
541 0 : ret = btrfs_get_dev_zones(device,
542 : zone_start_physical(sb_zone, zone_info),
543 : &zone_info->sb_zones[sb_pos],
544 : &nr_zones);
545 0 : if (ret)
546 0 : goto out;
547 :
548 0 : if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
549 0 : btrfs_err_in_rcu(device->fs_info,
550 : "zoned: failed to read super block log zone info at devid %llu zone %u",
551 : device->devid, sb_zone);
552 0 : ret = -EUCLEAN;
553 0 : goto out;
554 : }
555 :
556 : /*
557 : * If zones[0] is conventional, always use the beginning of the
558 : * zone to record superblock. No need to validate in that case.
559 : */
560 0 : if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
561 : BLK_ZONE_TYPE_CONVENTIONAL)
562 0 : continue;
563 :
564 0 : ret = sb_write_pointer(device->bdev,
565 : &zone_info->sb_zones[sb_pos], &sb_wp);
566 0 : if (ret != -ENOENT && ret) {
567 0 : btrfs_err_in_rcu(device->fs_info,
568 : "zoned: super block log zone corrupted devid %llu zone %u",
569 : device->devid, sb_zone);
570 0 : ret = -EUCLEAN;
571 0 : goto out;
572 : }
573 : }
574 :
575 :
576 0 : kvfree(zones);
577 :
578 0 : switch (bdev_zoned_model(bdev)) {
579 : case BLK_ZONED_HM:
580 : model = "host-managed zoned";
581 : emulated = "";
582 : break;
583 : case BLK_ZONED_HA:
584 : model = "host-aware zoned";
585 : emulated = "";
586 : break;
587 : case BLK_ZONED_NONE:
588 : model = "regular";
589 : emulated = "emulated ";
590 : break;
591 : default:
592 : /* Just in case */
593 0 : btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
594 : bdev_zoned_model(bdev),
595 : rcu_str_deref(device->name));
596 0 : ret = -EOPNOTSUPP;
597 0 : goto out_free_zone_info;
598 : }
599 :
600 0 : btrfs_info_in_rcu(fs_info,
601 : "%s block device %s, %u %szones of %llu bytes",
602 : model, rcu_str_deref(device->name), zone_info->nr_zones,
603 : emulated, zone_info->zone_size);
604 :
605 0 : return 0;
606 :
607 0 : out:
608 0 : kvfree(zones);
609 0 : out_free_zone_info:
610 0 : btrfs_destroy_dev_zone_info(device);
611 :
612 0 : return ret;
613 : }
614 :
615 0 : void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
616 : {
617 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
618 :
619 0 : if (!zone_info)
620 : return;
621 :
622 0 : bitmap_free(zone_info->active_zones);
623 0 : bitmap_free(zone_info->seq_zones);
624 0 : bitmap_free(zone_info->empty_zones);
625 0 : vfree(zone_info->zone_cache);
626 0 : kfree(zone_info);
627 0 : device->zone_info = NULL;
628 : }
629 :
630 0 : struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
631 : {
632 0 : struct btrfs_zoned_device_info *zone_info;
633 :
634 0 : zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
635 0 : if (!zone_info)
636 : return NULL;
637 :
638 0 : zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
639 0 : if (!zone_info->seq_zones)
640 0 : goto out;
641 :
642 0 : bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
643 : zone_info->nr_zones);
644 :
645 0 : zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
646 0 : if (!zone_info->empty_zones)
647 0 : goto out;
648 :
649 0 : bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
650 : zone_info->nr_zones);
651 :
652 0 : zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
653 0 : if (!zone_info->active_zones)
654 0 : goto out;
655 :
656 0 : bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
657 : zone_info->nr_zones);
658 0 : zone_info->zone_cache = NULL;
659 :
660 0 : return zone_info;
661 :
662 0 : out:
663 0 : bitmap_free(zone_info->seq_zones);
664 0 : bitmap_free(zone_info->empty_zones);
665 0 : bitmap_free(zone_info->active_zones);
666 0 : kfree(zone_info);
667 0 : return NULL;
668 : }
669 :
670 0 : int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
671 : struct blk_zone *zone)
672 : {
673 0 : unsigned int nr_zones = 1;
674 0 : int ret;
675 :
676 0 : ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
677 0 : if (ret != 0 || !nr_zones)
678 0 : return ret ? ret : -EIO;
679 :
680 : return 0;
681 : }
682 :
683 0 : static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
684 : {
685 0 : struct btrfs_device *device;
686 :
687 0 : list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
688 0 : if (device->bdev &&
689 : bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
690 0 : btrfs_err(fs_info,
691 : "zoned: mode not enabled but zoned device found: %pg",
692 : device->bdev);
693 0 : return -EINVAL;
694 : }
695 : }
696 :
697 : return 0;
698 : }
699 :
700 0 : int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
701 : {
702 0 : struct queue_limits *lim = &fs_info->limits;
703 0 : struct btrfs_device *device;
704 0 : u64 zone_size = 0;
705 0 : int ret;
706 :
707 : /*
708 : * Host-Managed devices can't be used without the ZONED flag. With the
709 : * ZONED all devices can be used, using zone emulation if required.
710 : */
711 0 : if (!btrfs_fs_incompat(fs_info, ZONED))
712 0 : return btrfs_check_for_zoned_device(fs_info);
713 :
714 0 : blk_set_stacking_limits(lim);
715 :
716 0 : list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
717 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
718 :
719 0 : if (!device->bdev)
720 0 : continue;
721 :
722 0 : if (!zone_size) {
723 0 : zone_size = zone_info->zone_size;
724 0 : } else if (zone_info->zone_size != zone_size) {
725 0 : btrfs_err(fs_info,
726 : "zoned: unequal block device zone sizes: have %llu found %llu",
727 : zone_info->zone_size, zone_size);
728 0 : return -EINVAL;
729 : }
730 :
731 : /*
732 : * With the zoned emulation, we can have non-zoned device on the
733 : * zoned mode. In this case, we don't have a valid max zone
734 : * append size.
735 : */
736 0 : if (bdev_is_zoned(device->bdev)) {
737 0 : blk_stack_limits(lim,
738 : &bdev_get_queue(device->bdev)->limits,
739 : 0);
740 : }
741 : }
742 :
743 : /*
744 : * stripe_size is always aligned to BTRFS_STRIPE_LEN in
745 : * btrfs_create_chunk(). Since we want stripe_len == zone_size,
746 : * check the alignment here.
747 : */
748 0 : if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
749 0 : btrfs_err(fs_info,
750 : "zoned: zone size %llu not aligned to stripe %u",
751 : zone_size, BTRFS_STRIPE_LEN);
752 0 : return -EINVAL;
753 : }
754 :
755 0 : if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
756 0 : btrfs_err(fs_info, "zoned: mixed block groups not supported");
757 0 : return -EINVAL;
758 : }
759 :
760 0 : fs_info->zone_size = zone_size;
761 : /*
762 : * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
763 : * Technically, we can have multiple pages per segment. But, since
764 : * we add the pages one by one to a bio, and cannot increase the
765 : * metadata reservation even if it increases the number of extents, it
766 : * is safe to stick with the limit.
767 : */
768 0 : fs_info->max_zone_append_size = ALIGN_DOWN(
769 : min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
770 : (u64)lim->max_sectors << SECTOR_SHIFT,
771 : (u64)lim->max_segments << PAGE_SHIFT),
772 : fs_info->sectorsize);
773 0 : fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
774 0 : if (fs_info->max_zone_append_size < fs_info->max_extent_size)
775 0 : fs_info->max_extent_size = fs_info->max_zone_append_size;
776 :
777 : /*
778 : * Check mount options here, because we might change fs_info->zoned
779 : * from fs_info->zone_size.
780 : */
781 0 : ret = btrfs_check_mountopts_zoned(fs_info);
782 0 : if (ret)
783 : return ret;
784 :
785 0 : btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
786 0 : return 0;
787 : }
788 :
789 0 : int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
790 : {
791 0 : if (!btrfs_is_zoned(info))
792 : return 0;
793 :
794 : /*
795 : * Space cache writing is not COWed. Disable that to avoid write errors
796 : * in sequential zones.
797 : */
798 0 : if (btrfs_test_opt(info, SPACE_CACHE)) {
799 0 : btrfs_err(info, "zoned: space cache v1 is not supported");
800 0 : return -EINVAL;
801 : }
802 :
803 0 : if (btrfs_test_opt(info, NODATACOW)) {
804 0 : btrfs_err(info, "zoned: NODATACOW not supported");
805 0 : return -EINVAL;
806 : }
807 :
808 0 : btrfs_clear_and_info(info, DISCARD_ASYNC,
809 : "zoned: async discard ignored and disabled for zoned mode");
810 :
811 0 : return 0;
812 : }
813 :
814 0 : static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
815 : int rw, u64 *bytenr_ret)
816 : {
817 0 : u64 wp;
818 0 : int ret;
819 :
820 0 : if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
821 0 : *bytenr_ret = zones[0].start << SECTOR_SHIFT;
822 0 : return 0;
823 : }
824 :
825 0 : ret = sb_write_pointer(bdev, zones, &wp);
826 0 : if (ret != -ENOENT && ret < 0)
827 : return ret;
828 :
829 0 : if (rw == WRITE) {
830 0 : struct blk_zone *reset = NULL;
831 :
832 0 : if (wp == zones[0].start << SECTOR_SHIFT)
833 : reset = &zones[0];
834 0 : else if (wp == zones[1].start << SECTOR_SHIFT)
835 0 : reset = &zones[1];
836 :
837 0 : if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
838 0 : ASSERT(sb_zone_is_full(reset));
839 :
840 0 : ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
841 : reset->start, reset->len,
842 : GFP_NOFS);
843 0 : if (ret)
844 : return ret;
845 :
846 0 : reset->cond = BLK_ZONE_COND_EMPTY;
847 0 : reset->wp = reset->start;
848 : }
849 0 : } else if (ret != -ENOENT) {
850 : /*
851 : * For READ, we want the previous one. Move write pointer to
852 : * the end of a zone, if it is at the head of a zone.
853 : */
854 0 : u64 zone_end = 0;
855 :
856 0 : if (wp == zones[0].start << SECTOR_SHIFT)
857 0 : zone_end = zones[1].start + zones[1].capacity;
858 0 : else if (wp == zones[1].start << SECTOR_SHIFT)
859 0 : zone_end = zones[0].start + zones[0].capacity;
860 0 : if (zone_end)
861 0 : wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
862 : BTRFS_SUPER_INFO_SIZE);
863 :
864 0 : wp -= BTRFS_SUPER_INFO_SIZE;
865 : }
866 :
867 0 : *bytenr_ret = wp;
868 0 : return 0;
869 :
870 : }
871 :
872 4 : int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
873 : u64 *bytenr_ret)
874 : {
875 4 : struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
876 4 : sector_t zone_sectors;
877 4 : u32 sb_zone;
878 4 : int ret;
879 4 : u8 zone_sectors_shift;
880 4 : sector_t nr_sectors;
881 4 : u32 nr_zones;
882 :
883 4 : if (!bdev_is_zoned(bdev)) {
884 4 : *bytenr_ret = btrfs_sb_offset(mirror);
885 4 : return 0;
886 : }
887 :
888 0 : ASSERT(rw == READ || rw == WRITE);
889 :
890 0 : zone_sectors = bdev_zone_sectors(bdev);
891 0 : if (!is_power_of_2(zone_sectors))
892 : return -EINVAL;
893 0 : zone_sectors_shift = ilog2(zone_sectors);
894 0 : nr_sectors = bdev_nr_sectors(bdev);
895 0 : nr_zones = nr_sectors >> zone_sectors_shift;
896 :
897 0 : sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
898 0 : if (sb_zone + 1 >= nr_zones)
899 : return -ENOENT;
900 :
901 0 : ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
902 : BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
903 : zones);
904 0 : if (ret < 0)
905 : return ret;
906 0 : if (ret != BTRFS_NR_SB_LOG_ZONES)
907 : return -EIO;
908 :
909 0 : return sb_log_location(bdev, zones, rw, bytenr_ret);
910 : }
911 :
912 0 : int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
913 : u64 *bytenr_ret)
914 : {
915 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
916 0 : u32 zone_num;
917 :
918 : /*
919 : * For a zoned filesystem on a non-zoned block device, use the same
920 : * super block locations as regular filesystem. Doing so, the super
921 : * block can always be retrieved and the zoned flag of the volume
922 : * detected from the super block information.
923 : */
924 0 : if (!bdev_is_zoned(device->bdev)) {
925 0 : *bytenr_ret = btrfs_sb_offset(mirror);
926 0 : return 0;
927 : }
928 :
929 0 : zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
930 0 : if (zone_num + 1 >= zinfo->nr_zones)
931 : return -ENOENT;
932 :
933 0 : return sb_log_location(device->bdev,
934 0 : &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
935 : rw, bytenr_ret);
936 : }
937 :
938 0 : static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
939 : int mirror)
940 : {
941 0 : u32 zone_num;
942 :
943 0 : if (!zinfo)
944 : return false;
945 :
946 0 : zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
947 0 : if (zone_num + 1 >= zinfo->nr_zones)
948 : return false;
949 :
950 0 : if (!test_bit(zone_num, zinfo->seq_zones))
951 0 : return false;
952 :
953 : return true;
954 : }
955 :
956 0 : int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
957 : {
958 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
959 0 : struct blk_zone *zone;
960 0 : int i;
961 :
962 0 : if (!is_sb_log_zone(zinfo, mirror))
963 : return 0;
964 :
965 0 : zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
966 0 : for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
967 : /* Advance the next zone */
968 0 : if (zone->cond == BLK_ZONE_COND_FULL) {
969 0 : zone++;
970 0 : continue;
971 : }
972 :
973 0 : if (zone->cond == BLK_ZONE_COND_EMPTY)
974 0 : zone->cond = BLK_ZONE_COND_IMP_OPEN;
975 :
976 0 : zone->wp += SUPER_INFO_SECTORS;
977 :
978 0 : if (sb_zone_is_full(zone)) {
979 : /*
980 : * No room left to write new superblock. Since
981 : * superblock is written with REQ_SYNC, it is safe to
982 : * finish the zone now.
983 : *
984 : * If the write pointer is exactly at the capacity,
985 : * explicit ZONE_FINISH is not necessary.
986 : */
987 0 : if (zone->wp != zone->start + zone->capacity) {
988 0 : int ret;
989 :
990 0 : ret = blkdev_zone_mgmt(device->bdev,
991 : REQ_OP_ZONE_FINISH, zone->start,
992 : zone->len, GFP_NOFS);
993 0 : if (ret)
994 : return ret;
995 : }
996 :
997 0 : zone->wp = zone->start + zone->len;
998 0 : zone->cond = BLK_ZONE_COND_FULL;
999 : }
1000 : return 0;
1001 : }
1002 :
1003 : /* All the zones are FULL. Should not reach here. */
1004 : ASSERT(0);
1005 : return -EIO;
1006 : }
1007 :
1008 0 : int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
1009 : {
1010 0 : sector_t zone_sectors;
1011 0 : sector_t nr_sectors;
1012 0 : u8 zone_sectors_shift;
1013 0 : u32 sb_zone;
1014 0 : u32 nr_zones;
1015 :
1016 0 : zone_sectors = bdev_zone_sectors(bdev);
1017 0 : zone_sectors_shift = ilog2(zone_sectors);
1018 0 : nr_sectors = bdev_nr_sectors(bdev);
1019 0 : nr_zones = nr_sectors >> zone_sectors_shift;
1020 :
1021 0 : sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
1022 0 : if (sb_zone + 1 >= nr_zones)
1023 : return -ENOENT;
1024 :
1025 0 : return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
1026 : zone_start_sector(sb_zone, bdev),
1027 : zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
1028 : }
1029 :
1030 : /*
1031 : * Find allocatable zones within a given region.
1032 : *
1033 : * @device: the device to allocate a region on
1034 : * @hole_start: the position of the hole to allocate the region
1035 : * @num_bytes: size of wanted region
1036 : * @hole_end: the end of the hole
1037 : * @return: position of allocatable zones
1038 : *
1039 : * Allocatable region should not contain any superblock locations.
1040 : */
1041 0 : u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
1042 : u64 hole_end, u64 num_bytes)
1043 : {
1044 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
1045 0 : const u8 shift = zinfo->zone_size_shift;
1046 0 : u64 nzones = num_bytes >> shift;
1047 0 : u64 pos = hole_start;
1048 0 : u64 begin, end;
1049 0 : bool have_sb;
1050 0 : int i;
1051 :
1052 0 : ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
1053 0 : ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
1054 :
1055 0 : while (pos < hole_end) {
1056 0 : begin = pos >> shift;
1057 0 : end = begin + nzones;
1058 :
1059 0 : if (end > zinfo->nr_zones)
1060 : return hole_end;
1061 :
1062 : /* Check if zones in the region are all empty */
1063 0 : if (btrfs_dev_is_sequential(device, pos) &&
1064 0 : !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
1065 0 : pos += zinfo->zone_size;
1066 0 : continue;
1067 : }
1068 :
1069 : have_sb = false;
1070 0 : for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1071 0 : u32 sb_zone;
1072 0 : u64 sb_pos;
1073 :
1074 0 : sb_zone = sb_zone_number(shift, i);
1075 0 : if (!(end <= sb_zone ||
1076 0 : sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
1077 0 : have_sb = true;
1078 0 : pos = zone_start_physical(
1079 : sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
1080 0 : break;
1081 : }
1082 :
1083 : /* We also need to exclude regular superblock positions */
1084 0 : sb_pos = btrfs_sb_offset(i);
1085 0 : if (!(pos + num_bytes <= sb_pos ||
1086 0 : sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
1087 0 : have_sb = true;
1088 0 : pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
1089 : zinfo->zone_size);
1090 0 : break;
1091 : }
1092 : }
1093 0 : if (!have_sb)
1094 : break;
1095 : }
1096 :
1097 : return pos;
1098 : }
1099 :
1100 0 : static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
1101 : {
1102 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
1103 0 : unsigned int zno = (pos >> zone_info->zone_size_shift);
1104 :
1105 : /* We can use any number of zones */
1106 0 : if (zone_info->max_active_zones == 0)
1107 : return true;
1108 :
1109 0 : if (!test_bit(zno, zone_info->active_zones)) {
1110 : /* Active zone left? */
1111 0 : if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
1112 : return false;
1113 0 : if (test_and_set_bit(zno, zone_info->active_zones)) {
1114 : /* Someone already set the bit */
1115 0 : atomic_inc(&zone_info->active_zones_left);
1116 : }
1117 : }
1118 :
1119 : return true;
1120 : }
1121 :
1122 0 : static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
1123 : {
1124 0 : struct btrfs_zoned_device_info *zone_info = device->zone_info;
1125 0 : unsigned int zno = (pos >> zone_info->zone_size_shift);
1126 :
1127 : /* We can use any number of zones */
1128 0 : if (zone_info->max_active_zones == 0)
1129 : return;
1130 :
1131 0 : if (test_and_clear_bit(zno, zone_info->active_zones))
1132 0 : atomic_inc(&zone_info->active_zones_left);
1133 : }
1134 :
1135 0 : int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
1136 : u64 length, u64 *bytes)
1137 : {
1138 0 : int ret;
1139 :
1140 0 : *bytes = 0;
1141 0 : ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
1142 : physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
1143 : GFP_NOFS);
1144 0 : if (ret)
1145 : return ret;
1146 :
1147 0 : *bytes = length;
1148 0 : while (length) {
1149 0 : btrfs_dev_set_zone_empty(device, physical);
1150 0 : btrfs_dev_clear_active_zone(device, physical);
1151 0 : physical += device->zone_info->zone_size;
1152 0 : length -= device->zone_info->zone_size;
1153 : }
1154 :
1155 : return 0;
1156 : }
1157 :
1158 0 : int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
1159 : {
1160 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
1161 0 : const u8 shift = zinfo->zone_size_shift;
1162 0 : unsigned long begin = start >> shift;
1163 0 : unsigned long nbits = size >> shift;
1164 0 : u64 pos;
1165 0 : int ret;
1166 :
1167 0 : ASSERT(IS_ALIGNED(start, zinfo->zone_size));
1168 0 : ASSERT(IS_ALIGNED(size, zinfo->zone_size));
1169 :
1170 0 : if (begin + nbits > zinfo->nr_zones)
1171 : return -ERANGE;
1172 :
1173 : /* All the zones are conventional */
1174 0 : if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
1175 : return 0;
1176 :
1177 : /* All the zones are sequential and empty */
1178 0 : if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
1179 0 : bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
1180 : return 0;
1181 :
1182 0 : for (pos = start; pos < start + size; pos += zinfo->zone_size) {
1183 0 : u64 reset_bytes;
1184 :
1185 0 : if (!btrfs_dev_is_sequential(device, pos) ||
1186 0 : btrfs_dev_is_empty_zone(device, pos))
1187 0 : continue;
1188 :
1189 : /* Free regions should be empty */
1190 0 : btrfs_warn_in_rcu(
1191 : device->fs_info,
1192 : "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1193 : rcu_str_deref(device->name), device->devid, pos >> shift);
1194 0 : WARN_ON_ONCE(1);
1195 :
1196 0 : ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
1197 : &reset_bytes);
1198 0 : if (ret)
1199 0 : return ret;
1200 : }
1201 :
1202 : return 0;
1203 : }
1204 :
1205 : /*
1206 : * Calculate an allocation pointer from the extent allocation information
1207 : * for a block group consist of conventional zones. It is pointed to the
1208 : * end of the highest addressed extent in the block group as an allocation
1209 : * offset.
1210 : */
1211 0 : static int calculate_alloc_pointer(struct btrfs_block_group *cache,
1212 : u64 *offset_ret, bool new)
1213 : {
1214 0 : struct btrfs_fs_info *fs_info = cache->fs_info;
1215 0 : struct btrfs_root *root;
1216 0 : struct btrfs_path *path;
1217 0 : struct btrfs_key key;
1218 0 : struct btrfs_key found_key;
1219 0 : int ret;
1220 0 : u64 length;
1221 :
1222 : /*
1223 : * Avoid tree lookups for a new block group, there's no use for it.
1224 : * It must always be 0.
1225 : *
1226 : * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1227 : * For new a block group, this function is called from
1228 : * btrfs_make_block_group() which is already taking the chunk mutex.
1229 : * Thus, we cannot call calculate_alloc_pointer() which takes extent
1230 : * buffer locks to avoid deadlock.
1231 : */
1232 0 : if (new) {
1233 0 : *offset_ret = 0;
1234 0 : return 0;
1235 : }
1236 :
1237 0 : path = btrfs_alloc_path();
1238 0 : if (!path)
1239 : return -ENOMEM;
1240 :
1241 0 : key.objectid = cache->start + cache->length;
1242 0 : key.type = 0;
1243 0 : key.offset = 0;
1244 :
1245 0 : root = btrfs_extent_root(fs_info, key.objectid);
1246 0 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1247 : /* We should not find the exact match */
1248 0 : if (!ret)
1249 : ret = -EUCLEAN;
1250 0 : if (ret < 0)
1251 0 : goto out;
1252 :
1253 0 : ret = btrfs_previous_extent_item(root, path, cache->start);
1254 0 : if (ret) {
1255 0 : if (ret == 1) {
1256 0 : ret = 0;
1257 0 : *offset_ret = 0;
1258 : }
1259 0 : goto out;
1260 : }
1261 :
1262 0 : btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1263 :
1264 0 : if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1265 0 : length = found_key.offset;
1266 : else
1267 0 : length = fs_info->nodesize;
1268 :
1269 0 : if (!(found_key.objectid >= cache->start &&
1270 0 : found_key.objectid + length <= cache->start + cache->length)) {
1271 0 : ret = -EUCLEAN;
1272 0 : goto out;
1273 : }
1274 0 : *offset_ret = found_key.objectid + length - cache->start;
1275 0 : ret = 0;
1276 :
1277 0 : out:
1278 0 : btrfs_free_path(path);
1279 0 : return ret;
1280 : }
1281 :
1282 0 : int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1283 : {
1284 0 : struct btrfs_fs_info *fs_info = cache->fs_info;
1285 0 : struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1286 0 : struct extent_map *em;
1287 0 : struct map_lookup *map;
1288 0 : struct btrfs_device *device;
1289 0 : u64 logical = cache->start;
1290 0 : u64 length = cache->length;
1291 0 : int ret;
1292 0 : int i;
1293 0 : unsigned int nofs_flag;
1294 0 : u64 *alloc_offsets = NULL;
1295 0 : u64 *caps = NULL;
1296 0 : u64 *physical = NULL;
1297 0 : unsigned long *active = NULL;
1298 0 : u64 last_alloc = 0;
1299 0 : u32 num_sequential = 0, num_conventional = 0;
1300 :
1301 0 : if (!btrfs_is_zoned(fs_info))
1302 : return 0;
1303 :
1304 : /* Sanity check */
1305 0 : if (!IS_ALIGNED(length, fs_info->zone_size)) {
1306 0 : btrfs_err(fs_info,
1307 : "zoned: block group %llu len %llu unaligned to zone size %llu",
1308 : logical, length, fs_info->zone_size);
1309 0 : return -EIO;
1310 : }
1311 :
1312 : /* Get the chunk mapping */
1313 0 : read_lock(&em_tree->lock);
1314 0 : em = lookup_extent_mapping(em_tree, logical, length);
1315 0 : read_unlock(&em_tree->lock);
1316 :
1317 0 : if (!em)
1318 : return -EINVAL;
1319 :
1320 0 : map = em->map_lookup;
1321 :
1322 0 : cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
1323 0 : if (!cache->physical_map) {
1324 0 : ret = -ENOMEM;
1325 0 : goto out;
1326 : }
1327 :
1328 0 : alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1329 0 : if (!alloc_offsets) {
1330 0 : ret = -ENOMEM;
1331 0 : goto out;
1332 : }
1333 :
1334 0 : caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
1335 0 : if (!caps) {
1336 0 : ret = -ENOMEM;
1337 0 : goto out;
1338 : }
1339 :
1340 0 : physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
1341 0 : if (!physical) {
1342 0 : ret = -ENOMEM;
1343 0 : goto out;
1344 : }
1345 :
1346 0 : active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
1347 0 : if (!active) {
1348 0 : ret = -ENOMEM;
1349 0 : goto out;
1350 : }
1351 :
1352 0 : for (i = 0; i < map->num_stripes; i++) {
1353 0 : bool is_sequential;
1354 0 : struct blk_zone zone;
1355 0 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1356 0 : int dev_replace_is_ongoing = 0;
1357 :
1358 0 : device = map->stripes[i].dev;
1359 0 : physical[i] = map->stripes[i].physical;
1360 :
1361 0 : if (device->bdev == NULL) {
1362 0 : alloc_offsets[i] = WP_MISSING_DEV;
1363 0 : continue;
1364 : }
1365 :
1366 0 : is_sequential = btrfs_dev_is_sequential(device, physical[i]);
1367 0 : if (is_sequential)
1368 0 : num_sequential++;
1369 : else
1370 0 : num_conventional++;
1371 :
1372 : /*
1373 : * Consider a zone as active if we can allow any number of
1374 : * active zones.
1375 : */
1376 0 : if (!device->zone_info->max_active_zones)
1377 0 : __set_bit(i, active);
1378 :
1379 0 : if (!is_sequential) {
1380 0 : alloc_offsets[i] = WP_CONVENTIONAL;
1381 0 : continue;
1382 : }
1383 :
1384 : /*
1385 : * This zone will be used for allocation, so mark this zone
1386 : * non-empty.
1387 : */
1388 0 : btrfs_dev_clear_zone_empty(device, physical[i]);
1389 :
1390 0 : down_read(&dev_replace->rwsem);
1391 0 : dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1392 0 : if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1393 0 : btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
1394 0 : up_read(&dev_replace->rwsem);
1395 :
1396 : /*
1397 : * The group is mapped to a sequential zone. Get the zone write
1398 : * pointer to determine the allocation offset within the zone.
1399 : */
1400 0 : WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
1401 0 : nofs_flag = memalloc_nofs_save();
1402 0 : ret = btrfs_get_dev_zone(device, physical[i], &zone);
1403 0 : memalloc_nofs_restore(nofs_flag);
1404 0 : if (ret == -EIO || ret == -EOPNOTSUPP) {
1405 0 : ret = 0;
1406 0 : alloc_offsets[i] = WP_MISSING_DEV;
1407 0 : continue;
1408 0 : } else if (ret) {
1409 0 : goto out;
1410 : }
1411 :
1412 0 : if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1413 0 : btrfs_err_in_rcu(fs_info,
1414 : "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1415 : zone.start << SECTOR_SHIFT,
1416 : rcu_str_deref(device->name), device->devid);
1417 0 : ret = -EIO;
1418 0 : goto out;
1419 : }
1420 :
1421 0 : caps[i] = (zone.capacity << SECTOR_SHIFT);
1422 :
1423 0 : switch (zone.cond) {
1424 0 : case BLK_ZONE_COND_OFFLINE:
1425 : case BLK_ZONE_COND_READONLY:
1426 0 : btrfs_err(fs_info,
1427 : "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1428 : physical[i] >> device->zone_info->zone_size_shift,
1429 : rcu_str_deref(device->name), device->devid);
1430 0 : alloc_offsets[i] = WP_MISSING_DEV;
1431 0 : break;
1432 0 : case BLK_ZONE_COND_EMPTY:
1433 0 : alloc_offsets[i] = 0;
1434 0 : break;
1435 0 : case BLK_ZONE_COND_FULL:
1436 0 : alloc_offsets[i] = caps[i];
1437 0 : break;
1438 0 : default:
1439 : /* Partially used zone */
1440 0 : alloc_offsets[i] =
1441 0 : ((zone.wp - zone.start) << SECTOR_SHIFT);
1442 0 : __set_bit(i, active);
1443 : break;
1444 : }
1445 : }
1446 :
1447 0 : if (num_sequential > 0)
1448 0 : set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1449 :
1450 0 : if (num_conventional > 0) {
1451 : /* Zone capacity is always zone size in emulation */
1452 0 : cache->zone_capacity = cache->length;
1453 0 : ret = calculate_alloc_pointer(cache, &last_alloc, new);
1454 0 : if (ret) {
1455 0 : btrfs_err(fs_info,
1456 : "zoned: failed to determine allocation offset of bg %llu",
1457 : cache->start);
1458 0 : goto out;
1459 0 : } else if (map->num_stripes == num_conventional) {
1460 0 : cache->alloc_offset = last_alloc;
1461 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
1462 0 : goto out;
1463 : }
1464 : }
1465 :
1466 0 : switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1467 0 : case 0: /* single */
1468 0 : if (alloc_offsets[0] == WP_MISSING_DEV) {
1469 0 : btrfs_err(fs_info,
1470 : "zoned: cannot recover write pointer for zone %llu",
1471 : physical[0]);
1472 0 : ret = -EIO;
1473 0 : goto out;
1474 : }
1475 0 : cache->alloc_offset = alloc_offsets[0];
1476 0 : cache->zone_capacity = caps[0];
1477 0 : if (test_bit(0, active))
1478 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
1479 : break;
1480 0 : case BTRFS_BLOCK_GROUP_DUP:
1481 0 : if (map->type & BTRFS_BLOCK_GROUP_DATA) {
1482 0 : btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
1483 0 : ret = -EINVAL;
1484 0 : goto out;
1485 : }
1486 0 : if (alloc_offsets[0] == WP_MISSING_DEV) {
1487 0 : btrfs_err(fs_info,
1488 : "zoned: cannot recover write pointer for zone %llu",
1489 : physical[0]);
1490 0 : ret = -EIO;
1491 0 : goto out;
1492 : }
1493 0 : if (alloc_offsets[1] == WP_MISSING_DEV) {
1494 0 : btrfs_err(fs_info,
1495 : "zoned: cannot recover write pointer for zone %llu",
1496 : physical[1]);
1497 0 : ret = -EIO;
1498 0 : goto out;
1499 : }
1500 0 : if (alloc_offsets[0] != alloc_offsets[1]) {
1501 0 : btrfs_err(fs_info,
1502 : "zoned: write pointer offset mismatch of zones in DUP profile");
1503 0 : ret = -EIO;
1504 0 : goto out;
1505 : }
1506 0 : if (test_bit(0, active) != test_bit(1, active)) {
1507 0 : if (!btrfs_zone_activate(cache)) {
1508 0 : ret = -EIO;
1509 0 : goto out;
1510 : }
1511 : } else {
1512 0 : if (test_bit(0, active))
1513 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
1514 0 : &cache->runtime_flags);
1515 : }
1516 0 : cache->alloc_offset = alloc_offsets[0];
1517 0 : cache->zone_capacity = min(caps[0], caps[1]);
1518 0 : break;
1519 0 : case BTRFS_BLOCK_GROUP_RAID1:
1520 : case BTRFS_BLOCK_GROUP_RAID0:
1521 : case BTRFS_BLOCK_GROUP_RAID10:
1522 : case BTRFS_BLOCK_GROUP_RAID5:
1523 : case BTRFS_BLOCK_GROUP_RAID6:
1524 : /* non-single profiles are not supported yet */
1525 : default:
1526 0 : btrfs_err(fs_info, "zoned: profile %s not yet supported",
1527 : btrfs_bg_type_to_raid_name(map->type));
1528 0 : ret = -EINVAL;
1529 0 : goto out;
1530 : }
1531 :
1532 0 : out:
1533 0 : if (cache->alloc_offset > fs_info->zone_size) {
1534 0 : btrfs_err(fs_info,
1535 : "zoned: invalid write pointer %llu in block group %llu",
1536 : cache->alloc_offset, cache->start);
1537 0 : ret = -EIO;
1538 : }
1539 :
1540 0 : if (cache->alloc_offset > cache->zone_capacity) {
1541 0 : btrfs_err(fs_info,
1542 : "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1543 : cache->alloc_offset, cache->zone_capacity,
1544 : cache->start);
1545 0 : ret = -EIO;
1546 : }
1547 :
1548 : /* An extent is allocated after the write pointer */
1549 0 : if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1550 0 : btrfs_err(fs_info,
1551 : "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1552 : logical, last_alloc, cache->alloc_offset);
1553 0 : ret = -EIO;
1554 : }
1555 :
1556 0 : if (!ret) {
1557 0 : cache->meta_write_pointer = cache->alloc_offset + cache->start;
1558 0 : if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
1559 0 : btrfs_get_block_group(cache);
1560 0 : spin_lock(&fs_info->zone_active_bgs_lock);
1561 0 : list_add_tail(&cache->active_bg_list,
1562 : &fs_info->zone_active_bgs);
1563 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
1564 : }
1565 : } else {
1566 0 : kfree(cache->physical_map);
1567 0 : cache->physical_map = NULL;
1568 : }
1569 0 : bitmap_free(active);
1570 0 : kfree(physical);
1571 0 : kfree(caps);
1572 0 : kfree(alloc_offsets);
1573 0 : free_extent_map(em);
1574 :
1575 0 : return ret;
1576 : }
1577 :
1578 0 : void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1579 : {
1580 0 : u64 unusable, free;
1581 :
1582 0 : if (!btrfs_is_zoned(cache->fs_info))
1583 : return;
1584 :
1585 0 : WARN_ON(cache->bytes_super != 0);
1586 :
1587 : /* Check for block groups never get activated */
1588 0 : if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
1589 0 : cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
1590 0 : !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
1591 0 : cache->alloc_offset == 0) {
1592 0 : unusable = cache->length;
1593 0 : free = 0;
1594 : } else {
1595 0 : unusable = (cache->alloc_offset - cache->used) +
1596 0 : (cache->length - cache->zone_capacity);
1597 0 : free = cache->zone_capacity - cache->alloc_offset;
1598 : }
1599 :
1600 : /* We only need ->free_space in ALLOC_SEQ block groups */
1601 0 : cache->cached = BTRFS_CACHE_FINISHED;
1602 0 : cache->free_space_ctl->free_space = free;
1603 0 : cache->zone_unusable = unusable;
1604 : }
1605 :
1606 0 : void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1607 : struct extent_buffer *eb)
1608 : {
1609 0 : if (!btrfs_is_zoned(eb->fs_info) ||
1610 : btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
1611 : return;
1612 :
1613 0 : ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1614 :
1615 0 : memzero_extent_buffer(eb, 0, eb->len);
1616 0 : set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1617 0 : set_extent_buffer_dirty(eb);
1618 0 : set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
1619 : EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
1620 : }
1621 :
1622 0 : bool btrfs_use_zone_append(struct btrfs_bio *bbio)
1623 : {
1624 0 : u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
1625 0 : struct btrfs_inode *inode = bbio->inode;
1626 0 : struct btrfs_fs_info *fs_info = bbio->fs_info;
1627 0 : struct btrfs_block_group *cache;
1628 0 : bool ret = false;
1629 :
1630 0 : if (!btrfs_is_zoned(fs_info))
1631 : return false;
1632 :
1633 0 : if (!inode || !is_data_inode(&inode->vfs_inode))
1634 : return false;
1635 :
1636 0 : if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
1637 : return false;
1638 :
1639 : /*
1640 : * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1641 : * extent layout the relocation code has.
1642 : * Furthermore we have set aside own block-group from which only the
1643 : * relocation "process" can allocate and make sure only one process at a
1644 : * time can add pages to an extent that gets relocated, so it's safe to
1645 : * use regular REQ_OP_WRITE for this special case.
1646 : */
1647 0 : if (btrfs_is_data_reloc_root(inode->root))
1648 : return false;
1649 :
1650 0 : cache = btrfs_lookup_block_group(fs_info, start);
1651 0 : ASSERT(cache);
1652 0 : if (!cache)
1653 : return false;
1654 :
1655 0 : ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
1656 0 : btrfs_put_block_group(cache);
1657 :
1658 0 : return ret;
1659 : }
1660 :
1661 0 : void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
1662 : {
1663 0 : const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1664 0 : struct btrfs_ordered_sum *sum = bbio->sums;
1665 :
1666 0 : if (physical < bbio->orig_physical)
1667 0 : sum->logical -= bbio->orig_physical - physical;
1668 : else
1669 0 : sum->logical += physical - bbio->orig_physical;
1670 0 : }
1671 :
1672 0 : static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
1673 : u64 logical)
1674 : {
1675 0 : struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
1676 0 : struct extent_map *em;
1677 :
1678 0 : ordered->disk_bytenr = logical;
1679 :
1680 0 : write_lock(&em_tree->lock);
1681 0 : em = search_extent_mapping(em_tree, ordered->file_offset,
1682 : ordered->num_bytes);
1683 0 : em->block_start = logical;
1684 0 : free_extent_map(em);
1685 0 : write_unlock(&em_tree->lock);
1686 0 : }
1687 :
1688 0 : static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
1689 : u64 logical, u64 len)
1690 : {
1691 0 : struct btrfs_ordered_extent *new;
1692 :
1693 0 : if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
1694 0 : split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
1695 : ordered->num_bytes, len, logical))
1696 : return false;
1697 :
1698 0 : new = btrfs_split_ordered_extent(ordered, len);
1699 0 : if (IS_ERR(new))
1700 : return false;
1701 0 : new->disk_bytenr = logical;
1702 0 : btrfs_finish_one_ordered(new);
1703 0 : return true;
1704 : }
1705 :
1706 0 : void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
1707 : {
1708 0 : struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1709 0 : struct btrfs_fs_info *fs_info = inode->root->fs_info;
1710 0 : struct btrfs_ordered_sum *sum =
1711 0 : list_first_entry(&ordered->list, typeof(*sum), list);
1712 0 : u64 logical = sum->logical;
1713 0 : u64 len = sum->len;
1714 :
1715 0 : while (len < ordered->disk_num_bytes) {
1716 0 : sum = list_next_entry(sum, list);
1717 0 : if (sum->logical == logical + len) {
1718 0 : len += sum->len;
1719 0 : continue;
1720 : }
1721 0 : if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
1722 0 : set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
1723 0 : btrfs_err(fs_info, "failed to split ordered extent");
1724 0 : goto out;
1725 : }
1726 0 : logical = sum->logical;
1727 0 : len = sum->len;
1728 : }
1729 :
1730 0 : if (ordered->disk_bytenr != logical)
1731 0 : btrfs_rewrite_logical_zoned(ordered, logical);
1732 :
1733 0 : out:
1734 : /*
1735 : * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1736 : * were allocated by btrfs_alloc_dummy_sum only to record the logical
1737 : * addresses and don't contain actual checksums. We thus must free them
1738 : * here so that we don't attempt to log the csums later.
1739 : */
1740 0 : if ((inode->flags & BTRFS_INODE_NODATASUM) ||
1741 0 : test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
1742 0 : while ((sum = list_first_entry_or_null(&ordered->list,
1743 : typeof(*sum), list))) {
1744 0 : list_del(&sum->list);
1745 0 : kfree(sum);
1746 : }
1747 : }
1748 0 : }
1749 :
1750 0 : bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1751 : struct extent_buffer *eb,
1752 : struct btrfs_block_group **cache_ret)
1753 : {
1754 0 : struct btrfs_block_group *cache;
1755 0 : bool ret = true;
1756 :
1757 0 : if (!btrfs_is_zoned(fs_info))
1758 : return true;
1759 :
1760 0 : cache = btrfs_lookup_block_group(fs_info, eb->start);
1761 0 : if (!cache)
1762 : return true;
1763 :
1764 0 : if (cache->meta_write_pointer != eb->start) {
1765 0 : btrfs_put_block_group(cache);
1766 0 : cache = NULL;
1767 0 : ret = false;
1768 : } else {
1769 0 : cache->meta_write_pointer = eb->start + eb->len;
1770 : }
1771 :
1772 0 : *cache_ret = cache;
1773 :
1774 0 : return ret;
1775 : }
1776 :
1777 0 : void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1778 : struct extent_buffer *eb)
1779 : {
1780 0 : if (!btrfs_is_zoned(eb->fs_info) || !cache)
1781 : return;
1782 :
1783 0 : ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1784 0 : cache->meta_write_pointer = eb->start;
1785 : }
1786 :
1787 0 : int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1788 : {
1789 0 : if (!btrfs_dev_is_sequential(device, physical))
1790 : return -EOPNOTSUPP;
1791 :
1792 0 : return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1793 : length >> SECTOR_SHIFT, GFP_NOFS, 0);
1794 : }
1795 :
1796 0 : static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1797 : struct blk_zone *zone)
1798 : {
1799 0 : struct btrfs_io_context *bioc = NULL;
1800 0 : u64 mapped_length = PAGE_SIZE;
1801 0 : unsigned int nofs_flag;
1802 0 : int nmirrors;
1803 0 : int i, ret;
1804 :
1805 0 : ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1806 : &mapped_length, &bioc, NULL, NULL, 1);
1807 0 : if (ret || !bioc || mapped_length < PAGE_SIZE) {
1808 0 : ret = -EIO;
1809 0 : goto out_put_bioc;
1810 : }
1811 :
1812 0 : if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1813 0 : ret = -EINVAL;
1814 0 : goto out_put_bioc;
1815 : }
1816 :
1817 0 : nofs_flag = memalloc_nofs_save();
1818 0 : nmirrors = (int)bioc->num_stripes;
1819 0 : for (i = 0; i < nmirrors; i++) {
1820 0 : u64 physical = bioc->stripes[i].physical;
1821 0 : struct btrfs_device *dev = bioc->stripes[i].dev;
1822 :
1823 : /* Missing device */
1824 0 : if (!dev->bdev)
1825 0 : continue;
1826 :
1827 0 : ret = btrfs_get_dev_zone(dev, physical, zone);
1828 : /* Failing device */
1829 0 : if (ret == -EIO || ret == -EOPNOTSUPP)
1830 0 : continue;
1831 : break;
1832 : }
1833 0 : memalloc_nofs_restore(nofs_flag);
1834 0 : out_put_bioc:
1835 0 : btrfs_put_bioc(bioc);
1836 0 : return ret;
1837 : }
1838 :
1839 : /*
1840 : * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1841 : * filling zeros between @physical_pos to a write pointer of dev-replace
1842 : * source device.
1843 : */
1844 0 : int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1845 : u64 physical_start, u64 physical_pos)
1846 : {
1847 0 : struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1848 0 : struct blk_zone zone;
1849 0 : u64 length;
1850 0 : u64 wp;
1851 0 : int ret;
1852 :
1853 0 : if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1854 : return 0;
1855 :
1856 0 : ret = read_zone_info(fs_info, logical, &zone);
1857 0 : if (ret)
1858 : return ret;
1859 :
1860 0 : wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1861 :
1862 0 : if (physical_pos == wp)
1863 : return 0;
1864 :
1865 0 : if (physical_pos > wp)
1866 : return -EUCLEAN;
1867 :
1868 0 : length = wp - physical_pos;
1869 0 : return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1870 : }
1871 :
1872 : /*
1873 : * Activate block group and underlying device zones
1874 : *
1875 : * @block_group: the block group to activate
1876 : *
1877 : * Return: true on success, false otherwise
1878 : */
1879 0 : bool btrfs_zone_activate(struct btrfs_block_group *block_group)
1880 : {
1881 0 : struct btrfs_fs_info *fs_info = block_group->fs_info;
1882 0 : struct btrfs_space_info *space_info = block_group->space_info;
1883 0 : struct map_lookup *map;
1884 0 : struct btrfs_device *device;
1885 0 : u64 physical;
1886 0 : bool ret;
1887 0 : int i;
1888 :
1889 0 : if (!btrfs_is_zoned(block_group->fs_info))
1890 : return true;
1891 :
1892 0 : map = block_group->physical_map;
1893 :
1894 0 : spin_lock(&space_info->lock);
1895 0 : spin_lock(&block_group->lock);
1896 0 : if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
1897 0 : ret = true;
1898 0 : goto out_unlock;
1899 : }
1900 :
1901 : /* No space left */
1902 0 : if (btrfs_zoned_bg_is_full(block_group)) {
1903 0 : ret = false;
1904 0 : goto out_unlock;
1905 : }
1906 :
1907 0 : for (i = 0; i < map->num_stripes; i++) {
1908 0 : device = map->stripes[i].dev;
1909 0 : physical = map->stripes[i].physical;
1910 :
1911 0 : if (device->zone_info->max_active_zones == 0)
1912 0 : continue;
1913 :
1914 0 : if (!btrfs_dev_set_active_zone(device, physical)) {
1915 : /* Cannot activate the zone */
1916 0 : ret = false;
1917 0 : goto out_unlock;
1918 : }
1919 : }
1920 :
1921 : /* Successfully activated all the zones */
1922 0 : set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
1923 0 : WARN_ON(block_group->alloc_offset != 0);
1924 0 : if (block_group->zone_unusable == block_group->length) {
1925 0 : block_group->zone_unusable = block_group->length - block_group->zone_capacity;
1926 0 : space_info->bytes_zone_unusable -= block_group->zone_capacity;
1927 : }
1928 0 : spin_unlock(&block_group->lock);
1929 0 : btrfs_try_granting_tickets(fs_info, space_info);
1930 0 : spin_unlock(&space_info->lock);
1931 :
1932 : /* For the active block group list */
1933 0 : btrfs_get_block_group(block_group);
1934 :
1935 0 : spin_lock(&fs_info->zone_active_bgs_lock);
1936 0 : list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
1937 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
1938 :
1939 0 : return true;
1940 :
1941 0 : out_unlock:
1942 0 : spin_unlock(&block_group->lock);
1943 0 : spin_unlock(&space_info->lock);
1944 0 : return ret;
1945 : }
1946 :
1947 0 : static void wait_eb_writebacks(struct btrfs_block_group *block_group)
1948 : {
1949 0 : struct btrfs_fs_info *fs_info = block_group->fs_info;
1950 0 : const u64 end = block_group->start + block_group->length;
1951 0 : struct radix_tree_iter iter;
1952 0 : struct extent_buffer *eb;
1953 0 : void __rcu **slot;
1954 :
1955 0 : rcu_read_lock();
1956 0 : radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
1957 : block_group->start >> fs_info->sectorsize_bits) {
1958 0 : eb = radix_tree_deref_slot(slot);
1959 0 : if (!eb)
1960 0 : continue;
1961 0 : if (radix_tree_deref_retry(eb)) {
1962 0 : slot = radix_tree_iter_retry(&iter);
1963 0 : continue;
1964 : }
1965 :
1966 0 : if (eb->start < block_group->start)
1967 0 : continue;
1968 0 : if (eb->start >= end)
1969 : break;
1970 :
1971 0 : slot = radix_tree_iter_resume(slot, &iter);
1972 0 : rcu_read_unlock();
1973 0 : wait_on_extent_buffer_writeback(eb);
1974 0 : rcu_read_lock();
1975 : }
1976 0 : rcu_read_unlock();
1977 0 : }
1978 :
1979 0 : static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
1980 : {
1981 0 : struct btrfs_fs_info *fs_info = block_group->fs_info;
1982 0 : struct map_lookup *map;
1983 0 : const bool is_metadata = (block_group->flags &
1984 : (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
1985 0 : int ret = 0;
1986 0 : int i;
1987 :
1988 0 : spin_lock(&block_group->lock);
1989 0 : if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
1990 0 : spin_unlock(&block_group->lock);
1991 0 : return 0;
1992 : }
1993 :
1994 : /* Check if we have unwritten allocated space */
1995 0 : if (is_metadata &&
1996 0 : block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
1997 0 : spin_unlock(&block_group->lock);
1998 0 : return -EAGAIN;
1999 : }
2000 :
2001 : /*
2002 : * If we are sure that the block group is full (= no more room left for
2003 : * new allocation) and the IO for the last usable block is completed, we
2004 : * don't need to wait for the other IOs. This holds because we ensure
2005 : * the sequential IO submissions using the ZONE_APPEND command for data
2006 : * and block_group->meta_write_pointer for metadata.
2007 : */
2008 0 : if (!fully_written) {
2009 0 : spin_unlock(&block_group->lock);
2010 :
2011 0 : ret = btrfs_inc_block_group_ro(block_group, false);
2012 0 : if (ret)
2013 : return ret;
2014 :
2015 : /* Ensure all writes in this block group finish */
2016 0 : btrfs_wait_block_group_reservations(block_group);
2017 : /* No need to wait for NOCOW writers. Zoned mode does not allow that */
2018 0 : btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
2019 : block_group->length);
2020 : /* Wait for extent buffers to be written. */
2021 0 : if (is_metadata)
2022 0 : wait_eb_writebacks(block_group);
2023 :
2024 0 : spin_lock(&block_group->lock);
2025 :
2026 : /*
2027 : * Bail out if someone already deactivated the block group, or
2028 : * allocated space is left in the block group.
2029 : */
2030 0 : if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2031 : &block_group->runtime_flags)) {
2032 0 : spin_unlock(&block_group->lock);
2033 0 : btrfs_dec_block_group_ro(block_group);
2034 0 : return 0;
2035 : }
2036 :
2037 0 : if (block_group->reserved) {
2038 0 : spin_unlock(&block_group->lock);
2039 0 : btrfs_dec_block_group_ro(block_group);
2040 0 : return -EAGAIN;
2041 : }
2042 : }
2043 :
2044 0 : clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
2045 0 : block_group->alloc_offset = block_group->zone_capacity;
2046 0 : block_group->free_space_ctl->free_space = 0;
2047 0 : btrfs_clear_treelog_bg(block_group);
2048 0 : btrfs_clear_data_reloc_bg(block_group);
2049 0 : spin_unlock(&block_group->lock);
2050 :
2051 0 : map = block_group->physical_map;
2052 0 : for (i = 0; i < map->num_stripes; i++) {
2053 0 : struct btrfs_device *device = map->stripes[i].dev;
2054 0 : const u64 physical = map->stripes[i].physical;
2055 :
2056 0 : if (device->zone_info->max_active_zones == 0)
2057 0 : continue;
2058 :
2059 0 : ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
2060 : physical >> SECTOR_SHIFT,
2061 0 : device->zone_info->zone_size >> SECTOR_SHIFT,
2062 : GFP_NOFS);
2063 :
2064 0 : if (ret)
2065 0 : return ret;
2066 :
2067 0 : btrfs_dev_clear_active_zone(device, physical);
2068 : }
2069 :
2070 0 : if (!fully_written)
2071 0 : btrfs_dec_block_group_ro(block_group);
2072 :
2073 0 : spin_lock(&fs_info->zone_active_bgs_lock);
2074 0 : ASSERT(!list_empty(&block_group->active_bg_list));
2075 0 : list_del_init(&block_group->active_bg_list);
2076 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
2077 :
2078 : /* For active_bg_list */
2079 0 : btrfs_put_block_group(block_group);
2080 :
2081 0 : clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2082 :
2083 0 : return 0;
2084 : }
2085 :
2086 0 : int btrfs_zone_finish(struct btrfs_block_group *block_group)
2087 : {
2088 0 : if (!btrfs_is_zoned(block_group->fs_info))
2089 : return 0;
2090 :
2091 0 : return do_zone_finish(block_group, false);
2092 : }
2093 :
2094 0 : bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
2095 : {
2096 0 : struct btrfs_fs_info *fs_info = fs_devices->fs_info;
2097 0 : struct btrfs_device *device;
2098 0 : bool ret = false;
2099 :
2100 0 : if (!btrfs_is_zoned(fs_info))
2101 : return true;
2102 :
2103 : /* Check if there is a device with active zones left */
2104 0 : mutex_lock(&fs_info->chunk_mutex);
2105 0 : list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
2106 0 : struct btrfs_zoned_device_info *zinfo = device->zone_info;
2107 :
2108 0 : if (!device->bdev)
2109 0 : continue;
2110 :
2111 0 : if (!zinfo->max_active_zones) {
2112 : ret = true;
2113 : break;
2114 : }
2115 :
2116 0 : switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2117 0 : case 0: /* single */
2118 0 : ret = (atomic_read(&zinfo->active_zones_left) >= 1);
2119 0 : break;
2120 0 : case BTRFS_BLOCK_GROUP_DUP:
2121 0 : ret = (atomic_read(&zinfo->active_zones_left) >= 2);
2122 0 : break;
2123 : }
2124 0 : if (ret)
2125 : break;
2126 : }
2127 0 : mutex_unlock(&fs_info->chunk_mutex);
2128 :
2129 0 : if (!ret)
2130 0 : set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
2131 :
2132 : return ret;
2133 : }
2134 :
2135 0 : void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
2136 : {
2137 0 : struct btrfs_block_group *block_group;
2138 0 : u64 min_alloc_bytes;
2139 :
2140 0 : if (!btrfs_is_zoned(fs_info))
2141 : return;
2142 :
2143 0 : block_group = btrfs_lookup_block_group(fs_info, logical);
2144 0 : ASSERT(block_group);
2145 :
2146 : /* No MIXED_BG on zoned btrfs. */
2147 0 : if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
2148 0 : min_alloc_bytes = fs_info->sectorsize;
2149 : else
2150 0 : min_alloc_bytes = fs_info->nodesize;
2151 :
2152 : /* Bail out if we can allocate more data from this block group. */
2153 0 : if (logical + length + min_alloc_bytes <=
2154 0 : block_group->start + block_group->zone_capacity)
2155 0 : goto out;
2156 :
2157 0 : do_zone_finish(block_group, true);
2158 :
2159 0 : out:
2160 0 : btrfs_put_block_group(block_group);
2161 : }
2162 :
2163 0 : static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
2164 : {
2165 0 : struct btrfs_block_group *bg =
2166 0 : container_of(work, struct btrfs_block_group, zone_finish_work);
2167 :
2168 0 : wait_on_extent_buffer_writeback(bg->last_eb);
2169 0 : free_extent_buffer(bg->last_eb);
2170 0 : btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
2171 0 : btrfs_put_block_group(bg);
2172 0 : }
2173 :
2174 0 : void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
2175 : struct extent_buffer *eb)
2176 : {
2177 0 : if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
2178 0 : eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
2179 : return;
2180 :
2181 0 : if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
2182 0 : btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
2183 : bg->start);
2184 0 : return;
2185 : }
2186 :
2187 : /* For the work */
2188 0 : btrfs_get_block_group(bg);
2189 0 : atomic_inc(&eb->refs);
2190 0 : bg->last_eb = eb;
2191 0 : INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
2192 0 : queue_work(system_unbound_wq, &bg->zone_finish_work);
2193 : }
2194 :
2195 0 : void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
2196 : {
2197 0 : struct btrfs_fs_info *fs_info = bg->fs_info;
2198 :
2199 0 : spin_lock(&fs_info->relocation_bg_lock);
2200 0 : if (fs_info->data_reloc_bg == bg->start)
2201 0 : fs_info->data_reloc_bg = 0;
2202 0 : spin_unlock(&fs_info->relocation_bg_lock);
2203 0 : }
2204 :
2205 0 : void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
2206 : {
2207 0 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2208 0 : struct btrfs_device *device;
2209 :
2210 0 : if (!btrfs_is_zoned(fs_info))
2211 : return;
2212 :
2213 0 : mutex_lock(&fs_devices->device_list_mutex);
2214 0 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
2215 0 : if (device->zone_info) {
2216 0 : vfree(device->zone_info->zone_cache);
2217 0 : device->zone_info->zone_cache = NULL;
2218 : }
2219 : }
2220 0 : mutex_unlock(&fs_devices->device_list_mutex);
2221 : }
2222 :
2223 0 : bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
2224 : {
2225 0 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2226 0 : struct btrfs_device *device;
2227 0 : u64 used = 0;
2228 0 : u64 total = 0;
2229 0 : u64 factor;
2230 :
2231 0 : ASSERT(btrfs_is_zoned(fs_info));
2232 :
2233 0 : if (fs_info->bg_reclaim_threshold == 0)
2234 : return false;
2235 :
2236 0 : mutex_lock(&fs_devices->device_list_mutex);
2237 0 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
2238 0 : if (!device->bdev)
2239 0 : continue;
2240 :
2241 0 : total += device->disk_total_bytes;
2242 0 : used += device->bytes_used;
2243 : }
2244 0 : mutex_unlock(&fs_devices->device_list_mutex);
2245 :
2246 0 : factor = div64_u64(used * 100, total);
2247 0 : return factor >= fs_info->bg_reclaim_threshold;
2248 : }
2249 :
2250 0 : void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
2251 : u64 length)
2252 : {
2253 0 : struct btrfs_block_group *block_group;
2254 :
2255 0 : if (!btrfs_is_zoned(fs_info))
2256 : return;
2257 :
2258 0 : block_group = btrfs_lookup_block_group(fs_info, logical);
2259 : /* It should be called on a previous data relocation block group. */
2260 0 : ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
2261 :
2262 0 : spin_lock(&block_group->lock);
2263 0 : if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
2264 0 : goto out;
2265 :
2266 : /* All relocation extents are written. */
2267 0 : if (block_group->start + block_group->alloc_offset == logical + length) {
2268 : /* Now, release this block group for further allocations. */
2269 0 : clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
2270 : &block_group->runtime_flags);
2271 : }
2272 :
2273 0 : out:
2274 0 : spin_unlock(&block_group->lock);
2275 0 : btrfs_put_block_group(block_group);
2276 : }
2277 :
2278 0 : int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
2279 : {
2280 0 : struct btrfs_block_group *block_group;
2281 0 : struct btrfs_block_group *min_bg = NULL;
2282 0 : u64 min_avail = U64_MAX;
2283 0 : int ret;
2284 :
2285 0 : spin_lock(&fs_info->zone_active_bgs_lock);
2286 0 : list_for_each_entry(block_group, &fs_info->zone_active_bgs,
2287 : active_bg_list) {
2288 0 : u64 avail;
2289 :
2290 0 : spin_lock(&block_group->lock);
2291 0 : if (block_group->reserved || block_group->alloc_offset == 0 ||
2292 0 : (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
2293 0 : spin_unlock(&block_group->lock);
2294 0 : continue;
2295 : }
2296 :
2297 0 : avail = block_group->zone_capacity - block_group->alloc_offset;
2298 0 : if (min_avail > avail) {
2299 0 : if (min_bg)
2300 0 : btrfs_put_block_group(min_bg);
2301 0 : min_bg = block_group;
2302 0 : min_avail = avail;
2303 0 : btrfs_get_block_group(min_bg);
2304 : }
2305 0 : spin_unlock(&block_group->lock);
2306 : }
2307 0 : spin_unlock(&fs_info->zone_active_bgs_lock);
2308 :
2309 0 : if (!min_bg)
2310 : return 0;
2311 :
2312 0 : ret = btrfs_zone_finish(min_bg);
2313 0 : btrfs_put_block_group(min_bg);
2314 :
2315 0 : return ret < 0 ? ret : 1;
2316 : }
2317 :
2318 0 : int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
2319 : struct btrfs_space_info *space_info,
2320 : bool do_finish)
2321 : {
2322 0 : struct btrfs_block_group *bg;
2323 0 : int index;
2324 :
2325 0 : if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
2326 : return 0;
2327 :
2328 0 : for (;;) {
2329 0 : int ret;
2330 0 : bool need_finish = false;
2331 :
2332 0 : down_read(&space_info->groups_sem);
2333 0 : for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
2334 0 : list_for_each_entry(bg, &space_info->block_groups[index],
2335 : list) {
2336 0 : if (!spin_trylock(&bg->lock))
2337 0 : continue;
2338 0 : if (btrfs_zoned_bg_is_full(bg) ||
2339 0 : test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2340 : &bg->runtime_flags)) {
2341 0 : spin_unlock(&bg->lock);
2342 0 : continue;
2343 : }
2344 0 : spin_unlock(&bg->lock);
2345 :
2346 0 : if (btrfs_zone_activate(bg)) {
2347 0 : up_read(&space_info->groups_sem);
2348 0 : return 1;
2349 : }
2350 :
2351 : need_finish = true;
2352 : }
2353 : }
2354 0 : up_read(&space_info->groups_sem);
2355 :
2356 0 : if (!do_finish || !need_finish)
2357 : break;
2358 :
2359 0 : ret = btrfs_zone_finish_one_bg(fs_info);
2360 0 : if (ret == 0)
2361 : break;
2362 0 : if (ret < 0)
2363 0 : return ret;
2364 : }
2365 :
2366 : return 0;
2367 : }
|