LCOV - code coverage report
Current view: top level - fs/btrfs - block-group.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 1653 2099 78.8 %
Date: 2023-07-31 20:08:34 Functions: 76 83 91.6 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/sizes.h>
       4             : #include <linux/list_sort.h>
       5             : #include "misc.h"
       6             : #include "ctree.h"
       7             : #include "block-group.h"
       8             : #include "space-info.h"
       9             : #include "disk-io.h"
      10             : #include "free-space-cache.h"
      11             : #include "free-space-tree.h"
      12             : #include "volumes.h"
      13             : #include "transaction.h"
      14             : #include "ref-verify.h"
      15             : #include "sysfs.h"
      16             : #include "tree-log.h"
      17             : #include "delalloc-space.h"
      18             : #include "discard.h"
      19             : #include "raid56.h"
      20             : #include "zoned.h"
      21             : #include "fs.h"
      22             : #include "accessors.h"
      23             : #include "extent-tree.h"
      24             : 
      25             : #ifdef CONFIG_BTRFS_DEBUG
      26             : int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
      27             : {
      28             :         struct btrfs_fs_info *fs_info = block_group->fs_info;
      29             : 
      30             :         return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
      31             :                 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
      32             :                (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
      33             :                 block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
      34             : }
      35             : #endif
      36             : 
      37             : /*
      38             :  * Return target flags in extended format or 0 if restripe for this chunk_type
      39             :  * is not in progress
      40             :  *
      41             :  * Should be called with balance_lock held
      42             :  */
      43    87147490 : static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
      44             : {
      45    87147490 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
      46    87147490 :         u64 target = 0;
      47             : 
      48    87147490 :         if (!bctl)
      49             :                 return 0;
      50             : 
      51     8798914 :         if (flags & BTRFS_BLOCK_GROUP_DATA &&
      52      112188 :             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      53           0 :                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
      54     8798914 :         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
      55        1343 :                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      56           0 :                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
      57     8798914 :         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
      58     8685383 :                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      59           0 :                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
      60             :         }
      61             : 
      62             :         return target;
      63             : }
      64             : 
      65             : /*
      66             :  * @flags: available profiles in extended format (see ctree.h)
      67             :  *
      68             :  * Return reduced profile in chunk format.  If profile changing is in progress
      69             :  * (either running or paused) picks the target profile (if it's already
      70             :  * available), otherwise falls back to plain reducing.
      71             :  */
      72    87139754 : static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
      73             : {
      74    87139754 :         u64 num_devices = fs_info->fs_devices->rw_devices;
      75    87139754 :         u64 target;
      76    87139754 :         u64 raid_type;
      77    87139754 :         u64 allowed = 0;
      78             : 
      79             :         /*
      80             :          * See if restripe for this chunk_type is in progress, if so try to
      81             :          * reduce to the target profile
      82             :          */
      83    87139754 :         spin_lock(&fs_info->balance_lock);
      84    87147435 :         target = get_restripe_target(fs_info, flags);
      85    87147435 :         if (target) {
      86           0 :                 spin_unlock(&fs_info->balance_lock);
      87           0 :                 return extended_to_chunk(target);
      88             :         }
      89    87147435 :         spin_unlock(&fs_info->balance_lock);
      90             : 
      91             :         /* First, mask out the RAID levels which aren't possible */
      92   958585166 :         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
      93   784290948 :                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
      94   261438318 :                         allowed |= btrfs_raid_array[raid_type].bg_flag;
      95             :         }
      96    87146783 :         allowed &= flags;
      97             : 
      98             :         /* Select the highest-redundancy RAID level. */
      99    87146783 :         if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
     100             :                 allowed = BTRFS_BLOCK_GROUP_RAID1C4;
     101    87146408 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
     102             :                 allowed = BTRFS_BLOCK_GROUP_RAID6;
     103    87145849 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
     104             :                 allowed = BTRFS_BLOCK_GROUP_RAID1C3;
     105    87145849 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
     106             :                 allowed = BTRFS_BLOCK_GROUP_RAID5;
     107    87145840 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
     108             :                 allowed = BTRFS_BLOCK_GROUP_RAID10;
     109    87145834 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
     110             :                 allowed = BTRFS_BLOCK_GROUP_RAID1;
     111    87145802 :         else if (allowed & BTRFS_BLOCK_GROUP_DUP)
     112             :                 allowed = BTRFS_BLOCK_GROUP_DUP;
     113     7355779 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
     114           0 :                 allowed = BTRFS_BLOCK_GROUP_RAID0;
     115             : 
     116    87146783 :         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
     117             : 
     118    87146783 :         return extended_to_chunk(flags | allowed);
     119             : }
     120             : 
     121    87142812 : u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
     122             : {
     123    87142812 :         unsigned seq;
     124    87142812 :         u64 flags;
     125             : 
     126    87142812 :         do {
     127    87142812 :                 flags = orig_flags;
     128    87142812 :                 seq = read_seqbegin(&fs_info->profiles_lock);
     129             : 
     130    87140634 :                 if (flags & BTRFS_BLOCK_GROUP_DATA)
     131     6508494 :                         flags |= fs_info->avail_data_alloc_bits;
     132    80632140 :                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
     133        5059 :                         flags |= fs_info->avail_system_alloc_bits;
     134    80627081 :                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
     135    80627070 :                         flags |= fs_info->avail_metadata_alloc_bits;
     136    87140634 :         } while (read_seqretry(&fs_info->profiles_lock, seq));
     137             : 
     138    87139626 :         return btrfs_reduce_alloc_profile(fs_info, flags);
     139             : }
     140             : 
     141    11537552 : void btrfs_get_block_group(struct btrfs_block_group *cache)
     142             : {
     143    11537552 :         refcount_inc(&cache->refs);
     144    73473619 : }
     145             : 
     146    85069784 : void btrfs_put_block_group(struct btrfs_block_group *cache)
     147             : {
     148    85069784 :         if (refcount_dec_and_test(&cache->refs)) {
     149       30743 :                 WARN_ON(cache->pinned > 0);
     150             :                 /*
     151             :                  * If there was a failure to cleanup a log tree, very likely due
     152             :                  * to an IO failure on a writeback attempt of one or more of its
     153             :                  * extent buffers, we could not do proper (and cheap) unaccounting
     154             :                  * of their reserved space, so don't warn on reserved > 0 in that
     155             :                  * case.
     156             :                  */
     157       30743 :                 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
     158        3754 :                     !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
     159       30742 :                         WARN_ON(cache->reserved > 0);
     160             : 
     161             :                 /*
     162             :                  * A block_group shouldn't be on the discard_list anymore.
     163             :                  * Remove the block_group from the discard_list to prevent us
     164             :                  * from causing a panic due to NULL pointer dereference.
     165             :                  */
     166       30743 :                 if (WARN_ON(!list_empty(&cache->discard_list)))
     167           0 :                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
     168             :                                                   cache);
     169             : 
     170       30743 :                 kfree(cache->free_space_ctl);
     171       30743 :                 kfree(cache->physical_map);
     172       30743 :                 kfree(cache);
     173             :         }
     174    85069935 : }
     175             : 
     176             : /*
     177             :  * This adds the block group to the fs_info rb tree for the block group cache
     178             :  */
     179       30776 : static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
     180             :                                        struct btrfs_block_group *block_group)
     181             : {
     182       30776 :         struct rb_node **p;
     183       30776 :         struct rb_node *parent = NULL;
     184       30776 :         struct btrfs_block_group *cache;
     185       30776 :         bool leftmost = true;
     186             : 
     187       30776 :         ASSERT(block_group->length != 0);
     188             : 
     189       30776 :         write_lock(&info->block_group_cache_lock);
     190       30776 :         p = &info->block_group_cache_tree.rb_root.rb_node;
     191             : 
     192      156942 :         while (*p) {
     193      126166 :                 parent = *p;
     194      126166 :                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
     195      126166 :                 if (block_group->start < cache->start) {
     196           0 :                         p = &(*p)->rb_left;
     197      126166 :                 } else if (block_group->start > cache->start) {
     198      126166 :                         p = &(*p)->rb_right;
     199      126166 :                         leftmost = false;
     200             :                 } else {
     201           0 :                         write_unlock(&info->block_group_cache_lock);
     202           0 :                         return -EEXIST;
     203             :                 }
     204             :         }
     205             : 
     206       30776 :         rb_link_node(&block_group->cache_node, parent, p);
     207       30776 :         rb_insert_color_cached(&block_group->cache_node,
     208             :                                &info->block_group_cache_tree, leftmost);
     209             : 
     210       30776 :         write_unlock(&info->block_group_cache_lock);
     211             : 
     212       30776 :         return 0;
     213             : }
     214             : 
     215             : /*
     216             :  * This will return the block group at or after bytenr if contains is 0, else
     217             :  * it will return the block group that contains the bytenr
     218             :  */
     219    72899660 : static struct btrfs_block_group *block_group_cache_tree_search(
     220             :                 struct btrfs_fs_info *info, u64 bytenr, int contains)
     221             : {
     222    72899660 :         struct btrfs_block_group *cache, *ret = NULL;
     223    72899660 :         struct rb_node *n;
     224    72899660 :         u64 end, start;
     225             : 
     226    72899660 :         read_lock(&info->block_group_cache_lock);
     227    72899762 :         n = info->block_group_cache_tree.rb_root.rb_node;
     228             : 
     229   228088736 :         while (n) {
     230   228084857 :                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
     231   228084857 :                 end = cache->start + cache->length - 1;
     232   228084857 :                 start = cache->start;
     233             : 
     234   228084857 :                 if (bytenr < start) {
     235    42137360 :                         if (!contains && (!ret || start < ret->start))
     236        8978 :                                 ret = cache;
     237    42137360 :                         n = n->rb_left;
     238   185947497 :                 } else if (bytenr > start) {
     239   181105749 :                         if (contains && bytenr <= end) {
     240             :                                 ret = cache;
     241             :                                 break;
     242             :                         }
     243   113051614 :                         n = n->rb_right;
     244             :                 } else {
     245             :                         ret = cache;
     246             :                         break;
     247             :                 }
     248             :         }
     249    72899762 :         if (ret)
     250    72899161 :                 btrfs_get_block_group(ret);
     251    72899607 :         read_unlock(&info->block_group_cache_lock);
     252             : 
     253    72898823 :         return ret;
     254             : }
     255             : 
     256             : /*
     257             :  * Return the block group that starts at or after bytenr
     258             :  */
     259         843 : struct btrfs_block_group *btrfs_lookup_first_block_group(
     260             :                 struct btrfs_fs_info *info, u64 bytenr)
     261             : {
     262         843 :         return block_group_cache_tree_search(info, bytenr, 0);
     263             : }
     264             : 
     265             : /*
     266             :  * Return the block group that contains the given bytenr
     267             :  */
     268    40775792 : struct btrfs_block_group *btrfs_lookup_block_group(
     269             :                 struct btrfs_fs_info *info, u64 bytenr)
     270             : {
     271    40775792 :         return block_group_cache_tree_search(info, bytenr, 1);
     272             : }
     273             : 
     274       34472 : struct btrfs_block_group *btrfs_next_block_group(
     275             :                 struct btrfs_block_group *cache)
     276             : {
     277       34472 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     278       34472 :         struct rb_node *node;
     279             : 
     280       34472 :         read_lock(&fs_info->block_group_cache_lock);
     281             : 
     282             :         /* If our block group was removed, we need a full search. */
     283       34472 :         if (RB_EMPTY_NODE(&cache->cache_node)) {
     284           0 :                 const u64 next_bytenr = cache->start + cache->length;
     285             : 
     286           0 :                 read_unlock(&fs_info->block_group_cache_lock);
     287           0 :                 btrfs_put_block_group(cache);
     288           0 :                 return btrfs_lookup_first_block_group(fs_info, next_bytenr);
     289             :         }
     290       34472 :         node = rb_next(&cache->cache_node);
     291       34472 :         btrfs_put_block_group(cache);
     292       34472 :         if (node) {
     293       30614 :                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
     294       30614 :                 btrfs_get_block_group(cache);
     295             :         } else
     296             :                 cache = NULL;
     297       34472 :         read_unlock(&fs_info->block_group_cache_lock);
     298       34472 :         return cache;
     299             : }
     300             : 
     301             : /*
     302             :  * Check if we can do a NOCOW write for a given extent.
     303             :  *
     304             :  * @fs_info:       The filesystem information object.
     305             :  * @bytenr:        Logical start address of the extent.
     306             :  *
     307             :  * Check if we can do a NOCOW write for the given extent, and increments the
     308             :  * number of NOCOW writers in the block group that contains the extent, as long
     309             :  * as the block group exists and it's currently not in read-only mode.
     310             :  *
     311             :  * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
     312             :  *          is responsible for calling btrfs_dec_nocow_writers() later.
     313             :  *
     314             :  *          Or NULL if we can not do a NOCOW write
     315             :  */
     316      280868 : struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
     317             :                                                   u64 bytenr)
     318             : {
     319      280868 :         struct btrfs_block_group *bg;
     320      280868 :         bool can_nocow = true;
     321             : 
     322      280868 :         bg = btrfs_lookup_block_group(fs_info, bytenr);
     323      280867 :         if (!bg)
     324             :                 return NULL;
     325             : 
     326      280867 :         spin_lock(&bg->lock);
     327      280870 :         if (bg->ro)
     328             :                 can_nocow = false;
     329             :         else
     330      280870 :                 atomic_inc(&bg->nocow_writers);
     331      280869 :         spin_unlock(&bg->lock);
     332             : 
     333      280870 :         if (!can_nocow) {
     334           0 :                 btrfs_put_block_group(bg);
     335           0 :                 return NULL;
     336             :         }
     337             : 
     338             :         /* No put on block group, done by btrfs_dec_nocow_writers(). */
     339             :         return bg;
     340             : }
     341             : 
     342             : /*
     343             :  * Decrement the number of NOCOW writers in a block group.
     344             :  *
     345             :  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
     346             :  * and on the block group returned by that call. Typically this is called after
     347             :  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
     348             :  * relocation.
     349             :  *
     350             :  * After this call, the caller should not use the block group anymore. It it wants
     351             :  * to use it, then it should get a reference on it before calling this function.
     352             :  */
     353      280869 : void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
     354             : {
     355      280869 :         if (atomic_dec_and_test(&bg->nocow_writers))
     356      280801 :                 wake_up_var(&bg->nocow_writers);
     357             : 
     358             :         /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
     359      280868 :         btrfs_put_block_group(bg);
     360      280869 : }
     361             : 
     362         523 : void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
     363             : {
     364         523 :         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
     365         523 : }
     366             : 
     367    13262117 : void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
     368             :                                         const u64 start)
     369             : {
     370    13262117 :         struct btrfs_block_group *bg;
     371             : 
     372    13262117 :         bg = btrfs_lookup_block_group(fs_info, start);
     373    13261747 :         ASSERT(bg);
     374    13261747 :         if (atomic_dec_and_test(&bg->reservations))
     375    13233930 :                 wake_up_var(&bg->reservations);
     376    13262253 :         btrfs_put_block_group(bg);
     377    13262363 : }
     378             : 
     379         523 : void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
     380             : {
     381         523 :         struct btrfs_space_info *space_info = bg->space_info;
     382             : 
     383         523 :         ASSERT(bg->ro);
     384             : 
     385         523 :         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
     386             :                 return;
     387             : 
     388             :         /*
     389             :          * Our block group is read only but before we set it to read only,
     390             :          * some task might have had allocated an extent from it already, but it
     391             :          * has not yet created a respective ordered extent (and added it to a
     392             :          * root's list of ordered extents).
     393             :          * Therefore wait for any task currently allocating extents, since the
     394             :          * block group's reservations counter is incremented while a read lock
     395             :          * on the groups' semaphore is held and decremented after releasing
     396             :          * the read access on that semaphore and creating the ordered extent.
     397             :          */
     398         227 :         down_write(&space_info->groups_sem);
     399         227 :         up_write(&space_info->groups_sem);
     400             : 
     401         227 :         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
     402             : }
     403             : 
     404       33554 : struct btrfs_caching_control *btrfs_get_caching_control(
     405             :                 struct btrfs_block_group *cache)
     406             : {
     407       33554 :         struct btrfs_caching_control *ctl;
     408             : 
     409       33554 :         spin_lock(&cache->lock);
     410       33554 :         if (!cache->caching_ctl) {
     411       30743 :                 spin_unlock(&cache->lock);
     412       30743 :                 return NULL;
     413             :         }
     414             : 
     415        2811 :         ctl = cache->caching_ctl;
     416        2811 :         refcount_inc(&ctl->count);
     417        2811 :         spin_unlock(&cache->lock);
     418        2811 :         return ctl;
     419             : }
     420             : 
     421      570078 : void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
     422             : {
     423      570078 :         if (refcount_dec_and_test(&ctl->count))
     424        5258 :                 kfree(ctl);
     425      570077 : }
     426             : 
     427             : /*
     428             :  * When we wait for progress in the block group caching, its because our
     429             :  * allocation attempt failed at least once.  So, we must sleep and let some
     430             :  * progress happen before we try again.
     431             :  *
     432             :  * This function will sleep at least once waiting for new free space to show
     433             :  * up, and then it will check the block group free space numbers for our min
     434             :  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
     435             :  * a free extent of a given size, but this is a good start.
     436             :  *
     437             :  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
     438             :  * any of the information in this block group.
     439             :  */
     440        2811 : void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
     441             :                                            u64 num_bytes)
     442             : {
     443        2811 :         struct btrfs_caching_control *caching_ctl;
     444             : 
     445        2811 :         caching_ctl = btrfs_get_caching_control(cache);
     446        2811 :         if (!caching_ctl)
     447             :                 return;
     448             : 
     449        5547 :         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
     450             :                    (cache->free_space_ctl->free_space >= num_bytes));
     451             : 
     452        2811 :         btrfs_put_caching_control(caching_ctl);
     453             : }
     454             : 
     455         650 : static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
     456             :                                        struct btrfs_caching_control *caching_ctl)
     457             : {
     458        1458 :         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
     459         650 :         return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
     460             : }
     461             : 
     462       30210 : static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
     463             : {
     464       30210 :         struct btrfs_caching_control *caching_ctl;
     465       30210 :         int ret;
     466             : 
     467       30210 :         caching_ctl = btrfs_get_caching_control(cache);
     468       30210 :         if (!caching_ctl)
     469       30210 :                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
     470           0 :         ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
     471           0 :         btrfs_put_caching_control(caching_ctl);
     472           0 :         return ret;
     473             : }
     474             : 
     475             : #ifdef CONFIG_BTRFS_DEBUG
     476             : static void fragment_free_space(struct btrfs_block_group *block_group)
     477             : {
     478             :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     479             :         u64 start = block_group->start;
     480             :         u64 len = block_group->length;
     481             :         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
     482             :                 fs_info->nodesize : fs_info->sectorsize;
     483             :         u64 step = chunk << 1;
     484             : 
     485             :         while (len > chunk) {
     486             :                 btrfs_remove_free_space(block_group, start, chunk);
     487             :                 start += step;
     488             :                 if (len < step)
     489             :                         len = 0;
     490             :                 else
     491             :                         len -= step;
     492             :         }
     493             : }
     494             : #endif
     495             : 
     496             : /*
     497             :  * This is only called by btrfs_cache_block_group, since we could have freed
     498             :  * extents we need to check the pinned_extents for any extents that can't be
     499             :  * used yet since their free space will be released as soon as the transaction
     500             :  * commits.
     501             :  */
     502      115767 : int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end,
     503             :                        u64 *total_added_ret)
     504             : {
     505      115767 :         struct btrfs_fs_info *info = block_group->fs_info;
     506      115767 :         u64 extent_start, extent_end, size;
     507      115767 :         int ret;
     508             : 
     509      115767 :         if (total_added_ret)
     510       93771 :                 *total_added_ret = 0;
     511             : 
     512      118974 :         while (start < end) {
     513      118620 :                 ret = find_first_extent_bit(&info->excluded_extents, start,
     514             :                                             &extent_start, &extent_end,
     515             :                                             EXTENT_DIRTY | EXTENT_UPTODATE,
     516             :                                             NULL);
     517      118620 :                 if (ret)
     518             :                         break;
     519             : 
     520       33029 :                 if (extent_start <= start) {
     521          12 :                         start = extent_end + 1;
     522       33017 :                 } else if (extent_start > start && extent_start < end) {
     523        3195 :                         size = extent_start - start;
     524        3195 :                         ret = btrfs_add_free_space_async_trimmed(block_group,
     525             :                                                                  start, size);
     526        3195 :                         if (ret)
     527           0 :                                 return ret;
     528        3195 :                         if (total_added_ret)
     529        1917 :                                 *total_added_ret += size;
     530        3195 :                         start = extent_end + 1;
     531             :                 } else {
     532             :                         break;
     533             :                 }
     534             :         }
     535             : 
     536      115767 :         if (start < end) {
     537      115413 :                 size = end - start;
     538      115413 :                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
     539             :                                                          size);
     540      115413 :                 if (ret)
     541             :                         return ret;
     542      115413 :                 if (total_added_ret)
     543       93417 :                         *total_added_ret += size;
     544             :         }
     545             : 
     546             :         return 0;
     547             : }
     548             : 
     549             : /*
     550             :  * Get an arbitrary extent item index / max_index through the block group
     551             :  *
     552             :  * @block_group   the block group to sample from
     553             :  * @index:        the integral step through the block group to grab from
     554             :  * @max_index:    the granularity of the sampling
     555             :  * @key:          return value parameter for the item we find
     556             :  *
     557             :  * Pre-conditions on indices:
     558             :  * 0 <= index <= max_index
     559             :  * 0 < max_index
     560             :  *
     561             :  * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
     562             :  * error code on error.
     563             :  */
     564        8060 : static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
     565             :                                           struct btrfs_block_group *block_group,
     566             :                                           int index, int max_index,
     567             :                                           struct btrfs_key *found_key)
     568             : {
     569        8060 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     570        8060 :         struct btrfs_root *extent_root;
     571        8060 :         u64 search_offset;
     572        8060 :         u64 search_end = block_group->start + block_group->length;
     573        8060 :         struct btrfs_path *path;
     574        8060 :         struct btrfs_key search_key;
     575        8060 :         int ret = 0;
     576             : 
     577        8060 :         ASSERT(index >= 0);
     578        8060 :         ASSERT(index <= max_index);
     579        8060 :         ASSERT(max_index > 0);
     580        8060 :         lockdep_assert_held(&caching_ctl->mutex);
     581        8060 :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     582             : 
     583        8060 :         path = btrfs_alloc_path();
     584        8060 :         if (!path)
     585             :                 return -ENOMEM;
     586             : 
     587        8060 :         extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
     588             :                                                        BTRFS_SUPER_INFO_OFFSET));
     589             : 
     590        8060 :         path->skip_locking = 1;
     591        8060 :         path->search_commit_root = 1;
     592        8060 :         path->reada = READA_FORWARD;
     593             : 
     594        8060 :         search_offset = index * div_u64(block_group->length, max_index);
     595        8060 :         search_key.objectid = block_group->start + search_offset;
     596        8060 :         search_key.type = BTRFS_EXTENT_ITEM_KEY;
     597        8060 :         search_key.offset = 0;
     598             : 
     599        8384 :         btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
     600             :                 /* Success; sampled an extent item in the block group */
     601        7557 :                 if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
     602        5571 :                     found_key->objectid >= block_group->start &&
     603        5571 :                     found_key->objectid + found_key->offset <= search_end)
     604             :                         break;
     605             : 
     606             :                 /* We can't possibly find a valid extent item anymore */
     607        2405 :                 if (found_key->objectid >= search_end) {
     608             :                         ret = 1;
     609             :                         break;
     610             :                 }
     611             :         }
     612             : 
     613        8060 :         lockdep_assert_held(&caching_ctl->mutex);
     614        8060 :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     615        8060 :         btrfs_free_path(path);
     616        8060 :         return ret;
     617             : }
     618             : 
     619             : /*
     620             :  * Best effort attempt to compute a block group's size class while caching it.
     621             :  *
     622             :  * @block_group: the block group we are caching
     623             :  *
     624             :  * We cannot infer the size class while adding free space extents, because that
     625             :  * logic doesn't care about contiguous file extents (it doesn't differentiate
     626             :  * between a 100M extent and 100 contiguous 1M extents). So we need to read the
     627             :  * file extent items. Reading all of them is quite wasteful, because usually
     628             :  * only a handful are enough to give a good answer. Therefore, we just grab 5 of
     629             :  * them at even steps through the block group and pick the smallest size class
     630             :  * we see. Since size class is best effort, and not guaranteed in general,
     631             :  * inaccuracy is acceptable.
     632             :  *
     633             :  * To be more explicit about why this algorithm makes sense:
     634             :  *
     635             :  * If we are caching in a block group from disk, then there are three major cases
     636             :  * to consider:
     637             :  * 1. the block group is well behaved and all extents in it are the same size
     638             :  *    class.
     639             :  * 2. the block group is mostly one size class with rare exceptions for last
     640             :  *    ditch allocations
     641             :  * 3. the block group was populated before size classes and can have a totally
     642             :  *    arbitrary mix of size classes.
     643             :  *
     644             :  * In case 1, looking at any extent in the block group will yield the correct
     645             :  * result. For the mixed cases, taking the minimum size class seems like a good
     646             :  * approximation, since gaps from frees will be usable to the size class. For
     647             :  * 2., a small handful of file extents is likely to yield the right answer. For
     648             :  * 3, we can either read every file extent, or admit that this is best effort
     649             :  * anyway and try to stay fast.
     650             :  *
     651             :  * Returns: 0 on success, negative error code on error.
     652             :  */
     653        5258 : static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
     654             :                                        struct btrfs_block_group *block_group)
     655             : {
     656        5258 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     657        5258 :         struct btrfs_key key;
     658        5258 :         int i;
     659        5258 :         u64 min_size = block_group->length;
     660        5258 :         enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
     661        5258 :         int ret;
     662             : 
     663        5258 :         if (!btrfs_block_group_should_use_size_class(block_group))
     664             :                 return 0;
     665             : 
     666             :         lockdep_assert_held(&caching_ctl->mutex);
     667             :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     668        9672 :         for (i = 0; i < 5; ++i) {
     669        8060 :                 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
     670        8060 :                 if (ret < 0)
     671           0 :                         goto out;
     672        8060 :                 if (ret > 0)
     673        2908 :                         continue;
     674        5152 :                 min_size = min_t(u64, min_size, key.offset);
     675        5152 :                 size_class = btrfs_calc_block_group_size_class(min_size);
     676             :         }
     677        1612 :         if (size_class != BTRFS_BG_SZ_NONE) {
     678        1612 :                 spin_lock(&block_group->lock);
     679        1612 :                 block_group->size_class = size_class;
     680        1612 :                 spin_unlock(&block_group->lock);
     681             :         }
     682           0 : out:
     683             :         return ret;
     684             : }
     685             : 
     686          29 : static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
     687             : {
     688          29 :         struct btrfs_block_group *block_group = caching_ctl->block_group;
     689          29 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     690          29 :         struct btrfs_root *extent_root;
     691          29 :         struct btrfs_path *path;
     692          29 :         struct extent_buffer *leaf;
     693          29 :         struct btrfs_key key;
     694          29 :         u64 total_found = 0;
     695          29 :         u64 last = 0;
     696          29 :         u32 nritems;
     697          29 :         int ret;
     698          29 :         bool wakeup = true;
     699             : 
     700          29 :         path = btrfs_alloc_path();
     701          29 :         if (!path)
     702             :                 return -ENOMEM;
     703             : 
     704          29 :         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
     705          29 :         extent_root = btrfs_extent_root(fs_info, last);
     706             : 
     707             : #ifdef CONFIG_BTRFS_DEBUG
     708             :         /*
     709             :          * If we're fragmenting we don't want to make anybody think we can
     710             :          * allocate from this block group until we've had a chance to fragment
     711             :          * the free space.
     712             :          */
     713             :         if (btrfs_should_fragment_free_space(block_group))
     714             :                 wakeup = false;
     715             : #endif
     716             :         /*
     717             :          * We don't want to deadlock with somebody trying to allocate a new
     718             :          * extent for the extent root while also trying to search the extent
     719             :          * root to add free space.  So we skip locking and search the commit
     720             :          * root, since its read-only
     721             :          */
     722          29 :         path->skip_locking = 1;
     723          29 :         path->search_commit_root = 1;
     724          29 :         path->reada = READA_FORWARD;
     725             : 
     726          29 :         key.objectid = last;
     727          29 :         key.offset = 0;
     728          29 :         key.type = BTRFS_EXTENT_ITEM_KEY;
     729             : 
     730             : next:
     731          50 :         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
     732          50 :         if (ret < 0)
     733           0 :                 goto out;
     734             : 
     735          50 :         leaf = path->nodes[0];
     736          50 :         nritems = btrfs_header_nritems(leaf);
     737             : 
     738         453 :         while (1) {
     739         453 :                 if (btrfs_fs_closing(fs_info) > 1) {
     740             :                         last = (u64)-1;
     741             :                         break;
     742             :                 }
     743             : 
     744         453 :                 if (path->slots[0] < nritems) {
     745         442 :                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     746             :                 } else {
     747          11 :                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
     748          11 :                         if (ret)
     749             :                                 break;
     750             : 
     751           0 :                         if (need_resched() ||
     752             :                             rwsem_is_contended(&fs_info->commit_root_sem)) {
     753           0 :                                 btrfs_release_path(path);
     754           0 :                                 up_read(&fs_info->commit_root_sem);
     755           0 :                                 mutex_unlock(&caching_ctl->mutex);
     756           0 :                                 cond_resched();
     757           0 :                                 mutex_lock(&caching_ctl->mutex);
     758           0 :                                 down_read(&fs_info->commit_root_sem);
     759           0 :                                 goto next;
     760             :                         }
     761             : 
     762           0 :                         ret = btrfs_next_leaf(extent_root, path);
     763           0 :                         if (ret < 0)
     764           0 :                                 goto out;
     765           0 :                         if (ret)
     766             :                                 break;
     767           0 :                         leaf = path->nodes[0];
     768           0 :                         nritems = btrfs_header_nritems(leaf);
     769           0 :                         continue;
     770             :                 }
     771             : 
     772         442 :                 if (key.objectid < last) {
     773          21 :                         key.objectid = last;
     774          21 :                         key.offset = 0;
     775          21 :                         key.type = BTRFS_EXTENT_ITEM_KEY;
     776          21 :                         btrfs_release_path(path);
     777          21 :                         goto next;
     778             :                 }
     779             : 
     780         421 :                 if (key.objectid < block_group->start) {
     781           0 :                         path->slots[0]++;
     782           0 :                         continue;
     783             :                 }
     784             : 
     785         421 :                 if (key.objectid >= block_group->start + block_group->length)
     786             :                         break;
     787             : 
     788         403 :                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
     789             :                     key.type == BTRFS_METADATA_ITEM_KEY) {
     790         394 :                         u64 space_added;
     791             : 
     792         394 :                         ret = add_new_free_space(block_group, last, key.objectid,
     793             :                                                  &space_added);
     794         394 :                         if (ret)
     795           0 :                                 goto out;
     796         394 :                         total_found += space_added;
     797         394 :                         if (key.type == BTRFS_METADATA_ITEM_KEY)
     798         376 :                                 last = key.objectid +
     799         376 :                                         fs_info->nodesize;
     800             :                         else
     801          18 :                                 last = key.objectid + key.offset;
     802             : 
     803         394 :                         if (total_found > CACHING_CTL_WAKE_UP) {
     804           2 :                                 total_found = 0;
     805           2 :                                 if (wakeup)
     806           2 :                                         wake_up(&caching_ctl->wait);
     807             :                         }
     808             :                 }
     809         403 :                 path->slots[0]++;
     810             :         }
     811             : 
     812          29 :         ret = add_new_free_space(block_group, last,
     813          29 :                                  block_group->start + block_group->length,
     814             :                                  NULL);
     815          29 : out:
     816          29 :         btrfs_free_path(path);
     817          29 :         return ret;
     818             : }
     819             : 
     820        5258 : static noinline void caching_thread(struct btrfs_work *work)
     821             : {
     822        5258 :         struct btrfs_block_group *block_group;
     823        5258 :         struct btrfs_fs_info *fs_info;
     824        5258 :         struct btrfs_caching_control *caching_ctl;
     825        5258 :         int ret;
     826             : 
     827        5258 :         caching_ctl = container_of(work, struct btrfs_caching_control, work);
     828        5258 :         block_group = caching_ctl->block_group;
     829        5258 :         fs_info = block_group->fs_info;
     830             : 
     831        5258 :         mutex_lock(&caching_ctl->mutex);
     832        5258 :         down_read(&fs_info->commit_root_sem);
     833             : 
     834        5258 :         load_block_group_size_class(caching_ctl, block_group);
     835        5258 :         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
     836          23 :                 ret = load_free_space_cache(block_group);
     837          23 :                 if (ret == 1) {
     838           1 :                         ret = 0;
     839           1 :                         goto done;
     840             :                 }
     841             : 
     842             :                 /*
     843             :                  * We failed to load the space cache, set ourselves to
     844             :                  * CACHE_STARTED and carry on.
     845             :                  */
     846          22 :                 spin_lock(&block_group->lock);
     847          22 :                 block_group->cached = BTRFS_CACHE_STARTED;
     848          22 :                 spin_unlock(&block_group->lock);
     849          22 :                 wake_up(&caching_ctl->wait);
     850             :         }
     851             : 
     852             :         /*
     853             :          * If we are in the transaction that populated the free space tree we
     854             :          * can't actually cache from the free space tree as our commit root and
     855             :          * real root are the same, so we could change the contents of the blocks
     856             :          * while caching.  Instead do the slow caching in this case, and after
     857             :          * the transaction has committed we will be safe.
     858             :          */
     859       10494 :         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
     860        5237 :             !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
     861        5228 :                 ret = load_free_space_tree(caching_ctl);
     862             :         else
     863          29 :                 ret = load_extent_tree_free(caching_ctl);
     864        5258 : done:
     865        5258 :         spin_lock(&block_group->lock);
     866        5258 :         block_group->caching_ctl = NULL;
     867        5258 :         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
     868        5258 :         spin_unlock(&block_group->lock);
     869             : 
     870             : #ifdef CONFIG_BTRFS_DEBUG
     871             :         if (btrfs_should_fragment_free_space(block_group)) {
     872             :                 u64 bytes_used;
     873             : 
     874             :                 spin_lock(&block_group->space_info->lock);
     875             :                 spin_lock(&block_group->lock);
     876             :                 bytes_used = block_group->length - block_group->used;
     877             :                 block_group->space_info->bytes_used += bytes_used >> 1;
     878             :                 spin_unlock(&block_group->lock);
     879             :                 spin_unlock(&block_group->space_info->lock);
     880             :                 fragment_free_space(block_group);
     881             :         }
     882             : #endif
     883             : 
     884        5258 :         up_read(&fs_info->commit_root_sem);
     885        5258 :         btrfs_free_excluded_extents(block_group);
     886        5258 :         mutex_unlock(&caching_ctl->mutex);
     887             : 
     888        5258 :         wake_up(&caching_ctl->wait);
     889             : 
     890        5258 :         btrfs_put_caching_control(caching_ctl);
     891        5258 :         btrfs_put_block_group(block_group);
     892        5258 : }
     893             : 
     894      645556 : int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
     895             : {
     896      645556 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     897      645556 :         struct btrfs_caching_control *caching_ctl = NULL;
     898      645556 :         int ret = 0;
     899             : 
     900             :         /* Allocator for zoned filesystems does not use the cache at all */
     901      645556 :         if (btrfs_is_zoned(fs_info))
     902             :                 return 0;
     903             : 
     904      645556 :         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
     905      645556 :         if (!caching_ctl)
     906             :                 return -ENOMEM;
     907             : 
     908      645556 :         INIT_LIST_HEAD(&caching_ctl->list);
     909      645556 :         mutex_init(&caching_ctl->mutex);
     910      645556 :         init_waitqueue_head(&caching_ctl->wait);
     911      645556 :         caching_ctl->block_group = cache;
     912      645556 :         refcount_set(&caching_ctl->count, 2);
     913      645556 :         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
     914             : 
     915      645556 :         spin_lock(&cache->lock);
     916      645556 :         if (cache->cached != BTRFS_CACHE_NO) {
     917      640298 :                 kfree(caching_ctl);
     918             : 
     919      640298 :                 caching_ctl = cache->caching_ctl;
     920      640298 :                 if (caching_ctl)
     921      551471 :                         refcount_inc(&caching_ctl->count);
     922      640298 :                 spin_unlock(&cache->lock);
     923      640298 :                 goto out;
     924             :         }
     925        5258 :         WARN_ON(cache->caching_ctl);
     926        5258 :         cache->caching_ctl = caching_ctl;
     927        5258 :         cache->cached = BTRFS_CACHE_STARTED;
     928        5258 :         spin_unlock(&cache->lock);
     929             : 
     930        5258 :         write_lock(&fs_info->block_group_cache_lock);
     931        5258 :         refcount_inc(&caching_ctl->count);
     932        5258 :         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
     933        5258 :         write_unlock(&fs_info->block_group_cache_lock);
     934             : 
     935        5258 :         btrfs_get_block_group(cache);
     936             : 
     937        5258 :         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
     938      645556 : out:
     939      645556 :         if (wait && caching_ctl)
     940         650 :                 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
     941      645556 :         if (caching_ctl)
     942      556729 :                 btrfs_put_caching_control(caching_ctl);
     943             : 
     944             :         return ret;
     945             : }
     946             : 
     947           0 : static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
     948             : {
     949           0 :         u64 extra_flags = chunk_to_extended(flags) &
     950             :                                 BTRFS_EXTENDED_PROFILE_MASK;
     951             : 
     952           0 :         write_seqlock(&fs_info->profiles_lock);
     953           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA)
     954           0 :                 fs_info->avail_data_alloc_bits &= ~extra_flags;
     955           0 :         if (flags & BTRFS_BLOCK_GROUP_METADATA)
     956           0 :                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
     957           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
     958           0 :                 fs_info->avail_system_alloc_bits &= ~extra_flags;
     959           0 :         write_sequnlock(&fs_info->profiles_lock);
     960           0 : }
     961             : 
     962             : /*
     963             :  * Clear incompat bits for the following feature(s):
     964             :  *
     965             :  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
     966             :  *            in the whole filesystem
     967             :  *
     968             :  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
     969             :  */
     970         533 : static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
     971             : {
     972         533 :         bool found_raid56 = false;
     973         533 :         bool found_raid1c34 = false;
     974             : 
     975         533 :         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
     976         533 :             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
     977             :             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
     978           0 :                 struct list_head *head = &fs_info->space_info;
     979           0 :                 struct btrfs_space_info *sinfo;
     980             : 
     981           0 :                 list_for_each_entry_rcu(sinfo, head, list) {
     982           0 :                         down_read(&sinfo->groups_sem);
     983           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
     984           0 :                                 found_raid56 = true;
     985           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
     986           0 :                                 found_raid56 = true;
     987           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
     988           0 :                                 found_raid1c34 = true;
     989           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
     990           0 :                                 found_raid1c34 = true;
     991           0 :                         up_read(&sinfo->groups_sem);
     992             :                 }
     993           0 :                 if (!found_raid56)
     994           0 :                         btrfs_clear_fs_incompat(fs_info, RAID56);
     995           0 :                 if (!found_raid1c34)
     996           0 :                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
     997             :         }
     998         533 : }
     999             : 
    1000         533 : static int remove_block_group_item(struct btrfs_trans_handle *trans,
    1001             :                                    struct btrfs_path *path,
    1002             :                                    struct btrfs_block_group *block_group)
    1003             : {
    1004         533 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    1005         533 :         struct btrfs_root *root;
    1006         533 :         struct btrfs_key key;
    1007         533 :         int ret;
    1008             : 
    1009         533 :         root = btrfs_block_group_root(fs_info);
    1010         533 :         key.objectid = block_group->start;
    1011         533 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    1012         533 :         key.offset = block_group->length;
    1013             : 
    1014         533 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    1015         533 :         if (ret > 0)
    1016             :                 ret = -ENOENT;
    1017         533 :         if (ret < 0)
    1018           0 :                 return ret;
    1019             : 
    1020         533 :         ret = btrfs_del_item(trans, root, path);
    1021         533 :         return ret;
    1022             : }
    1023             : 
    1024         533 : int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
    1025             :                              u64 group_start, struct extent_map *em)
    1026             : {
    1027         533 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    1028         533 :         struct btrfs_path *path;
    1029         533 :         struct btrfs_block_group *block_group;
    1030         533 :         struct btrfs_free_cluster *cluster;
    1031         533 :         struct inode *inode;
    1032         533 :         struct kobject *kobj = NULL;
    1033         533 :         int ret;
    1034         533 :         int index;
    1035         533 :         int factor;
    1036         533 :         struct btrfs_caching_control *caching_ctl = NULL;
    1037         533 :         bool remove_em;
    1038         533 :         bool remove_rsv = false;
    1039             : 
    1040         533 :         block_group = btrfs_lookup_block_group(fs_info, group_start);
    1041         533 :         BUG_ON(!block_group);
    1042         533 :         BUG_ON(!block_group->ro);
    1043             : 
    1044         533 :         trace_btrfs_remove_block_group(block_group);
    1045             :         /*
    1046             :          * Free the reserved super bytes from this block group before
    1047             :          * remove it.
    1048             :          */
    1049         533 :         btrfs_free_excluded_extents(block_group);
    1050         533 :         btrfs_free_ref_tree_range(fs_info, block_group->start,
    1051             :                                   block_group->length);
    1052             : 
    1053         533 :         index = btrfs_bg_flags_to_raid_index(block_group->flags);
    1054         533 :         factor = btrfs_bg_type_to_factor(block_group->flags);
    1055             : 
    1056             :         /* make sure this block group isn't part of an allocation cluster */
    1057         533 :         cluster = &fs_info->data_alloc_cluster;
    1058         533 :         spin_lock(&cluster->refill_lock);
    1059         533 :         btrfs_return_cluster_to_free_space(block_group, cluster);
    1060         533 :         spin_unlock(&cluster->refill_lock);
    1061             : 
    1062             :         /*
    1063             :          * make sure this block group isn't part of a metadata
    1064             :          * allocation cluster
    1065             :          */
    1066         533 :         cluster = &fs_info->meta_alloc_cluster;
    1067         533 :         spin_lock(&cluster->refill_lock);
    1068         533 :         btrfs_return_cluster_to_free_space(block_group, cluster);
    1069         533 :         spin_unlock(&cluster->refill_lock);
    1070             : 
    1071         533 :         btrfs_clear_treelog_bg(block_group);
    1072         533 :         btrfs_clear_data_reloc_bg(block_group);
    1073             : 
    1074         533 :         path = btrfs_alloc_path();
    1075         533 :         if (!path) {
    1076           0 :                 ret = -ENOMEM;
    1077           0 :                 goto out;
    1078             :         }
    1079             : 
    1080             :         /*
    1081             :          * get the inode first so any iput calls done for the io_list
    1082             :          * aren't the final iput (no unlinks allowed now)
    1083             :          */
    1084         533 :         inode = lookup_free_space_inode(block_group, path);
    1085             : 
    1086         533 :         mutex_lock(&trans->transaction->cache_write_mutex);
    1087             :         /*
    1088             :          * Make sure our free space cache IO is done before removing the
    1089             :          * free space inode
    1090             :          */
    1091         533 :         spin_lock(&trans->transaction->dirty_bgs_lock);
    1092         533 :         if (!list_empty(&block_group->io_list)) {
    1093           0 :                 list_del_init(&block_group->io_list);
    1094             : 
    1095           0 :                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
    1096             : 
    1097           0 :                 spin_unlock(&trans->transaction->dirty_bgs_lock);
    1098           0 :                 btrfs_wait_cache_io(trans, block_group, path);
    1099           0 :                 btrfs_put_block_group(block_group);
    1100           0 :                 spin_lock(&trans->transaction->dirty_bgs_lock);
    1101             :         }
    1102             : 
    1103         533 :         if (!list_empty(&block_group->dirty_list)) {
    1104           0 :                 list_del_init(&block_group->dirty_list);
    1105           0 :                 remove_rsv = true;
    1106           0 :                 btrfs_put_block_group(block_group);
    1107             :         }
    1108         533 :         spin_unlock(&trans->transaction->dirty_bgs_lock);
    1109         533 :         mutex_unlock(&trans->transaction->cache_write_mutex);
    1110             : 
    1111         533 :         ret = btrfs_remove_free_space_inode(trans, inode, block_group);
    1112         533 :         if (ret)
    1113           0 :                 goto out;
    1114             : 
    1115         533 :         write_lock(&fs_info->block_group_cache_lock);
    1116         533 :         rb_erase_cached(&block_group->cache_node,
    1117             :                         &fs_info->block_group_cache_tree);
    1118         533 :         RB_CLEAR_NODE(&block_group->cache_node);
    1119             : 
    1120             :         /* Once for the block groups rbtree */
    1121         533 :         btrfs_put_block_group(block_group);
    1122             : 
    1123         533 :         write_unlock(&fs_info->block_group_cache_lock);
    1124             : 
    1125         533 :         down_write(&block_group->space_info->groups_sem);
    1126             :         /*
    1127             :          * we must use list_del_init so people can check to see if they
    1128             :          * are still on the list after taking the semaphore
    1129             :          */
    1130         533 :         list_del_init(&block_group->list);
    1131         533 :         if (list_empty(&block_group->space_info->block_groups[index])) {
    1132           0 :                 kobj = block_group->space_info->block_group_kobjs[index];
    1133           0 :                 block_group->space_info->block_group_kobjs[index] = NULL;
    1134           0 :                 clear_avail_alloc_bits(fs_info, block_group->flags);
    1135             :         }
    1136         533 :         up_write(&block_group->space_info->groups_sem);
    1137         533 :         clear_incompat_bg_bits(fs_info, block_group->flags);
    1138         533 :         if (kobj) {
    1139           0 :                 kobject_del(kobj);
    1140           0 :                 kobject_put(kobj);
    1141             :         }
    1142             : 
    1143         533 :         if (block_group->cached == BTRFS_CACHE_STARTED)
    1144           0 :                 btrfs_wait_block_group_cache_done(block_group);
    1145             : 
    1146         533 :         write_lock(&fs_info->block_group_cache_lock);
    1147         533 :         caching_ctl = btrfs_get_caching_control(block_group);
    1148         533 :         if (!caching_ctl) {
    1149         533 :                 struct btrfs_caching_control *ctl;
    1150             : 
    1151         940 :                 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
    1152         432 :                         if (ctl->block_group == block_group) {
    1153          25 :                                 caching_ctl = ctl;
    1154          25 :                                 refcount_inc(&caching_ctl->count);
    1155             :                                 break;
    1156             :                         }
    1157             :                 }
    1158             :         }
    1159         533 :         if (caching_ctl)
    1160          25 :                 list_del_init(&caching_ctl->list);
    1161         533 :         write_unlock(&fs_info->block_group_cache_lock);
    1162             : 
    1163         533 :         if (caching_ctl) {
    1164             :                 /* Once for the caching bgs list and once for us. */
    1165          25 :                 btrfs_put_caching_control(caching_ctl);
    1166          25 :                 btrfs_put_caching_control(caching_ctl);
    1167             :         }
    1168             : 
    1169         533 :         spin_lock(&trans->transaction->dirty_bgs_lock);
    1170         533 :         WARN_ON(!list_empty(&block_group->dirty_list));
    1171         533 :         WARN_ON(!list_empty(&block_group->io_list));
    1172         533 :         spin_unlock(&trans->transaction->dirty_bgs_lock);
    1173             : 
    1174         533 :         btrfs_remove_free_space_cache(block_group);
    1175             : 
    1176         533 :         spin_lock(&block_group->space_info->lock);
    1177         533 :         list_del_init(&block_group->ro_list);
    1178             : 
    1179         533 :         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
    1180           0 :                 WARN_ON(block_group->space_info->total_bytes
    1181             :                         < block_group->length);
    1182           0 :                 WARN_ON(block_group->space_info->bytes_readonly
    1183             :                         < block_group->length - block_group->zone_unusable);
    1184           0 :                 WARN_ON(block_group->space_info->bytes_zone_unusable
    1185             :                         < block_group->zone_unusable);
    1186           0 :                 WARN_ON(block_group->space_info->disk_total
    1187             :                         < block_group->length * factor);
    1188             :         }
    1189         533 :         block_group->space_info->total_bytes -= block_group->length;
    1190         533 :         block_group->space_info->bytes_readonly -=
    1191         533 :                 (block_group->length - block_group->zone_unusable);
    1192         533 :         block_group->space_info->bytes_zone_unusable -=
    1193         533 :                 block_group->zone_unusable;
    1194         533 :         block_group->space_info->disk_total -= block_group->length * factor;
    1195             : 
    1196         533 :         spin_unlock(&block_group->space_info->lock);
    1197             : 
    1198             :         /*
    1199             :          * Remove the free space for the block group from the free space tree
    1200             :          * and the block group's item from the extent tree before marking the
    1201             :          * block group as removed. This is to prevent races with tasks that
    1202             :          * freeze and unfreeze a block group, this task and another task
    1203             :          * allocating a new block group - the unfreeze task ends up removing
    1204             :          * the block group's extent map before the task calling this function
    1205             :          * deletes the block group item from the extent tree, allowing for
    1206             :          * another task to attempt to create another block group with the same
    1207             :          * item key (and failing with -EEXIST and a transaction abort).
    1208             :          */
    1209         533 :         ret = remove_block_group_free_space(trans, block_group);
    1210         533 :         if (ret)
    1211           0 :                 goto out;
    1212             : 
    1213         533 :         ret = remove_block_group_item(trans, path, block_group);
    1214         533 :         if (ret < 0)
    1215           0 :                 goto out;
    1216             : 
    1217         533 :         spin_lock(&block_group->lock);
    1218         533 :         set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
    1219             : 
    1220             :         /*
    1221             :          * At this point trimming or scrub can't start on this block group,
    1222             :          * because we removed the block group from the rbtree
    1223             :          * fs_info->block_group_cache_tree so no one can't find it anymore and
    1224             :          * even if someone already got this block group before we removed it
    1225             :          * from the rbtree, they have already incremented block_group->frozen -
    1226             :          * if they didn't, for the trimming case they won't find any free space
    1227             :          * entries because we already removed them all when we called
    1228             :          * btrfs_remove_free_space_cache().
    1229             :          *
    1230             :          * And we must not remove the extent map from the fs_info->mapping_tree
    1231             :          * to prevent the same logical address range and physical device space
    1232             :          * ranges from being reused for a new block group. This is needed to
    1233             :          * avoid races with trimming and scrub.
    1234             :          *
    1235             :          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
    1236             :          * completely transactionless, so while it is trimming a range the
    1237             :          * currently running transaction might finish and a new one start,
    1238             :          * allowing for new block groups to be created that can reuse the same
    1239             :          * physical device locations unless we take this special care.
    1240             :          *
    1241             :          * There may also be an implicit trim operation if the file system
    1242             :          * is mounted with -odiscard. The same protections must remain
    1243             :          * in place until the extents have been discarded completely when
    1244             :          * the transaction commit has completed.
    1245             :          */
    1246         533 :         remove_em = (atomic_read(&block_group->frozen) == 0);
    1247         533 :         spin_unlock(&block_group->lock);
    1248             : 
    1249         533 :         if (remove_em) {
    1250         533 :                 struct extent_map_tree *em_tree;
    1251             : 
    1252         533 :                 em_tree = &fs_info->mapping_tree;
    1253         533 :                 write_lock(&em_tree->lock);
    1254         533 :                 remove_extent_mapping(em_tree, em);
    1255         533 :                 write_unlock(&em_tree->lock);
    1256             :                 /* once for the tree */
    1257         533 :                 free_extent_map(em);
    1258             :         }
    1259             : 
    1260           0 : out:
    1261             :         /* Once for the lookup reference */
    1262         533 :         btrfs_put_block_group(block_group);
    1263         533 :         if (remove_rsv)
    1264           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    1265         533 :         btrfs_free_path(path);
    1266         533 :         return ret;
    1267             : }
    1268             : 
    1269         533 : struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
    1270             :                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
    1271             : {
    1272         533 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    1273         533 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    1274         533 :         struct extent_map *em;
    1275         533 :         struct map_lookup *map;
    1276         533 :         unsigned int num_items;
    1277             : 
    1278         533 :         read_lock(&em_tree->lock);
    1279         533 :         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
    1280         533 :         read_unlock(&em_tree->lock);
    1281         533 :         ASSERT(em && em->start == chunk_offset);
    1282             : 
    1283             :         /*
    1284             :          * We need to reserve 3 + N units from the metadata space info in order
    1285             :          * to remove a block group (done at btrfs_remove_chunk() and at
    1286             :          * btrfs_remove_block_group()), which are used for:
    1287             :          *
    1288             :          * 1 unit for adding the free space inode's orphan (located in the tree
    1289             :          * of tree roots).
    1290             :          * 1 unit for deleting the block group item (located in the extent
    1291             :          * tree).
    1292             :          * 1 unit for deleting the free space item (located in tree of tree
    1293             :          * roots).
    1294             :          * N units for deleting N device extent items corresponding to each
    1295             :          * stripe (located in the device tree).
    1296             :          *
    1297             :          * In order to remove a block group we also need to reserve units in the
    1298             :          * system space info in order to update the chunk tree (update one or
    1299             :          * more device items and remove one chunk item), but this is done at
    1300             :          * btrfs_remove_chunk() through a call to check_system_chunk().
    1301             :          */
    1302         533 :         map = em->map_lookup;
    1303         533 :         num_items = 3 + map->num_stripes;
    1304         533 :         free_extent_map(em);
    1305             : 
    1306         533 :         return btrfs_start_transaction_fallback_global_rsv(root, num_items);
    1307             : }
    1308             : 
    1309             : /*
    1310             :  * Mark block group @cache read-only, so later write won't happen to block
    1311             :  * group @cache.
    1312             :  *
    1313             :  * If @force is not set, this function will only mark the block group readonly
    1314             :  * if we have enough free space (1M) in other metadata/system block groups.
    1315             :  * If @force is not set, this function will mark the block group readonly
    1316             :  * without checking free space.
    1317             :  *
    1318             :  * NOTE: This function doesn't care if other block groups can contain all the
    1319             :  * data in this block group. That check should be done by relocation routine,
    1320             :  * not this function.
    1321             :  */
    1322         590 : static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
    1323             : {
    1324         590 :         struct btrfs_space_info *sinfo = cache->space_info;
    1325         590 :         u64 num_bytes;
    1326         590 :         int ret = -ENOSPC;
    1327             : 
    1328         590 :         spin_lock(&sinfo->lock);
    1329         590 :         spin_lock(&cache->lock);
    1330             : 
    1331         590 :         if (cache->swap_extents) {
    1332           0 :                 ret = -ETXTBSY;
    1333           0 :                 goto out;
    1334             :         }
    1335             : 
    1336         590 :         if (cache->ro) {
    1337           0 :                 cache->ro++;
    1338           0 :                 ret = 0;
    1339           0 :                 goto out;
    1340             :         }
    1341             : 
    1342         590 :         num_bytes = cache->length - cache->reserved - cache->pinned -
    1343         590 :                     cache->bytes_super - cache->zone_unusable - cache->used;
    1344             : 
    1345             :         /*
    1346             :          * Data never overcommits, even in mixed mode, so do just the straight
    1347             :          * check of left over space in how much we have allocated.
    1348             :          */
    1349         590 :         if (force) {
    1350             :                 ret = 0;
    1351         590 :         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
    1352         277 :                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
    1353             : 
    1354             :                 /*
    1355             :                  * Here we make sure if we mark this bg RO, we still have enough
    1356             :                  * free space as buffer.
    1357             :                  */
    1358         277 :                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
    1359             :                         ret = 0;
    1360             :         } else {
    1361             :                 /*
    1362             :                  * We overcommit metadata, so we need to do the
    1363             :                  * btrfs_can_overcommit check here, and we need to pass in
    1364             :                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
    1365             :                  * leeway to allow us to mark this block group as read only.
    1366             :                  */
    1367         313 :                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
    1368             :                                          BTRFS_RESERVE_NO_FLUSH))
    1369             :                         ret = 0;
    1370             :         }
    1371             : 
    1372             :         if (!ret) {
    1373         557 :                 sinfo->bytes_readonly += num_bytes;
    1374         557 :                 if (btrfs_is_zoned(cache->fs_info)) {
    1375             :                         /* Migrate zone_unusable bytes to readonly */
    1376             :                         sinfo->bytes_readonly += cache->zone_unusable;
    1377             :                         sinfo->bytes_zone_unusable -= cache->zone_unusable;
    1378             :                         cache->zone_unusable = 0;
    1379             :                 }
    1380         557 :                 cache->ro++;
    1381         557 :                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
    1382             :         }
    1383         590 : out:
    1384         590 :         spin_unlock(&cache->lock);
    1385         590 :         spin_unlock(&sinfo->lock);
    1386         590 :         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
    1387           0 :                 btrfs_info(cache->fs_info,
    1388             :                         "unable to make block group %llu ro", cache->start);
    1389           0 :                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
    1390             :         }
    1391         590 :         return ret;
    1392             : }
    1393             : 
    1394          12 : static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
    1395             :                                  struct btrfs_block_group *bg)
    1396             : {
    1397          12 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1398          12 :         struct btrfs_transaction *prev_trans = NULL;
    1399          12 :         const u64 start = bg->start;
    1400          12 :         const u64 end = start + bg->length - 1;
    1401          12 :         int ret;
    1402             : 
    1403          12 :         spin_lock(&fs_info->trans_lock);
    1404          12 :         if (trans->transaction->list.prev != &fs_info->trans_list) {
    1405           0 :                 prev_trans = list_last_entry(&trans->transaction->list,
    1406             :                                              struct btrfs_transaction, list);
    1407           0 :                 refcount_inc(&prev_trans->use_count);
    1408             :         }
    1409          12 :         spin_unlock(&fs_info->trans_lock);
    1410             : 
    1411             :         /*
    1412             :          * Hold the unused_bg_unpin_mutex lock to avoid racing with
    1413             :          * btrfs_finish_extent_commit(). If we are at transaction N, another
    1414             :          * task might be running finish_extent_commit() for the previous
    1415             :          * transaction N - 1, and have seen a range belonging to the block
    1416             :          * group in pinned_extents before we were able to clear the whole block
    1417             :          * group range from pinned_extents. This means that task can lookup for
    1418             :          * the block group after we unpinned it from pinned_extents and removed
    1419             :          * it, leading to a BUG_ON() at unpin_extent_range().
    1420             :          */
    1421          12 :         mutex_lock(&fs_info->unused_bg_unpin_mutex);
    1422          12 :         if (prev_trans) {
    1423           0 :                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
    1424             :                                         EXTENT_DIRTY);
    1425           0 :                 if (ret)
    1426           0 :                         goto out;
    1427             :         }
    1428             : 
    1429          12 :         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
    1430             :                                 EXTENT_DIRTY);
    1431          12 : out:
    1432          12 :         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
    1433          12 :         if (prev_trans)
    1434           0 :                 btrfs_put_transaction(prev_trans);
    1435             : 
    1436          12 :         return ret == 0;
    1437             : }
    1438             : 
    1439             : /*
    1440             :  * Process the unused_bgs list and remove any that don't have any allocated
    1441             :  * space inside of them.
    1442             :  */
    1443       47176 : void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
    1444             : {
    1445       47176 :         struct btrfs_block_group *block_group;
    1446       47176 :         struct btrfs_space_info *space_info;
    1447       47176 :         struct btrfs_trans_handle *trans;
    1448       47176 :         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
    1449       47176 :         int ret = 0;
    1450             : 
    1451       47176 :         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
    1452             :                 return;
    1453             : 
    1454       47176 :         if (btrfs_fs_closing(fs_info))
    1455             :                 return;
    1456             : 
    1457             :         /*
    1458             :          * Long running balances can keep us blocked here for eternity, so
    1459             :          * simply skip deletion if we're unable to get the mutex.
    1460             :          */
    1461       44011 :         if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
    1462             :                 return;
    1463             : 
    1464       43999 :         spin_lock(&fs_info->unused_bgs_lock);
    1465       44080 :         while (!list_empty(&fs_info->unused_bgs)) {
    1466          82 :                 int trimming;
    1467             : 
    1468          82 :                 block_group = list_first_entry(&fs_info->unused_bgs,
    1469             :                                                struct btrfs_block_group,
    1470             :                                                bg_list);
    1471          82 :                 list_del_init(&block_group->bg_list);
    1472             : 
    1473          80 :                 space_info = block_group->space_info;
    1474             : 
    1475          80 :                 if (ret || btrfs_mixed_space_info(space_info)) {
    1476           0 :                         btrfs_put_block_group(block_group);
    1477           0 :                         continue;
    1478             :                 }
    1479          80 :                 spin_unlock(&fs_info->unused_bgs_lock);
    1480             : 
    1481          80 :                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
    1482             : 
    1483             :                 /* Don't want to race with allocators so take the groups_sem */
    1484          80 :                 down_write(&space_info->groups_sem);
    1485             : 
    1486             :                 /*
    1487             :                  * Async discard moves the final block group discard to be prior
    1488             :                  * to the unused_bgs code path.  Therefore, if it's not fully
    1489             :                  * trimmed, punt it back to the async discard lists.
    1490             :                  */
    1491         151 :                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
    1492          71 :                     !btrfs_is_free_space_trimmed(block_group)) {
    1493          41 :                         trace_btrfs_skip_unused_block_group(block_group);
    1494          41 :                         up_write(&space_info->groups_sem);
    1495             :                         /* Requeue if we failed because of async discard */
    1496          41 :                         btrfs_discard_queue_work(&fs_info->discard_ctl,
    1497             :                                                  block_group);
    1498          41 :                         goto next;
    1499             :                 }
    1500             : 
    1501          39 :                 spin_lock(&block_group->lock);
    1502          39 :                 if (block_group->reserved || block_group->pinned ||
    1503          19 :                     block_group->used || block_group->ro ||
    1504          19 :                     list_is_singular(&block_group->list)) {
    1505             :                         /*
    1506             :                          * We want to bail if we made new allocations or have
    1507             :                          * outstanding allocations in this block group.  We do
    1508             :                          * the ro check in case balance is currently acting on
    1509             :                          * this block group.
    1510             :                          */
    1511          27 :                         trace_btrfs_skip_unused_block_group(block_group);
    1512          27 :                         spin_unlock(&block_group->lock);
    1513          27 :                         up_write(&space_info->groups_sem);
    1514          27 :                         goto next;
    1515             :                 }
    1516          12 :                 spin_unlock(&block_group->lock);
    1517             : 
    1518             :                 /* We don't want to force the issue, only flip if it's ok. */
    1519          12 :                 ret = inc_block_group_ro(block_group, 0);
    1520          12 :                 up_write(&space_info->groups_sem);
    1521          12 :                 if (ret < 0) {
    1522           0 :                         ret = 0;
    1523           0 :                         goto next;
    1524             :                 }
    1525             : 
    1526          12 :                 ret = btrfs_zone_finish(block_group);
    1527          12 :                 if (ret < 0) {
    1528             :                         btrfs_dec_block_group_ro(block_group);
    1529             :                         if (ret == -EAGAIN)
    1530             :                                 ret = 0;
    1531             :                         goto next;
    1532             :                 }
    1533             : 
    1534             :                 /*
    1535             :                  * Want to do this before we do anything else so we can recover
    1536             :                  * properly if we fail to join the transaction.
    1537             :                  */
    1538          12 :                 trans = btrfs_start_trans_remove_block_group(fs_info,
    1539             :                                                      block_group->start);
    1540          12 :                 if (IS_ERR(trans)) {
    1541           0 :                         btrfs_dec_block_group_ro(block_group);
    1542           0 :                         ret = PTR_ERR(trans);
    1543           0 :                         goto next;
    1544             :                 }
    1545             : 
    1546             :                 /*
    1547             :                  * We could have pending pinned extents for this block group,
    1548             :                  * just delete them, we don't care about them anymore.
    1549             :                  */
    1550          12 :                 if (!clean_pinned_extents(trans, block_group)) {
    1551           0 :                         btrfs_dec_block_group_ro(block_group);
    1552           0 :                         goto end_trans;
    1553             :                 }
    1554             : 
    1555             :                 /*
    1556             :                  * At this point, the block_group is read only and should fail
    1557             :                  * new allocations.  However, btrfs_finish_extent_commit() can
    1558             :                  * cause this block_group to be placed back on the discard
    1559             :                  * lists because now the block_group isn't fully discarded.
    1560             :                  * Bail here and try again later after discarding everything.
    1561             :                  */
    1562          12 :                 spin_lock(&fs_info->discard_ctl.lock);
    1563          12 :                 if (!list_empty(&block_group->discard_list)) {
    1564           0 :                         spin_unlock(&fs_info->discard_ctl.lock);
    1565           0 :                         btrfs_dec_block_group_ro(block_group);
    1566           0 :                         btrfs_discard_queue_work(&fs_info->discard_ctl,
    1567             :                                                  block_group);
    1568           0 :                         goto end_trans;
    1569             :                 }
    1570          12 :                 spin_unlock(&fs_info->discard_ctl.lock);
    1571             : 
    1572             :                 /* Reset pinned so btrfs_put_block_group doesn't complain */
    1573          12 :                 spin_lock(&space_info->lock);
    1574          12 :                 spin_lock(&block_group->lock);
    1575             : 
    1576          12 :                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
    1577          12 :                                                      -block_group->pinned);
    1578          12 :                 space_info->bytes_readonly += block_group->pinned;
    1579          12 :                 block_group->pinned = 0;
    1580             : 
    1581          12 :                 spin_unlock(&block_group->lock);
    1582          12 :                 spin_unlock(&space_info->lock);
    1583             : 
    1584             :                 /*
    1585             :                  * The normal path here is an unused block group is passed here,
    1586             :                  * then trimming is handled in the transaction commit path.
    1587             :                  * Async discard interposes before this to do the trimming
    1588             :                  * before coming down the unused block group path as trimming
    1589             :                  * will no longer be done later in the transaction commit path.
    1590             :                  */
    1591          12 :                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
    1592           0 :                         goto flip_async;
    1593             : 
    1594             :                 /*
    1595             :                  * DISCARD can flip during remount. On zoned filesystems, we
    1596             :                  * need to reset sequential-required zones.
    1597             :                  */
    1598          12 :                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
    1599             :                                 btrfs_is_zoned(fs_info);
    1600             : 
    1601             :                 /* Implicit trim during transaction commit. */
    1602           0 :                 if (trimming)
    1603           0 :                         btrfs_freeze_block_group(block_group);
    1604             : 
    1605             :                 /*
    1606             :                  * Btrfs_remove_chunk will abort the transaction if things go
    1607             :                  * horribly wrong.
    1608             :                  */
    1609          12 :                 ret = btrfs_remove_chunk(trans, block_group->start);
    1610             : 
    1611          12 :                 if (ret) {
    1612           0 :                         if (trimming)
    1613           0 :                                 btrfs_unfreeze_block_group(block_group);
    1614           0 :                         goto end_trans;
    1615             :                 }
    1616             : 
    1617             :                 /*
    1618             :                  * If we're not mounted with -odiscard, we can just forget
    1619             :                  * about this block group. Otherwise we'll need to wait
    1620             :                  * until transaction commit to do the actual discard.
    1621             :                  */
    1622          12 :                 if (trimming) {
    1623           0 :                         spin_lock(&fs_info->unused_bgs_lock);
    1624             :                         /*
    1625             :                          * A concurrent scrub might have added us to the list
    1626             :                          * fs_info->unused_bgs, so use a list_move operation
    1627             :                          * to add the block group to the deleted_bgs list.
    1628             :                          */
    1629           0 :                         list_move(&block_group->bg_list,
    1630           0 :                                   &trans->transaction->deleted_bgs);
    1631           0 :                         spin_unlock(&fs_info->unused_bgs_lock);
    1632           0 :                         btrfs_get_block_group(block_group);
    1633             :                 }
    1634          12 : end_trans:
    1635          12 :                 btrfs_end_transaction(trans);
    1636          80 : next:
    1637          80 :                 btrfs_put_block_group(block_group);
    1638          80 :                 spin_lock(&fs_info->unused_bgs_lock);
    1639             :         }
    1640       43998 :         spin_unlock(&fs_info->unused_bgs_lock);
    1641       44000 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1642       44000 :         return;
    1643             : 
    1644             : flip_async:
    1645           0 :         btrfs_end_transaction(trans);
    1646           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1647           0 :         btrfs_put_block_group(block_group);
    1648           0 :         btrfs_discard_punt_unused_bgs_list(fs_info);
    1649             : }
    1650             : 
    1651       19011 : void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
    1652             : {
    1653       19011 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1654             : 
    1655       19011 :         spin_lock(&fs_info->unused_bgs_lock);
    1656       19011 :         if (list_empty(&bg->bg_list)) {
    1657       19005 :                 btrfs_get_block_group(bg);
    1658       19005 :                 trace_btrfs_add_unused_block_group(bg);
    1659       19005 :                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
    1660           6 :         } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
    1661             :                 /* Pull out the block group from the reclaim_bgs list. */
    1662           6 :                 trace_btrfs_add_unused_block_group(bg);
    1663           6 :                 list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
    1664             :         }
    1665       19011 :         spin_unlock(&fs_info->unused_bgs_lock);
    1666       19011 : }
    1667             : 
    1668             : /*
    1669             :  * We want block groups with a low number of used bytes to be in the beginning
    1670             :  * of the list, so they will get reclaimed first.
    1671             :  */
    1672           0 : static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
    1673             :                            const struct list_head *b)
    1674             : {
    1675           0 :         const struct btrfs_block_group *bg1, *bg2;
    1676             : 
    1677           0 :         bg1 = list_entry(a, struct btrfs_block_group, bg_list);
    1678           0 :         bg2 = list_entry(b, struct btrfs_block_group, bg_list);
    1679             : 
    1680           0 :         return bg1->used > bg2->used;
    1681             : }
    1682             : 
    1683             : static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
    1684             : {
    1685             :         if (btrfs_is_zoned(fs_info))
    1686             :                 return btrfs_zoned_should_reclaim(fs_info);
    1687             :         return true;
    1688             : }
    1689             : 
    1690     8563323 : static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
    1691             : {
    1692     8563323 :         const struct btrfs_space_info *space_info = bg->space_info;
    1693     8563323 :         const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
    1694     8563323 :         const u64 new_val = bg->used;
    1695     8563323 :         const u64 old_val = new_val + bytes_freed;
    1696     8563323 :         u64 thresh;
    1697             : 
    1698     8563323 :         if (reclaim_thresh == 0)
    1699             :                 return false;
    1700             : 
    1701           0 :         thresh = mult_perc(bg->length, reclaim_thresh);
    1702             : 
    1703             :         /*
    1704             :          * If we were below the threshold before don't reclaim, we are likely a
    1705             :          * brand new block group and we don't want to relocate new block groups.
    1706             :          */
    1707           0 :         if (old_val < thresh)
    1708             :                 return false;
    1709           0 :         if (new_val >= thresh)
    1710           0 :                 return false;
    1711             :         return true;
    1712             : }
    1713             : 
    1714           0 : void btrfs_reclaim_bgs_work(struct work_struct *work)
    1715             : {
    1716           0 :         struct btrfs_fs_info *fs_info =
    1717           0 :                 container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
    1718           0 :         struct btrfs_block_group *bg;
    1719           0 :         struct btrfs_space_info *space_info;
    1720             : 
    1721           0 :         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
    1722             :                 return;
    1723             : 
    1724           0 :         if (btrfs_fs_closing(fs_info))
    1725             :                 return;
    1726             : 
    1727           0 :         if (!btrfs_should_reclaim(fs_info))
    1728             :                 return;
    1729             : 
    1730           0 :         sb_start_write(fs_info->sb);
    1731             : 
    1732           0 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
    1733           0 :                 sb_end_write(fs_info->sb);
    1734           0 :                 return;
    1735             :         }
    1736             : 
    1737             :         /*
    1738             :          * Long running balances can keep us blocked here for eternity, so
    1739             :          * simply skip reclaim if we're unable to get the mutex.
    1740             :          */
    1741           0 :         if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
    1742           0 :                 btrfs_exclop_finish(fs_info);
    1743           0 :                 sb_end_write(fs_info->sb);
    1744           0 :                 return;
    1745             :         }
    1746             : 
    1747           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1748             :         /*
    1749             :          * Sort happens under lock because we can't simply splice it and sort.
    1750             :          * The block groups might still be in use and reachable via bg_list,
    1751             :          * and their presence in the reclaim_bgs list must be preserved.
    1752             :          */
    1753           0 :         list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
    1754           0 :         while (!list_empty(&fs_info->reclaim_bgs)) {
    1755           0 :                 u64 zone_unusable;
    1756           0 :                 int ret = 0;
    1757             : 
    1758           0 :                 bg = list_first_entry(&fs_info->reclaim_bgs,
    1759             :                                       struct btrfs_block_group,
    1760             :                                       bg_list);
    1761           0 :                 list_del_init(&bg->bg_list);
    1762             : 
    1763           0 :                 space_info = bg->space_info;
    1764           0 :                 spin_unlock(&fs_info->unused_bgs_lock);
    1765             : 
    1766             :                 /* Don't race with allocators so take the groups_sem */
    1767           0 :                 down_write(&space_info->groups_sem);
    1768             : 
    1769           0 :                 spin_lock(&bg->lock);
    1770           0 :                 if (bg->reserved || bg->pinned || bg->ro) {
    1771             :                         /*
    1772             :                          * We want to bail if we made new allocations or have
    1773             :                          * outstanding allocations in this block group.  We do
    1774             :                          * the ro check in case balance is currently acting on
    1775             :                          * this block group.
    1776             :                          */
    1777           0 :                         spin_unlock(&bg->lock);
    1778           0 :                         up_write(&space_info->groups_sem);
    1779           0 :                         goto next;
    1780             :                 }
    1781           0 :                 if (bg->used == 0) {
    1782             :                         /*
    1783             :                          * It is possible that we trigger relocation on a block
    1784             :                          * group as its extents are deleted and it first goes
    1785             :                          * below the threshold, then shortly after goes empty.
    1786             :                          *
    1787             :                          * In this case, relocating it does delete it, but has
    1788             :                          * some overhead in relocation specific metadata, looking
    1789             :                          * for the non-existent extents and running some extra
    1790             :                          * transactions, which we can avoid by using one of the
    1791             :                          * other mechanisms for dealing with empty block groups.
    1792             :                          */
    1793           0 :                         if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
    1794           0 :                                 btrfs_mark_bg_unused(bg);
    1795           0 :                         spin_unlock(&bg->lock);
    1796           0 :                         up_write(&space_info->groups_sem);
    1797           0 :                         goto next;
    1798             : 
    1799             :                 }
    1800             :                 /*
    1801             :                  * The block group might no longer meet the reclaim condition by
    1802             :                  * the time we get around to reclaiming it, so to avoid
    1803             :                  * reclaiming overly full block_groups, skip reclaiming them.
    1804             :                  *
    1805             :                  * Since the decision making process also depends on the amount
    1806             :                  * being freed, pass in a fake giant value to skip that extra
    1807             :                  * check, which is more meaningful when adding to the list in
    1808             :                  * the first place.
    1809             :                  */
    1810           0 :                 if (!should_reclaim_block_group(bg, bg->length)) {
    1811           0 :                         spin_unlock(&bg->lock);
    1812           0 :                         up_write(&space_info->groups_sem);
    1813           0 :                         goto next;
    1814             :                 }
    1815           0 :                 spin_unlock(&bg->lock);
    1816             : 
    1817             :                 /*
    1818             :                  * Get out fast, in case we're read-only or unmounting the
    1819             :                  * filesystem. It is OK to drop block groups from the list even
    1820             :                  * for the read-only case. As we did sb_start_write(),
    1821             :                  * "mount -o remount,ro" won't happen and read-only filesystem
    1822             :                  * means it is forced read-only due to a fatal error. So, it
    1823             :                  * never gets back to read-write to let us reclaim again.
    1824             :                  */
    1825           0 :                 if (btrfs_need_cleaner_sleep(fs_info)) {
    1826           0 :                         up_write(&space_info->groups_sem);
    1827           0 :                         goto next;
    1828             :                 }
    1829             : 
    1830             :                 /*
    1831             :                  * Cache the zone_unusable value before turning the block group
    1832             :                  * to read only. As soon as the blog group is read only it's
    1833             :                  * zone_unusable value gets moved to the block group's read-only
    1834             :                  * bytes and isn't available for calculations anymore.
    1835             :                  */
    1836           0 :                 zone_unusable = bg->zone_unusable;
    1837           0 :                 ret = inc_block_group_ro(bg, 0);
    1838           0 :                 up_write(&space_info->groups_sem);
    1839           0 :                 if (ret < 0)
    1840           0 :                         goto next;
    1841             : 
    1842           0 :                 btrfs_info(fs_info,
    1843             :                         "reclaiming chunk %llu with %llu%% used %llu%% unusable",
    1844             :                                 bg->start,
    1845             :                                 div64_u64(bg->used * 100, bg->length),
    1846             :                                 div64_u64(zone_unusable * 100, bg->length));
    1847           0 :                 trace_btrfs_reclaim_block_group(bg);
    1848           0 :                 ret = btrfs_relocate_chunk(fs_info, bg->start);
    1849           0 :                 if (ret) {
    1850           0 :                         btrfs_dec_block_group_ro(bg);
    1851           0 :                         btrfs_err(fs_info, "error relocating chunk %llu",
    1852             :                                   bg->start);
    1853             :                 }
    1854             : 
    1855           0 : next:
    1856           0 :                 if (ret)
    1857           0 :                         btrfs_mark_bg_to_reclaim(bg);
    1858           0 :                 btrfs_put_block_group(bg);
    1859             : 
    1860           0 :                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    1861             :                 /*
    1862             :                  * Reclaiming all the block groups in the list can take really
    1863             :                  * long.  Prioritize cleaning up unused block groups.
    1864             :                  */
    1865           0 :                 btrfs_delete_unused_bgs(fs_info);
    1866             :                 /*
    1867             :                  * If we are interrupted by a balance, we can just bail out. The
    1868             :                  * cleaner thread restart again if necessary.
    1869             :                  */
    1870           0 :                 if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
    1871           0 :                         goto end;
    1872           0 :                 spin_lock(&fs_info->unused_bgs_lock);
    1873             :         }
    1874           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1875           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1876           0 : end:
    1877           0 :         btrfs_exclop_finish(fs_info);
    1878           0 :         sb_end_write(fs_info->sb);
    1879             : }
    1880             : 
    1881       44011 : void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
    1882             : {
    1883       44011 :         spin_lock(&fs_info->unused_bgs_lock);
    1884       44011 :         if (!list_empty(&fs_info->reclaim_bgs))
    1885           0 :                 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
    1886       44011 :         spin_unlock(&fs_info->unused_bgs_lock);
    1887       44011 : }
    1888             : 
    1889           0 : void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
    1890             : {
    1891           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1892             : 
    1893           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1894           0 :         if (list_empty(&bg->bg_list)) {
    1895           0 :                 btrfs_get_block_group(bg);
    1896           0 :                 trace_btrfs_add_reclaim_block_group(bg);
    1897           0 :                 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
    1898             :         }
    1899           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1900           0 : }
    1901             : 
    1902       29306 : static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
    1903             :                            struct btrfs_path *path)
    1904             : {
    1905       29306 :         struct extent_map_tree *em_tree;
    1906       29306 :         struct extent_map *em;
    1907       29306 :         struct btrfs_block_group_item bg;
    1908       29306 :         struct extent_buffer *leaf;
    1909       29306 :         int slot;
    1910       29306 :         u64 flags;
    1911       29306 :         int ret = 0;
    1912             : 
    1913       29306 :         slot = path->slots[0];
    1914       29306 :         leaf = path->nodes[0];
    1915             : 
    1916       29306 :         em_tree = &fs_info->mapping_tree;
    1917       29306 :         read_lock(&em_tree->lock);
    1918       29306 :         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
    1919       29306 :         read_unlock(&em_tree->lock);
    1920       29306 :         if (!em) {
    1921           0 :                 btrfs_err(fs_info,
    1922             :                           "logical %llu len %llu found bg but no related chunk",
    1923             :                           key->objectid, key->offset);
    1924           0 :                 return -ENOENT;
    1925             :         }
    1926             : 
    1927       29306 :         if (em->start != key->objectid || em->len != key->offset) {
    1928           0 :                 btrfs_err(fs_info,
    1929             :                         "block group %llu len %llu mismatch with chunk %llu len %llu",
    1930             :                         key->objectid, key->offset, em->start, em->len);
    1931           0 :                 ret = -EUCLEAN;
    1932           0 :                 goto out_free_em;
    1933             :         }
    1934             : 
    1935       29306 :         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
    1936             :                            sizeof(bg));
    1937       29306 :         flags = btrfs_stack_block_group_flags(&bg) &
    1938             :                 BTRFS_BLOCK_GROUP_TYPE_MASK;
    1939             : 
    1940       29306 :         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    1941           0 :                 btrfs_err(fs_info,
    1942             : "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
    1943             :                           key->objectid, key->offset, flags,
    1944             :                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
    1945           0 :                 ret = -EUCLEAN;
    1946             :         }
    1947             : 
    1948       29306 : out_free_em:
    1949       29306 :         free_extent_map(em);
    1950       29306 :         return ret;
    1951             : }
    1952             : 
    1953       32523 : static int find_first_block_group(struct btrfs_fs_info *fs_info,
    1954             :                                   struct btrfs_path *path,
    1955             :                                   struct btrfs_key *key)
    1956             : {
    1957       32523 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    1958       32523 :         int ret;
    1959       32523 :         struct btrfs_key found_key;
    1960             : 
    1961     7802489 :         btrfs_for_each_slot(root, key, &found_key, path, ret) {
    1962     7799272 :                 if (found_key.objectid >= key->objectid &&
    1963     7799272 :                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
    1964       29306 :                         return read_bg_from_eb(fs_info, &found_key, path);
    1965             :                 }
    1966             :         }
    1967             :         return ret;
    1968             : }
    1969             : 
    1970       30776 : static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
    1971             : {
    1972       30776 :         u64 extra_flags = chunk_to_extended(flags) &
    1973             :                                 BTRFS_EXTENDED_PROFILE_MASK;
    1974             : 
    1975       30776 :         write_seqlock(&fs_info->profiles_lock);
    1976       30776 :         if (flags & BTRFS_BLOCK_GROUP_DATA)
    1977       23899 :                 fs_info->avail_data_alloc_bits |= extra_flags;
    1978       30776 :         if (flags & BTRFS_BLOCK_GROUP_METADATA)
    1979        3754 :                 fs_info->avail_metadata_alloc_bits |= extra_flags;
    1980       30776 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    1981        3314 :                 fs_info->avail_system_alloc_bits |= extra_flags;
    1982       30776 :         write_sequnlock(&fs_info->profiles_lock);
    1983       30776 : }
    1984             : 
    1985             : /*
    1986             :  * Map a physical disk address to a list of logical addresses.
    1987             :  *
    1988             :  * @fs_info:       the filesystem
    1989             :  * @chunk_start:   logical address of block group
    1990             :  * @physical:      physical address to map to logical addresses
    1991             :  * @logical:       return array of logical addresses which map to @physical
    1992             :  * @naddrs:        length of @logical
    1993             :  * @stripe_len:    size of IO stripe for the given block group
    1994             :  *
    1995             :  * Maps a particular @physical disk address to a list of @logical addresses.
    1996             :  * Used primarily to exclude those portions of a block group that contain super
    1997             :  * block copies.
    1998             :  */
    1999       92328 : int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
    2000             :                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
    2001             : {
    2002       92328 :         struct extent_map *em;
    2003       92328 :         struct map_lookup *map;
    2004       92328 :         u64 *buf;
    2005       92328 :         u64 bytenr;
    2006       92328 :         u64 data_stripe_length;
    2007       92328 :         u64 io_stripe_size;
    2008       92328 :         int i, nr = 0;
    2009       92328 :         int ret = 0;
    2010             : 
    2011       92328 :         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
    2012       92328 :         if (IS_ERR(em))
    2013             :                 return -EIO;
    2014             : 
    2015       92328 :         map = em->map_lookup;
    2016       92328 :         data_stripe_length = em->orig_block_len;
    2017       92328 :         io_stripe_size = BTRFS_STRIPE_LEN;
    2018       92328 :         chunk_start = em->start;
    2019             : 
    2020             :         /* For RAID5/6 adjust to a full IO stripe length */
    2021       92328 :         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
    2022           0 :                 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
    2023             : 
    2024       92328 :         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
    2025       92328 :         if (!buf) {
    2026           0 :                 ret = -ENOMEM;
    2027           0 :                 goto out;
    2028             :         }
    2029             : 
    2030      205140 :         for (i = 0; i < map->num_stripes; i++) {
    2031      112812 :                 bool already_inserted = false;
    2032      112812 :                 u32 stripe_nr;
    2033      112812 :                 u32 offset;
    2034      112812 :                 int j;
    2035             : 
    2036      112812 :                 if (!in_range(physical, map->stripes[i].physical,
    2037             :                               data_stripe_length))
    2038      109564 :                         continue;
    2039             : 
    2040        3248 :                 stripe_nr = (physical - map->stripes[i].physical) >>
    2041             :                             BTRFS_STRIPE_LEN_SHIFT;
    2042        3248 :                 offset = (physical - map->stripes[i].physical) &
    2043             :                          BTRFS_STRIPE_LEN_MASK;
    2044             : 
    2045        3248 :                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
    2046             :                                  BTRFS_BLOCK_GROUP_RAID10))
    2047           0 :                         stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
    2048           0 :                                             map->sub_stripes);
    2049             :                 /*
    2050             :                  * The remaining case would be for RAID56, multiply by
    2051             :                  * nr_data_stripes().  Alternatively, just use rmap_len below
    2052             :                  * instead of map->stripe_len
    2053             :                  */
    2054        3248 :                 bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
    2055             : 
    2056             :                 /* Ensure we don't add duplicate addresses */
    2057        3249 :                 for (j = 0; j < nr; j++) {
    2058           1 :                         if (buf[j] == bytenr) {
    2059             :                                 already_inserted = true;
    2060             :                                 break;
    2061             :                         }
    2062             :                 }
    2063             : 
    2064        3248 :                 if (!already_inserted)
    2065        3248 :                         buf[nr++] = bytenr;
    2066             :         }
    2067             : 
    2068       92328 :         *logical = buf;
    2069       92328 :         *naddrs = nr;
    2070       92328 :         *stripe_len = io_stripe_size;
    2071       92328 : out:
    2072       92328 :         free_extent_map(em);
    2073       92328 :         return ret;
    2074             : }
    2075             : 
    2076       30776 : static int exclude_super_stripes(struct btrfs_block_group *cache)
    2077             : {
    2078       30776 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    2079       30776 :         const bool zoned = btrfs_is_zoned(fs_info);
    2080       30776 :         u64 bytenr;
    2081       30776 :         u64 *logical;
    2082       30776 :         int stripe_len;
    2083       30776 :         int i, nr, ret;
    2084             : 
    2085       30776 :         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
    2086           0 :                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
    2087           0 :                 cache->bytes_super += stripe_len;
    2088           0 :                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
    2089             :                                                 stripe_len);
    2090           0 :                 if (ret)
    2091             :                         return ret;
    2092             :         }
    2093             : 
    2094      123104 :         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    2095       92328 :                 bytenr = btrfs_sb_offset(i);
    2096       92328 :                 ret = btrfs_rmap_block(fs_info, cache->start,
    2097             :                                        bytenr, &logical, &nr, &stripe_len);
    2098       92328 :                 if (ret)
    2099           0 :                         return ret;
    2100             : 
    2101             :                 /* Shouldn't have super stripes in sequential zones */
    2102             :                 if (zoned && nr) {
    2103             :                         kfree(logical);
    2104             :                         btrfs_err(fs_info,
    2105             :                         "zoned: block group %llu must not contain super block",
    2106             :                                   cache->start);
    2107             :                         return -EUCLEAN;
    2108             :                 }
    2109             : 
    2110       95576 :                 while (nr--) {
    2111        3248 :                         u64 len = min_t(u64, stripe_len,
    2112             :                                 cache->start + cache->length - logical[nr]);
    2113             : 
    2114        3248 :                         cache->bytes_super += len;
    2115        3248 :                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
    2116             :                                                         len);
    2117        3248 :                         if (ret) {
    2118           0 :                                 kfree(logical);
    2119           0 :                                 return ret;
    2120             :                         }
    2121             :                 }
    2122             : 
    2123       92328 :                 kfree(logical);
    2124             :         }
    2125             :         return 0;
    2126             : }
    2127             : 
    2128       30776 : static struct btrfs_block_group *btrfs_create_block_group_cache(
    2129             :                 struct btrfs_fs_info *fs_info, u64 start)
    2130             : {
    2131       30776 :         struct btrfs_block_group *cache;
    2132             : 
    2133       30776 :         cache = kzalloc(sizeof(*cache), GFP_NOFS);
    2134       30776 :         if (!cache)
    2135             :                 return NULL;
    2136             : 
    2137       30776 :         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
    2138             :                                         GFP_NOFS);
    2139       30776 :         if (!cache->free_space_ctl) {
    2140           0 :                 kfree(cache);
    2141           0 :                 return NULL;
    2142             :         }
    2143             : 
    2144       30776 :         cache->start = start;
    2145             : 
    2146       30776 :         cache->fs_info = fs_info;
    2147       30776 :         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
    2148             : 
    2149       30776 :         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
    2150             : 
    2151       30776 :         refcount_set(&cache->refs, 1);
    2152       30776 :         spin_lock_init(&cache->lock);
    2153       30776 :         init_rwsem(&cache->data_rwsem);
    2154       30776 :         INIT_LIST_HEAD(&cache->list);
    2155       30776 :         INIT_LIST_HEAD(&cache->cluster_list);
    2156       30776 :         INIT_LIST_HEAD(&cache->bg_list);
    2157       30776 :         INIT_LIST_HEAD(&cache->ro_list);
    2158       30776 :         INIT_LIST_HEAD(&cache->discard_list);
    2159       30776 :         INIT_LIST_HEAD(&cache->dirty_list);
    2160       30776 :         INIT_LIST_HEAD(&cache->io_list);
    2161       30776 :         INIT_LIST_HEAD(&cache->active_bg_list);
    2162       30776 :         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
    2163       30776 :         atomic_set(&cache->frozen, 0);
    2164       30776 :         mutex_init(&cache->free_space_lock);
    2165             : 
    2166       30776 :         return cache;
    2167             : }
    2168             : 
    2169             : /*
    2170             :  * Iterate all chunks and verify that each of them has the corresponding block
    2171             :  * group
    2172             :  */
    2173        3217 : static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
    2174             : {
    2175        3217 :         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
    2176        3217 :         struct extent_map *em;
    2177        3217 :         struct btrfs_block_group *bg;
    2178        3217 :         u64 start = 0;
    2179        3217 :         int ret = 0;
    2180             : 
    2181        3217 :         while (1) {
    2182        3217 :                 read_lock(&map_tree->lock);
    2183             :                 /*
    2184             :                  * lookup_extent_mapping will return the first extent map
    2185             :                  * intersecting the range, so setting @len to 1 is enough to
    2186             :                  * get the first chunk.
    2187             :                  */
    2188        3217 :                 em = lookup_extent_mapping(map_tree, start, 1);
    2189        3217 :                 read_unlock(&map_tree->lock);
    2190        3217 :                 if (!em)
    2191             :                         break;
    2192             : 
    2193           0 :                 bg = btrfs_lookup_block_group(fs_info, em->start);
    2194           0 :                 if (!bg) {
    2195           0 :                         btrfs_err(fs_info,
    2196             :         "chunk start=%llu len=%llu doesn't have corresponding block group",
    2197             :                                      em->start, em->len);
    2198           0 :                         ret = -EUCLEAN;
    2199           0 :                         free_extent_map(em);
    2200           0 :                         break;
    2201             :                 }
    2202           0 :                 if (bg->start != em->start || bg->length != em->len ||
    2203           0 :                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
    2204           0 :                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    2205           0 :                         btrfs_err(fs_info,
    2206             : "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
    2207             :                                 em->start, em->len,
    2208             :                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
    2209             :                                 bg->start, bg->length,
    2210             :                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
    2211           0 :                         ret = -EUCLEAN;
    2212           0 :                         free_extent_map(em);
    2213           0 :                         btrfs_put_block_group(bg);
    2214           0 :                         break;
    2215             :                 }
    2216           0 :                 start = em->start + em->len;
    2217           0 :                 free_extent_map(em);
    2218           0 :                 btrfs_put_block_group(bg);
    2219             :         }
    2220        3217 :         return ret;
    2221             : }
    2222             : 
    2223       29306 : static int read_one_block_group(struct btrfs_fs_info *info,
    2224             :                                 struct btrfs_block_group_item *bgi,
    2225             :                                 const struct btrfs_key *key,
    2226             :                                 int need_clear)
    2227             : {
    2228       29306 :         struct btrfs_block_group *cache;
    2229       29306 :         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
    2230       29306 :         int ret;
    2231             : 
    2232       29306 :         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
    2233             : 
    2234       29306 :         cache = btrfs_create_block_group_cache(info, key->objectid);
    2235       29306 :         if (!cache)
    2236             :                 return -ENOMEM;
    2237             : 
    2238       29306 :         cache->length = key->offset;
    2239       29306 :         cache->used = btrfs_stack_block_group_used(bgi);
    2240       29306 :         cache->commit_used = cache->used;
    2241       29306 :         cache->flags = btrfs_stack_block_group_flags(bgi);
    2242       29306 :         cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
    2243             : 
    2244       29306 :         set_free_space_tree_thresholds(cache);
    2245             : 
    2246       29306 :         if (need_clear) {
    2247             :                 /*
    2248             :                  * When we mount with old space cache, we need to
    2249             :                  * set BTRFS_DC_CLEAR and set dirty flag.
    2250             :                  *
    2251             :                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
    2252             :                  *    truncate the old free space cache inode and
    2253             :                  *    setup a new one.
    2254             :                  * b) Setting 'dirty flag' makes sure that we flush
    2255             :                  *    the new space cache info onto disk.
    2256             :                  */
    2257         361 :                 if (btrfs_test_opt(info, SPACE_CACHE))
    2258         343 :                         cache->disk_cache_state = BTRFS_DC_CLEAR;
    2259             :         }
    2260       29306 :         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
    2261             :             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
    2262           0 :                         btrfs_err(info,
    2263             : "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
    2264             :                                   cache->start);
    2265           0 :                         ret = -EINVAL;
    2266           0 :                         goto error;
    2267             :         }
    2268             : 
    2269       29306 :         ret = btrfs_load_block_group_zone_info(cache, false);
    2270       29306 :         if (ret) {
    2271             :                 btrfs_err(info, "zoned: failed to load zone info of bg %llu",
    2272             :                           cache->start);
    2273             :                 goto error;
    2274             :         }
    2275             : 
    2276             :         /*
    2277             :          * We need to exclude the super stripes now so that the space info has
    2278             :          * super bytes accounted for, otherwise we'll think we have more space
    2279             :          * than we actually do.
    2280             :          */
    2281       29306 :         ret = exclude_super_stripes(cache);
    2282       29306 :         if (ret) {
    2283             :                 /* We may have excluded something, so call this just in case. */
    2284           0 :                 btrfs_free_excluded_extents(cache);
    2285           0 :                 goto error;
    2286             :         }
    2287             : 
    2288             :         /*
    2289             :          * For zoned filesystem, space after the allocation offset is the only
    2290             :          * free space for a block group. So, we don't need any caching work.
    2291             :          * btrfs_calc_zone_unusable() will set the amount of free space and
    2292             :          * zone_unusable space.
    2293             :          *
    2294             :          * For regular filesystem, check for two cases, either we are full, and
    2295             :          * therefore don't need to bother with the caching work since we won't
    2296             :          * find any space, or we are empty, and we can just add all the space
    2297             :          * in and be done with it.  This saves us _a_lot_ of time, particularly
    2298             :          * in the full case.
    2299             :          */
    2300       29306 :         if (btrfs_is_zoned(info)) {
    2301             :                 btrfs_calc_zone_unusable(cache);
    2302             :                 /* Should not have any excluded extents. Just in case, though. */
    2303             :                 btrfs_free_excluded_extents(cache);
    2304       29306 :         } else if (cache->length == cache->used) {
    2305         414 :                 cache->cached = BTRFS_CACHE_FINISHED;
    2306         414 :                 btrfs_free_excluded_extents(cache);
    2307       28892 :         } else if (cache->used == 0) {
    2308       18670 :                 cache->cached = BTRFS_CACHE_FINISHED;
    2309       18670 :                 ret = add_new_free_space(cache, cache->start,
    2310       18670 :                                          cache->start + cache->length, NULL);
    2311       18670 :                 btrfs_free_excluded_extents(cache);
    2312       18670 :                 if (ret)
    2313           0 :                         goto error;
    2314             :         }
    2315             : 
    2316       29306 :         ret = btrfs_add_block_group_cache(info, cache);
    2317       29306 :         if (ret) {
    2318           0 :                 btrfs_remove_free_space_cache(cache);
    2319           0 :                 goto error;
    2320             :         }
    2321       29306 :         trace_btrfs_add_block_group(info, cache, 0);
    2322       29306 :         btrfs_add_bg_to_space_info(info, cache);
    2323             : 
    2324       29306 :         set_avail_alloc_bits(info, cache->flags);
    2325       29306 :         if (btrfs_chunk_writeable(info, cache->start)) {
    2326       29306 :                 if (cache->used == 0) {
    2327       18670 :                         ASSERT(list_empty(&cache->bg_list));
    2328       18670 :                         if (btrfs_test_opt(info, DISCARD_ASYNC))
    2329           1 :                                 btrfs_discard_queue_work(&info->discard_ctl, cache);
    2330             :                         else
    2331       18669 :                                 btrfs_mark_bg_unused(cache);
    2332             :                 }
    2333             :         } else {
    2334           0 :                 inc_block_group_ro(cache, 1);
    2335             :         }
    2336             : 
    2337             :         return 0;
    2338           0 : error:
    2339           0 :         btrfs_put_block_group(cache);
    2340           0 :         return ret;
    2341             : }
    2342             : 
    2343           0 : static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
    2344             : {
    2345           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    2346           0 :         struct rb_node *node;
    2347           0 :         int ret = 0;
    2348             : 
    2349           0 :         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
    2350           0 :                 struct extent_map *em;
    2351           0 :                 struct map_lookup *map;
    2352           0 :                 struct btrfs_block_group *bg;
    2353             : 
    2354           0 :                 em = rb_entry(node, struct extent_map, rb_node);
    2355           0 :                 map = em->map_lookup;
    2356           0 :                 bg = btrfs_create_block_group_cache(fs_info, em->start);
    2357           0 :                 if (!bg) {
    2358             :                         ret = -ENOMEM;
    2359             :                         break;
    2360             :                 }
    2361             : 
    2362             :                 /* Fill dummy cache as FULL */
    2363           0 :                 bg->length = em->len;
    2364           0 :                 bg->flags = map->type;
    2365           0 :                 bg->cached = BTRFS_CACHE_FINISHED;
    2366           0 :                 bg->used = em->len;
    2367           0 :                 bg->flags = map->type;
    2368           0 :                 ret = btrfs_add_block_group_cache(fs_info, bg);
    2369             :                 /*
    2370             :                  * We may have some valid block group cache added already, in
    2371             :                  * that case we skip to the next one.
    2372             :                  */
    2373           0 :                 if (ret == -EEXIST) {
    2374           0 :                         ret = 0;
    2375           0 :                         btrfs_put_block_group(bg);
    2376           0 :                         continue;
    2377             :                 }
    2378             : 
    2379           0 :                 if (ret) {
    2380           0 :                         btrfs_remove_free_space_cache(bg);
    2381           0 :                         btrfs_put_block_group(bg);
    2382           0 :                         break;
    2383             :                 }
    2384             : 
    2385           0 :                 btrfs_add_bg_to_space_info(fs_info, bg);
    2386             : 
    2387           0 :                 set_avail_alloc_bits(fs_info, bg->flags);
    2388             :         }
    2389           0 :         if (!ret)
    2390           0 :                 btrfs_init_global_block_rsv(fs_info);
    2391           0 :         return ret;
    2392             : }
    2393             : 
    2394        3217 : int btrfs_read_block_groups(struct btrfs_fs_info *info)
    2395             : {
    2396        3217 :         struct btrfs_root *root = btrfs_block_group_root(info);
    2397        3217 :         struct btrfs_path *path;
    2398        3217 :         int ret;
    2399        3217 :         struct btrfs_block_group *cache;
    2400        3217 :         struct btrfs_space_info *space_info;
    2401        3217 :         struct btrfs_key key;
    2402        3217 :         int need_clear = 0;
    2403        3217 :         u64 cache_gen;
    2404             : 
    2405             :         /*
    2406             :          * Either no extent root (with ibadroots rescue option) or we have
    2407             :          * unsupported RO options. The fs can never be mounted read-write, so no
    2408             :          * need to waste time searching block group items.
    2409             :          *
    2410             :          * This also allows new extent tree related changes to be RO compat,
    2411             :          * no need for a full incompat flag.
    2412             :          */
    2413        3217 :         if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
    2414             :                       ~BTRFS_FEATURE_COMPAT_RO_SUPP))
    2415           0 :                 return fill_dummy_bgs(info);
    2416             : 
    2417        3217 :         key.objectid = 0;
    2418        3217 :         key.offset = 0;
    2419        3217 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2420        3217 :         path = btrfs_alloc_path();
    2421        3217 :         if (!path)
    2422             :                 return -ENOMEM;
    2423             : 
    2424        3217 :         cache_gen = btrfs_super_cache_generation(info->super_copy);
    2425        3217 :         if (btrfs_test_opt(info, SPACE_CACHE) &&
    2426             :             btrfs_super_generation(info->super_copy) != cache_gen)
    2427           7 :                 need_clear = 1;
    2428        3217 :         if (btrfs_test_opt(info, CLEAR_CACHE))
    2429          10 :                 need_clear = 1;
    2430             : 
    2431       61829 :         while (1) {
    2432       32523 :                 struct btrfs_block_group_item bgi;
    2433       32523 :                 struct extent_buffer *leaf;
    2434       32523 :                 int slot;
    2435             : 
    2436       32523 :                 ret = find_first_block_group(info, path, &key);
    2437       32523 :                 if (ret > 0)
    2438             :                         break;
    2439       29306 :                 if (ret != 0)
    2440           0 :                         goto error;
    2441             : 
    2442       29306 :                 leaf = path->nodes[0];
    2443       29306 :                 slot = path->slots[0];
    2444             : 
    2445       29306 :                 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
    2446             :                                    sizeof(bgi));
    2447             : 
    2448       29306 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    2449       29306 :                 btrfs_release_path(path);
    2450       29306 :                 ret = read_one_block_group(info, &bgi, &key, need_clear);
    2451       29306 :                 if (ret < 0)
    2452           0 :                         goto error;
    2453       29306 :                 key.objectid += key.offset;
    2454       29306 :                 key.offset = 0;
    2455             :         }
    2456        3217 :         btrfs_release_path(path);
    2457             : 
    2458       12838 :         list_for_each_entry(space_info, &info->space_info, list) {
    2459             :                 int i;
    2460             : 
    2461       96210 :                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
    2462       86589 :                         if (list_empty(&space_info->block_groups[i]))
    2463       76968 :                                 continue;
    2464        9621 :                         cache = list_first_entry(&space_info->block_groups[i],
    2465             :                                                  struct btrfs_block_group,
    2466             :                                                  list);
    2467        9621 :                         btrfs_sysfs_add_block_group_type(cache);
    2468             :                 }
    2469             : 
    2470        9621 :                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
    2471             :                       (BTRFS_BLOCK_GROUP_RAID10 |
    2472             :                        BTRFS_BLOCK_GROUP_RAID1_MASK |
    2473             :                        BTRFS_BLOCK_GROUP_RAID56_MASK |
    2474             :                        BTRFS_BLOCK_GROUP_DUP)))
    2475        3259 :                         continue;
    2476             :                 /*
    2477             :                  * Avoid allocating from un-mirrored block group if there are
    2478             :                  * mirrored block groups.
    2479             :                  */
    2480        6362 :                 list_for_each_entry(cache,
    2481             :                                 &space_info->block_groups[BTRFS_RAID_RAID0],
    2482             :                                 list)
    2483           0 :                         inc_block_group_ro(cache, 1);
    2484        6362 :                 list_for_each_entry(cache,
    2485             :                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
    2486             :                                 list)
    2487           0 :                         inc_block_group_ro(cache, 1);
    2488             :         }
    2489             : 
    2490        3217 :         btrfs_init_global_block_rsv(info);
    2491        3217 :         ret = check_chunk_block_group_mappings(info);
    2492        3217 : error:
    2493        3217 :         btrfs_free_path(path);
    2494             :         /*
    2495             :          * We've hit some error while reading the extent tree, and have
    2496             :          * rescue=ibadroots mount option.
    2497             :          * Try to fill the tree using dummy block groups so that the user can
    2498             :          * continue to mount and grab their data.
    2499             :          */
    2500        3217 :         if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
    2501           0 :                 ret = fill_dummy_bgs(info);
    2502             :         return ret;
    2503             : }
    2504             : 
    2505             : /*
    2506             :  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
    2507             :  * allocation.
    2508             :  *
    2509             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2510             :  * phases.
    2511             :  */
    2512        1470 : static int insert_block_group_item(struct btrfs_trans_handle *trans,
    2513             :                                    struct btrfs_block_group *block_group)
    2514             : {
    2515        1470 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2516        1470 :         struct btrfs_block_group_item bgi;
    2517        1470 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2518        1470 :         struct btrfs_key key;
    2519        1470 :         u64 old_commit_used;
    2520        1470 :         int ret;
    2521             : 
    2522        1470 :         spin_lock(&block_group->lock);
    2523        1470 :         btrfs_set_stack_block_group_used(&bgi, block_group->used);
    2524        1470 :         btrfs_set_stack_block_group_chunk_objectid(&bgi,
    2525             :                                                    block_group->global_root_id);
    2526        1470 :         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
    2527        1470 :         old_commit_used = block_group->commit_used;
    2528        1470 :         block_group->commit_used = block_group->used;
    2529        1470 :         key.objectid = block_group->start;
    2530        1470 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2531        1470 :         key.offset = block_group->length;
    2532        1470 :         spin_unlock(&block_group->lock);
    2533             : 
    2534        1470 :         ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
    2535        1470 :         if (ret < 0) {
    2536           0 :                 spin_lock(&block_group->lock);
    2537           0 :                 block_group->commit_used = old_commit_used;
    2538           0 :                 spin_unlock(&block_group->lock);
    2539             :         }
    2540             : 
    2541        1470 :         return ret;
    2542             : }
    2543             : 
    2544        1800 : static int insert_dev_extent(struct btrfs_trans_handle *trans,
    2545             :                             struct btrfs_device *device, u64 chunk_offset,
    2546             :                             u64 start, u64 num_bytes)
    2547             : {
    2548        1800 :         struct btrfs_fs_info *fs_info = device->fs_info;
    2549        1800 :         struct btrfs_root *root = fs_info->dev_root;
    2550        1800 :         struct btrfs_path *path;
    2551        1800 :         struct btrfs_dev_extent *extent;
    2552        1800 :         struct extent_buffer *leaf;
    2553        1800 :         struct btrfs_key key;
    2554        1800 :         int ret;
    2555             : 
    2556        1800 :         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
    2557        1800 :         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
    2558        1800 :         path = btrfs_alloc_path();
    2559        1800 :         if (!path)
    2560             :                 return -ENOMEM;
    2561             : 
    2562        1800 :         key.objectid = device->devid;
    2563        1800 :         key.type = BTRFS_DEV_EXTENT_KEY;
    2564        1800 :         key.offset = start;
    2565        1800 :         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
    2566        1800 :         if (ret)
    2567           0 :                 goto out;
    2568             : 
    2569        1800 :         leaf = path->nodes[0];
    2570        1800 :         extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
    2571        1800 :         btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
    2572        1800 :         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
    2573             :                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
    2574        1800 :         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
    2575             : 
    2576        1800 :         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
    2577        1800 :         btrfs_mark_buffer_dirty(leaf);
    2578        1800 : out:
    2579        1800 :         btrfs_free_path(path);
    2580        1800 :         return ret;
    2581             : }
    2582             : 
    2583             : /*
    2584             :  * This function belongs to phase 2.
    2585             :  *
    2586             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2587             :  * phases.
    2588             :  */
    2589        1470 : static int insert_dev_extents(struct btrfs_trans_handle *trans,
    2590             :                                    u64 chunk_offset, u64 chunk_size)
    2591             : {
    2592        1470 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2593        1470 :         struct btrfs_device *device;
    2594        1470 :         struct extent_map *em;
    2595        1470 :         struct map_lookup *map;
    2596        1470 :         u64 dev_offset;
    2597        1470 :         u64 stripe_size;
    2598        1470 :         int i;
    2599        1470 :         int ret = 0;
    2600             : 
    2601        1470 :         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
    2602        1470 :         if (IS_ERR(em))
    2603           0 :                 return PTR_ERR(em);
    2604             : 
    2605        1470 :         map = em->map_lookup;
    2606        1470 :         stripe_size = em->orig_block_len;
    2607             : 
    2608             :         /*
    2609             :          * Take the device list mutex to prevent races with the final phase of
    2610             :          * a device replace operation that replaces the device object associated
    2611             :          * with the map's stripes, because the device object's id can change
    2612             :          * at any time during that final phase of the device replace operation
    2613             :          * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
    2614             :          * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
    2615             :          * resulting in persisting a device extent item with such ID.
    2616             :          */
    2617        1470 :         mutex_lock(&fs_info->fs_devices->device_list_mutex);
    2618        4740 :         for (i = 0; i < map->num_stripes; i++) {
    2619        1800 :                 device = map->stripes[i].dev;
    2620        1800 :                 dev_offset = map->stripes[i].physical;
    2621             : 
    2622        1800 :                 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
    2623             :                                        stripe_size);
    2624        1800 :                 if (ret)
    2625             :                         break;
    2626             :         }
    2627        1470 :         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2628             : 
    2629        1470 :         free_extent_map(em);
    2630        1470 :         return ret;
    2631             : }
    2632             : 
    2633             : /*
    2634             :  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
    2635             :  * chunk allocation.
    2636             :  *
    2637             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2638             :  * phases.
    2639             :  */
    2640    56660928 : void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
    2641             : {
    2642    56660928 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2643    56660928 :         struct btrfs_block_group *block_group;
    2644    56660928 :         int ret = 0;
    2645             : 
    2646    56660631 :         while (!list_empty(&trans->new_bgs)) {
    2647        1470 :                 int index;
    2648             : 
    2649        1470 :                 block_group = list_first_entry(&trans->new_bgs,
    2650             :                                                struct btrfs_block_group,
    2651             :                                                bg_list);
    2652        1470 :                 if (ret)
    2653           0 :                         goto next;
    2654             : 
    2655        1470 :                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
    2656             : 
    2657        1470 :                 ret = insert_block_group_item(trans, block_group);
    2658        1470 :                 if (ret)
    2659           0 :                         btrfs_abort_transaction(trans, ret);
    2660        1470 :                 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
    2661             :                               &block_group->runtime_flags)) {
    2662           0 :                         mutex_lock(&fs_info->chunk_mutex);
    2663           0 :                         ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
    2664           0 :                         mutex_unlock(&fs_info->chunk_mutex);
    2665           0 :                         if (ret)
    2666           0 :                                 btrfs_abort_transaction(trans, ret);
    2667             :                 }
    2668        1470 :                 ret = insert_dev_extents(trans, block_group->start,
    2669             :                                          block_group->length);
    2670        1470 :                 if (ret)
    2671           0 :                         btrfs_abort_transaction(trans, ret);
    2672        1470 :                 add_block_group_free_space(trans, block_group);
    2673             : 
    2674             :                 /*
    2675             :                  * If we restriped during balance, we may have added a new raid
    2676             :                  * type, so now add the sysfs entries when it is safe to do so.
    2677             :                  * We don't have to worry about locking here as it's handled in
    2678             :                  * btrfs_sysfs_add_block_group_type.
    2679             :                  */
    2680        1470 :                 if (block_group->space_info->block_group_kobjs[index] == NULL)
    2681           0 :                         btrfs_sysfs_add_block_group_type(block_group);
    2682             : 
    2683             :                 /* Already aborted the transaction if it failed. */
    2684        1470 : next:
    2685        1470 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    2686        1470 :                 list_del_init(&block_group->bg_list);
    2687        1470 :                 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
    2688             :         }
    2689    56659161 :         btrfs_trans_release_chunk_metadata(trans);
    2690    56656539 : }
    2691             : 
    2692             : /*
    2693             :  * For extent tree v2 we use the block_group_item->chunk_offset to point at our
    2694             :  * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
    2695             :  */
    2696             : static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
    2697             : {
    2698        1470 :         u64 div = SZ_1G;
    2699        1470 :         u64 index;
    2700             : 
    2701        1470 :         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
    2702             :                 return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    2703             : 
    2704             :         /* If we have a smaller fs index based on 128MiB. */
    2705           0 :         if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
    2706           0 :                 div = SZ_128M;
    2707             : 
    2708           0 :         offset = div64_u64(offset, div);
    2709           0 :         div64_u64_rem(offset, fs_info->nr_global_roots, &index);
    2710           0 :         return index;
    2711             : }
    2712             : 
    2713        1470 : struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
    2714             :                                                  u64 type,
    2715             :                                                  u64 chunk_offset, u64 size)
    2716             : {
    2717        1470 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2718        1470 :         struct btrfs_block_group *cache;
    2719        1470 :         int ret;
    2720             : 
    2721        1470 :         btrfs_set_log_full_commit(trans);
    2722             : 
    2723        1470 :         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
    2724        1470 :         if (!cache)
    2725             :                 return ERR_PTR(-ENOMEM);
    2726             : 
    2727             :         /*
    2728             :          * Mark it as new before adding it to the rbtree of block groups or any
    2729             :          * list, so that no other task finds it and calls btrfs_mark_bg_unused()
    2730             :          * before the new flag is set.
    2731             :          */
    2732        1470 :         set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
    2733             : 
    2734        1470 :         cache->length = size;
    2735        1470 :         set_free_space_tree_thresholds(cache);
    2736        1470 :         cache->flags = type;
    2737        1470 :         cache->cached = BTRFS_CACHE_FINISHED;
    2738        1470 :         cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
    2739             : 
    2740        1470 :         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
    2741        1469 :                 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
    2742             : 
    2743        1470 :         ret = btrfs_load_block_group_zone_info(cache, true);
    2744        1470 :         if (ret) {
    2745             :                 btrfs_put_block_group(cache);
    2746             :                 return ERR_PTR(ret);
    2747             :         }
    2748             : 
    2749        1470 :         ret = exclude_super_stripes(cache);
    2750        1470 :         if (ret) {
    2751             :                 /* We may have excluded something, so call this just in case */
    2752           0 :                 btrfs_free_excluded_extents(cache);
    2753           0 :                 btrfs_put_block_group(cache);
    2754           0 :                 return ERR_PTR(ret);
    2755             :         }
    2756             : 
    2757        1470 :         ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
    2758        1470 :         btrfs_free_excluded_extents(cache);
    2759        1470 :         if (ret) {
    2760           0 :                 btrfs_put_block_group(cache);
    2761           0 :                 return ERR_PTR(ret);
    2762             :         }
    2763             : 
    2764             :         /*
    2765             :          * Ensure the corresponding space_info object is created and
    2766             :          * assigned to our block group. We want our bg to be added to the rbtree
    2767             :          * with its ->space_info set.
    2768             :          */
    2769        1470 :         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
    2770        1470 :         ASSERT(cache->space_info);
    2771             : 
    2772        1470 :         ret = btrfs_add_block_group_cache(fs_info, cache);
    2773        1470 :         if (ret) {
    2774           0 :                 btrfs_remove_free_space_cache(cache);
    2775           0 :                 btrfs_put_block_group(cache);
    2776           0 :                 return ERR_PTR(ret);
    2777             :         }
    2778             : 
    2779             :         /*
    2780             :          * Now that our block group has its ->space_info set and is inserted in
    2781             :          * the rbtree, update the space info's counters.
    2782             :          */
    2783        1470 :         trace_btrfs_add_block_group(fs_info, cache, 1);
    2784        1470 :         btrfs_add_bg_to_space_info(fs_info, cache);
    2785        1470 :         btrfs_update_global_block_rsv(fs_info);
    2786             : 
    2787             : #ifdef CONFIG_BTRFS_DEBUG
    2788             :         if (btrfs_should_fragment_free_space(cache)) {
    2789             :                 cache->space_info->bytes_used += size >> 1;
    2790             :                 fragment_free_space(cache);
    2791             :         }
    2792             : #endif
    2793             : 
    2794        1470 :         list_add_tail(&cache->bg_list, &trans->new_bgs);
    2795        1470 :         trans->delayed_ref_updates++;
    2796        1470 :         btrfs_update_delayed_refs_rsv(trans);
    2797             : 
    2798        1470 :         set_avail_alloc_bits(fs_info, type);
    2799        1470 :         return cache;
    2800             : }
    2801             : 
    2802             : /*
    2803             :  * Mark one block group RO, can be called several times for the same block
    2804             :  * group.
    2805             :  *
    2806             :  * @cache:              the destination block group
    2807             :  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
    2808             :  *                      ensure we still have some free space after marking this
    2809             :  *                      block group RO.
    2810             :  */
    2811         545 : int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
    2812             :                              bool do_chunk_alloc)
    2813             : {
    2814         545 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    2815         545 :         struct btrfs_trans_handle *trans;
    2816         545 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2817         545 :         u64 alloc_flags;
    2818         545 :         int ret;
    2819         545 :         bool dirty_bg_running;
    2820             : 
    2821             :         /*
    2822             :          * This can only happen when we are doing read-only scrub on read-only
    2823             :          * mount.
    2824             :          * In that case we should not start a new transaction on read-only fs.
    2825             :          * Thus here we skip all chunk allocations.
    2826             :          */
    2827         545 :         if (sb_rdonly(fs_info->sb)) {
    2828           0 :                 mutex_lock(&fs_info->ro_block_group_mutex);
    2829           0 :                 ret = inc_block_group_ro(cache, 0);
    2830           0 :                 mutex_unlock(&fs_info->ro_block_group_mutex);
    2831           0 :                 return ret;
    2832             :         }
    2833             : 
    2834         546 :         do {
    2835         546 :                 trans = btrfs_join_transaction(root);
    2836         546 :                 if (IS_ERR(trans))
    2837           0 :                         return PTR_ERR(trans);
    2838             : 
    2839         546 :                 dirty_bg_running = false;
    2840             : 
    2841             :                 /*
    2842             :                  * We're not allowed to set block groups readonly after the dirty
    2843             :                  * block group cache has started writing.  If it already started,
    2844             :                  * back off and let this transaction commit.
    2845             :                  */
    2846         546 :                 mutex_lock(&fs_info->ro_block_group_mutex);
    2847        1092 :                 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
    2848           1 :                         u64 transid = trans->transid;
    2849             : 
    2850           1 :                         mutex_unlock(&fs_info->ro_block_group_mutex);
    2851           1 :                         btrfs_end_transaction(trans);
    2852             : 
    2853           1 :                         ret = btrfs_wait_for_commit(fs_info, transid);
    2854           1 :                         if (ret)
    2855           0 :                                 return ret;
    2856             :                         dirty_bg_running = true;
    2857             :                 }
    2858         545 :         } while (dirty_bg_running);
    2859             : 
    2860         545 :         if (do_chunk_alloc) {
    2861             :                 /*
    2862             :                  * If we are changing raid levels, try to allocate a
    2863             :                  * corresponding block group with the new raid level.
    2864             :                  */
    2865         523 :                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
    2866         523 :                 if (alloc_flags != cache->flags) {
    2867           0 :                         ret = btrfs_chunk_alloc(trans, alloc_flags,
    2868             :                                                 CHUNK_ALLOC_FORCE);
    2869             :                         /*
    2870             :                          * ENOSPC is allowed here, we may have enough space
    2871             :                          * already allocated at the new raid level to carry on
    2872             :                          */
    2873           0 :                         if (ret == -ENOSPC)
    2874             :                                 ret = 0;
    2875           0 :                         if (ret < 0)
    2876           0 :                                 goto out;
    2877             :                 }
    2878             :         }
    2879             : 
    2880         545 :         ret = inc_block_group_ro(cache, 0);
    2881         545 :         if (!ret)
    2882         512 :                 goto out;
    2883          33 :         if (ret == -ETXTBSY)
    2884           0 :                 goto unlock_out;
    2885             : 
    2886             :         /*
    2887             :          * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
    2888             :          * chunk allocation storm to exhaust the system chunk array.  Otherwise
    2889             :          * we still want to try our best to mark the block group read-only.
    2890             :          */
    2891          33 :         if (!do_chunk_alloc && ret == -ENOSPC &&
    2892           0 :             (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
    2893           0 :                 goto unlock_out;
    2894             : 
    2895          33 :         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
    2896          33 :         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
    2897          33 :         if (ret < 0)
    2898           0 :                 goto out;
    2899             :         /*
    2900             :          * We have allocated a new chunk. We also need to activate that chunk to
    2901             :          * grant metadata tickets for zoned filesystem.
    2902             :          */
    2903          33 :         ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
    2904          33 :         if (ret < 0)
    2905             :                 goto out;
    2906             : 
    2907          33 :         ret = inc_block_group_ro(cache, 0);
    2908          33 :         if (ret == -ETXTBSY)
    2909           0 :                 goto unlock_out;
    2910          33 : out:
    2911         545 :         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
    2912          98 :                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
    2913          98 :                 mutex_lock(&fs_info->chunk_mutex);
    2914          98 :                 check_system_chunk(trans, alloc_flags);
    2915          98 :                 mutex_unlock(&fs_info->chunk_mutex);
    2916             :         }
    2917         447 : unlock_out:
    2918         545 :         mutex_unlock(&fs_info->ro_block_group_mutex);
    2919             : 
    2920         545 :         btrfs_end_transaction(trans);
    2921         545 :         return ret;
    2922             : }
    2923             : 
    2924          24 : void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
    2925             : {
    2926          24 :         struct btrfs_space_info *sinfo = cache->space_info;
    2927          24 :         u64 num_bytes;
    2928             : 
    2929          24 :         BUG_ON(!cache->ro);
    2930             : 
    2931          24 :         spin_lock(&sinfo->lock);
    2932          24 :         spin_lock(&cache->lock);
    2933          24 :         if (!--cache->ro) {
    2934          24 :                 if (btrfs_is_zoned(cache->fs_info)) {
    2935             :                         /* Migrate zone_unusable bytes back */
    2936             :                         cache->zone_unusable =
    2937             :                                 (cache->alloc_offset - cache->used) +
    2938             :                                 (cache->length - cache->zone_capacity);
    2939             :                         sinfo->bytes_zone_unusable += cache->zone_unusable;
    2940             :                         sinfo->bytes_readonly -= cache->zone_unusable;
    2941             :                 }
    2942          24 :                 num_bytes = cache->length - cache->reserved -
    2943          24 :                             cache->pinned - cache->bytes_super -
    2944          24 :                             cache->zone_unusable - cache->used;
    2945          24 :                 sinfo->bytes_readonly -= num_bytes;
    2946          24 :                 list_del_init(&cache->ro_list);
    2947             :         }
    2948          24 :         spin_unlock(&cache->lock);
    2949          24 :         spin_unlock(&sinfo->lock);
    2950          24 : }
    2951             : 
    2952      543996 : static int update_block_group_item(struct btrfs_trans_handle *trans,
    2953             :                                    struct btrfs_path *path,
    2954             :                                    struct btrfs_block_group *cache)
    2955             : {
    2956      543996 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2957      543996 :         int ret;
    2958      543996 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2959      543996 :         unsigned long bi;
    2960      543996 :         struct extent_buffer *leaf;
    2961      543996 :         struct btrfs_block_group_item bgi;
    2962      543996 :         struct btrfs_key key;
    2963      543996 :         u64 old_commit_used;
    2964      543996 :         u64 used;
    2965             : 
    2966             :         /*
    2967             :          * Block group items update can be triggered out of commit transaction
    2968             :          * critical section, thus we need a consistent view of used bytes.
    2969             :          * We cannot use cache->used directly outside of the spin lock, as it
    2970             :          * may be changed.
    2971             :          */
    2972      543996 :         spin_lock(&cache->lock);
    2973      543996 :         old_commit_used = cache->commit_used;
    2974      543996 :         used = cache->used;
    2975             :         /* No change in used bytes, can safely skip it. */
    2976      543996 :         if (cache->commit_used == used) {
    2977      244726 :                 spin_unlock(&cache->lock);
    2978      244726 :                 return 0;
    2979             :         }
    2980      299270 :         cache->commit_used = used;
    2981      299270 :         spin_unlock(&cache->lock);
    2982             : 
    2983      299270 :         key.objectid = cache->start;
    2984      299270 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2985      299270 :         key.offset = cache->length;
    2986             : 
    2987      299270 :         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2988      299270 :         if (ret) {
    2989           5 :                 if (ret > 0)
    2990           5 :                         ret = -ENOENT;
    2991           5 :                 goto fail;
    2992             :         }
    2993             : 
    2994      299265 :         leaf = path->nodes[0];
    2995      299265 :         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
    2996      299265 :         btrfs_set_stack_block_group_used(&bgi, used);
    2997      299265 :         btrfs_set_stack_block_group_chunk_objectid(&bgi,
    2998             :                                                    cache->global_root_id);
    2999      299265 :         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
    3000      299265 :         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
    3001      299265 :         btrfs_mark_buffer_dirty(leaf);
    3002      299270 : fail:
    3003      299270 :         btrfs_release_path(path);
    3004             :         /* We didn't update the block group item, need to revert @commit_used. */
    3005      299270 :         if (ret < 0) {
    3006           5 :                 spin_lock(&cache->lock);
    3007           5 :                 cache->commit_used = old_commit_used;
    3008           5 :                 spin_unlock(&cache->lock);
    3009             :         }
    3010             :         return ret;
    3011             : 
    3012             : }
    3013             : 
    3014      543999 : static int cache_save_setup(struct btrfs_block_group *block_group,
    3015             :                             struct btrfs_trans_handle *trans,
    3016             :                             struct btrfs_path *path)
    3017             : {
    3018      543999 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    3019      543999 :         struct btrfs_root *root = fs_info->tree_root;
    3020      543999 :         struct inode *inode = NULL;
    3021      543999 :         struct extent_changeset *data_reserved = NULL;
    3022      543999 :         u64 alloc_hint = 0;
    3023      543999 :         int dcs = BTRFS_DC_ERROR;
    3024      543999 :         u64 cache_size = 0;
    3025      543999 :         int retries = 0;
    3026      543999 :         int ret = 0;
    3027             : 
    3028      543999 :         if (!btrfs_test_opt(fs_info, SPACE_CACHE))
    3029             :                 return 0;
    3030             : 
    3031             :         /*
    3032             :          * If this block group is smaller than 100 megs don't bother caching the
    3033             :          * block group.
    3034             :          */
    3035          84 :         if (block_group->length < (100 * SZ_1M)) {
    3036          43 :                 spin_lock(&block_group->lock);
    3037          43 :                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
    3038          43 :                 spin_unlock(&block_group->lock);
    3039          43 :                 return 0;
    3040             :         }
    3041             : 
    3042          41 :         if (TRANS_ABORTED(trans))
    3043             :                 return 0;
    3044          41 : again:
    3045          46 :         inode = lookup_free_space_inode(block_group, path);
    3046          46 :         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
    3047           0 :                 ret = PTR_ERR(inode);
    3048           0 :                 btrfs_release_path(path);
    3049           0 :                 goto out;
    3050             :         }
    3051             : 
    3052          46 :         if (IS_ERR(inode)) {
    3053           5 :                 BUG_ON(retries);
    3054           5 :                 retries++;
    3055             : 
    3056           5 :                 if (block_group->ro)
    3057           0 :                         goto out_free;
    3058             : 
    3059           5 :                 ret = create_free_space_inode(trans, block_group, path);
    3060           5 :                 if (ret)
    3061           0 :                         goto out_free;
    3062           5 :                 goto again;
    3063             :         }
    3064             : 
    3065             :         /*
    3066             :          * We want to set the generation to 0, that way if anything goes wrong
    3067             :          * from here on out we know not to trust this cache when we load up next
    3068             :          * time.
    3069             :          */
    3070          41 :         BTRFS_I(inode)->generation = 0;
    3071          41 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    3072          41 :         if (ret) {
    3073             :                 /*
    3074             :                  * So theoretically we could recover from this, simply set the
    3075             :                  * super cache generation to 0 so we know to invalidate the
    3076             :                  * cache, but then we'd have to keep track of the block groups
    3077             :                  * that fail this way so we know we _have_ to reset this cache
    3078             :                  * before the next commit or risk reading stale cache.  So to
    3079             :                  * limit our exposure to horrible edge cases lets just abort the
    3080             :                  * transaction, this only happens in really bad situations
    3081             :                  * anyway.
    3082             :                  */
    3083           0 :                 btrfs_abort_transaction(trans, ret);
    3084           0 :                 goto out_put;
    3085             :         }
    3086          41 :         WARN_ON(ret);
    3087             : 
    3088             :         /* We've already setup this transaction, go ahead and exit */
    3089          41 :         if (block_group->cache_generation == trans->transid &&
    3090             :             i_size_read(inode)) {
    3091          24 :                 dcs = BTRFS_DC_SETUP;
    3092          24 :                 goto out_put;
    3093             :         }
    3094             : 
    3095          17 :         if (i_size_read(inode) > 0) {
    3096          12 :                 ret = btrfs_check_trunc_cache_free_space(fs_info,
    3097             :                                         &fs_info->global_block_rsv);
    3098          12 :                 if (ret)
    3099           0 :                         goto out_put;
    3100             : 
    3101          12 :                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
    3102          12 :                 if (ret)
    3103           0 :                         goto out_put;
    3104             :         }
    3105             : 
    3106          17 :         spin_lock(&block_group->lock);
    3107          17 :         if (block_group->cached != BTRFS_CACHE_FINISHED ||
    3108          17 :             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
    3109             :                 /*
    3110             :                  * don't bother trying to write stuff out _if_
    3111             :                  * a) we're not cached,
    3112             :                  * b) we're with nospace_cache mount option,
    3113             :                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
    3114             :                  */
    3115           0 :                 dcs = BTRFS_DC_WRITTEN;
    3116           0 :                 spin_unlock(&block_group->lock);
    3117           0 :                 goto out_put;
    3118             :         }
    3119          17 :         spin_unlock(&block_group->lock);
    3120             : 
    3121             :         /*
    3122             :          * We hit an ENOSPC when setting up the cache in this transaction, just
    3123             :          * skip doing the setup, we've already cleared the cache so we're safe.
    3124             :          */
    3125          34 :         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
    3126           0 :                 ret = -ENOSPC;
    3127           0 :                 goto out_put;
    3128             :         }
    3129             : 
    3130             :         /*
    3131             :          * Try to preallocate enough space based on how big the block group is.
    3132             :          * Keep in mind this has to include any pinned space which could end up
    3133             :          * taking up quite a bit since it's not folded into the other space
    3134             :          * cache.
    3135             :          */
    3136          17 :         cache_size = div_u64(block_group->length, SZ_256M);
    3137          17 :         if (!cache_size)
    3138           0 :                 cache_size = 1;
    3139             : 
    3140          17 :         cache_size *= 16;
    3141          17 :         cache_size *= fs_info->sectorsize;
    3142             : 
    3143          17 :         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
    3144             :                                           cache_size, false);
    3145          17 :         if (ret)
    3146           0 :                 goto out_put;
    3147             : 
    3148          17 :         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
    3149             :                                               cache_size, cache_size,
    3150             :                                               &alloc_hint);
    3151             :         /*
    3152             :          * Our cache requires contiguous chunks so that we don't modify a bunch
    3153             :          * of metadata or split extents when writing the cache out, which means
    3154             :          * we can enospc if we are heavily fragmented in addition to just normal
    3155             :          * out of space conditions.  So if we hit this just skip setting up any
    3156             :          * other block groups for this transaction, maybe we'll unpin enough
    3157             :          * space the next time around.
    3158             :          */
    3159          17 :         if (!ret)
    3160             :                 dcs = BTRFS_DC_SETUP;
    3161           0 :         else if (ret == -ENOSPC)
    3162           0 :                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
    3163             : 
    3164           0 : out_put:
    3165          41 :         iput(inode);
    3166          41 : out_free:
    3167          41 :         btrfs_release_path(path);
    3168          41 : out:
    3169          41 :         spin_lock(&block_group->lock);
    3170          41 :         if (!ret && dcs == BTRFS_DC_SETUP)
    3171          41 :                 block_group->cache_generation = trans->transid;
    3172          41 :         block_group->disk_cache_state = dcs;
    3173          41 :         spin_unlock(&block_group->lock);
    3174             : 
    3175          41 :         extent_changeset_free(data_reserved);
    3176          41 :         return ret;
    3177             : }
    3178             : 
    3179      206349 : int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
    3180             : {
    3181      206349 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3182      206349 :         struct btrfs_block_group *cache, *tmp;
    3183      206349 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3184      206349 :         struct btrfs_path *path;
    3185             : 
    3186      206349 :         if (list_empty(&cur_trans->dirty_bgs) ||
    3187       57596 :             !btrfs_test_opt(fs_info, SPACE_CACHE))
    3188             :                 return 0;
    3189             : 
    3190           6 :         path = btrfs_alloc_path();
    3191           6 :         if (!path)
    3192             :                 return -ENOMEM;
    3193             : 
    3194             :         /* Could add new block groups, use _safe just in case */
    3195          13 :         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
    3196             :                                  dirty_list) {
    3197           7 :                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
    3198           3 :                         cache_save_setup(cache, trans, path);
    3199             :         }
    3200             : 
    3201           6 :         btrfs_free_path(path);
    3202           6 :         return 0;
    3203             : }
    3204             : 
    3205             : /*
    3206             :  * Transaction commit does final block group cache writeback during a critical
    3207             :  * section where nothing is allowed to change the FS.  This is required in
    3208             :  * order for the cache to actually match the block group, but can introduce a
    3209             :  * lot of latency into the commit.
    3210             :  *
    3211             :  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
    3212             :  * There's a chance we'll have to redo some of it if the block group changes
    3213             :  * again during the commit, but it greatly reduces the commit latency by
    3214             :  * getting rid of the easy block groups while we're still allowing others to
    3215             :  * join the commit.
    3216             :  */
    3217      206186 : int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
    3218             : {
    3219      206186 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3220      206186 :         struct btrfs_block_group *cache;
    3221      206186 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3222      206186 :         int ret = 0;
    3223      206186 :         int should_put;
    3224      206186 :         struct btrfs_path *path = NULL;
    3225      206186 :         LIST_HEAD(dirty);
    3226      206186 :         struct list_head *io = &cur_trans->io_bgs;
    3227      206186 :         int loops = 0;
    3228             : 
    3229      206186 :         spin_lock(&cur_trans->dirty_bgs_lock);
    3230      206186 :         if (list_empty(&cur_trans->dirty_bgs)) {
    3231       67649 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3232       67649 :                 return 0;
    3233             :         }
    3234      138537 :         list_splice_init(&cur_trans->dirty_bgs, &dirty);
    3235      138537 :         spin_unlock(&cur_trans->dirty_bgs_lock);
    3236             : 
    3237      251271 : again:
    3238             :         /* Make sure all the block groups on our dirty list actually exist */
    3239      251271 :         btrfs_create_pending_block_groups(trans);
    3240             : 
    3241      251271 :         if (!path) {
    3242      138537 :                 path = btrfs_alloc_path();
    3243      138537 :                 if (!path) {
    3244           0 :                         ret = -ENOMEM;
    3245           0 :                         goto out;
    3246             :                 }
    3247             :         }
    3248             : 
    3249             :         /*
    3250             :          * cache_write_mutex is here only to save us from balance or automatic
    3251             :          * removal of empty block groups deleting this block group while we are
    3252             :          * writing out the cache
    3253             :          */
    3254      251271 :         mutex_lock(&trans->transaction->cache_write_mutex);
    3255      569314 :         while (!list_empty(&dirty)) {
    3256      318043 :                 bool drop_reserve = true;
    3257             : 
    3258      318043 :                 cache = list_first_entry(&dirty, struct btrfs_block_group,
    3259             :                                          dirty_list);
    3260             :                 /*
    3261             :                  * This can happen if something re-dirties a block group that
    3262             :                  * is already under IO.  Just wait for it to finish and then do
    3263             :                  * it all again
    3264             :                  */
    3265      318043 :                 if (!list_empty(&cache->io_list)) {
    3266           8 :                         list_del_init(&cache->io_list);
    3267           8 :                         btrfs_wait_cache_io(trans, cache, path);
    3268           8 :                         btrfs_put_block_group(cache);
    3269             :                 }
    3270             : 
    3271             : 
    3272             :                 /*
    3273             :                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
    3274             :                  * it should update the cache_state.  Don't delete until after
    3275             :                  * we wait.
    3276             :                  *
    3277             :                  * Since we're not running in the commit critical section
    3278             :                  * we need the dirty_bgs_lock to protect from update_block_group
    3279             :                  */
    3280      318043 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3281      318043 :                 list_del_init(&cache->dirty_list);
    3282      318043 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3283             : 
    3284      318043 :                 should_put = 1;
    3285             : 
    3286      318043 :                 cache_save_setup(cache, trans, path);
    3287             : 
    3288      318043 :                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
    3289          17 :                         cache->io_ctl.inode = NULL;
    3290          17 :                         ret = btrfs_write_out_cache(trans, cache, path);
    3291          17 :                         if (ret == 0 && cache->io_ctl.inode) {
    3292          17 :                                 should_put = 0;
    3293             : 
    3294             :                                 /*
    3295             :                                  * The cache_write_mutex is protecting the
    3296             :                                  * io_list, also refer to the definition of
    3297             :                                  * btrfs_transaction::io_bgs for more details
    3298             :                                  */
    3299          17 :                                 list_add_tail(&cache->io_list, io);
    3300             :                         } else {
    3301             :                                 /*
    3302             :                                  * If we failed to write the cache, the
    3303             :                                  * generation will be bad and life goes on
    3304             :                                  */
    3305             :                                 ret = 0;
    3306             :                         }
    3307             :                 }
    3308             :                 if (!ret) {
    3309      318043 :                         ret = update_block_group_item(trans, path, cache);
    3310             :                         /*
    3311             :                          * Our block group might still be attached to the list
    3312             :                          * of new block groups in the transaction handle of some
    3313             :                          * other task (struct btrfs_trans_handle->new_bgs). This
    3314             :                          * means its block group item isn't yet in the extent
    3315             :                          * tree. If this happens ignore the error, as we will
    3316             :                          * try again later in the critical section of the
    3317             :                          * transaction commit.
    3318             :                          */
    3319      318043 :                         if (ret == -ENOENT) {
    3320           5 :                                 ret = 0;
    3321           5 :                                 spin_lock(&cur_trans->dirty_bgs_lock);
    3322           5 :                                 if (list_empty(&cache->dirty_list)) {
    3323           5 :                                         list_add_tail(&cache->dirty_list,
    3324             :                                                       &cur_trans->dirty_bgs);
    3325           5 :                                         btrfs_get_block_group(cache);
    3326           5 :                                         drop_reserve = false;
    3327             :                                 }
    3328           5 :                                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3329      318038 :                         } else if (ret) {
    3330           0 :                                 btrfs_abort_transaction(trans, ret);
    3331             :                         }
    3332             :                 }
    3333             : 
    3334             :                 /* If it's not on the io list, we need to put the block group */
    3335      318043 :                 if (should_put)
    3336      318026 :                         btrfs_put_block_group(cache);
    3337      318043 :                 if (drop_reserve)
    3338      318038 :                         btrfs_delayed_refs_rsv_release(fs_info, 1);
    3339             :                 /*
    3340             :                  * Avoid blocking other tasks for too long. It might even save
    3341             :                  * us from writing caches for block groups that are going to be
    3342             :                  * removed.
    3343             :                  */
    3344      318043 :                 mutex_unlock(&trans->transaction->cache_write_mutex);
    3345      318043 :                 if (ret)
    3346           0 :                         goto out;
    3347      318043 :                 mutex_lock(&trans->transaction->cache_write_mutex);
    3348             :         }
    3349      251271 :         mutex_unlock(&trans->transaction->cache_write_mutex);
    3350             : 
    3351             :         /*
    3352             :          * Go through delayed refs for all the stuff we've just kicked off
    3353             :          * and then loop back (just once)
    3354             :          */
    3355      251271 :         if (!ret)
    3356      251271 :                 ret = btrfs_run_delayed_refs(trans, 0);
    3357      251271 :         if (!ret && loops == 0) {
    3358      138537 :                 loops++;
    3359      138537 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3360      138537 :                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
    3361             :                 /*
    3362             :                  * dirty_bgs_lock protects us from concurrent block group
    3363             :                  * deletes too (not just cache_write_mutex).
    3364             :                  */
    3365      138537 :                 if (!list_empty(&dirty)) {
    3366      112734 :                         spin_unlock(&cur_trans->dirty_bgs_lock);
    3367      112734 :                         goto again;
    3368             :                 }
    3369       25803 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3370             :         }
    3371      112734 : out:
    3372      138537 :         if (ret < 0) {
    3373           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3374           0 :                 list_splice_init(&dirty, &cur_trans->dirty_bgs);
    3375           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3376           0 :                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
    3377             :         }
    3378             : 
    3379      138537 :         btrfs_free_path(path);
    3380      138537 :         return ret;
    3381             : }
    3382             : 
    3383      206739 : int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
    3384             : {
    3385      206739 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3386      206739 :         struct btrfs_block_group *cache;
    3387      206739 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3388      206739 :         int ret = 0;
    3389      206739 :         int should_put;
    3390      206739 :         struct btrfs_path *path;
    3391      206739 :         struct list_head *io = &cur_trans->io_bgs;
    3392             : 
    3393      206739 :         path = btrfs_alloc_path();
    3394      206739 :         if (!path)
    3395             :                 return -ENOMEM;
    3396             : 
    3397             :         /*
    3398             :          * Even though we are in the critical section of the transaction commit,
    3399             :          * we can still have concurrent tasks adding elements to this
    3400             :          * transaction's list of dirty block groups. These tasks correspond to
    3401             :          * endio free space workers started when writeback finishes for a
    3402             :          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
    3403             :          * allocate new block groups as a result of COWing nodes of the root
    3404             :          * tree when updating the free space inode. The writeback for the space
    3405             :          * caches is triggered by an earlier call to
    3406             :          * btrfs_start_dirty_block_groups() and iterations of the following
    3407             :          * loop.
    3408             :          * Also we want to do the cache_save_setup first and then run the
    3409             :          * delayed refs to make sure we have the best chance at doing this all
    3410             :          * in one shot.
    3411             :          */
    3412      206739 :         spin_lock(&cur_trans->dirty_bgs_lock);
    3413      432692 :         while (!list_empty(&cur_trans->dirty_bgs)) {
    3414      225953 :                 cache = list_first_entry(&cur_trans->dirty_bgs,
    3415             :                                          struct btrfs_block_group,
    3416             :                                          dirty_list);
    3417             : 
    3418             :                 /*
    3419             :                  * This can happen if cache_save_setup re-dirties a block group
    3420             :                  * that is already under IO.  Just wait for it to finish and
    3421             :                  * then do it all again
    3422             :                  */
    3423      225953 :                 if (!list_empty(&cache->io_list)) {
    3424          16 :                         spin_unlock(&cur_trans->dirty_bgs_lock);
    3425          16 :                         list_del_init(&cache->io_list);
    3426          16 :                         btrfs_wait_cache_io(trans, cache, path);
    3427          16 :                         btrfs_put_block_group(cache);
    3428          16 :                         spin_lock(&cur_trans->dirty_bgs_lock);
    3429             :                 }
    3430             : 
    3431             :                 /*
    3432             :                  * Don't remove from the dirty list until after we've waited on
    3433             :                  * any pending IO
    3434             :                  */
    3435      225953 :                 list_del_init(&cache->dirty_list);
    3436      225953 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3437      225953 :                 should_put = 1;
    3438             : 
    3439      225953 :                 cache_save_setup(cache, trans, path);
    3440             : 
    3441      225953 :                 if (!ret)
    3442      225953 :                         ret = btrfs_run_delayed_refs(trans,
    3443             :                                                      (unsigned long) -1);
    3444             : 
    3445      225953 :                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
    3446          24 :                         cache->io_ctl.inode = NULL;
    3447          24 :                         ret = btrfs_write_out_cache(trans, cache, path);
    3448          24 :                         if (ret == 0 && cache->io_ctl.inode) {
    3449          24 :                                 should_put = 0;
    3450          24 :                                 list_add_tail(&cache->io_list, io);
    3451             :                         } else {
    3452             :                                 /*
    3453             :                                  * If we failed to write the cache, the
    3454             :                                  * generation will be bad and life goes on
    3455             :                                  */
    3456             :                                 ret = 0;
    3457             :                         }
    3458             :                 }
    3459      225953 :                 if (!ret) {
    3460      225953 :                         ret = update_block_group_item(trans, path, cache);
    3461             :                         /*
    3462             :                          * One of the free space endio workers might have
    3463             :                          * created a new block group while updating a free space
    3464             :                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
    3465             :                          * and hasn't released its transaction handle yet, in
    3466             :                          * which case the new block group is still attached to
    3467             :                          * its transaction handle and its creation has not
    3468             :                          * finished yet (no block group item in the extent tree
    3469             :                          * yet, etc). If this is the case, wait for all free
    3470             :                          * space endio workers to finish and retry. This is a
    3471             :                          * very rare case so no need for a more efficient and
    3472             :                          * complex approach.
    3473             :                          */
    3474      225953 :                         if (ret == -ENOENT) {
    3475           0 :                                 wait_event(cur_trans->writer_wait,
    3476             :                                    atomic_read(&cur_trans->num_writers) == 1);
    3477           0 :                                 ret = update_block_group_item(trans, path, cache);
    3478             :                         }
    3479      225953 :                         if (ret)
    3480           0 :                                 btrfs_abort_transaction(trans, ret);
    3481             :                 }
    3482             : 
    3483             :                 /* If its not on the io list, we need to put the block group */
    3484      225953 :                 if (should_put)
    3485      225929 :                         btrfs_put_block_group(cache);
    3486      225953 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    3487      225953 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3488             :         }
    3489      206739 :         spin_unlock(&cur_trans->dirty_bgs_lock);
    3490             : 
    3491             :         /*
    3492             :          * Refer to the definition of io_bgs member for details why it's safe
    3493             :          * to use it without any locking
    3494             :          */
    3495      206756 :         while (!list_empty(io)) {
    3496          17 :                 cache = list_first_entry(io, struct btrfs_block_group,
    3497             :                                          io_list);
    3498          17 :                 list_del_init(&cache->io_list);
    3499          17 :                 btrfs_wait_cache_io(trans, cache, path);
    3500          17 :                 btrfs_put_block_group(cache);
    3501             :         }
    3502             : 
    3503      206739 :         btrfs_free_path(path);
    3504      206739 :         return ret;
    3505             : }
    3506             : 
    3507    18582674 : int btrfs_update_block_group(struct btrfs_trans_handle *trans,
    3508             :                              u64 bytenr, u64 num_bytes, bool alloc)
    3509             : {
    3510    18582674 :         struct btrfs_fs_info *info = trans->fs_info;
    3511    18582674 :         struct btrfs_block_group *cache = NULL;
    3512    18582674 :         u64 total = num_bytes;
    3513    18582674 :         u64 old_val;
    3514    18582674 :         u64 byte_in_group;
    3515    18582674 :         int factor;
    3516    18582674 :         int ret = 0;
    3517             : 
    3518             :         /* Block accounting for super block */
    3519    18582674 :         spin_lock(&info->delalloc_root_lock);
    3520    18582674 :         old_val = btrfs_super_bytes_used(info->super_copy);
    3521    18582674 :         if (alloc)
    3522    10019351 :                 old_val += num_bytes;
    3523             :         else
    3524     8563323 :                 old_val -= num_bytes;
    3525    18582674 :         btrfs_set_super_bytes_used(info->super_copy, old_val);
    3526    18582674 :         spin_unlock(&info->delalloc_root_lock);
    3527             : 
    3528    37165351 :         while (total) {
    3529    18582676 :                 struct btrfs_space_info *space_info;
    3530    18582676 :                 bool reclaim = false;
    3531             : 
    3532    18582676 :                 cache = btrfs_lookup_block_group(info, bytenr);
    3533    18582676 :                 if (!cache) {
    3534             :                         ret = -ENOENT;
    3535             :                         break;
    3536             :                 }
    3537    18582676 :                 space_info = cache->space_info;
    3538    18582676 :                 factor = btrfs_bg_type_to_factor(cache->flags);
    3539             : 
    3540             :                 /*
    3541             :                  * If this block group has free space cache written out, we
    3542             :                  * need to make sure to load it if we are removing space.  This
    3543             :                  * is because we need the unpinning stage to actually add the
    3544             :                  * space back to the block group, otherwise we will leak space.
    3545             :                  */
    3546    27145998 :                 if (!alloc && !btrfs_block_group_done(cache))
    3547         145 :                         btrfs_cache_block_group(cache, true);
    3548             : 
    3549    18582675 :                 byte_in_group = bytenr - cache->start;
    3550    18582675 :                 WARN_ON(byte_in_group > cache->length);
    3551             : 
    3552    18582675 :                 spin_lock(&space_info->lock);
    3553    18582677 :                 spin_lock(&cache->lock);
    3554             : 
    3555    18582677 :                 if (btrfs_test_opt(info, SPACE_CACHE) &&
    3556         490 :                     cache->disk_cache_state < BTRFS_DC_CLEAR)
    3557          43 :                         cache->disk_cache_state = BTRFS_DC_CLEAR;
    3558             : 
    3559    18582677 :                 old_val = cache->used;
    3560    18582677 :                 num_bytes = min(total, cache->length - byte_in_group);
    3561    18582677 :                 if (alloc) {
    3562    10019354 :                         old_val += num_bytes;
    3563    10019354 :                         cache->used = old_val;
    3564    10019354 :                         cache->reserved -= num_bytes;
    3565    10019354 :                         space_info->bytes_reserved -= num_bytes;
    3566    10019354 :                         space_info->bytes_used += num_bytes;
    3567    10019354 :                         space_info->disk_used += num_bytes * factor;
    3568    10019354 :                         spin_unlock(&cache->lock);
    3569    10019354 :                         spin_unlock(&space_info->lock);
    3570             :                 } else {
    3571     8563323 :                         old_val -= num_bytes;
    3572     8563323 :                         cache->used = old_val;
    3573     8563323 :                         cache->pinned += num_bytes;
    3574     8563323 :                         btrfs_space_info_update_bytes_pinned(info, space_info,
    3575             :                                                              num_bytes);
    3576     8563323 :                         space_info->bytes_used -= num_bytes;
    3577     8563323 :                         space_info->disk_used -= num_bytes * factor;
    3578             : 
    3579     8563323 :                         reclaim = should_reclaim_block_group(cache, num_bytes);
    3580             : 
    3581     8563323 :                         spin_unlock(&cache->lock);
    3582     8563323 :                         spin_unlock(&space_info->lock);
    3583             : 
    3584     8563323 :                         set_extent_bit(&trans->transaction->pinned_extents,
    3585     8563323 :                                        bytenr, bytenr + num_bytes - 1,
    3586             :                                        EXTENT_DIRTY, NULL);
    3587             :                 }
    3588             : 
    3589    18582674 :                 spin_lock(&trans->transaction->dirty_bgs_lock);
    3590    18582674 :                 if (list_empty(&cache->dirty_list)) {
    3591      543999 :                         list_add_tail(&cache->dirty_list,
    3592      543999 :                                       &trans->transaction->dirty_bgs);
    3593      543999 :                         trans->delayed_ref_updates++;
    3594      543999 :                         btrfs_get_block_group(cache);
    3595             :                 }
    3596    18582674 :                 spin_unlock(&trans->transaction->dirty_bgs_lock);
    3597             : 
    3598             :                 /*
    3599             :                  * No longer have used bytes in this block group, queue it for
    3600             :                  * deletion. We do this after adding the block group to the
    3601             :                  * dirty list to avoid races between cleaner kthread and space
    3602             :                  * cache writeout.
    3603             :                  */
    3604    18582674 :                 if (!alloc && old_val == 0) {
    3605      123052 :                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
    3606           6 :                                 btrfs_mark_bg_unused(cache);
    3607    18459622 :                 } else if (!alloc && reclaim) {
    3608           0 :                         btrfs_mark_bg_to_reclaim(cache);
    3609             :                 }
    3610             : 
    3611    18582674 :                 btrfs_put_block_group(cache);
    3612    18582674 :                 total -= num_bytes;
    3613    18582674 :                 bytenr += num_bytes;
    3614             :         }
    3615             : 
    3616             :         /* Modified block groups are accounted for in the delayed_refs_rsv. */
    3617    18582675 :         btrfs_update_delayed_refs_rsv(trans);
    3618    18582674 :         return ret;
    3619             : }
    3620             : 
    3621             : /*
    3622             :  * Update the block_group and space info counters.
    3623             :  *
    3624             :  * @cache:      The cache we are manipulating
    3625             :  * @ram_bytes:  The number of bytes of file content, and will be same to
    3626             :  *              @num_bytes except for the compress path.
    3627             :  * @num_bytes:  The number of bytes in question
    3628             :  * @delalloc:   The blocks are allocated for the delalloc write
    3629             :  *
    3630             :  * This is called by the allocator when it reserves space. If this is a
    3631             :  * reservation and the block group has become read only we cannot make the
    3632             :  * reservation and return -EAGAIN, otherwise this function always succeeds.
    3633             :  */
    3634    13262459 : int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
    3635             :                              u64 ram_bytes, u64 num_bytes, int delalloc,
    3636             :                              bool force_wrong_size_class)
    3637             : {
    3638    13262459 :         struct btrfs_space_info *space_info = cache->space_info;
    3639    13262459 :         enum btrfs_block_group_size_class size_class;
    3640    13262459 :         int ret = 0;
    3641             : 
    3642    13262459 :         spin_lock(&space_info->lock);
    3643    13262552 :         spin_lock(&cache->lock);
    3644    13262559 :         if (cache->ro) {
    3645           0 :                 ret = -EAGAIN;
    3646           0 :                 goto out;
    3647             :         }
    3648             : 
    3649    13262559 :         if (btrfs_block_group_should_use_size_class(cache)) {
    3650     3707031 :                 size_class = btrfs_calc_block_group_size_class(num_bytes);
    3651     3707031 :                 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
    3652        1577 :                 if (ret)
    3653           0 :                         goto out;
    3654             :         }
    3655    13262559 :         cache->reserved += num_bytes;
    3656    13262559 :         space_info->bytes_reserved += num_bytes;
    3657    13262559 :         trace_btrfs_space_reservation(cache->fs_info, "space_info",
    3658             :                                       space_info->flags, num_bytes, 1);
    3659    13262446 :         btrfs_space_info_update_bytes_may_use(cache->fs_info,
    3660    13262446 :                                               space_info, -ram_bytes);
    3661    13262414 :         if (delalloc)
    3662     3377667 :                 cache->delalloc_bytes += num_bytes;
    3663             : 
    3664             :         /*
    3665             :          * Compression can use less space than we reserved, so wake tickets if
    3666             :          * that happens.
    3667             :          */
    3668    13262414 :         if (num_bytes < ram_bytes)
    3669      158278 :                 btrfs_try_granting_tickets(cache->fs_info, space_info);
    3670    13104136 : out:
    3671    13262414 :         spin_unlock(&cache->lock);
    3672    13262534 :         spin_unlock(&space_info->lock);
    3673    13262540 :         return ret;
    3674             : }
    3675             : 
    3676             : /*
    3677             :  * Update the block_group and space info counters.
    3678             :  *
    3679             :  * @cache:      The cache we are manipulating
    3680             :  * @num_bytes:  The number of bytes in question
    3681             :  * @delalloc:   The blocks are allocated for the delalloc write
    3682             :  *
    3683             :  * This is called by somebody who is freeing space that was never actually used
    3684             :  * on disk.  For example if you reserve some space for a new leaf in transaction
    3685             :  * A and before transaction A commits you free that leaf, you call this with
    3686             :  * reserve set to 0 in order to clear the reservation.
    3687             :  */
    3688       44105 : void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
    3689             :                                u64 num_bytes, int delalloc)
    3690             : {
    3691       44105 :         struct btrfs_space_info *space_info = cache->space_info;
    3692             : 
    3693       44105 :         spin_lock(&space_info->lock);
    3694       44106 :         spin_lock(&cache->lock);
    3695       44106 :         if (cache->ro)
    3696           0 :                 space_info->bytes_readonly += num_bytes;
    3697       44106 :         cache->reserved -= num_bytes;
    3698       44106 :         space_info->bytes_reserved -= num_bytes;
    3699       44106 :         space_info->max_extent_size = 0;
    3700             : 
    3701       44106 :         if (delalloc)
    3702         169 :                 cache->delalloc_bytes -= num_bytes;
    3703       44106 :         spin_unlock(&cache->lock);
    3704             : 
    3705       44106 :         btrfs_try_granting_tickets(cache->fs_info, space_info);
    3706       44106 :         spin_unlock(&space_info->lock);
    3707       44106 : }
    3708             : 
    3709           0 : static void force_metadata_allocation(struct btrfs_fs_info *info)
    3710             : {
    3711           0 :         struct list_head *head = &info->space_info;
    3712           0 :         struct btrfs_space_info *found;
    3713             : 
    3714           0 :         list_for_each_entry(found, head, list) {
    3715           0 :                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
    3716           0 :                         found->force_alloc = CHUNK_ALLOC_FORCE;
    3717             :         }
    3718           0 : }
    3719             : 
    3720      272878 : static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
    3721             :                               struct btrfs_space_info *sinfo, int force)
    3722             : {
    3723      272878 :         u64 bytes_used = btrfs_space_info_used(sinfo, false);
    3724      272878 :         u64 thresh;
    3725             : 
    3726      272878 :         if (force == CHUNK_ALLOC_FORCE)
    3727             :                 return 1;
    3728             : 
    3729             :         /*
    3730             :          * in limited mode, we want to have some free space up to
    3731             :          * about 1% of the FS size.
    3732             :          */
    3733       99828 :         if (force == CHUNK_ALLOC_LIMITED) {
    3734           0 :                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
    3735           0 :                 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
    3736             : 
    3737           0 :                 if (sinfo->total_bytes - bytes_used < thresh)
    3738             :                         return 1;
    3739             :         }
    3740             : 
    3741       99828 :         if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
    3742       27986 :                 return 0;
    3743             :         return 1;
    3744             : }
    3745             : 
    3746          23 : int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
    3747             : {
    3748          23 :         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
    3749             : 
    3750          23 :         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
    3751             : }
    3752             : 
    3753        1427 : static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
    3754             : {
    3755        1427 :         struct btrfs_block_group *bg;
    3756        1427 :         int ret;
    3757             : 
    3758             :         /*
    3759             :          * Check if we have enough space in the system space info because we
    3760             :          * will need to update device items in the chunk btree and insert a new
    3761             :          * chunk item in the chunk btree as well. This will allocate a new
    3762             :          * system block group if needed.
    3763             :          */
    3764        1427 :         check_system_chunk(trans, flags);
    3765             : 
    3766        1427 :         bg = btrfs_create_chunk(trans, flags);
    3767        1427 :         if (IS_ERR(bg)) {
    3768          50 :                 ret = PTR_ERR(bg);
    3769          50 :                 goto out;
    3770             :         }
    3771             : 
    3772        1377 :         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
    3773             :         /*
    3774             :          * Normally we are not expected to fail with -ENOSPC here, since we have
    3775             :          * previously reserved space in the system space_info and allocated one
    3776             :          * new system chunk if necessary. However there are three exceptions:
    3777             :          *
    3778             :          * 1) We may have enough free space in the system space_info but all the
    3779             :          *    existing system block groups have a profile which can not be used
    3780             :          *    for extent allocation.
    3781             :          *
    3782             :          *    This happens when mounting in degraded mode. For example we have a
    3783             :          *    RAID1 filesystem with 2 devices, lose one device and mount the fs
    3784             :          *    using the other device in degraded mode. If we then allocate a chunk,
    3785             :          *    we may have enough free space in the existing system space_info, but
    3786             :          *    none of the block groups can be used for extent allocation since they
    3787             :          *    have a RAID1 profile, and because we are in degraded mode with a
    3788             :          *    single device, we are forced to allocate a new system chunk with a
    3789             :          *    SINGLE profile. Making check_system_chunk() iterate over all system
    3790             :          *    block groups and check if they have a usable profile and enough space
    3791             :          *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
    3792             :          *    try again after forcing allocation of a new system chunk. Like this
    3793             :          *    we avoid paying the cost of that search in normal circumstances, when
    3794             :          *    we were not mounted in degraded mode;
    3795             :          *
    3796             :          * 2) We had enough free space info the system space_info, and one suitable
    3797             :          *    block group to allocate from when we called check_system_chunk()
    3798             :          *    above. However right after we called it, the only system block group
    3799             :          *    with enough free space got turned into RO mode by a running scrub,
    3800             :          *    and in this case we have to allocate a new one and retry. We only
    3801             :          *    need do this allocate and retry once, since we have a transaction
    3802             :          *    handle and scrub uses the commit root to search for block groups;
    3803             :          *
    3804             :          * 3) We had one system block group with enough free space when we called
    3805             :          *    check_system_chunk(), but after that, right before we tried to
    3806             :          *    allocate the last extent buffer we needed, a discard operation came
    3807             :          *    in and it temporarily removed the last free space entry from the
    3808             :          *    block group (discard removes a free space entry, discards it, and
    3809             :          *    then adds back the entry to the block group cache).
    3810             :          */
    3811        1377 :         if (ret == -ENOSPC) {
    3812           0 :                 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
    3813           0 :                 struct btrfs_block_group *sys_bg;
    3814             : 
    3815           0 :                 sys_bg = btrfs_create_chunk(trans, sys_flags);
    3816           0 :                 if (IS_ERR(sys_bg)) {
    3817           0 :                         ret = PTR_ERR(sys_bg);
    3818           0 :                         btrfs_abort_transaction(trans, ret);
    3819           0 :                         goto out;
    3820             :                 }
    3821             : 
    3822           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
    3823           0 :                 if (ret) {
    3824           0 :                         btrfs_abort_transaction(trans, ret);
    3825           0 :                         goto out;
    3826             :                 }
    3827             : 
    3828           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
    3829           0 :                 if (ret) {
    3830           0 :                         btrfs_abort_transaction(trans, ret);
    3831           0 :                         goto out;
    3832             :                 }
    3833        1377 :         } else if (ret) {
    3834           0 :                 btrfs_abort_transaction(trans, ret);
    3835           0 :                 goto out;
    3836             :         }
    3837        1377 : out:
    3838        1427 :         btrfs_trans_release_chunk_metadata(trans);
    3839             : 
    3840        1427 :         if (ret)
    3841          50 :                 return ERR_PTR(ret);
    3842             : 
    3843        1377 :         btrfs_get_block_group(bg);
    3844        1377 :         return bg;
    3845             : }
    3846             : 
    3847             : /*
    3848             :  * Chunk allocation is done in 2 phases:
    3849             :  *
    3850             :  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
    3851             :  *    the chunk, the chunk mapping, create its block group and add the items
    3852             :  *    that belong in the chunk btree to it - more specifically, we need to
    3853             :  *    update device items in the chunk btree and add a new chunk item to it.
    3854             :  *
    3855             :  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
    3856             :  *    group item to the extent btree and the device extent items to the devices
    3857             :  *    btree.
    3858             :  *
    3859             :  * This is done to prevent deadlocks. For example when COWing a node from the
    3860             :  * extent btree we are holding a write lock on the node's parent and if we
    3861             :  * trigger chunk allocation and attempted to insert the new block group item
    3862             :  * in the extent btree right way, we could deadlock because the path for the
    3863             :  * insertion can include that parent node. At first glance it seems impossible
    3864             :  * to trigger chunk allocation after starting a transaction since tasks should
    3865             :  * reserve enough transaction units (metadata space), however while that is true
    3866             :  * most of the time, chunk allocation may still be triggered for several reasons:
    3867             :  *
    3868             :  * 1) When reserving metadata, we check if there is enough free space in the
    3869             :  *    metadata space_info and therefore don't trigger allocation of a new chunk.
    3870             :  *    However later when the task actually tries to COW an extent buffer from
    3871             :  *    the extent btree or from the device btree for example, it is forced to
    3872             :  *    allocate a new block group (chunk) because the only one that had enough
    3873             :  *    free space was just turned to RO mode by a running scrub for example (or
    3874             :  *    device replace, block group reclaim thread, etc), so we can not use it
    3875             :  *    for allocating an extent and end up being forced to allocate a new one;
    3876             :  *
    3877             :  * 2) Because we only check that the metadata space_info has enough free bytes,
    3878             :  *    we end up not allocating a new metadata chunk in that case. However if
    3879             :  *    the filesystem was mounted in degraded mode, none of the existing block
    3880             :  *    groups might be suitable for extent allocation due to their incompatible
    3881             :  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
    3882             :  *    use a RAID1 profile, in degraded mode using a single device). In this case
    3883             :  *    when the task attempts to COW some extent buffer of the extent btree for
    3884             :  *    example, it will trigger allocation of a new metadata block group with a
    3885             :  *    suitable profile (SINGLE profile in the example of the degraded mount of
    3886             :  *    the RAID1 filesystem);
    3887             :  *
    3888             :  * 3) The task has reserved enough transaction units / metadata space, but when
    3889             :  *    it attempts to COW an extent buffer from the extent or device btree for
    3890             :  *    example, it does not find any free extent in any metadata block group,
    3891             :  *    therefore forced to try to allocate a new metadata block group.
    3892             :  *    This is because some other task allocated all available extents in the
    3893             :  *    meanwhile - this typically happens with tasks that don't reserve space
    3894             :  *    properly, either intentionally or as a bug. One example where this is
    3895             :  *    done intentionally is fsync, as it does not reserve any transaction units
    3896             :  *    and ends up allocating a variable number of metadata extents for log
    3897             :  *    tree extent buffers;
    3898             :  *
    3899             :  * 4) The task has reserved enough transaction units / metadata space, but right
    3900             :  *    before it tries to allocate the last extent buffer it needs, a discard
    3901             :  *    operation comes in and, temporarily, removes the last free space entry from
    3902             :  *    the only metadata block group that had free space (discard starts by
    3903             :  *    removing a free space entry from a block group, then does the discard
    3904             :  *    operation and, once it's done, it adds back the free space entry to the
    3905             :  *    block group).
    3906             :  *
    3907             :  * We also need this 2 phases setup when adding a device to a filesystem with
    3908             :  * a seed device - we must create new metadata and system chunks without adding
    3909             :  * any of the block group items to the chunk, extent and device btrees. If we
    3910             :  * did not do it this way, we would get ENOSPC when attempting to update those
    3911             :  * btrees, since all the chunks from the seed device are read-only.
    3912             :  *
    3913             :  * Phase 1 does the updates and insertions to the chunk btree because if we had
    3914             :  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
    3915             :  * parallel, we risk having too many system chunks allocated by many tasks if
    3916             :  * many tasks reach phase 1 without the previous ones completing phase 2. In the
    3917             :  * extreme case this leads to exhaustion of the system chunk array in the
    3918             :  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
    3919             :  * and with RAID filesystems (so we have more device items in the chunk btree).
    3920             :  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
    3921             :  * the system chunk array due to concurrent allocations") provides more details.
    3922             :  *
    3923             :  * Allocation of system chunks does not happen through this function. A task that
    3924             :  * needs to update the chunk btree (the only btree that uses system chunks), must
    3925             :  * preallocate chunk space by calling either check_system_chunk() or
    3926             :  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
    3927             :  * metadata chunk or when removing a chunk, while the later is used before doing
    3928             :  * a modification to the chunk btree - use cases for the later are adding,
    3929             :  * removing and resizing a device as well as relocation of a system chunk.
    3930             :  * See the comment below for more details.
    3931             :  *
    3932             :  * The reservation of system space, done through check_system_chunk(), as well
    3933             :  * as all the updates and insertions into the chunk btree must be done while
    3934             :  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
    3935             :  * an extent buffer from the chunks btree we never trigger allocation of a new
    3936             :  * system chunk, which would result in a deadlock (trying to lock twice an
    3937             :  * extent buffer of the chunk btree, first time before triggering the chunk
    3938             :  * allocation and the second time during chunk allocation while attempting to
    3939             :  * update the chunks btree). The system chunk array is also updated while holding
    3940             :  * that mutex. The same logic applies to removing chunks - we must reserve system
    3941             :  * space, update the chunk btree and the system chunk array in the superblock
    3942             :  * while holding fs_info->chunk_mutex.
    3943             :  *
    3944             :  * This function, btrfs_chunk_alloc(), belongs to phase 1.
    3945             :  *
    3946             :  * If @force is CHUNK_ALLOC_FORCE:
    3947             :  *    - return 1 if it successfully allocates a chunk,
    3948             :  *    - return errors including -ENOSPC otherwise.
    3949             :  * If @force is NOT CHUNK_ALLOC_FORCE:
    3950             :  *    - return 0 if it doesn't need to allocate a new chunk,
    3951             :  *    - return 1 if it successfully allocates a chunk,
    3952             :  *    - return errors including -ENOSPC otherwise.
    3953             :  */
    3954      272872 : int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
    3955             :                       enum btrfs_chunk_alloc_enum force)
    3956             : {
    3957      272872 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3958      272872 :         struct btrfs_space_info *space_info;
    3959      272872 :         struct btrfs_block_group *ret_bg;
    3960      272872 :         bool wait_for_alloc = false;
    3961      272872 :         bool should_alloc = false;
    3962      272872 :         bool from_extent_allocation = false;
    3963      272872 :         int ret = 0;
    3964             : 
    3965      272872 :         if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
    3966       54405 :                 from_extent_allocation = true;
    3967       54405 :                 force = CHUNK_ALLOC_FORCE;
    3968             :         }
    3969             : 
    3970             :         /* Don't re-enter if we're already allocating a chunk */
    3971      272872 :         if (trans->allocating_chunk)
    3972             :                 return -ENOSPC;
    3973             :         /*
    3974             :          * Allocation of system chunks can not happen through this path, as we
    3975             :          * could end up in a deadlock if we are allocating a data or metadata
    3976             :          * chunk and there is another task modifying the chunk btree.
    3977             :          *
    3978             :          * This is because while we are holding the chunk mutex, we will attempt
    3979             :          * to add the new chunk item to the chunk btree or update an existing
    3980             :          * device item in the chunk btree, while the other task that is modifying
    3981             :          * the chunk btree is attempting to COW an extent buffer while holding a
    3982             :          * lock on it and on its parent - if the COW operation triggers a system
    3983             :          * chunk allocation, then we can deadlock because we are holding the
    3984             :          * chunk mutex and we may need to access that extent buffer or its parent
    3985             :          * in order to add the chunk item or update a device item.
    3986             :          *
    3987             :          * Tasks that want to modify the chunk tree should reserve system space
    3988             :          * before updating the chunk btree, by calling either
    3989             :          * btrfs_reserve_chunk_metadata() or check_system_chunk().
    3990             :          * It's possible that after a task reserves the space, it still ends up
    3991             :          * here - this happens in the cases described above at do_chunk_alloc().
    3992             :          * The task will have to either retry or fail.
    3993             :          */
    3994      272872 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    3995             :                 return -ENOSPC;
    3996             : 
    3997      272872 :         space_info = btrfs_find_space_info(fs_info, flags);
    3998      272877 :         ASSERT(space_info);
    3999             : 
    4000      272877 :         do {
    4001      272877 :                 spin_lock(&space_info->lock);
    4002      272878 :                 if (force < space_info->force_alloc)
    4003             :                         force = space_info->force_alloc;
    4004      272878 :                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
    4005      272878 :                 if (space_info->full) {
    4006             :                         /* No more free physical space */
    4007      252191 :                         if (should_alloc)
    4008             :                                 ret = -ENOSPC;
    4009             :                         else
    4010        8732 :                                 ret = 0;
    4011      252191 :                         spin_unlock(&space_info->lock);
    4012      252191 :                         return ret;
    4013       20687 :                 } else if (!should_alloc) {
    4014       19254 :                         spin_unlock(&space_info->lock);
    4015       19254 :                         return 0;
    4016        1433 :                 } else if (space_info->chunk_alloc) {
    4017             :                         /*
    4018             :                          * Someone is already allocating, so we need to block
    4019             :                          * until this someone is finished and then loop to
    4020             :                          * recheck if we should continue with our allocation
    4021             :                          * attempt.
    4022             :                          */
    4023           6 :                         wait_for_alloc = true;
    4024           6 :                         force = CHUNK_ALLOC_NO_FORCE;
    4025           6 :                         spin_unlock(&space_info->lock);
    4026           6 :                         mutex_lock(&fs_info->chunk_mutex);
    4027           6 :                         mutex_unlock(&fs_info->chunk_mutex);
    4028             :                 } else {
    4029             :                         /* Proceed with allocation */
    4030        1427 :                         space_info->chunk_alloc = 1;
    4031        1427 :                         wait_for_alloc = false;
    4032        1427 :                         spin_unlock(&space_info->lock);
    4033             :                 }
    4034             : 
    4035        1433 :                 cond_resched();
    4036        1433 :         } while (wait_for_alloc);
    4037             : 
    4038        1427 :         mutex_lock(&fs_info->chunk_mutex);
    4039        1427 :         trans->allocating_chunk = true;
    4040             : 
    4041             :         /*
    4042             :          * If we have mixed data/metadata chunks we want to make sure we keep
    4043             :          * allocating mixed chunks instead of individual chunks.
    4044             :          */
    4045        1427 :         if (btrfs_mixed_space_info(space_info))
    4046         146 :                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
    4047             : 
    4048             :         /*
    4049             :          * if we're doing a data chunk, go ahead and make sure that
    4050             :          * we keep a reasonable number of metadata chunks allocated in the
    4051             :          * FS as well.
    4052             :          */
    4053        1427 :         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
    4054           0 :                 fs_info->data_chunk_allocations++;
    4055           0 :                 if (!(fs_info->data_chunk_allocations %
    4056             :                       fs_info->metadata_ratio))
    4057           0 :                         force_metadata_allocation(fs_info);
    4058             :         }
    4059             : 
    4060        1427 :         ret_bg = do_chunk_alloc(trans, flags);
    4061        1427 :         trans->allocating_chunk = false;
    4062             : 
    4063        1427 :         if (IS_ERR(ret_bg)) {
    4064          50 :                 ret = PTR_ERR(ret_bg);
    4065             :         } else if (from_extent_allocation) {
    4066             :                 /*
    4067             :                  * New block group is likely to be used soon. Try to activate
    4068             :                  * it now. Failure is OK for now.
    4069             :                  */
    4070             :                 btrfs_zone_activate(ret_bg);
    4071             :         }
    4072             : 
    4073          50 :         if (!ret)
    4074        1377 :                 btrfs_put_block_group(ret_bg);
    4075             : 
    4076        1427 :         spin_lock(&space_info->lock);
    4077        1427 :         if (ret < 0) {
    4078          50 :                 if (ret == -ENOSPC)
    4079          50 :                         space_info->full = 1;
    4080             :                 else
    4081           0 :                         goto out;
    4082             :         } else {
    4083        1377 :                 ret = 1;
    4084        1377 :                 space_info->max_extent_size = 0;
    4085             :         }
    4086             : 
    4087        1427 :         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
    4088        1427 : out:
    4089        1427 :         space_info->chunk_alloc = 0;
    4090        1427 :         spin_unlock(&space_info->lock);
    4091        1427 :         mutex_unlock(&fs_info->chunk_mutex);
    4092             : 
    4093        1427 :         return ret;
    4094             : }
    4095             : 
    4096        2058 : static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
    4097             : {
    4098        2058 :         u64 num_dev;
    4099             : 
    4100        2058 :         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
    4101        2058 :         if (!num_dev)
    4102           0 :                 num_dev = fs_info->fs_devices->rw_devices;
    4103             : 
    4104        2058 :         return num_dev;
    4105             : }
    4106             : 
    4107        2120 : static void reserve_chunk_space(struct btrfs_trans_handle *trans,
    4108             :                                 u64 bytes,
    4109             :                                 u64 type)
    4110             : {
    4111        2120 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4112        2120 :         struct btrfs_space_info *info;
    4113        2120 :         u64 left;
    4114        2120 :         int ret = 0;
    4115             : 
    4116             :         /*
    4117             :          * Needed because we can end up allocating a system chunk and for an
    4118             :          * atomic and race free space reservation in the chunk block reserve.
    4119             :          */
    4120        2120 :         lockdep_assert_held(&fs_info->chunk_mutex);
    4121             : 
    4122        2120 :         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
    4123        2120 :         spin_lock(&info->lock);
    4124        2120 :         left = info->total_bytes - btrfs_space_info_used(info, true);
    4125        2120 :         spin_unlock(&info->lock);
    4126             : 
    4127        2120 :         if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
    4128           0 :                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
    4129             :                            left, bytes, type);
    4130           0 :                 btrfs_dump_space_info(fs_info, info, 0, 0);
    4131             :         }
    4132             : 
    4133        2120 :         if (left < bytes) {
    4134          93 :                 u64 flags = btrfs_system_alloc_profile(fs_info);
    4135          93 :                 struct btrfs_block_group *bg;
    4136             : 
    4137             :                 /*
    4138             :                  * Ignore failure to create system chunk. We might end up not
    4139             :                  * needing it, as we might not need to COW all nodes/leafs from
    4140             :                  * the paths we visit in the chunk tree (they were already COWed
    4141             :                  * or created in the current transaction for example).
    4142             :                  */
    4143          93 :                 bg = btrfs_create_chunk(trans, flags);
    4144          93 :                 if (IS_ERR(bg)) {
    4145           0 :                         ret = PTR_ERR(bg);
    4146             :                 } else {
    4147             :                         /*
    4148             :                          * We have a new chunk. We also need to activate it for
    4149             :                          * zoned filesystem.
    4150             :                          */
    4151          93 :                         ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
    4152          93 :                         if (ret < 0)
    4153             :                                 return;
    4154             : 
    4155             :                         /*
    4156             :                          * If we fail to add the chunk item here, we end up
    4157             :                          * trying again at phase 2 of chunk allocation, at
    4158             :                          * btrfs_create_pending_block_groups(). So ignore
    4159             :                          * any error here. An ENOSPC here could happen, due to
    4160             :                          * the cases described at do_chunk_alloc() - the system
    4161             :                          * block group we just created was just turned into RO
    4162             :                          * mode by a scrub for example, or a running discard
    4163             :                          * temporarily removed its free space entries, etc.
    4164             :                          */
    4165          93 :                         btrfs_chunk_alloc_add_chunk_item(trans, bg);
    4166             :                 }
    4167             :         }
    4168             : 
    4169          93 :         if (!ret) {
    4170        2120 :                 ret = btrfs_block_rsv_add(fs_info,
    4171             :                                           &fs_info->chunk_block_rsv,
    4172             :                                           bytes, BTRFS_RESERVE_NO_FLUSH);
    4173        2120 :                 if (!ret)
    4174        2120 :                         trans->chunk_bytes_reserved += bytes;
    4175             :         }
    4176             : }
    4177             : 
    4178             : /*
    4179             :  * Reserve space in the system space for allocating or removing a chunk.
    4180             :  * The caller must be holding fs_info->chunk_mutex.
    4181             :  */
    4182        2058 : void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
    4183             : {
    4184        2058 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4185        2058 :         const u64 num_devs = get_profile_num_devs(fs_info, type);
    4186        2058 :         u64 bytes;
    4187             : 
    4188             :         /* num_devs device items to update and 1 chunk item to add or remove. */
    4189        2058 :         bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
    4190             :                 btrfs_calc_insert_metadata_size(fs_info, 1);
    4191             : 
    4192        2058 :         reserve_chunk_space(trans, bytes, type);
    4193        2058 : }
    4194             : 
    4195             : /*
    4196             :  * Reserve space in the system space, if needed, for doing a modification to the
    4197             :  * chunk btree.
    4198             :  *
    4199             :  * @trans:              A transaction handle.
    4200             :  * @is_item_insertion:  Indicate if the modification is for inserting a new item
    4201             :  *                      in the chunk btree or if it's for the deletion or update
    4202             :  *                      of an existing item.
    4203             :  *
    4204             :  * This is used in a context where we need to update the chunk btree outside
    4205             :  * block group allocation and removal, to avoid a deadlock with a concurrent
    4206             :  * task that is allocating a metadata or data block group and therefore needs to
    4207             :  * update the chunk btree while holding the chunk mutex. After the update to the
    4208             :  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
    4209             :  *
    4210             :  */
    4211          62 : void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
    4212             :                                   bool is_item_insertion)
    4213             : {
    4214          62 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4215          62 :         u64 bytes;
    4216             : 
    4217          62 :         if (is_item_insertion)
    4218           0 :                 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
    4219             :         else
    4220          62 :                 bytes = btrfs_calc_metadata_size(fs_info, 1);
    4221             : 
    4222          62 :         mutex_lock(&fs_info->chunk_mutex);
    4223          62 :         reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
    4224          62 :         mutex_unlock(&fs_info->chunk_mutex);
    4225          62 : }
    4226             : 
    4227        3217 : void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
    4228             : {
    4229        3217 :         struct btrfs_block_group *block_group;
    4230             : 
    4231        3217 :         block_group = btrfs_lookup_first_block_group(info, 0);
    4232       33427 :         while (block_group) {
    4233       30210 :                 btrfs_wait_block_group_cache_done(block_group);
    4234       30210 :                 spin_lock(&block_group->lock);
    4235       30210 :                 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
    4236       30210 :                                        &block_group->runtime_flags)) {
    4237           6 :                         struct inode *inode = block_group->inode;
    4238             : 
    4239           6 :                         block_group->inode = NULL;
    4240           6 :                         spin_unlock(&block_group->lock);
    4241             : 
    4242           6 :                         ASSERT(block_group->io_ctl.inode == NULL);
    4243           6 :                         iput(inode);
    4244             :                 } else {
    4245       30204 :                         spin_unlock(&block_group->lock);
    4246             :                 }
    4247       30210 :                 block_group = btrfs_next_block_group(block_group);
    4248             :         }
    4249        3217 : }
    4250             : 
    4251             : /*
    4252             :  * Must be called only after stopping all workers, since we could have block
    4253             :  * group caching kthreads running, and therefore they could race with us if we
    4254             :  * freed the block groups before stopping them.
    4255             :  */
    4256        3218 : int btrfs_free_block_groups(struct btrfs_fs_info *info)
    4257             : {
    4258        3218 :         struct btrfs_block_group *block_group;
    4259        3218 :         struct btrfs_space_info *space_info;
    4260        3218 :         struct btrfs_caching_control *caching_ctl;
    4261        3218 :         struct rb_node *n;
    4262             : 
    4263        3218 :         write_lock(&info->block_group_cache_lock);
    4264        8451 :         while (!list_empty(&info->caching_block_groups)) {
    4265        5233 :                 caching_ctl = list_entry(info->caching_block_groups.next,
    4266             :                                          struct btrfs_caching_control, list);
    4267        5233 :                 list_del(&caching_ctl->list);
    4268        5233 :                 btrfs_put_caching_control(caching_ctl);
    4269             :         }
    4270        3218 :         write_unlock(&info->block_group_cache_lock);
    4271             : 
    4272        3218 :         spin_lock(&info->unused_bgs_lock);
    4273        3495 :         while (!list_empty(&info->unused_bgs)) {
    4274         277 :                 block_group = list_first_entry(&info->unused_bgs,
    4275             :                                                struct btrfs_block_group,
    4276             :                                                bg_list);
    4277         277 :                 list_del_init(&block_group->bg_list);
    4278         277 :                 btrfs_put_block_group(block_group);
    4279             :         }
    4280             : 
    4281        3218 :         while (!list_empty(&info->reclaim_bgs)) {
    4282           0 :                 block_group = list_first_entry(&info->reclaim_bgs,
    4283             :                                                struct btrfs_block_group,
    4284             :                                                bg_list);
    4285           0 :                 list_del_init(&block_group->bg_list);
    4286           0 :                 btrfs_put_block_group(block_group);
    4287             :         }
    4288        3218 :         spin_unlock(&info->unused_bgs_lock);
    4289             : 
    4290        3218 :         spin_lock(&info->zone_active_bgs_lock);
    4291        3218 :         while (!list_empty(&info->zone_active_bgs)) {
    4292           0 :                 block_group = list_first_entry(&info->zone_active_bgs,
    4293             :                                                struct btrfs_block_group,
    4294             :                                                active_bg_list);
    4295           0 :                 list_del_init(&block_group->active_bg_list);
    4296           0 :                 btrfs_put_block_group(block_group);
    4297             :         }
    4298        3218 :         spin_unlock(&info->zone_active_bgs_lock);
    4299             : 
    4300        3218 :         write_lock(&info->block_group_cache_lock);
    4301       33428 :         while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
    4302       30210 :                 block_group = rb_entry(n, struct btrfs_block_group,
    4303             :                                        cache_node);
    4304       30210 :                 rb_erase_cached(&block_group->cache_node,
    4305             :                                 &info->block_group_cache_tree);
    4306       30210 :                 RB_CLEAR_NODE(&block_group->cache_node);
    4307       30210 :                 write_unlock(&info->block_group_cache_lock);
    4308             : 
    4309       30210 :                 down_write(&block_group->space_info->groups_sem);
    4310       30210 :                 list_del(&block_group->list);
    4311       30210 :                 up_write(&block_group->space_info->groups_sem);
    4312             : 
    4313             :                 /*
    4314             :                  * We haven't cached this block group, which means we could
    4315             :                  * possibly have excluded extents on this block group.
    4316             :                  */
    4317       30210 :                 if (block_group->cached == BTRFS_CACHE_NO ||
    4318             :                     block_group->cached == BTRFS_CACHE_ERROR)
    4319        4961 :                         btrfs_free_excluded_extents(block_group);
    4320             : 
    4321       30210 :                 btrfs_remove_free_space_cache(block_group);
    4322       30210 :                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
    4323       30210 :                 ASSERT(list_empty(&block_group->dirty_list));
    4324       30210 :                 ASSERT(list_empty(&block_group->io_list));
    4325       30210 :                 ASSERT(list_empty(&block_group->bg_list));
    4326       30210 :                 ASSERT(refcount_read(&block_group->refs) == 1);
    4327       30210 :                 ASSERT(block_group->swap_extents == 0);
    4328       30210 :                 btrfs_put_block_group(block_group);
    4329             : 
    4330       30210 :                 write_lock(&info->block_group_cache_lock);
    4331             :         }
    4332        3218 :         write_unlock(&info->block_group_cache_lock);
    4333             : 
    4334        3218 :         btrfs_release_global_block_rsv(info);
    4335             : 
    4336       12839 :         while (!list_empty(&info->space_info)) {
    4337        9621 :                 space_info = list_entry(info->space_info.next,
    4338             :                                         struct btrfs_space_info,
    4339             :                                         list);
    4340             : 
    4341             :                 /*
    4342             :                  * Do not hide this behind enospc_debug, this is actually
    4343             :                  * important and indicates a real bug if this happens.
    4344             :                  */
    4345       19242 :                 if (WARN_ON(space_info->bytes_pinned > 0 ||
    4346             :                             space_info->bytes_may_use > 0))
    4347           0 :                         btrfs_dump_space_info(info, space_info, 0, 0);
    4348             : 
    4349             :                 /*
    4350             :                  * If there was a failure to cleanup a log tree, very likely due
    4351             :                  * to an IO failure on a writeback attempt of one or more of its
    4352             :                  * extent buffers, we could not do proper (and cheap) unaccounting
    4353             :                  * of their reserved space, so don't warn on bytes_reserved > 0 in
    4354             :                  * that case.
    4355             :                  */
    4356        9621 :                 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
    4357        3217 :                     !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
    4358        9620 :                         if (WARN_ON(space_info->bytes_reserved > 0))
    4359           0 :                                 btrfs_dump_space_info(info, space_info, 0, 0);
    4360             :                 }
    4361             : 
    4362        9621 :                 WARN_ON(space_info->reclaim_size > 0);
    4363        9621 :                 list_del(&space_info->list);
    4364        9621 :                 btrfs_sysfs_remove_space_info(space_info);
    4365             :         }
    4366        3218 :         return 0;
    4367             : }
    4368             : 
    4369        5071 : void btrfs_freeze_block_group(struct btrfs_block_group *cache)
    4370             : {
    4371        5071 :         atomic_inc(&cache->frozen);
    4372           0 : }
    4373             : 
    4374        5071 : void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
    4375             : {
    4376        5071 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    4377        5071 :         struct extent_map_tree *em_tree;
    4378        5071 :         struct extent_map *em;
    4379        5071 :         bool cleanup;
    4380             : 
    4381        5071 :         spin_lock(&block_group->lock);
    4382        9702 :         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
    4383        4631 :                    test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
    4384        5071 :         spin_unlock(&block_group->lock);
    4385             : 
    4386        5071 :         if (cleanup) {
    4387           0 :                 em_tree = &fs_info->mapping_tree;
    4388           0 :                 write_lock(&em_tree->lock);
    4389           0 :                 em = lookup_extent_mapping(em_tree, block_group->start,
    4390             :                                            1);
    4391           0 :                 BUG_ON(!em); /* logic error, can't happen */
    4392           0 :                 remove_extent_mapping(em_tree, em);
    4393           0 :                 write_unlock(&em_tree->lock);
    4394             : 
    4395             :                 /* once for us and once for the tree */
    4396           0 :                 free_extent_map(em);
    4397           0 :                 free_extent_map(em);
    4398             : 
    4399             :                 /*
    4400             :                  * We may have left one free space entry and other possible
    4401             :                  * tasks trimming this block group have left 1 entry each one.
    4402             :                  * Free them if any.
    4403             :                  */
    4404           0 :                 btrfs_remove_free_space_cache(block_group);
    4405             :         }
    4406        5071 : }
    4407             : 
    4408         553 : bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
    4409             : {
    4410         553 :         bool ret = true;
    4411             : 
    4412         553 :         spin_lock(&bg->lock);
    4413         553 :         if (bg->ro)
    4414             :                 ret = false;
    4415             :         else
    4416         553 :                 bg->swap_extents++;
    4417         553 :         spin_unlock(&bg->lock);
    4418             : 
    4419         553 :         return ret;
    4420             : }
    4421             : 
    4422          40 : void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
    4423             : {
    4424          40 :         spin_lock(&bg->lock);
    4425          40 :         ASSERT(!bg->ro);
    4426          40 :         ASSERT(bg->swap_extents >= amount);
    4427          40 :         bg->swap_extents -= amount;
    4428          40 :         spin_unlock(&bg->lock);
    4429          40 : }
    4430             : 
    4431    13274440 : enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
    4432             : {
    4433    16986623 :         if (size <= SZ_128K)
    4434             :                 return BTRFS_BG_SZ_SMALL;
    4435      155608 :         if (size <= SZ_8M)
    4436      143089 :                 return BTRFS_BG_SZ_MEDIUM;
    4437             :         return BTRFS_BG_SZ_LARGE;
    4438             : }
    4439             : 
    4440             : /*
    4441             :  * Handle a block group allocating an extent in a size class
    4442             :  *
    4443             :  * @bg:                         The block group we allocated in.
    4444             :  * @size_class:                 The size class of the allocation.
    4445             :  * @force_wrong_size_class:     Whether we are desperate enough to allow
    4446             :  *                              mismatched size classes.
    4447             :  *
    4448             :  * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
    4449             :  * case of a race that leads to the wrong size class without
    4450             :  * force_wrong_size_class set.
    4451             :  *
    4452             :  * find_free_extent will skip block groups with a mismatched size class until
    4453             :  * it really needs to avoid ENOSPC. In that case it will set
    4454             :  * force_wrong_size_class. However, if a block group is newly allocated and
    4455             :  * doesn't yet have a size class, then it is possible for two allocations of
    4456             :  * different sizes to race and both try to use it. The loser is caught here and
    4457             :  * has to retry.
    4458             :  */
    4459           0 : int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
    4460             :                                      enum btrfs_block_group_size_class size_class,
    4461             :                                      bool force_wrong_size_class)
    4462             : {
    4463     3707031 :         ASSERT(size_class != BTRFS_BG_SZ_NONE);
    4464             : 
    4465             :         /* The new allocation is in the right size class, do nothing */
    4466     3707031 :         if (bg->size_class == size_class)
    4467             :                 return 0;
    4468             :         /*
    4469             :          * The new allocation is in a mismatched size class.
    4470             :          * This means one of two things:
    4471             :          *
    4472             :          * 1. Two tasks in find_free_extent for different size_classes raced
    4473             :          *    and hit the same empty block_group. Make the loser try again.
    4474             :          * 2. A call to find_free_extent got desperate enough to set
    4475             :          *    'force_wrong_slab'. Don't change the size_class, but allow the
    4476             :          *    allocation.
    4477             :          */
    4478       50514 :         if (bg->size_class != BTRFS_BG_SZ_NONE) {
    4479       48937 :                 if (force_wrong_size_class)
    4480             :                         return 0;
    4481           0 :                 return -EAGAIN;
    4482             :         }
    4483             :         /*
    4484             :          * The happy new block group case: the new allocation is the first
    4485             :          * one in the block_group so we set size_class.
    4486             :          */
    4487        1577 :         bg->size_class = size_class;
    4488             : 
    4489        1577 :         return 0;
    4490             : }
    4491             : 
    4492    22812344 : bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
    4493             : {
    4494    36080161 :         if (btrfs_is_zoned(bg->fs_info))
    4495             :                 return false;
    4496    36080161 :         if (!btrfs_is_block_group_data_only(bg))
    4497    11113092 :                 return false;
    4498             :         return true;
    4499             : }

Generated by: LCOV version 1.14