LCOV - code coverage report
Current view: top level - fs/btrfs - block-group.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwa @ Mon Jul 31 20:08:17 PDT 2023 Lines: 0 2113 0.0 %
Date: 2023-07-31 20:08:17 Functions: 0 83 0.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/sizes.h>
       4             : #include <linux/list_sort.h>
       5             : #include "misc.h"
       6             : #include "ctree.h"
       7             : #include "block-group.h"
       8             : #include "space-info.h"
       9             : #include "disk-io.h"
      10             : #include "free-space-cache.h"
      11             : #include "free-space-tree.h"
      12             : #include "volumes.h"
      13             : #include "transaction.h"
      14             : #include "ref-verify.h"
      15             : #include "sysfs.h"
      16             : #include "tree-log.h"
      17             : #include "delalloc-space.h"
      18             : #include "discard.h"
      19             : #include "raid56.h"
      20             : #include "zoned.h"
      21             : #include "fs.h"
      22             : #include "accessors.h"
      23             : #include "extent-tree.h"
      24             : 
      25             : #ifdef CONFIG_BTRFS_DEBUG
      26             : int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
      27             : {
      28             :         struct btrfs_fs_info *fs_info = block_group->fs_info;
      29             : 
      30             :         return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
      31             :                 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
      32             :                (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
      33             :                 block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
      34             : }
      35             : #endif
      36             : 
      37             : /*
      38             :  * Return target flags in extended format or 0 if restripe for this chunk_type
      39             :  * is not in progress
      40             :  *
      41             :  * Should be called with balance_lock held
      42             :  */
      43           0 : static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
      44             : {
      45           0 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
      46           0 :         u64 target = 0;
      47             : 
      48           0 :         if (!bctl)
      49             :                 return 0;
      50             : 
      51           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA &&
      52           0 :             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      53           0 :                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
      54           0 :         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
      55           0 :                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      56           0 :                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
      57           0 :         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
      58           0 :                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      59           0 :                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
      60             :         }
      61             : 
      62             :         return target;
      63             : }
      64             : 
      65             : /*
      66             :  * @flags: available profiles in extended format (see ctree.h)
      67             :  *
      68             :  * Return reduced profile in chunk format.  If profile changing is in progress
      69             :  * (either running or paused) picks the target profile (if it's already
      70             :  * available), otherwise falls back to plain reducing.
      71             :  */
      72           0 : static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
      73             : {
      74           0 :         u64 num_devices = fs_info->fs_devices->rw_devices;
      75           0 :         u64 target;
      76           0 :         u64 raid_type;
      77           0 :         u64 allowed = 0;
      78             : 
      79             :         /*
      80             :          * See if restripe for this chunk_type is in progress, if so try to
      81             :          * reduce to the target profile
      82             :          */
      83           0 :         spin_lock(&fs_info->balance_lock);
      84           0 :         target = get_restripe_target(fs_info, flags);
      85           0 :         if (target) {
      86           0 :                 spin_unlock(&fs_info->balance_lock);
      87           0 :                 return extended_to_chunk(target);
      88             :         }
      89           0 :         spin_unlock(&fs_info->balance_lock);
      90             : 
      91             :         /* First, mask out the RAID levels which aren't possible */
      92           0 :         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
      93           0 :                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
      94           0 :                         allowed |= btrfs_raid_array[raid_type].bg_flag;
      95             :         }
      96           0 :         allowed &= flags;
      97             : 
      98             :         /* Select the highest-redundancy RAID level. */
      99           0 :         if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
     100             :                 allowed = BTRFS_BLOCK_GROUP_RAID1C4;
     101           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
     102             :                 allowed = BTRFS_BLOCK_GROUP_RAID6;
     103           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
     104             :                 allowed = BTRFS_BLOCK_GROUP_RAID1C3;
     105           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
     106             :                 allowed = BTRFS_BLOCK_GROUP_RAID5;
     107           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
     108             :                 allowed = BTRFS_BLOCK_GROUP_RAID10;
     109           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
     110             :                 allowed = BTRFS_BLOCK_GROUP_RAID1;
     111           0 :         else if (allowed & BTRFS_BLOCK_GROUP_DUP)
     112             :                 allowed = BTRFS_BLOCK_GROUP_DUP;
     113           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
     114           0 :                 allowed = BTRFS_BLOCK_GROUP_RAID0;
     115             : 
     116           0 :         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
     117             : 
     118           0 :         return extended_to_chunk(flags | allowed);
     119             : }
     120             : 
     121           0 : u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
     122             : {
     123           0 :         unsigned seq;
     124           0 :         u64 flags;
     125             : 
     126           0 :         do {
     127           0 :                 flags = orig_flags;
     128           0 :                 seq = read_seqbegin(&fs_info->profiles_lock);
     129             : 
     130           0 :                 if (flags & BTRFS_BLOCK_GROUP_DATA)
     131           0 :                         flags |= fs_info->avail_data_alloc_bits;
     132           0 :                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
     133           0 :                         flags |= fs_info->avail_system_alloc_bits;
     134           0 :                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
     135           0 :                         flags |= fs_info->avail_metadata_alloc_bits;
     136           0 :         } while (read_seqretry(&fs_info->profiles_lock, seq));
     137             : 
     138           0 :         return btrfs_reduce_alloc_profile(fs_info, flags);
     139             : }
     140             : 
     141           0 : void btrfs_get_block_group(struct btrfs_block_group *cache)
     142             : {
     143           0 :         refcount_inc(&cache->refs);
     144           0 : }
     145             : 
     146           0 : void btrfs_put_block_group(struct btrfs_block_group *cache)
     147             : {
     148           0 :         if (refcount_dec_and_test(&cache->refs)) {
     149           0 :                 WARN_ON(cache->pinned > 0);
     150             :                 /*
     151             :                  * If there was a failure to cleanup a log tree, very likely due
     152             :                  * to an IO failure on a writeback attempt of one or more of its
     153             :                  * extent buffers, we could not do proper (and cheap) unaccounting
     154             :                  * of their reserved space, so don't warn on reserved > 0 in that
     155             :                  * case.
     156             :                  */
     157           0 :                 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
     158           0 :                     !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
     159           0 :                         WARN_ON(cache->reserved > 0);
     160             : 
     161             :                 /*
     162             :                  * A block_group shouldn't be on the discard_list anymore.
     163             :                  * Remove the block_group from the discard_list to prevent us
     164             :                  * from causing a panic due to NULL pointer dereference.
     165             :                  */
     166           0 :                 if (WARN_ON(!list_empty(&cache->discard_list)))
     167           0 :                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
     168             :                                                   cache);
     169             : 
     170           0 :                 kfree(cache->free_space_ctl);
     171           0 :                 kfree(cache->physical_map);
     172           0 :                 kfree(cache);
     173             :         }
     174           0 : }
     175             : 
     176             : /*
     177             :  * This adds the block group to the fs_info rb tree for the block group cache
     178             :  */
     179           0 : static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
     180             :                                        struct btrfs_block_group *block_group)
     181             : {
     182           0 :         struct rb_node **p;
     183           0 :         struct rb_node *parent = NULL;
     184           0 :         struct btrfs_block_group *cache;
     185           0 :         bool leftmost = true;
     186             : 
     187           0 :         ASSERT(block_group->length != 0);
     188             : 
     189           0 :         write_lock(&info->block_group_cache_lock);
     190           0 :         p = &info->block_group_cache_tree.rb_root.rb_node;
     191             : 
     192           0 :         while (*p) {
     193           0 :                 parent = *p;
     194           0 :                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
     195           0 :                 if (block_group->start < cache->start) {
     196           0 :                         p = &(*p)->rb_left;
     197           0 :                 } else if (block_group->start > cache->start) {
     198           0 :                         p = &(*p)->rb_right;
     199           0 :                         leftmost = false;
     200             :                 } else {
     201           0 :                         write_unlock(&info->block_group_cache_lock);
     202           0 :                         return -EEXIST;
     203             :                 }
     204             :         }
     205             : 
     206           0 :         rb_link_node(&block_group->cache_node, parent, p);
     207           0 :         rb_insert_color_cached(&block_group->cache_node,
     208             :                                &info->block_group_cache_tree, leftmost);
     209             : 
     210           0 :         write_unlock(&info->block_group_cache_lock);
     211             : 
     212           0 :         return 0;
     213             : }
     214             : 
     215             : /*
     216             :  * This will return the block group at or after bytenr if contains is 0, else
     217             :  * it will return the block group that contains the bytenr
     218             :  */
     219           0 : static struct btrfs_block_group *block_group_cache_tree_search(
     220             :                 struct btrfs_fs_info *info, u64 bytenr, int contains)
     221             : {
     222           0 :         struct btrfs_block_group *cache, *ret = NULL;
     223           0 :         struct rb_node *n;
     224           0 :         u64 end, start;
     225             : 
     226           0 :         read_lock(&info->block_group_cache_lock);
     227           0 :         n = info->block_group_cache_tree.rb_root.rb_node;
     228             : 
     229           0 :         while (n) {
     230           0 :                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
     231           0 :                 end = cache->start + cache->length - 1;
     232           0 :                 start = cache->start;
     233             : 
     234           0 :                 if (bytenr < start) {
     235           0 :                         if (!contains && (!ret || start < ret->start))
     236           0 :                                 ret = cache;
     237           0 :                         n = n->rb_left;
     238           0 :                 } else if (bytenr > start) {
     239           0 :                         if (contains && bytenr <= end) {
     240             :                                 ret = cache;
     241             :                                 break;
     242             :                         }
     243           0 :                         n = n->rb_right;
     244             :                 } else {
     245             :                         ret = cache;
     246             :                         break;
     247             :                 }
     248             :         }
     249           0 :         if (ret)
     250           0 :                 btrfs_get_block_group(ret);
     251           0 :         read_unlock(&info->block_group_cache_lock);
     252             : 
     253           0 :         return ret;
     254             : }
     255             : 
     256             : /*
     257             :  * Return the block group that starts at or after bytenr
     258             :  */
     259           0 : struct btrfs_block_group *btrfs_lookup_first_block_group(
     260             :                 struct btrfs_fs_info *info, u64 bytenr)
     261             : {
     262           0 :         return block_group_cache_tree_search(info, bytenr, 0);
     263             : }
     264             : 
     265             : /*
     266             :  * Return the block group that contains the given bytenr
     267             :  */
     268           0 : struct btrfs_block_group *btrfs_lookup_block_group(
     269             :                 struct btrfs_fs_info *info, u64 bytenr)
     270             : {
     271           0 :         return block_group_cache_tree_search(info, bytenr, 1);
     272             : }
     273             : 
     274           0 : struct btrfs_block_group *btrfs_next_block_group(
     275             :                 struct btrfs_block_group *cache)
     276             : {
     277           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     278           0 :         struct rb_node *node;
     279             : 
     280           0 :         read_lock(&fs_info->block_group_cache_lock);
     281             : 
     282             :         /* If our block group was removed, we need a full search. */
     283           0 :         if (RB_EMPTY_NODE(&cache->cache_node)) {
     284           0 :                 const u64 next_bytenr = cache->start + cache->length;
     285             : 
     286           0 :                 read_unlock(&fs_info->block_group_cache_lock);
     287           0 :                 btrfs_put_block_group(cache);
     288           0 :                 return btrfs_lookup_first_block_group(fs_info, next_bytenr);
     289             :         }
     290           0 :         node = rb_next(&cache->cache_node);
     291           0 :         btrfs_put_block_group(cache);
     292           0 :         if (node) {
     293           0 :                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
     294           0 :                 btrfs_get_block_group(cache);
     295             :         } else
     296             :                 cache = NULL;
     297           0 :         read_unlock(&fs_info->block_group_cache_lock);
     298           0 :         return cache;
     299             : }
     300             : 
     301             : /*
     302             :  * Check if we can do a NOCOW write for a given extent.
     303             :  *
     304             :  * @fs_info:       The filesystem information object.
     305             :  * @bytenr:        Logical start address of the extent.
     306             :  *
     307             :  * Check if we can do a NOCOW write for the given extent, and increments the
     308             :  * number of NOCOW writers in the block group that contains the extent, as long
     309             :  * as the block group exists and it's currently not in read-only mode.
     310             :  *
     311             :  * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
     312             :  *          is responsible for calling btrfs_dec_nocow_writers() later.
     313             :  *
     314             :  *          Or NULL if we can not do a NOCOW write
     315             :  */
     316           0 : struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
     317             :                                                   u64 bytenr)
     318             : {
     319           0 :         struct btrfs_block_group *bg;
     320           0 :         bool can_nocow = true;
     321             : 
     322           0 :         bg = btrfs_lookup_block_group(fs_info, bytenr);
     323           0 :         if (!bg)
     324             :                 return NULL;
     325             : 
     326           0 :         spin_lock(&bg->lock);
     327           0 :         if (bg->ro)
     328             :                 can_nocow = false;
     329             :         else
     330           0 :                 atomic_inc(&bg->nocow_writers);
     331           0 :         spin_unlock(&bg->lock);
     332             : 
     333           0 :         if (!can_nocow) {
     334           0 :                 btrfs_put_block_group(bg);
     335           0 :                 return NULL;
     336             :         }
     337             : 
     338             :         /* No put on block group, done by btrfs_dec_nocow_writers(). */
     339             :         return bg;
     340             : }
     341             : 
     342             : /*
     343             :  * Decrement the number of NOCOW writers in a block group.
     344             :  *
     345             :  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
     346             :  * and on the block group returned by that call. Typically this is called after
     347             :  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
     348             :  * relocation.
     349             :  *
     350             :  * After this call, the caller should not use the block group anymore. It it wants
     351             :  * to use it, then it should get a reference on it before calling this function.
     352             :  */
     353           0 : void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
     354             : {
     355           0 :         if (atomic_dec_and_test(&bg->nocow_writers))
     356           0 :                 wake_up_var(&bg->nocow_writers);
     357             : 
     358             :         /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
     359           0 :         btrfs_put_block_group(bg);
     360           0 : }
     361             : 
     362           0 : void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
     363             : {
     364           0 :         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
     365           0 : }
     366             : 
     367           0 : void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
     368             :                                         const u64 start)
     369             : {
     370           0 :         struct btrfs_block_group *bg;
     371             : 
     372           0 :         bg = btrfs_lookup_block_group(fs_info, start);
     373           0 :         ASSERT(bg);
     374           0 :         if (atomic_dec_and_test(&bg->reservations))
     375           0 :                 wake_up_var(&bg->reservations);
     376           0 :         btrfs_put_block_group(bg);
     377           0 : }
     378             : 
     379           0 : void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
     380             : {
     381           0 :         struct btrfs_space_info *space_info = bg->space_info;
     382             : 
     383           0 :         ASSERT(bg->ro);
     384             : 
     385           0 :         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
     386             :                 return;
     387             : 
     388             :         /*
     389             :          * Our block group is read only but before we set it to read only,
     390             :          * some task might have had allocated an extent from it already, but it
     391             :          * has not yet created a respective ordered extent (and added it to a
     392             :          * root's list of ordered extents).
     393             :          * Therefore wait for any task currently allocating extents, since the
     394             :          * block group's reservations counter is incremented while a read lock
     395             :          * on the groups' semaphore is held and decremented after releasing
     396             :          * the read access on that semaphore and creating the ordered extent.
     397             :          */
     398           0 :         down_write(&space_info->groups_sem);
     399           0 :         up_write(&space_info->groups_sem);
     400             : 
     401           0 :         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
     402             : }
     403             : 
     404           0 : struct btrfs_caching_control *btrfs_get_caching_control(
     405             :                 struct btrfs_block_group *cache)
     406             : {
     407           0 :         struct btrfs_caching_control *ctl;
     408             : 
     409           0 :         spin_lock(&cache->lock);
     410           0 :         if (!cache->caching_ctl) {
     411           0 :                 spin_unlock(&cache->lock);
     412           0 :                 return NULL;
     413             :         }
     414             : 
     415           0 :         ctl = cache->caching_ctl;
     416           0 :         refcount_inc(&ctl->count);
     417           0 :         spin_unlock(&cache->lock);
     418           0 :         return ctl;
     419             : }
     420             : 
     421           0 : void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
     422             : {
     423           0 :         if (refcount_dec_and_test(&ctl->count))
     424           0 :                 kfree(ctl);
     425           0 : }
     426             : 
     427             : /*
     428             :  * When we wait for progress in the block group caching, its because our
     429             :  * allocation attempt failed at least once.  So, we must sleep and let some
     430             :  * progress happen before we try again.
     431             :  *
     432             :  * This function will sleep at least once waiting for new free space to show
     433             :  * up, and then it will check the block group free space numbers for our min
     434             :  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
     435             :  * a free extent of a given size, but this is a good start.
     436             :  *
     437             :  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
     438             :  * any of the information in this block group.
     439             :  */
     440           0 : void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
     441             :                                            u64 num_bytes)
     442             : {
     443           0 :         struct btrfs_caching_control *caching_ctl;
     444             : 
     445           0 :         caching_ctl = btrfs_get_caching_control(cache);
     446           0 :         if (!caching_ctl)
     447             :                 return;
     448             : 
     449           0 :         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
     450             :                    (cache->free_space_ctl->free_space >= num_bytes));
     451             : 
     452           0 :         btrfs_put_caching_control(caching_ctl);
     453             : }
     454             : 
     455           0 : static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
     456             :                                        struct btrfs_caching_control *caching_ctl)
     457             : {
     458           0 :         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
     459           0 :         return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
     460             : }
     461             : 
     462           0 : static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
     463             : {
     464           0 :         struct btrfs_caching_control *caching_ctl;
     465           0 :         int ret;
     466             : 
     467           0 :         caching_ctl = btrfs_get_caching_control(cache);
     468           0 :         if (!caching_ctl)
     469           0 :                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
     470           0 :         ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
     471           0 :         btrfs_put_caching_control(caching_ctl);
     472           0 :         return ret;
     473             : }
     474             : 
     475             : #ifdef CONFIG_BTRFS_DEBUG
     476             : static void fragment_free_space(struct btrfs_block_group *block_group)
     477             : {
     478             :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     479             :         u64 start = block_group->start;
     480             :         u64 len = block_group->length;
     481             :         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
     482             :                 fs_info->nodesize : fs_info->sectorsize;
     483             :         u64 step = chunk << 1;
     484             : 
     485             :         while (len > chunk) {
     486             :                 btrfs_remove_free_space(block_group, start, chunk);
     487             :                 start += step;
     488             :                 if (len < step)
     489             :                         len = 0;
     490             :                 else
     491             :                         len -= step;
     492             :         }
     493             : }
     494             : #endif
     495             : 
     496             : /*
     497             :  * This is only called by btrfs_cache_block_group, since we could have freed
     498             :  * extents we need to check the pinned_extents for any extents that can't be
     499             :  * used yet since their free space will be released as soon as the transaction
     500             :  * commits.
     501             :  */
     502           0 : u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
     503             : {
     504           0 :         struct btrfs_fs_info *info = block_group->fs_info;
     505           0 :         u64 extent_start, extent_end, size, total_added = 0;
     506           0 :         int ret;
     507             : 
     508           0 :         while (start < end) {
     509           0 :                 ret = find_first_extent_bit(&info->excluded_extents, start,
     510             :                                             &extent_start, &extent_end,
     511             :                                             EXTENT_DIRTY | EXTENT_UPTODATE,
     512             :                                             NULL);
     513           0 :                 if (ret)
     514             :                         break;
     515             : 
     516           0 :                 if (extent_start <= start) {
     517           0 :                         start = extent_end + 1;
     518           0 :                 } else if (extent_start > start && extent_start < end) {
     519           0 :                         size = extent_start - start;
     520           0 :                         total_added += size;
     521           0 :                         ret = btrfs_add_free_space_async_trimmed(block_group,
     522             :                                                                  start, size);
     523           0 :                         BUG_ON(ret); /* -ENOMEM or logic error */
     524           0 :                         start = extent_end + 1;
     525             :                 } else {
     526             :                         break;
     527             :                 }
     528             :         }
     529             : 
     530           0 :         if (start < end) {
     531           0 :                 size = end - start;
     532           0 :                 total_added += size;
     533           0 :                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
     534             :                                                          size);
     535           0 :                 BUG_ON(ret); /* -ENOMEM or logic error */
     536             :         }
     537             : 
     538           0 :         return total_added;
     539             : }
     540             : 
     541             : /*
     542             :  * Get an arbitrary extent item index / max_index through the block group
     543             :  *
     544             :  * @block_group   the block group to sample from
     545             :  * @index:        the integral step through the block group to grab from
     546             :  * @max_index:    the granularity of the sampling
     547             :  * @key:          return value parameter for the item we find
     548             :  *
     549             :  * Pre-conditions on indices:
     550             :  * 0 <= index <= max_index
     551             :  * 0 < max_index
     552             :  *
     553             :  * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
     554             :  * error code on error.
     555             :  */
     556           0 : static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
     557             :                                           struct btrfs_block_group *block_group,
     558             :                                           int index, int max_index,
     559             :                                           struct btrfs_key *found_key)
     560             : {
     561           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     562           0 :         struct btrfs_root *extent_root;
     563           0 :         u64 search_offset;
     564           0 :         u64 search_end = block_group->start + block_group->length;
     565           0 :         struct btrfs_path *path;
     566           0 :         struct btrfs_key search_key;
     567           0 :         int ret = 0;
     568             : 
     569           0 :         ASSERT(index >= 0);
     570           0 :         ASSERT(index <= max_index);
     571           0 :         ASSERT(max_index > 0);
     572           0 :         lockdep_assert_held(&caching_ctl->mutex);
     573           0 :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     574             : 
     575           0 :         path = btrfs_alloc_path();
     576           0 :         if (!path)
     577             :                 return -ENOMEM;
     578             : 
     579           0 :         extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
     580             :                                                        BTRFS_SUPER_INFO_OFFSET));
     581             : 
     582           0 :         path->skip_locking = 1;
     583           0 :         path->search_commit_root = 1;
     584           0 :         path->reada = READA_FORWARD;
     585             : 
     586           0 :         search_offset = index * div_u64(block_group->length, max_index);
     587           0 :         search_key.objectid = block_group->start + search_offset;
     588           0 :         search_key.type = BTRFS_EXTENT_ITEM_KEY;
     589           0 :         search_key.offset = 0;
     590             : 
     591           0 :         btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
     592             :                 /* Success; sampled an extent item in the block group */
     593           0 :                 if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
     594           0 :                     found_key->objectid >= block_group->start &&
     595           0 :                     found_key->objectid + found_key->offset <= search_end)
     596             :                         break;
     597             : 
     598             :                 /* We can't possibly find a valid extent item anymore */
     599           0 :                 if (found_key->objectid >= search_end) {
     600             :                         ret = 1;
     601             :                         break;
     602             :                 }
     603             :         }
     604             : 
     605           0 :         lockdep_assert_held(&caching_ctl->mutex);
     606           0 :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     607           0 :         btrfs_free_path(path);
     608           0 :         return ret;
     609             : }
     610             : 
     611             : /*
     612             :  * Best effort attempt to compute a block group's size class while caching it.
     613             :  *
     614             :  * @block_group: the block group we are caching
     615             :  *
     616             :  * We cannot infer the size class while adding free space extents, because that
     617             :  * logic doesn't care about contiguous file extents (it doesn't differentiate
     618             :  * between a 100M extent and 100 contiguous 1M extents). So we need to read the
     619             :  * file extent items. Reading all of them is quite wasteful, because usually
     620             :  * only a handful are enough to give a good answer. Therefore, we just grab 5 of
     621             :  * them at even steps through the block group and pick the smallest size class
     622             :  * we see. Since size class is best effort, and not guaranteed in general,
     623             :  * inaccuracy is acceptable.
     624             :  *
     625             :  * To be more explicit about why this algorithm makes sense:
     626             :  *
     627             :  * If we are caching in a block group from disk, then there are three major cases
     628             :  * to consider:
     629             :  * 1. the block group is well behaved and all extents in it are the same size
     630             :  *    class.
     631             :  * 2. the block group is mostly one size class with rare exceptions for last
     632             :  *    ditch allocations
     633             :  * 3. the block group was populated before size classes and can have a totally
     634             :  *    arbitrary mix of size classes.
     635             :  *
     636             :  * In case 1, looking at any extent in the block group will yield the correct
     637             :  * result. For the mixed cases, taking the minimum size class seems like a good
     638             :  * approximation, since gaps from frees will be usable to the size class. For
     639             :  * 2., a small handful of file extents is likely to yield the right answer. For
     640             :  * 3, we can either read every file extent, or admit that this is best effort
     641             :  * anyway and try to stay fast.
     642             :  *
     643             :  * Returns: 0 on success, negative error code on error.
     644             :  */
     645           0 : static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
     646             :                                        struct btrfs_block_group *block_group)
     647             : {
     648           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     649           0 :         struct btrfs_key key;
     650           0 :         int i;
     651           0 :         u64 min_size = block_group->length;
     652           0 :         enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
     653           0 :         int ret;
     654             : 
     655           0 :         if (!btrfs_block_group_should_use_size_class(block_group))
     656             :                 return 0;
     657             : 
     658             :         lockdep_assert_held(&caching_ctl->mutex);
     659             :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     660           0 :         for (i = 0; i < 5; ++i) {
     661           0 :                 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
     662           0 :                 if (ret < 0)
     663           0 :                         goto out;
     664           0 :                 if (ret > 0)
     665           0 :                         continue;
     666           0 :                 min_size = min_t(u64, min_size, key.offset);
     667           0 :                 size_class = btrfs_calc_block_group_size_class(min_size);
     668             :         }
     669           0 :         if (size_class != BTRFS_BG_SZ_NONE) {
     670           0 :                 spin_lock(&block_group->lock);
     671           0 :                 block_group->size_class = size_class;
     672           0 :                 spin_unlock(&block_group->lock);
     673             :         }
     674           0 : out:
     675             :         return ret;
     676             : }
     677             : 
     678           0 : static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
     679             : {
     680           0 :         struct btrfs_block_group *block_group = caching_ctl->block_group;
     681           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     682           0 :         struct btrfs_root *extent_root;
     683           0 :         struct btrfs_path *path;
     684           0 :         struct extent_buffer *leaf;
     685           0 :         struct btrfs_key key;
     686           0 :         u64 total_found = 0;
     687           0 :         u64 last = 0;
     688           0 :         u32 nritems;
     689           0 :         int ret;
     690           0 :         bool wakeup = true;
     691             : 
     692           0 :         path = btrfs_alloc_path();
     693           0 :         if (!path)
     694             :                 return -ENOMEM;
     695             : 
     696           0 :         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
     697           0 :         extent_root = btrfs_extent_root(fs_info, last);
     698             : 
     699             : #ifdef CONFIG_BTRFS_DEBUG
     700             :         /*
     701             :          * If we're fragmenting we don't want to make anybody think we can
     702             :          * allocate from this block group until we've had a chance to fragment
     703             :          * the free space.
     704             :          */
     705             :         if (btrfs_should_fragment_free_space(block_group))
     706             :                 wakeup = false;
     707             : #endif
     708             :         /*
     709             :          * We don't want to deadlock with somebody trying to allocate a new
     710             :          * extent for the extent root while also trying to search the extent
     711             :          * root to add free space.  So we skip locking and search the commit
     712             :          * root, since its read-only
     713             :          */
     714           0 :         path->skip_locking = 1;
     715           0 :         path->search_commit_root = 1;
     716           0 :         path->reada = READA_FORWARD;
     717             : 
     718           0 :         key.objectid = last;
     719           0 :         key.offset = 0;
     720           0 :         key.type = BTRFS_EXTENT_ITEM_KEY;
     721             : 
     722             : next:
     723           0 :         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
     724           0 :         if (ret < 0)
     725           0 :                 goto out;
     726             : 
     727           0 :         leaf = path->nodes[0];
     728           0 :         nritems = btrfs_header_nritems(leaf);
     729             : 
     730           0 :         while (1) {
     731           0 :                 if (btrfs_fs_closing(fs_info) > 1) {
     732             :                         last = (u64)-1;
     733             :                         break;
     734             :                 }
     735             : 
     736           0 :                 if (path->slots[0] < nritems) {
     737           0 :                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     738             :                 } else {
     739           0 :                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
     740           0 :                         if (ret)
     741             :                                 break;
     742             : 
     743           0 :                         if (need_resched() ||
     744             :                             rwsem_is_contended(&fs_info->commit_root_sem)) {
     745           0 :                                 btrfs_release_path(path);
     746           0 :                                 up_read(&fs_info->commit_root_sem);
     747           0 :                                 mutex_unlock(&caching_ctl->mutex);
     748           0 :                                 cond_resched();
     749           0 :                                 mutex_lock(&caching_ctl->mutex);
     750           0 :                                 down_read(&fs_info->commit_root_sem);
     751           0 :                                 goto next;
     752             :                         }
     753             : 
     754           0 :                         ret = btrfs_next_leaf(extent_root, path);
     755           0 :                         if (ret < 0)
     756           0 :                                 goto out;
     757           0 :                         if (ret)
     758             :                                 break;
     759           0 :                         leaf = path->nodes[0];
     760           0 :                         nritems = btrfs_header_nritems(leaf);
     761           0 :                         continue;
     762             :                 }
     763             : 
     764           0 :                 if (key.objectid < last) {
     765           0 :                         key.objectid = last;
     766           0 :                         key.offset = 0;
     767           0 :                         key.type = BTRFS_EXTENT_ITEM_KEY;
     768           0 :                         btrfs_release_path(path);
     769           0 :                         goto next;
     770             :                 }
     771             : 
     772           0 :                 if (key.objectid < block_group->start) {
     773           0 :                         path->slots[0]++;
     774           0 :                         continue;
     775             :                 }
     776             : 
     777           0 :                 if (key.objectid >= block_group->start + block_group->length)
     778             :                         break;
     779             : 
     780           0 :                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
     781             :                     key.type == BTRFS_METADATA_ITEM_KEY) {
     782           0 :                         total_found += add_new_free_space(block_group, last,
     783             :                                                           key.objectid);
     784           0 :                         if (key.type == BTRFS_METADATA_ITEM_KEY)
     785           0 :                                 last = key.objectid +
     786           0 :                                         fs_info->nodesize;
     787             :                         else
     788           0 :                                 last = key.objectid + key.offset;
     789             : 
     790           0 :                         if (total_found > CACHING_CTL_WAKE_UP) {
     791           0 :                                 total_found = 0;
     792           0 :                                 if (wakeup)
     793           0 :                                         wake_up(&caching_ctl->wait);
     794             :                         }
     795             :                 }
     796           0 :                 path->slots[0]++;
     797             :         }
     798           0 :         ret = 0;
     799             : 
     800           0 :         total_found += add_new_free_space(block_group, last,
     801           0 :                                 block_group->start + block_group->length);
     802             : 
     803           0 : out:
     804           0 :         btrfs_free_path(path);
     805           0 :         return ret;
     806             : }
     807             : 
     808           0 : static noinline void caching_thread(struct btrfs_work *work)
     809             : {
     810           0 :         struct btrfs_block_group *block_group;
     811           0 :         struct btrfs_fs_info *fs_info;
     812           0 :         struct btrfs_caching_control *caching_ctl;
     813           0 :         int ret;
     814             : 
     815           0 :         caching_ctl = container_of(work, struct btrfs_caching_control, work);
     816           0 :         block_group = caching_ctl->block_group;
     817           0 :         fs_info = block_group->fs_info;
     818             : 
     819           0 :         mutex_lock(&caching_ctl->mutex);
     820           0 :         down_read(&fs_info->commit_root_sem);
     821             : 
     822           0 :         load_block_group_size_class(caching_ctl, block_group);
     823           0 :         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
     824           0 :                 ret = load_free_space_cache(block_group);
     825           0 :                 if (ret == 1) {
     826           0 :                         ret = 0;
     827           0 :                         goto done;
     828             :                 }
     829             : 
     830             :                 /*
     831             :                  * We failed to load the space cache, set ourselves to
     832             :                  * CACHE_STARTED and carry on.
     833             :                  */
     834           0 :                 spin_lock(&block_group->lock);
     835           0 :                 block_group->cached = BTRFS_CACHE_STARTED;
     836           0 :                 spin_unlock(&block_group->lock);
     837           0 :                 wake_up(&caching_ctl->wait);
     838             :         }
     839             : 
     840             :         /*
     841             :          * If we are in the transaction that populated the free space tree we
     842             :          * can't actually cache from the free space tree as our commit root and
     843             :          * real root are the same, so we could change the contents of the blocks
     844             :          * while caching.  Instead do the slow caching in this case, and after
     845             :          * the transaction has committed we will be safe.
     846             :          */
     847           0 :         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
     848           0 :             !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
     849           0 :                 ret = load_free_space_tree(caching_ctl);
     850             :         else
     851           0 :                 ret = load_extent_tree_free(caching_ctl);
     852           0 : done:
     853           0 :         spin_lock(&block_group->lock);
     854           0 :         block_group->caching_ctl = NULL;
     855           0 :         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
     856           0 :         spin_unlock(&block_group->lock);
     857             : 
     858             : #ifdef CONFIG_BTRFS_DEBUG
     859             :         if (btrfs_should_fragment_free_space(block_group)) {
     860             :                 u64 bytes_used;
     861             : 
     862             :                 spin_lock(&block_group->space_info->lock);
     863             :                 spin_lock(&block_group->lock);
     864             :                 bytes_used = block_group->length - block_group->used;
     865             :                 block_group->space_info->bytes_used += bytes_used >> 1;
     866             :                 spin_unlock(&block_group->lock);
     867             :                 spin_unlock(&block_group->space_info->lock);
     868             :                 fragment_free_space(block_group);
     869             :         }
     870             : #endif
     871             : 
     872           0 :         up_read(&fs_info->commit_root_sem);
     873           0 :         btrfs_free_excluded_extents(block_group);
     874           0 :         mutex_unlock(&caching_ctl->mutex);
     875             : 
     876           0 :         wake_up(&caching_ctl->wait);
     877             : 
     878           0 :         btrfs_put_caching_control(caching_ctl);
     879           0 :         btrfs_put_block_group(block_group);
     880           0 : }
     881             : 
     882           0 : int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
     883             : {
     884           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     885           0 :         struct btrfs_caching_control *caching_ctl = NULL;
     886           0 :         int ret = 0;
     887             : 
     888             :         /* Allocator for zoned filesystems does not use the cache at all */
     889           0 :         if (btrfs_is_zoned(fs_info))
     890             :                 return 0;
     891             : 
     892           0 :         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
     893           0 :         if (!caching_ctl)
     894             :                 return -ENOMEM;
     895             : 
     896           0 :         INIT_LIST_HEAD(&caching_ctl->list);
     897           0 :         mutex_init(&caching_ctl->mutex);
     898           0 :         init_waitqueue_head(&caching_ctl->wait);
     899           0 :         caching_ctl->block_group = cache;
     900           0 :         refcount_set(&caching_ctl->count, 2);
     901           0 :         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
     902             : 
     903           0 :         spin_lock(&cache->lock);
     904           0 :         if (cache->cached != BTRFS_CACHE_NO) {
     905           0 :                 kfree(caching_ctl);
     906             : 
     907           0 :                 caching_ctl = cache->caching_ctl;
     908           0 :                 if (caching_ctl)
     909           0 :                         refcount_inc(&caching_ctl->count);
     910           0 :                 spin_unlock(&cache->lock);
     911           0 :                 goto out;
     912             :         }
     913           0 :         WARN_ON(cache->caching_ctl);
     914           0 :         cache->caching_ctl = caching_ctl;
     915           0 :         cache->cached = BTRFS_CACHE_STARTED;
     916           0 :         spin_unlock(&cache->lock);
     917             : 
     918           0 :         write_lock(&fs_info->block_group_cache_lock);
     919           0 :         refcount_inc(&caching_ctl->count);
     920           0 :         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
     921           0 :         write_unlock(&fs_info->block_group_cache_lock);
     922             : 
     923           0 :         btrfs_get_block_group(cache);
     924             : 
     925           0 :         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
     926           0 : out:
     927           0 :         if (wait && caching_ctl)
     928           0 :                 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
     929           0 :         if (caching_ctl)
     930           0 :                 btrfs_put_caching_control(caching_ctl);
     931             : 
     932             :         return ret;
     933             : }
     934             : 
     935           0 : static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
     936             : {
     937           0 :         u64 extra_flags = chunk_to_extended(flags) &
     938             :                                 BTRFS_EXTENDED_PROFILE_MASK;
     939             : 
     940           0 :         write_seqlock(&fs_info->profiles_lock);
     941           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA)
     942           0 :                 fs_info->avail_data_alloc_bits &= ~extra_flags;
     943           0 :         if (flags & BTRFS_BLOCK_GROUP_METADATA)
     944           0 :                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
     945           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
     946           0 :                 fs_info->avail_system_alloc_bits &= ~extra_flags;
     947           0 :         write_sequnlock(&fs_info->profiles_lock);
     948           0 : }
     949             : 
     950             : /*
     951             :  * Clear incompat bits for the following feature(s):
     952             :  *
     953             :  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
     954             :  *            in the whole filesystem
     955             :  *
     956             :  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
     957             :  */
     958           0 : static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
     959             : {
     960           0 :         bool found_raid56 = false;
     961           0 :         bool found_raid1c34 = false;
     962             : 
     963           0 :         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
     964           0 :             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
     965             :             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
     966           0 :                 struct list_head *head = &fs_info->space_info;
     967           0 :                 struct btrfs_space_info *sinfo;
     968             : 
     969           0 :                 list_for_each_entry_rcu(sinfo, head, list) {
     970           0 :                         down_read(&sinfo->groups_sem);
     971           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
     972           0 :                                 found_raid56 = true;
     973           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
     974           0 :                                 found_raid56 = true;
     975           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
     976           0 :                                 found_raid1c34 = true;
     977           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
     978           0 :                                 found_raid1c34 = true;
     979           0 :                         up_read(&sinfo->groups_sem);
     980             :                 }
     981           0 :                 if (!found_raid56)
     982           0 :                         btrfs_clear_fs_incompat(fs_info, RAID56);
     983           0 :                 if (!found_raid1c34)
     984           0 :                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
     985             :         }
     986           0 : }
     987             : 
     988           0 : static int remove_block_group_item(struct btrfs_trans_handle *trans,
     989             :                                    struct btrfs_path *path,
     990             :                                    struct btrfs_block_group *block_group)
     991             : {
     992           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
     993           0 :         struct btrfs_root *root;
     994           0 :         struct btrfs_key key;
     995           0 :         int ret;
     996             : 
     997           0 :         root = btrfs_block_group_root(fs_info);
     998           0 :         key.objectid = block_group->start;
     999           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    1000           0 :         key.offset = block_group->length;
    1001             : 
    1002           0 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    1003           0 :         if (ret > 0)
    1004             :                 ret = -ENOENT;
    1005           0 :         if (ret < 0)
    1006           0 :                 return ret;
    1007             : 
    1008           0 :         ret = btrfs_del_item(trans, root, path);
    1009           0 :         return ret;
    1010             : }
    1011             : 
    1012           0 : int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
    1013             :                              u64 group_start, struct extent_map *em)
    1014             : {
    1015           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    1016           0 :         struct btrfs_path *path;
    1017           0 :         struct btrfs_block_group *block_group;
    1018           0 :         struct btrfs_free_cluster *cluster;
    1019           0 :         struct inode *inode;
    1020           0 :         struct kobject *kobj = NULL;
    1021           0 :         int ret;
    1022           0 :         int index;
    1023           0 :         int factor;
    1024           0 :         struct btrfs_caching_control *caching_ctl = NULL;
    1025           0 :         bool remove_em;
    1026           0 :         bool remove_rsv = false;
    1027             : 
    1028           0 :         block_group = btrfs_lookup_block_group(fs_info, group_start);
    1029           0 :         BUG_ON(!block_group);
    1030           0 :         BUG_ON(!block_group->ro);
    1031             : 
    1032           0 :         trace_btrfs_remove_block_group(block_group);
    1033             :         /*
    1034             :          * Free the reserved super bytes from this block group before
    1035             :          * remove it.
    1036             :          */
    1037           0 :         btrfs_free_excluded_extents(block_group);
    1038           0 :         btrfs_free_ref_tree_range(fs_info, block_group->start,
    1039             :                                   block_group->length);
    1040             : 
    1041           0 :         index = btrfs_bg_flags_to_raid_index(block_group->flags);
    1042           0 :         factor = btrfs_bg_type_to_factor(block_group->flags);
    1043             : 
    1044             :         /* make sure this block group isn't part of an allocation cluster */
    1045           0 :         cluster = &fs_info->data_alloc_cluster;
    1046           0 :         spin_lock(&cluster->refill_lock);
    1047           0 :         btrfs_return_cluster_to_free_space(block_group, cluster);
    1048           0 :         spin_unlock(&cluster->refill_lock);
    1049             : 
    1050             :         /*
    1051             :          * make sure this block group isn't part of a metadata
    1052             :          * allocation cluster
    1053             :          */
    1054           0 :         cluster = &fs_info->meta_alloc_cluster;
    1055           0 :         spin_lock(&cluster->refill_lock);
    1056           0 :         btrfs_return_cluster_to_free_space(block_group, cluster);
    1057           0 :         spin_unlock(&cluster->refill_lock);
    1058             : 
    1059           0 :         btrfs_clear_treelog_bg(block_group);
    1060           0 :         btrfs_clear_data_reloc_bg(block_group);
    1061             : 
    1062           0 :         path = btrfs_alloc_path();
    1063           0 :         if (!path) {
    1064           0 :                 ret = -ENOMEM;
    1065           0 :                 goto out;
    1066             :         }
    1067             : 
    1068             :         /*
    1069             :          * get the inode first so any iput calls done for the io_list
    1070             :          * aren't the final iput (no unlinks allowed now)
    1071             :          */
    1072           0 :         inode = lookup_free_space_inode(block_group, path);
    1073             : 
    1074           0 :         mutex_lock(&trans->transaction->cache_write_mutex);
    1075             :         /*
    1076             :          * Make sure our free space cache IO is done before removing the
    1077             :          * free space inode
    1078             :          */
    1079           0 :         spin_lock(&trans->transaction->dirty_bgs_lock);
    1080           0 :         if (!list_empty(&block_group->io_list)) {
    1081           0 :                 list_del_init(&block_group->io_list);
    1082             : 
    1083           0 :                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
    1084             : 
    1085           0 :                 spin_unlock(&trans->transaction->dirty_bgs_lock);
    1086           0 :                 btrfs_wait_cache_io(trans, block_group, path);
    1087           0 :                 btrfs_put_block_group(block_group);
    1088           0 :                 spin_lock(&trans->transaction->dirty_bgs_lock);
    1089             :         }
    1090             : 
    1091           0 :         if (!list_empty(&block_group->dirty_list)) {
    1092           0 :                 list_del_init(&block_group->dirty_list);
    1093           0 :                 remove_rsv = true;
    1094           0 :                 btrfs_put_block_group(block_group);
    1095             :         }
    1096           0 :         spin_unlock(&trans->transaction->dirty_bgs_lock);
    1097           0 :         mutex_unlock(&trans->transaction->cache_write_mutex);
    1098             : 
    1099           0 :         ret = btrfs_remove_free_space_inode(trans, inode, block_group);
    1100           0 :         if (ret)
    1101           0 :                 goto out;
    1102             : 
    1103           0 :         write_lock(&fs_info->block_group_cache_lock);
    1104           0 :         rb_erase_cached(&block_group->cache_node,
    1105             :                         &fs_info->block_group_cache_tree);
    1106           0 :         RB_CLEAR_NODE(&block_group->cache_node);
    1107             : 
    1108             :         /* Once for the block groups rbtree */
    1109           0 :         btrfs_put_block_group(block_group);
    1110             : 
    1111           0 :         write_unlock(&fs_info->block_group_cache_lock);
    1112             : 
    1113           0 :         down_write(&block_group->space_info->groups_sem);
    1114             :         /*
    1115             :          * we must use list_del_init so people can check to see if they
    1116             :          * are still on the list after taking the semaphore
    1117             :          */
    1118           0 :         list_del_init(&block_group->list);
    1119           0 :         if (list_empty(&block_group->space_info->block_groups[index])) {
    1120           0 :                 kobj = block_group->space_info->block_group_kobjs[index];
    1121           0 :                 block_group->space_info->block_group_kobjs[index] = NULL;
    1122           0 :                 clear_avail_alloc_bits(fs_info, block_group->flags);
    1123             :         }
    1124           0 :         up_write(&block_group->space_info->groups_sem);
    1125           0 :         clear_incompat_bg_bits(fs_info, block_group->flags);
    1126           0 :         if (kobj) {
    1127           0 :                 kobject_del(kobj);
    1128           0 :                 kobject_put(kobj);
    1129             :         }
    1130             : 
    1131           0 :         if (block_group->cached == BTRFS_CACHE_STARTED)
    1132           0 :                 btrfs_wait_block_group_cache_done(block_group);
    1133             : 
    1134           0 :         write_lock(&fs_info->block_group_cache_lock);
    1135           0 :         caching_ctl = btrfs_get_caching_control(block_group);
    1136           0 :         if (!caching_ctl) {
    1137           0 :                 struct btrfs_caching_control *ctl;
    1138             : 
    1139           0 :                 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
    1140           0 :                         if (ctl->block_group == block_group) {
    1141           0 :                                 caching_ctl = ctl;
    1142           0 :                                 refcount_inc(&caching_ctl->count);
    1143             :                                 break;
    1144             :                         }
    1145             :                 }
    1146             :         }
    1147           0 :         if (caching_ctl)
    1148           0 :                 list_del_init(&caching_ctl->list);
    1149           0 :         write_unlock(&fs_info->block_group_cache_lock);
    1150             : 
    1151           0 :         if (caching_ctl) {
    1152             :                 /* Once for the caching bgs list and once for us. */
    1153           0 :                 btrfs_put_caching_control(caching_ctl);
    1154           0 :                 btrfs_put_caching_control(caching_ctl);
    1155             :         }
    1156             : 
    1157           0 :         spin_lock(&trans->transaction->dirty_bgs_lock);
    1158           0 :         WARN_ON(!list_empty(&block_group->dirty_list));
    1159           0 :         WARN_ON(!list_empty(&block_group->io_list));
    1160           0 :         spin_unlock(&trans->transaction->dirty_bgs_lock);
    1161             : 
    1162           0 :         btrfs_remove_free_space_cache(block_group);
    1163             : 
    1164           0 :         spin_lock(&block_group->space_info->lock);
    1165           0 :         list_del_init(&block_group->ro_list);
    1166             : 
    1167           0 :         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
    1168           0 :                 WARN_ON(block_group->space_info->total_bytes
    1169             :                         < block_group->length);
    1170           0 :                 WARN_ON(block_group->space_info->bytes_readonly
    1171             :                         < block_group->length - block_group->zone_unusable);
    1172           0 :                 WARN_ON(block_group->space_info->bytes_zone_unusable
    1173             :                         < block_group->zone_unusable);
    1174           0 :                 WARN_ON(block_group->space_info->disk_total
    1175             :                         < block_group->length * factor);
    1176             :         }
    1177           0 :         block_group->space_info->total_bytes -= block_group->length;
    1178           0 :         block_group->space_info->bytes_readonly -=
    1179           0 :                 (block_group->length - block_group->zone_unusable);
    1180           0 :         block_group->space_info->bytes_zone_unusable -=
    1181           0 :                 block_group->zone_unusable;
    1182           0 :         block_group->space_info->disk_total -= block_group->length * factor;
    1183             : 
    1184           0 :         spin_unlock(&block_group->space_info->lock);
    1185             : 
    1186             :         /*
    1187             :          * Remove the free space for the block group from the free space tree
    1188             :          * and the block group's item from the extent tree before marking the
    1189             :          * block group as removed. This is to prevent races with tasks that
    1190             :          * freeze and unfreeze a block group, this task and another task
    1191             :          * allocating a new block group - the unfreeze task ends up removing
    1192             :          * the block group's extent map before the task calling this function
    1193             :          * deletes the block group item from the extent tree, allowing for
    1194             :          * another task to attempt to create another block group with the same
    1195             :          * item key (and failing with -EEXIST and a transaction abort).
    1196             :          */
    1197           0 :         ret = remove_block_group_free_space(trans, block_group);
    1198           0 :         if (ret)
    1199           0 :                 goto out;
    1200             : 
    1201           0 :         ret = remove_block_group_item(trans, path, block_group);
    1202           0 :         if (ret < 0)
    1203           0 :                 goto out;
    1204             : 
    1205           0 :         spin_lock(&block_group->lock);
    1206           0 :         set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
    1207             : 
    1208             :         /*
    1209             :          * At this point trimming or scrub can't start on this block group,
    1210             :          * because we removed the block group from the rbtree
    1211             :          * fs_info->block_group_cache_tree so no one can't find it anymore and
    1212             :          * even if someone already got this block group before we removed it
    1213             :          * from the rbtree, they have already incremented block_group->frozen -
    1214             :          * if they didn't, for the trimming case they won't find any free space
    1215             :          * entries because we already removed them all when we called
    1216             :          * btrfs_remove_free_space_cache().
    1217             :          *
    1218             :          * And we must not remove the extent map from the fs_info->mapping_tree
    1219             :          * to prevent the same logical address range and physical device space
    1220             :          * ranges from being reused for a new block group. This is needed to
    1221             :          * avoid races with trimming and scrub.
    1222             :          *
    1223             :          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
    1224             :          * completely transactionless, so while it is trimming a range the
    1225             :          * currently running transaction might finish and a new one start,
    1226             :          * allowing for new block groups to be created that can reuse the same
    1227             :          * physical device locations unless we take this special care.
    1228             :          *
    1229             :          * There may also be an implicit trim operation if the file system
    1230             :          * is mounted with -odiscard. The same protections must remain
    1231             :          * in place until the extents have been discarded completely when
    1232             :          * the transaction commit has completed.
    1233             :          */
    1234           0 :         remove_em = (atomic_read(&block_group->frozen) == 0);
    1235           0 :         spin_unlock(&block_group->lock);
    1236             : 
    1237           0 :         if (remove_em) {
    1238           0 :                 struct extent_map_tree *em_tree;
    1239             : 
    1240           0 :                 em_tree = &fs_info->mapping_tree;
    1241           0 :                 write_lock(&em_tree->lock);
    1242           0 :                 remove_extent_mapping(em_tree, em);
    1243           0 :                 write_unlock(&em_tree->lock);
    1244             :                 /* once for the tree */
    1245           0 :                 free_extent_map(em);
    1246             :         }
    1247             : 
    1248           0 : out:
    1249             :         /* Once for the lookup reference */
    1250           0 :         btrfs_put_block_group(block_group);
    1251           0 :         if (remove_rsv)
    1252           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    1253           0 :         btrfs_free_path(path);
    1254           0 :         return ret;
    1255             : }
    1256             : 
    1257           0 : struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
    1258             :                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
    1259             : {
    1260           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    1261           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    1262           0 :         struct extent_map *em;
    1263           0 :         struct map_lookup *map;
    1264           0 :         unsigned int num_items;
    1265             : 
    1266           0 :         read_lock(&em_tree->lock);
    1267           0 :         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
    1268           0 :         read_unlock(&em_tree->lock);
    1269           0 :         ASSERT(em && em->start == chunk_offset);
    1270             : 
    1271             :         /*
    1272             :          * We need to reserve 3 + N units from the metadata space info in order
    1273             :          * to remove a block group (done at btrfs_remove_chunk() and at
    1274             :          * btrfs_remove_block_group()), which are used for:
    1275             :          *
    1276             :          * 1 unit for adding the free space inode's orphan (located in the tree
    1277             :          * of tree roots).
    1278             :          * 1 unit for deleting the block group item (located in the extent
    1279             :          * tree).
    1280             :          * 1 unit for deleting the free space item (located in tree of tree
    1281             :          * roots).
    1282             :          * N units for deleting N device extent items corresponding to each
    1283             :          * stripe (located in the device tree).
    1284             :          *
    1285             :          * In order to remove a block group we also need to reserve units in the
    1286             :          * system space info in order to update the chunk tree (update one or
    1287             :          * more device items and remove one chunk item), but this is done at
    1288             :          * btrfs_remove_chunk() through a call to check_system_chunk().
    1289             :          */
    1290           0 :         map = em->map_lookup;
    1291           0 :         num_items = 3 + map->num_stripes;
    1292           0 :         free_extent_map(em);
    1293             : 
    1294           0 :         return btrfs_start_transaction_fallback_global_rsv(root, num_items);
    1295             : }
    1296             : 
    1297             : /*
    1298             :  * Mark block group @cache read-only, so later write won't happen to block
    1299             :  * group @cache.
    1300             :  *
    1301             :  * If @force is not set, this function will only mark the block group readonly
    1302             :  * if we have enough free space (1M) in other metadata/system block groups.
    1303             :  * If @force is not set, this function will mark the block group readonly
    1304             :  * without checking free space.
    1305             :  *
    1306             :  * NOTE: This function doesn't care if other block groups can contain all the
    1307             :  * data in this block group. That check should be done by relocation routine,
    1308             :  * not this function.
    1309             :  */
    1310           0 : static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
    1311             : {
    1312           0 :         struct btrfs_space_info *sinfo = cache->space_info;
    1313           0 :         u64 num_bytes;
    1314           0 :         int ret = -ENOSPC;
    1315             : 
    1316           0 :         spin_lock(&sinfo->lock);
    1317           0 :         spin_lock(&cache->lock);
    1318             : 
    1319           0 :         if (cache->swap_extents) {
    1320           0 :                 ret = -ETXTBSY;
    1321           0 :                 goto out;
    1322             :         }
    1323             : 
    1324           0 :         if (cache->ro) {
    1325           0 :                 cache->ro++;
    1326           0 :                 ret = 0;
    1327           0 :                 goto out;
    1328             :         }
    1329             : 
    1330           0 :         num_bytes = cache->length - cache->reserved - cache->pinned -
    1331           0 :                     cache->bytes_super - cache->zone_unusable - cache->used;
    1332             : 
    1333             :         /*
    1334             :          * Data never overcommits, even in mixed mode, so do just the straight
    1335             :          * check of left over space in how much we have allocated.
    1336             :          */
    1337           0 :         if (force) {
    1338             :                 ret = 0;
    1339           0 :         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
    1340           0 :                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
    1341             : 
    1342             :                 /*
    1343             :                  * Here we make sure if we mark this bg RO, we still have enough
    1344             :                  * free space as buffer.
    1345             :                  */
    1346           0 :                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
    1347             :                         ret = 0;
    1348             :         } else {
    1349             :                 /*
    1350             :                  * We overcommit metadata, so we need to do the
    1351             :                  * btrfs_can_overcommit check here, and we need to pass in
    1352             :                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
    1353             :                  * leeway to allow us to mark this block group as read only.
    1354             :                  */
    1355           0 :                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
    1356             :                                          BTRFS_RESERVE_NO_FLUSH))
    1357             :                         ret = 0;
    1358             :         }
    1359             : 
    1360             :         if (!ret) {
    1361           0 :                 sinfo->bytes_readonly += num_bytes;
    1362           0 :                 if (btrfs_is_zoned(cache->fs_info)) {
    1363             :                         /* Migrate zone_unusable bytes to readonly */
    1364           0 :                         sinfo->bytes_readonly += cache->zone_unusable;
    1365           0 :                         sinfo->bytes_zone_unusable -= cache->zone_unusable;
    1366           0 :                         cache->zone_unusable = 0;
    1367             :                 }
    1368           0 :                 cache->ro++;
    1369           0 :                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
    1370             :         }
    1371           0 : out:
    1372           0 :         spin_unlock(&cache->lock);
    1373           0 :         spin_unlock(&sinfo->lock);
    1374           0 :         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
    1375           0 :                 btrfs_info(cache->fs_info,
    1376             :                         "unable to make block group %llu ro", cache->start);
    1377           0 :                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
    1378             :         }
    1379           0 :         return ret;
    1380             : }
    1381             : 
    1382           0 : static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
    1383             :                                  struct btrfs_block_group *bg)
    1384             : {
    1385           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1386           0 :         struct btrfs_transaction *prev_trans = NULL;
    1387           0 :         const u64 start = bg->start;
    1388           0 :         const u64 end = start + bg->length - 1;
    1389           0 :         int ret;
    1390             : 
    1391           0 :         spin_lock(&fs_info->trans_lock);
    1392           0 :         if (trans->transaction->list.prev != &fs_info->trans_list) {
    1393           0 :                 prev_trans = list_last_entry(&trans->transaction->list,
    1394             :                                              struct btrfs_transaction, list);
    1395           0 :                 refcount_inc(&prev_trans->use_count);
    1396             :         }
    1397           0 :         spin_unlock(&fs_info->trans_lock);
    1398             : 
    1399             :         /*
    1400             :          * Hold the unused_bg_unpin_mutex lock to avoid racing with
    1401             :          * btrfs_finish_extent_commit(). If we are at transaction N, another
    1402             :          * task might be running finish_extent_commit() for the previous
    1403             :          * transaction N - 1, and have seen a range belonging to the block
    1404             :          * group in pinned_extents before we were able to clear the whole block
    1405             :          * group range from pinned_extents. This means that task can lookup for
    1406             :          * the block group after we unpinned it from pinned_extents and removed
    1407             :          * it, leading to a BUG_ON() at unpin_extent_range().
    1408             :          */
    1409           0 :         mutex_lock(&fs_info->unused_bg_unpin_mutex);
    1410           0 :         if (prev_trans) {
    1411           0 :                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
    1412             :                                         EXTENT_DIRTY);
    1413           0 :                 if (ret)
    1414           0 :                         goto out;
    1415             :         }
    1416             : 
    1417           0 :         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
    1418             :                                 EXTENT_DIRTY);
    1419           0 : out:
    1420           0 :         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
    1421           0 :         if (prev_trans)
    1422           0 :                 btrfs_put_transaction(prev_trans);
    1423             : 
    1424           0 :         return ret == 0;
    1425             : }
    1426             : 
    1427             : /*
    1428             :  * Process the unused_bgs list and remove any that don't have any allocated
    1429             :  * space inside of them.
    1430             :  */
    1431           0 : void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
    1432             : {
    1433           0 :         struct btrfs_block_group *block_group;
    1434           0 :         struct btrfs_space_info *space_info;
    1435           0 :         struct btrfs_trans_handle *trans;
    1436           0 :         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
    1437           0 :         int ret = 0;
    1438             : 
    1439           0 :         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
    1440             :                 return;
    1441             : 
    1442           0 :         if (btrfs_fs_closing(fs_info))
    1443             :                 return;
    1444             : 
    1445             :         /*
    1446             :          * Long running balances can keep us blocked here for eternity, so
    1447             :          * simply skip deletion if we're unable to get the mutex.
    1448             :          */
    1449           0 :         if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
    1450             :                 return;
    1451             : 
    1452           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1453           0 :         while (!list_empty(&fs_info->unused_bgs)) {
    1454           0 :                 int trimming;
    1455             : 
    1456           0 :                 block_group = list_first_entry(&fs_info->unused_bgs,
    1457             :                                                struct btrfs_block_group,
    1458             :                                                bg_list);
    1459           0 :                 list_del_init(&block_group->bg_list);
    1460             : 
    1461           0 :                 space_info = block_group->space_info;
    1462             : 
    1463           0 :                 if (ret || btrfs_mixed_space_info(space_info)) {
    1464           0 :                         btrfs_put_block_group(block_group);
    1465           0 :                         continue;
    1466             :                 }
    1467           0 :                 spin_unlock(&fs_info->unused_bgs_lock);
    1468             : 
    1469           0 :                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
    1470             : 
    1471             :                 /* Don't want to race with allocators so take the groups_sem */
    1472           0 :                 down_write(&space_info->groups_sem);
    1473             : 
    1474             :                 /*
    1475             :                  * Async discard moves the final block group discard to be prior
    1476             :                  * to the unused_bgs code path.  Therefore, if it's not fully
    1477             :                  * trimmed, punt it back to the async discard lists.
    1478             :                  */
    1479           0 :                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
    1480           0 :                     !btrfs_is_free_space_trimmed(block_group)) {
    1481           0 :                         trace_btrfs_skip_unused_block_group(block_group);
    1482           0 :                         up_write(&space_info->groups_sem);
    1483             :                         /* Requeue if we failed because of async discard */
    1484           0 :                         btrfs_discard_queue_work(&fs_info->discard_ctl,
    1485             :                                                  block_group);
    1486           0 :                         goto next;
    1487             :                 }
    1488             : 
    1489           0 :                 spin_lock(&block_group->lock);
    1490           0 :                 if (block_group->reserved || block_group->pinned ||
    1491           0 :                     block_group->used || block_group->ro ||
    1492           0 :                     list_is_singular(&block_group->list)) {
    1493             :                         /*
    1494             :                          * We want to bail if we made new allocations or have
    1495             :                          * outstanding allocations in this block group.  We do
    1496             :                          * the ro check in case balance is currently acting on
    1497             :                          * this block group.
    1498             :                          */
    1499           0 :                         trace_btrfs_skip_unused_block_group(block_group);
    1500           0 :                         spin_unlock(&block_group->lock);
    1501           0 :                         up_write(&space_info->groups_sem);
    1502           0 :                         goto next;
    1503             :                 }
    1504           0 :                 spin_unlock(&block_group->lock);
    1505             : 
    1506             :                 /* We don't want to force the issue, only flip if it's ok. */
    1507           0 :                 ret = inc_block_group_ro(block_group, 0);
    1508           0 :                 up_write(&space_info->groups_sem);
    1509           0 :                 if (ret < 0) {
    1510           0 :                         ret = 0;
    1511           0 :                         goto next;
    1512             :                 }
    1513             : 
    1514           0 :                 ret = btrfs_zone_finish(block_group);
    1515           0 :                 if (ret < 0) {
    1516           0 :                         btrfs_dec_block_group_ro(block_group);
    1517           0 :                         if (ret == -EAGAIN)
    1518           0 :                                 ret = 0;
    1519           0 :                         goto next;
    1520             :                 }
    1521             : 
    1522             :                 /*
    1523             :                  * Want to do this before we do anything else so we can recover
    1524             :                  * properly if we fail to join the transaction.
    1525             :                  */
    1526           0 :                 trans = btrfs_start_trans_remove_block_group(fs_info,
    1527             :                                                      block_group->start);
    1528           0 :                 if (IS_ERR(trans)) {
    1529           0 :                         btrfs_dec_block_group_ro(block_group);
    1530           0 :                         ret = PTR_ERR(trans);
    1531           0 :                         goto next;
    1532             :                 }
    1533             : 
    1534             :                 /*
    1535             :                  * We could have pending pinned extents for this block group,
    1536             :                  * just delete them, we don't care about them anymore.
    1537             :                  */
    1538           0 :                 if (!clean_pinned_extents(trans, block_group)) {
    1539           0 :                         btrfs_dec_block_group_ro(block_group);
    1540           0 :                         goto end_trans;
    1541             :                 }
    1542             : 
    1543             :                 /*
    1544             :                  * At this point, the block_group is read only and should fail
    1545             :                  * new allocations.  However, btrfs_finish_extent_commit() can
    1546             :                  * cause this block_group to be placed back on the discard
    1547             :                  * lists because now the block_group isn't fully discarded.
    1548             :                  * Bail here and try again later after discarding everything.
    1549             :                  */
    1550           0 :                 spin_lock(&fs_info->discard_ctl.lock);
    1551           0 :                 if (!list_empty(&block_group->discard_list)) {
    1552           0 :                         spin_unlock(&fs_info->discard_ctl.lock);
    1553           0 :                         btrfs_dec_block_group_ro(block_group);
    1554           0 :                         btrfs_discard_queue_work(&fs_info->discard_ctl,
    1555             :                                                  block_group);
    1556           0 :                         goto end_trans;
    1557             :                 }
    1558           0 :                 spin_unlock(&fs_info->discard_ctl.lock);
    1559             : 
    1560             :                 /* Reset pinned so btrfs_put_block_group doesn't complain */
    1561           0 :                 spin_lock(&space_info->lock);
    1562           0 :                 spin_lock(&block_group->lock);
    1563             : 
    1564           0 :                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
    1565           0 :                                                      -block_group->pinned);
    1566           0 :                 space_info->bytes_readonly += block_group->pinned;
    1567           0 :                 block_group->pinned = 0;
    1568             : 
    1569           0 :                 spin_unlock(&block_group->lock);
    1570           0 :                 spin_unlock(&space_info->lock);
    1571             : 
    1572             :                 /*
    1573             :                  * The normal path here is an unused block group is passed here,
    1574             :                  * then trimming is handled in the transaction commit path.
    1575             :                  * Async discard interposes before this to do the trimming
    1576             :                  * before coming down the unused block group path as trimming
    1577             :                  * will no longer be done later in the transaction commit path.
    1578             :                  */
    1579           0 :                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
    1580           0 :                         goto flip_async;
    1581             : 
    1582             :                 /*
    1583             :                  * DISCARD can flip during remount. On zoned filesystems, we
    1584             :                  * need to reset sequential-required zones.
    1585             :                  */
    1586           0 :                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
    1587             :                                 btrfs_is_zoned(fs_info);
    1588             : 
    1589             :                 /* Implicit trim during transaction commit. */
    1590           0 :                 if (trimming)
    1591           0 :                         btrfs_freeze_block_group(block_group);
    1592             : 
    1593             :                 /*
    1594             :                  * Btrfs_remove_chunk will abort the transaction if things go
    1595             :                  * horribly wrong.
    1596             :                  */
    1597           0 :                 ret = btrfs_remove_chunk(trans, block_group->start);
    1598             : 
    1599           0 :                 if (ret) {
    1600           0 :                         if (trimming)
    1601           0 :                                 btrfs_unfreeze_block_group(block_group);
    1602           0 :                         goto end_trans;
    1603             :                 }
    1604             : 
    1605             :                 /*
    1606             :                  * If we're not mounted with -odiscard, we can just forget
    1607             :                  * about this block group. Otherwise we'll need to wait
    1608             :                  * until transaction commit to do the actual discard.
    1609             :                  */
    1610           0 :                 if (trimming) {
    1611           0 :                         spin_lock(&fs_info->unused_bgs_lock);
    1612             :                         /*
    1613             :                          * A concurrent scrub might have added us to the list
    1614             :                          * fs_info->unused_bgs, so use a list_move operation
    1615             :                          * to add the block group to the deleted_bgs list.
    1616             :                          */
    1617           0 :                         list_move(&block_group->bg_list,
    1618           0 :                                   &trans->transaction->deleted_bgs);
    1619           0 :                         spin_unlock(&fs_info->unused_bgs_lock);
    1620           0 :                         btrfs_get_block_group(block_group);
    1621             :                 }
    1622           0 : end_trans:
    1623           0 :                 btrfs_end_transaction(trans);
    1624           0 : next:
    1625           0 :                 btrfs_put_block_group(block_group);
    1626           0 :                 spin_lock(&fs_info->unused_bgs_lock);
    1627             :         }
    1628           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1629           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1630           0 :         return;
    1631             : 
    1632             : flip_async:
    1633           0 :         btrfs_end_transaction(trans);
    1634           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1635           0 :         btrfs_put_block_group(block_group);
    1636           0 :         btrfs_discard_punt_unused_bgs_list(fs_info);
    1637             : }
    1638             : 
    1639           0 : void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
    1640             : {
    1641           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1642             : 
    1643           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1644           0 :         if (list_empty(&bg->bg_list)) {
    1645           0 :                 btrfs_get_block_group(bg);
    1646           0 :                 trace_btrfs_add_unused_block_group(bg);
    1647           0 :                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
    1648           0 :         } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
    1649             :                 /* Pull out the block group from the reclaim_bgs list. */
    1650           0 :                 trace_btrfs_add_unused_block_group(bg);
    1651           0 :                 list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
    1652             :         }
    1653           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1654           0 : }
    1655             : 
    1656             : /*
    1657             :  * We want block groups with a low number of used bytes to be in the beginning
    1658             :  * of the list, so they will get reclaimed first.
    1659             :  */
    1660           0 : static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
    1661             :                            const struct list_head *b)
    1662             : {
    1663           0 :         const struct btrfs_block_group *bg1, *bg2;
    1664             : 
    1665           0 :         bg1 = list_entry(a, struct btrfs_block_group, bg_list);
    1666           0 :         bg2 = list_entry(b, struct btrfs_block_group, bg_list);
    1667             : 
    1668           0 :         return bg1->used > bg2->used;
    1669             : }
    1670             : 
    1671             : static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
    1672             : {
    1673           0 :         if (btrfs_is_zoned(fs_info))
    1674           0 :                 return btrfs_zoned_should_reclaim(fs_info);
    1675             :         return true;
    1676             : }
    1677             : 
    1678           0 : static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
    1679             : {
    1680           0 :         const struct btrfs_space_info *space_info = bg->space_info;
    1681           0 :         const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
    1682           0 :         const u64 new_val = bg->used;
    1683           0 :         const u64 old_val = new_val + bytes_freed;
    1684           0 :         u64 thresh;
    1685             : 
    1686           0 :         if (reclaim_thresh == 0)
    1687             :                 return false;
    1688             : 
    1689           0 :         thresh = mult_perc(bg->length, reclaim_thresh);
    1690             : 
    1691             :         /*
    1692             :          * If we were below the threshold before don't reclaim, we are likely a
    1693             :          * brand new block group and we don't want to relocate new block groups.
    1694             :          */
    1695           0 :         if (old_val < thresh)
    1696             :                 return false;
    1697           0 :         if (new_val >= thresh)
    1698           0 :                 return false;
    1699             :         return true;
    1700             : }
    1701             : 
    1702           0 : void btrfs_reclaim_bgs_work(struct work_struct *work)
    1703             : {
    1704           0 :         struct btrfs_fs_info *fs_info =
    1705           0 :                 container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
    1706           0 :         struct btrfs_block_group *bg;
    1707           0 :         struct btrfs_space_info *space_info;
    1708             : 
    1709           0 :         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
    1710             :                 return;
    1711             : 
    1712           0 :         if (btrfs_fs_closing(fs_info))
    1713             :                 return;
    1714             : 
    1715           0 :         if (!btrfs_should_reclaim(fs_info))
    1716             :                 return;
    1717             : 
    1718           0 :         sb_start_write(fs_info->sb);
    1719             : 
    1720           0 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
    1721           0 :                 sb_end_write(fs_info->sb);
    1722           0 :                 return;
    1723             :         }
    1724             : 
    1725             :         /*
    1726             :          * Long running balances can keep us blocked here for eternity, so
    1727             :          * simply skip reclaim if we're unable to get the mutex.
    1728             :          */
    1729           0 :         if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
    1730           0 :                 btrfs_exclop_finish(fs_info);
    1731           0 :                 sb_end_write(fs_info->sb);
    1732           0 :                 return;
    1733             :         }
    1734             : 
    1735           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1736             :         /*
    1737             :          * Sort happens under lock because we can't simply splice it and sort.
    1738             :          * The block groups might still be in use and reachable via bg_list,
    1739             :          * and their presence in the reclaim_bgs list must be preserved.
    1740             :          */
    1741           0 :         list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
    1742           0 :         while (!list_empty(&fs_info->reclaim_bgs)) {
    1743           0 :                 u64 zone_unusable;
    1744           0 :                 int ret = 0;
    1745             : 
    1746           0 :                 bg = list_first_entry(&fs_info->reclaim_bgs,
    1747             :                                       struct btrfs_block_group,
    1748             :                                       bg_list);
    1749           0 :                 list_del_init(&bg->bg_list);
    1750             : 
    1751           0 :                 space_info = bg->space_info;
    1752           0 :                 spin_unlock(&fs_info->unused_bgs_lock);
    1753             : 
    1754             :                 /* Don't race with allocators so take the groups_sem */
    1755           0 :                 down_write(&space_info->groups_sem);
    1756             : 
    1757           0 :                 spin_lock(&bg->lock);
    1758           0 :                 if (bg->reserved || bg->pinned || bg->ro) {
    1759             :                         /*
    1760             :                          * We want to bail if we made new allocations or have
    1761             :                          * outstanding allocations in this block group.  We do
    1762             :                          * the ro check in case balance is currently acting on
    1763             :                          * this block group.
    1764             :                          */
    1765           0 :                         spin_unlock(&bg->lock);
    1766           0 :                         up_write(&space_info->groups_sem);
    1767           0 :                         goto next;
    1768             :                 }
    1769           0 :                 if (bg->used == 0) {
    1770             :                         /*
    1771             :                          * It is possible that we trigger relocation on a block
    1772             :                          * group as its extents are deleted and it first goes
    1773             :                          * below the threshold, then shortly after goes empty.
    1774             :                          *
    1775             :                          * In this case, relocating it does delete it, but has
    1776             :                          * some overhead in relocation specific metadata, looking
    1777             :                          * for the non-existent extents and running some extra
    1778             :                          * transactions, which we can avoid by using one of the
    1779             :                          * other mechanisms for dealing with empty block groups.
    1780             :                          */
    1781           0 :                         if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
    1782           0 :                                 btrfs_mark_bg_unused(bg);
    1783           0 :                         spin_unlock(&bg->lock);
    1784           0 :                         up_write(&space_info->groups_sem);
    1785           0 :                         goto next;
    1786             : 
    1787             :                 }
    1788             :                 /*
    1789             :                  * The block group might no longer meet the reclaim condition by
    1790             :                  * the time we get around to reclaiming it, so to avoid
    1791             :                  * reclaiming overly full block_groups, skip reclaiming them.
    1792             :                  *
    1793             :                  * Since the decision making process also depends on the amount
    1794             :                  * being freed, pass in a fake giant value to skip that extra
    1795             :                  * check, which is more meaningful when adding to the list in
    1796             :                  * the first place.
    1797             :                  */
    1798           0 :                 if (!should_reclaim_block_group(bg, bg->length)) {
    1799           0 :                         spin_unlock(&bg->lock);
    1800           0 :                         up_write(&space_info->groups_sem);
    1801           0 :                         goto next;
    1802             :                 }
    1803           0 :                 spin_unlock(&bg->lock);
    1804             : 
    1805             :                 /*
    1806             :                  * Get out fast, in case we're read-only or unmounting the
    1807             :                  * filesystem. It is OK to drop block groups from the list even
    1808             :                  * for the read-only case. As we did sb_start_write(),
    1809             :                  * "mount -o remount,ro" won't happen and read-only filesystem
    1810             :                  * means it is forced read-only due to a fatal error. So, it
    1811             :                  * never gets back to read-write to let us reclaim again.
    1812             :                  */
    1813           0 :                 if (btrfs_need_cleaner_sleep(fs_info)) {
    1814           0 :                         up_write(&space_info->groups_sem);
    1815           0 :                         goto next;
    1816             :                 }
    1817             : 
    1818             :                 /*
    1819             :                  * Cache the zone_unusable value before turning the block group
    1820             :                  * to read only. As soon as the blog group is read only it's
    1821             :                  * zone_unusable value gets moved to the block group's read-only
    1822             :                  * bytes and isn't available for calculations anymore.
    1823             :                  */
    1824           0 :                 zone_unusable = bg->zone_unusable;
    1825           0 :                 ret = inc_block_group_ro(bg, 0);
    1826           0 :                 up_write(&space_info->groups_sem);
    1827           0 :                 if (ret < 0)
    1828           0 :                         goto next;
    1829             : 
    1830           0 :                 btrfs_info(fs_info,
    1831             :                         "reclaiming chunk %llu with %llu%% used %llu%% unusable",
    1832             :                                 bg->start,
    1833             :                                 div64_u64(bg->used * 100, bg->length),
    1834             :                                 div64_u64(zone_unusable * 100, bg->length));
    1835           0 :                 trace_btrfs_reclaim_block_group(bg);
    1836           0 :                 ret = btrfs_relocate_chunk(fs_info, bg->start);
    1837           0 :                 if (ret) {
    1838           0 :                         btrfs_dec_block_group_ro(bg);
    1839           0 :                         btrfs_err(fs_info, "error relocating chunk %llu",
    1840             :                                   bg->start);
    1841             :                 }
    1842             : 
    1843           0 : next:
    1844           0 :                 if (ret)
    1845           0 :                         btrfs_mark_bg_to_reclaim(bg);
    1846           0 :                 btrfs_put_block_group(bg);
    1847             : 
    1848           0 :                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    1849             :                 /*
    1850             :                  * Reclaiming all the block groups in the list can take really
    1851             :                  * long.  Prioritize cleaning up unused block groups.
    1852             :                  */
    1853           0 :                 btrfs_delete_unused_bgs(fs_info);
    1854             :                 /*
    1855             :                  * If we are interrupted by a balance, we can just bail out. The
    1856             :                  * cleaner thread restart again if necessary.
    1857             :                  */
    1858           0 :                 if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
    1859           0 :                         goto end;
    1860           0 :                 spin_lock(&fs_info->unused_bgs_lock);
    1861             :         }
    1862           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1863           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1864           0 : end:
    1865           0 :         btrfs_exclop_finish(fs_info);
    1866           0 :         sb_end_write(fs_info->sb);
    1867             : }
    1868             : 
    1869           0 : void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
    1870             : {
    1871           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1872           0 :         if (!list_empty(&fs_info->reclaim_bgs))
    1873           0 :                 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
    1874           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1875           0 : }
    1876             : 
    1877           0 : void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
    1878             : {
    1879           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1880             : 
    1881           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1882           0 :         if (list_empty(&bg->bg_list)) {
    1883           0 :                 btrfs_get_block_group(bg);
    1884           0 :                 trace_btrfs_add_reclaim_block_group(bg);
    1885           0 :                 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
    1886             :         }
    1887           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1888           0 : }
    1889             : 
    1890           0 : static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
    1891             :                            struct btrfs_path *path)
    1892             : {
    1893           0 :         struct extent_map_tree *em_tree;
    1894           0 :         struct extent_map *em;
    1895           0 :         struct btrfs_block_group_item bg;
    1896           0 :         struct extent_buffer *leaf;
    1897           0 :         int slot;
    1898           0 :         u64 flags;
    1899           0 :         int ret = 0;
    1900             : 
    1901           0 :         slot = path->slots[0];
    1902           0 :         leaf = path->nodes[0];
    1903             : 
    1904           0 :         em_tree = &fs_info->mapping_tree;
    1905           0 :         read_lock(&em_tree->lock);
    1906           0 :         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
    1907           0 :         read_unlock(&em_tree->lock);
    1908           0 :         if (!em) {
    1909           0 :                 btrfs_err(fs_info,
    1910             :                           "logical %llu len %llu found bg but no related chunk",
    1911             :                           key->objectid, key->offset);
    1912           0 :                 return -ENOENT;
    1913             :         }
    1914             : 
    1915           0 :         if (em->start != key->objectid || em->len != key->offset) {
    1916           0 :                 btrfs_err(fs_info,
    1917             :                         "block group %llu len %llu mismatch with chunk %llu len %llu",
    1918             :                         key->objectid, key->offset, em->start, em->len);
    1919           0 :                 ret = -EUCLEAN;
    1920           0 :                 goto out_free_em;
    1921             :         }
    1922             : 
    1923           0 :         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
    1924             :                            sizeof(bg));
    1925           0 :         flags = btrfs_stack_block_group_flags(&bg) &
    1926             :                 BTRFS_BLOCK_GROUP_TYPE_MASK;
    1927             : 
    1928           0 :         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    1929           0 :                 btrfs_err(fs_info,
    1930             : "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
    1931             :                           key->objectid, key->offset, flags,
    1932             :                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
    1933           0 :                 ret = -EUCLEAN;
    1934             :         }
    1935             : 
    1936           0 : out_free_em:
    1937           0 :         free_extent_map(em);
    1938           0 :         return ret;
    1939             : }
    1940             : 
    1941           0 : static int find_first_block_group(struct btrfs_fs_info *fs_info,
    1942             :                                   struct btrfs_path *path,
    1943             :                                   struct btrfs_key *key)
    1944             : {
    1945           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    1946           0 :         int ret;
    1947           0 :         struct btrfs_key found_key;
    1948             : 
    1949           0 :         btrfs_for_each_slot(root, key, &found_key, path, ret) {
    1950           0 :                 if (found_key.objectid >= key->objectid &&
    1951           0 :                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
    1952           0 :                         return read_bg_from_eb(fs_info, &found_key, path);
    1953             :                 }
    1954             :         }
    1955             :         return ret;
    1956             : }
    1957             : 
    1958           0 : static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
    1959             : {
    1960           0 :         u64 extra_flags = chunk_to_extended(flags) &
    1961             :                                 BTRFS_EXTENDED_PROFILE_MASK;
    1962             : 
    1963           0 :         write_seqlock(&fs_info->profiles_lock);
    1964           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA)
    1965           0 :                 fs_info->avail_data_alloc_bits |= extra_flags;
    1966           0 :         if (flags & BTRFS_BLOCK_GROUP_METADATA)
    1967           0 :                 fs_info->avail_metadata_alloc_bits |= extra_flags;
    1968           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    1969           0 :                 fs_info->avail_system_alloc_bits |= extra_flags;
    1970           0 :         write_sequnlock(&fs_info->profiles_lock);
    1971           0 : }
    1972             : 
    1973             : /*
    1974             :  * Map a physical disk address to a list of logical addresses.
    1975             :  *
    1976             :  * @fs_info:       the filesystem
    1977             :  * @chunk_start:   logical address of block group
    1978             :  * @physical:      physical address to map to logical addresses
    1979             :  * @logical:       return array of logical addresses which map to @physical
    1980             :  * @naddrs:        length of @logical
    1981             :  * @stripe_len:    size of IO stripe for the given block group
    1982             :  *
    1983             :  * Maps a particular @physical disk address to a list of @logical addresses.
    1984             :  * Used primarily to exclude those portions of a block group that contain super
    1985             :  * block copies.
    1986             :  */
    1987           0 : int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
    1988             :                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
    1989             : {
    1990           0 :         struct extent_map *em;
    1991           0 :         struct map_lookup *map;
    1992           0 :         u64 *buf;
    1993           0 :         u64 bytenr;
    1994           0 :         u64 data_stripe_length;
    1995           0 :         u64 io_stripe_size;
    1996           0 :         int i, nr = 0;
    1997           0 :         int ret = 0;
    1998             : 
    1999           0 :         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
    2000           0 :         if (IS_ERR(em))
    2001             :                 return -EIO;
    2002             : 
    2003           0 :         map = em->map_lookup;
    2004           0 :         data_stripe_length = em->orig_block_len;
    2005           0 :         io_stripe_size = BTRFS_STRIPE_LEN;
    2006           0 :         chunk_start = em->start;
    2007             : 
    2008             :         /* For RAID5/6 adjust to a full IO stripe length */
    2009           0 :         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
    2010           0 :                 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
    2011             : 
    2012           0 :         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
    2013           0 :         if (!buf) {
    2014           0 :                 ret = -ENOMEM;
    2015           0 :                 goto out;
    2016             :         }
    2017             : 
    2018           0 :         for (i = 0; i < map->num_stripes; i++) {
    2019           0 :                 bool already_inserted = false;
    2020           0 :                 u32 stripe_nr;
    2021           0 :                 u32 offset;
    2022           0 :                 int j;
    2023             : 
    2024           0 :                 if (!in_range(physical, map->stripes[i].physical,
    2025             :                               data_stripe_length))
    2026           0 :                         continue;
    2027             : 
    2028           0 :                 stripe_nr = (physical - map->stripes[i].physical) >>
    2029             :                             BTRFS_STRIPE_LEN_SHIFT;
    2030           0 :                 offset = (physical - map->stripes[i].physical) &
    2031             :                          BTRFS_STRIPE_LEN_MASK;
    2032             : 
    2033           0 :                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
    2034             :                                  BTRFS_BLOCK_GROUP_RAID10))
    2035           0 :                         stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
    2036           0 :                                             map->sub_stripes);
    2037             :                 /*
    2038             :                  * The remaining case would be for RAID56, multiply by
    2039             :                  * nr_data_stripes().  Alternatively, just use rmap_len below
    2040             :                  * instead of map->stripe_len
    2041             :                  */
    2042           0 :                 bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
    2043             : 
    2044             :                 /* Ensure we don't add duplicate addresses */
    2045           0 :                 for (j = 0; j < nr; j++) {
    2046           0 :                         if (buf[j] == bytenr) {
    2047             :                                 already_inserted = true;
    2048             :                                 break;
    2049             :                         }
    2050             :                 }
    2051             : 
    2052           0 :                 if (!already_inserted)
    2053           0 :                         buf[nr++] = bytenr;
    2054             :         }
    2055             : 
    2056           0 :         *logical = buf;
    2057           0 :         *naddrs = nr;
    2058           0 :         *stripe_len = io_stripe_size;
    2059           0 : out:
    2060           0 :         free_extent_map(em);
    2061           0 :         return ret;
    2062             : }
    2063             : 
    2064           0 : static int exclude_super_stripes(struct btrfs_block_group *cache)
    2065             : {
    2066           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    2067           0 :         const bool zoned = btrfs_is_zoned(fs_info);
    2068           0 :         u64 bytenr;
    2069           0 :         u64 *logical;
    2070           0 :         int stripe_len;
    2071           0 :         int i, nr, ret;
    2072             : 
    2073           0 :         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
    2074           0 :                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
    2075           0 :                 cache->bytes_super += stripe_len;
    2076           0 :                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
    2077             :                                                 stripe_len);
    2078           0 :                 if (ret)
    2079             :                         return ret;
    2080             :         }
    2081             : 
    2082           0 :         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    2083           0 :                 bytenr = btrfs_sb_offset(i);
    2084           0 :                 ret = btrfs_rmap_block(fs_info, cache->start,
    2085             :                                        bytenr, &logical, &nr, &stripe_len);
    2086           0 :                 if (ret)
    2087           0 :                         return ret;
    2088             : 
    2089             :                 /* Shouldn't have super stripes in sequential zones */
    2090           0 :                 if (zoned && nr) {
    2091           0 :                         kfree(logical);
    2092           0 :                         btrfs_err(fs_info,
    2093             :                         "zoned: block group %llu must not contain super block",
    2094             :                                   cache->start);
    2095           0 :                         return -EUCLEAN;
    2096             :                 }
    2097             : 
    2098           0 :                 while (nr--) {
    2099           0 :                         u64 len = min_t(u64, stripe_len,
    2100             :                                 cache->start + cache->length - logical[nr]);
    2101             : 
    2102           0 :                         cache->bytes_super += len;
    2103           0 :                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
    2104             :                                                         len);
    2105           0 :                         if (ret) {
    2106           0 :                                 kfree(logical);
    2107           0 :                                 return ret;
    2108             :                         }
    2109             :                 }
    2110             : 
    2111           0 :                 kfree(logical);
    2112             :         }
    2113             :         return 0;
    2114             : }
    2115             : 
    2116           0 : static struct btrfs_block_group *btrfs_create_block_group_cache(
    2117             :                 struct btrfs_fs_info *fs_info, u64 start)
    2118             : {
    2119           0 :         struct btrfs_block_group *cache;
    2120             : 
    2121           0 :         cache = kzalloc(sizeof(*cache), GFP_NOFS);
    2122           0 :         if (!cache)
    2123             :                 return NULL;
    2124             : 
    2125           0 :         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
    2126             :                                         GFP_NOFS);
    2127           0 :         if (!cache->free_space_ctl) {
    2128           0 :                 kfree(cache);
    2129           0 :                 return NULL;
    2130             :         }
    2131             : 
    2132           0 :         cache->start = start;
    2133             : 
    2134           0 :         cache->fs_info = fs_info;
    2135           0 :         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
    2136             : 
    2137           0 :         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
    2138             : 
    2139           0 :         refcount_set(&cache->refs, 1);
    2140           0 :         spin_lock_init(&cache->lock);
    2141           0 :         init_rwsem(&cache->data_rwsem);
    2142           0 :         INIT_LIST_HEAD(&cache->list);
    2143           0 :         INIT_LIST_HEAD(&cache->cluster_list);
    2144           0 :         INIT_LIST_HEAD(&cache->bg_list);
    2145           0 :         INIT_LIST_HEAD(&cache->ro_list);
    2146           0 :         INIT_LIST_HEAD(&cache->discard_list);
    2147           0 :         INIT_LIST_HEAD(&cache->dirty_list);
    2148           0 :         INIT_LIST_HEAD(&cache->io_list);
    2149           0 :         INIT_LIST_HEAD(&cache->active_bg_list);
    2150           0 :         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
    2151           0 :         atomic_set(&cache->frozen, 0);
    2152           0 :         mutex_init(&cache->free_space_lock);
    2153             : 
    2154           0 :         return cache;
    2155             : }
    2156             : 
    2157             : /*
    2158             :  * Iterate all chunks and verify that each of them has the corresponding block
    2159             :  * group
    2160             :  */
    2161           0 : static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
    2162             : {
    2163           0 :         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
    2164           0 :         struct extent_map *em;
    2165           0 :         struct btrfs_block_group *bg;
    2166           0 :         u64 start = 0;
    2167           0 :         int ret = 0;
    2168             : 
    2169           0 :         while (1) {
    2170           0 :                 read_lock(&map_tree->lock);
    2171             :                 /*
    2172             :                  * lookup_extent_mapping will return the first extent map
    2173             :                  * intersecting the range, so setting @len to 1 is enough to
    2174             :                  * get the first chunk.
    2175             :                  */
    2176           0 :                 em = lookup_extent_mapping(map_tree, start, 1);
    2177           0 :                 read_unlock(&map_tree->lock);
    2178           0 :                 if (!em)
    2179             :                         break;
    2180             : 
    2181           0 :                 bg = btrfs_lookup_block_group(fs_info, em->start);
    2182           0 :                 if (!bg) {
    2183           0 :                         btrfs_err(fs_info,
    2184             :         "chunk start=%llu len=%llu doesn't have corresponding block group",
    2185             :                                      em->start, em->len);
    2186           0 :                         ret = -EUCLEAN;
    2187           0 :                         free_extent_map(em);
    2188           0 :                         break;
    2189             :                 }
    2190           0 :                 if (bg->start != em->start || bg->length != em->len ||
    2191           0 :                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
    2192           0 :                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    2193           0 :                         btrfs_err(fs_info,
    2194             : "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
    2195             :                                 em->start, em->len,
    2196             :                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
    2197             :                                 bg->start, bg->length,
    2198             :                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
    2199           0 :                         ret = -EUCLEAN;
    2200           0 :                         free_extent_map(em);
    2201           0 :                         btrfs_put_block_group(bg);
    2202           0 :                         break;
    2203             :                 }
    2204           0 :                 start = em->start + em->len;
    2205           0 :                 free_extent_map(em);
    2206           0 :                 btrfs_put_block_group(bg);
    2207             :         }
    2208           0 :         return ret;
    2209             : }
    2210             : 
    2211           0 : static int read_one_block_group(struct btrfs_fs_info *info,
    2212             :                                 struct btrfs_block_group_item *bgi,
    2213             :                                 const struct btrfs_key *key,
    2214             :                                 int need_clear)
    2215             : {
    2216           0 :         struct btrfs_block_group *cache;
    2217           0 :         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
    2218           0 :         int ret;
    2219             : 
    2220           0 :         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
    2221             : 
    2222           0 :         cache = btrfs_create_block_group_cache(info, key->objectid);
    2223           0 :         if (!cache)
    2224             :                 return -ENOMEM;
    2225             : 
    2226           0 :         cache->length = key->offset;
    2227           0 :         cache->used = btrfs_stack_block_group_used(bgi);
    2228           0 :         cache->commit_used = cache->used;
    2229           0 :         cache->flags = btrfs_stack_block_group_flags(bgi);
    2230           0 :         cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
    2231             : 
    2232           0 :         set_free_space_tree_thresholds(cache);
    2233             : 
    2234           0 :         if (need_clear) {
    2235             :                 /*
    2236             :                  * When we mount with old space cache, we need to
    2237             :                  * set BTRFS_DC_CLEAR and set dirty flag.
    2238             :                  *
    2239             :                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
    2240             :                  *    truncate the old free space cache inode and
    2241             :                  *    setup a new one.
    2242             :                  * b) Setting 'dirty flag' makes sure that we flush
    2243             :                  *    the new space cache info onto disk.
    2244             :                  */
    2245           0 :                 if (btrfs_test_opt(info, SPACE_CACHE))
    2246           0 :                         cache->disk_cache_state = BTRFS_DC_CLEAR;
    2247             :         }
    2248           0 :         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
    2249             :             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
    2250           0 :                         btrfs_err(info,
    2251             : "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
    2252             :                                   cache->start);
    2253           0 :                         ret = -EINVAL;
    2254           0 :                         goto error;
    2255             :         }
    2256             : 
    2257           0 :         ret = btrfs_load_block_group_zone_info(cache, false);
    2258           0 :         if (ret) {
    2259           0 :                 btrfs_err(info, "zoned: failed to load zone info of bg %llu",
    2260             :                           cache->start);
    2261           0 :                 goto error;
    2262             :         }
    2263             : 
    2264             :         /*
    2265             :          * We need to exclude the super stripes now so that the space info has
    2266             :          * super bytes accounted for, otherwise we'll think we have more space
    2267             :          * than we actually do.
    2268             :          */
    2269           0 :         ret = exclude_super_stripes(cache);
    2270           0 :         if (ret) {
    2271             :                 /* We may have excluded something, so call this just in case. */
    2272           0 :                 btrfs_free_excluded_extents(cache);
    2273           0 :                 goto error;
    2274             :         }
    2275             : 
    2276             :         /*
    2277             :          * For zoned filesystem, space after the allocation offset is the only
    2278             :          * free space for a block group. So, we don't need any caching work.
    2279             :          * btrfs_calc_zone_unusable() will set the amount of free space and
    2280             :          * zone_unusable space.
    2281             :          *
    2282             :          * For regular filesystem, check for two cases, either we are full, and
    2283             :          * therefore don't need to bother with the caching work since we won't
    2284             :          * find any space, or we are empty, and we can just add all the space
    2285             :          * in and be done with it.  This saves us _a_lot_ of time, particularly
    2286             :          * in the full case.
    2287             :          */
    2288           0 :         if (btrfs_is_zoned(info)) {
    2289           0 :                 btrfs_calc_zone_unusable(cache);
    2290             :                 /* Should not have any excluded extents. Just in case, though. */
    2291           0 :                 btrfs_free_excluded_extents(cache);
    2292           0 :         } else if (cache->length == cache->used) {
    2293           0 :                 cache->cached = BTRFS_CACHE_FINISHED;
    2294           0 :                 btrfs_free_excluded_extents(cache);
    2295           0 :         } else if (cache->used == 0) {
    2296           0 :                 cache->cached = BTRFS_CACHE_FINISHED;
    2297           0 :                 add_new_free_space(cache, cache->start,
    2298             :                                    cache->start + cache->length);
    2299           0 :                 btrfs_free_excluded_extents(cache);
    2300             :         }
    2301             : 
    2302           0 :         ret = btrfs_add_block_group_cache(info, cache);
    2303           0 :         if (ret) {
    2304           0 :                 btrfs_remove_free_space_cache(cache);
    2305           0 :                 goto error;
    2306             :         }
    2307           0 :         trace_btrfs_add_block_group(info, cache, 0);
    2308           0 :         btrfs_add_bg_to_space_info(info, cache);
    2309             : 
    2310           0 :         set_avail_alloc_bits(info, cache->flags);
    2311           0 :         if (btrfs_chunk_writeable(info, cache->start)) {
    2312           0 :                 if (cache->used == 0) {
    2313           0 :                         ASSERT(list_empty(&cache->bg_list));
    2314           0 :                         if (btrfs_test_opt(info, DISCARD_ASYNC))
    2315           0 :                                 btrfs_discard_queue_work(&info->discard_ctl, cache);
    2316             :                         else
    2317           0 :                                 btrfs_mark_bg_unused(cache);
    2318             :                 }
    2319             :         } else {
    2320           0 :                 inc_block_group_ro(cache, 1);
    2321             :         }
    2322             : 
    2323             :         return 0;
    2324           0 : error:
    2325           0 :         btrfs_put_block_group(cache);
    2326           0 :         return ret;
    2327             : }
    2328             : 
    2329           0 : static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
    2330             : {
    2331           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    2332           0 :         struct rb_node *node;
    2333           0 :         int ret = 0;
    2334             : 
    2335           0 :         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
    2336           0 :                 struct extent_map *em;
    2337           0 :                 struct map_lookup *map;
    2338           0 :                 struct btrfs_block_group *bg;
    2339             : 
    2340           0 :                 em = rb_entry(node, struct extent_map, rb_node);
    2341           0 :                 map = em->map_lookup;
    2342           0 :                 bg = btrfs_create_block_group_cache(fs_info, em->start);
    2343           0 :                 if (!bg) {
    2344             :                         ret = -ENOMEM;
    2345             :                         break;
    2346             :                 }
    2347             : 
    2348             :                 /* Fill dummy cache as FULL */
    2349           0 :                 bg->length = em->len;
    2350           0 :                 bg->flags = map->type;
    2351           0 :                 bg->cached = BTRFS_CACHE_FINISHED;
    2352           0 :                 bg->used = em->len;
    2353           0 :                 bg->flags = map->type;
    2354           0 :                 ret = btrfs_add_block_group_cache(fs_info, bg);
    2355             :                 /*
    2356             :                  * We may have some valid block group cache added already, in
    2357             :                  * that case we skip to the next one.
    2358             :                  */
    2359           0 :                 if (ret == -EEXIST) {
    2360           0 :                         ret = 0;
    2361           0 :                         btrfs_put_block_group(bg);
    2362           0 :                         continue;
    2363             :                 }
    2364             : 
    2365           0 :                 if (ret) {
    2366           0 :                         btrfs_remove_free_space_cache(bg);
    2367           0 :                         btrfs_put_block_group(bg);
    2368           0 :                         break;
    2369             :                 }
    2370             : 
    2371           0 :                 btrfs_add_bg_to_space_info(fs_info, bg);
    2372             : 
    2373           0 :                 set_avail_alloc_bits(fs_info, bg->flags);
    2374             :         }
    2375           0 :         if (!ret)
    2376           0 :                 btrfs_init_global_block_rsv(fs_info);
    2377           0 :         return ret;
    2378             : }
    2379             : 
    2380           0 : int btrfs_read_block_groups(struct btrfs_fs_info *info)
    2381             : {
    2382           0 :         struct btrfs_root *root = btrfs_block_group_root(info);
    2383           0 :         struct btrfs_path *path;
    2384           0 :         int ret;
    2385           0 :         struct btrfs_block_group *cache;
    2386           0 :         struct btrfs_space_info *space_info;
    2387           0 :         struct btrfs_key key;
    2388           0 :         int need_clear = 0;
    2389           0 :         u64 cache_gen;
    2390             : 
    2391             :         /*
    2392             :          * Either no extent root (with ibadroots rescue option) or we have
    2393             :          * unsupported RO options. The fs can never be mounted read-write, so no
    2394             :          * need to waste time searching block group items.
    2395             :          *
    2396             :          * This also allows new extent tree related changes to be RO compat,
    2397             :          * no need for a full incompat flag.
    2398             :          */
    2399           0 :         if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
    2400             :                       ~BTRFS_FEATURE_COMPAT_RO_SUPP))
    2401           0 :                 return fill_dummy_bgs(info);
    2402             : 
    2403           0 :         key.objectid = 0;
    2404           0 :         key.offset = 0;
    2405           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2406           0 :         path = btrfs_alloc_path();
    2407           0 :         if (!path)
    2408             :                 return -ENOMEM;
    2409             : 
    2410           0 :         cache_gen = btrfs_super_cache_generation(info->super_copy);
    2411           0 :         if (btrfs_test_opt(info, SPACE_CACHE) &&
    2412             :             btrfs_super_generation(info->super_copy) != cache_gen)
    2413           0 :                 need_clear = 1;
    2414           0 :         if (btrfs_test_opt(info, CLEAR_CACHE))
    2415           0 :                 need_clear = 1;
    2416             : 
    2417           0 :         while (1) {
    2418           0 :                 struct btrfs_block_group_item bgi;
    2419           0 :                 struct extent_buffer *leaf;
    2420           0 :                 int slot;
    2421             : 
    2422           0 :                 ret = find_first_block_group(info, path, &key);
    2423           0 :                 if (ret > 0)
    2424             :                         break;
    2425           0 :                 if (ret != 0)
    2426           0 :                         goto error;
    2427             : 
    2428           0 :                 leaf = path->nodes[0];
    2429           0 :                 slot = path->slots[0];
    2430             : 
    2431           0 :                 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
    2432             :                                    sizeof(bgi));
    2433             : 
    2434           0 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    2435           0 :                 btrfs_release_path(path);
    2436           0 :                 ret = read_one_block_group(info, &bgi, &key, need_clear);
    2437           0 :                 if (ret < 0)
    2438           0 :                         goto error;
    2439           0 :                 key.objectid += key.offset;
    2440           0 :                 key.offset = 0;
    2441             :         }
    2442           0 :         btrfs_release_path(path);
    2443             : 
    2444           0 :         list_for_each_entry(space_info, &info->space_info, list) {
    2445             :                 int i;
    2446             : 
    2447           0 :                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
    2448           0 :                         if (list_empty(&space_info->block_groups[i]))
    2449           0 :                                 continue;
    2450           0 :                         cache = list_first_entry(&space_info->block_groups[i],
    2451             :                                                  struct btrfs_block_group,
    2452             :                                                  list);
    2453           0 :                         btrfs_sysfs_add_block_group_type(cache);
    2454             :                 }
    2455             : 
    2456           0 :                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
    2457             :                       (BTRFS_BLOCK_GROUP_RAID10 |
    2458             :                        BTRFS_BLOCK_GROUP_RAID1_MASK |
    2459             :                        BTRFS_BLOCK_GROUP_RAID56_MASK |
    2460             :                        BTRFS_BLOCK_GROUP_DUP)))
    2461           0 :                         continue;
    2462             :                 /*
    2463             :                  * Avoid allocating from un-mirrored block group if there are
    2464             :                  * mirrored block groups.
    2465             :                  */
    2466           0 :                 list_for_each_entry(cache,
    2467             :                                 &space_info->block_groups[BTRFS_RAID_RAID0],
    2468             :                                 list)
    2469           0 :                         inc_block_group_ro(cache, 1);
    2470           0 :                 list_for_each_entry(cache,
    2471             :                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
    2472             :                                 list)
    2473           0 :                         inc_block_group_ro(cache, 1);
    2474             :         }
    2475             : 
    2476           0 :         btrfs_init_global_block_rsv(info);
    2477           0 :         ret = check_chunk_block_group_mappings(info);
    2478           0 : error:
    2479           0 :         btrfs_free_path(path);
    2480             :         /*
    2481             :          * We've hit some error while reading the extent tree, and have
    2482             :          * rescue=ibadroots mount option.
    2483             :          * Try to fill the tree using dummy block groups so that the user can
    2484             :          * continue to mount and grab their data.
    2485             :          */
    2486           0 :         if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
    2487           0 :                 ret = fill_dummy_bgs(info);
    2488             :         return ret;
    2489             : }
    2490             : 
    2491             : /*
    2492             :  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
    2493             :  * allocation.
    2494             :  *
    2495             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2496             :  * phases.
    2497             :  */
    2498           0 : static int insert_block_group_item(struct btrfs_trans_handle *trans,
    2499             :                                    struct btrfs_block_group *block_group)
    2500             : {
    2501           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2502           0 :         struct btrfs_block_group_item bgi;
    2503           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2504           0 :         struct btrfs_key key;
    2505           0 :         u64 old_commit_used;
    2506           0 :         int ret;
    2507             : 
    2508           0 :         spin_lock(&block_group->lock);
    2509           0 :         btrfs_set_stack_block_group_used(&bgi, block_group->used);
    2510           0 :         btrfs_set_stack_block_group_chunk_objectid(&bgi,
    2511             :                                                    block_group->global_root_id);
    2512           0 :         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
    2513           0 :         old_commit_used = block_group->commit_used;
    2514           0 :         block_group->commit_used = block_group->used;
    2515           0 :         key.objectid = block_group->start;
    2516           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2517           0 :         key.offset = block_group->length;
    2518           0 :         spin_unlock(&block_group->lock);
    2519             : 
    2520           0 :         ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
    2521           0 :         if (ret < 0) {
    2522           0 :                 spin_lock(&block_group->lock);
    2523           0 :                 block_group->commit_used = old_commit_used;
    2524           0 :                 spin_unlock(&block_group->lock);
    2525             :         }
    2526             : 
    2527           0 :         return ret;
    2528             : }
    2529             : 
    2530           0 : static int insert_dev_extent(struct btrfs_trans_handle *trans,
    2531             :                             struct btrfs_device *device, u64 chunk_offset,
    2532             :                             u64 start, u64 num_bytes)
    2533             : {
    2534           0 :         struct btrfs_fs_info *fs_info = device->fs_info;
    2535           0 :         struct btrfs_root *root = fs_info->dev_root;
    2536           0 :         struct btrfs_path *path;
    2537           0 :         struct btrfs_dev_extent *extent;
    2538           0 :         struct extent_buffer *leaf;
    2539           0 :         struct btrfs_key key;
    2540           0 :         int ret;
    2541             : 
    2542           0 :         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
    2543           0 :         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
    2544           0 :         path = btrfs_alloc_path();
    2545           0 :         if (!path)
    2546             :                 return -ENOMEM;
    2547             : 
    2548           0 :         key.objectid = device->devid;
    2549           0 :         key.type = BTRFS_DEV_EXTENT_KEY;
    2550           0 :         key.offset = start;
    2551           0 :         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
    2552           0 :         if (ret)
    2553           0 :                 goto out;
    2554             : 
    2555           0 :         leaf = path->nodes[0];
    2556           0 :         extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
    2557           0 :         btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
    2558           0 :         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
    2559             :                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
    2560           0 :         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
    2561             : 
    2562           0 :         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
    2563           0 :         btrfs_mark_buffer_dirty(leaf);
    2564           0 : out:
    2565           0 :         btrfs_free_path(path);
    2566           0 :         return ret;
    2567             : }
    2568             : 
    2569             : /*
    2570             :  * This function belongs to phase 2.
    2571             :  *
    2572             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2573             :  * phases.
    2574             :  */
    2575           0 : static int insert_dev_extents(struct btrfs_trans_handle *trans,
    2576             :                                    u64 chunk_offset, u64 chunk_size)
    2577             : {
    2578           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2579           0 :         struct btrfs_device *device;
    2580           0 :         struct extent_map *em;
    2581           0 :         struct map_lookup *map;
    2582           0 :         u64 dev_offset;
    2583           0 :         u64 stripe_size;
    2584           0 :         int i;
    2585           0 :         int ret = 0;
    2586             : 
    2587           0 :         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
    2588           0 :         if (IS_ERR(em))
    2589           0 :                 return PTR_ERR(em);
    2590             : 
    2591           0 :         map = em->map_lookup;
    2592           0 :         stripe_size = em->orig_block_len;
    2593             : 
    2594             :         /*
    2595             :          * Take the device list mutex to prevent races with the final phase of
    2596             :          * a device replace operation that replaces the device object associated
    2597             :          * with the map's stripes, because the device object's id can change
    2598             :          * at any time during that final phase of the device replace operation
    2599             :          * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
    2600             :          * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
    2601             :          * resulting in persisting a device extent item with such ID.
    2602             :          */
    2603           0 :         mutex_lock(&fs_info->fs_devices->device_list_mutex);
    2604           0 :         for (i = 0; i < map->num_stripes; i++) {
    2605           0 :                 device = map->stripes[i].dev;
    2606           0 :                 dev_offset = map->stripes[i].physical;
    2607             : 
    2608           0 :                 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
    2609             :                                        stripe_size);
    2610           0 :                 if (ret)
    2611             :                         break;
    2612             :         }
    2613           0 :         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2614             : 
    2615           0 :         free_extent_map(em);
    2616           0 :         return ret;
    2617             : }
    2618             : 
    2619             : /*
    2620             :  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
    2621             :  * chunk allocation.
    2622             :  *
    2623             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2624             :  * phases.
    2625             :  */
    2626           0 : void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
    2627             : {
    2628           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2629           0 :         struct btrfs_block_group *block_group;
    2630           0 :         int ret = 0;
    2631             : 
    2632           0 :         while (!list_empty(&trans->new_bgs)) {
    2633           0 :                 int index;
    2634             : 
    2635           0 :                 block_group = list_first_entry(&trans->new_bgs,
    2636             :                                                struct btrfs_block_group,
    2637             :                                                bg_list);
    2638           0 :                 if (ret)
    2639           0 :                         goto next;
    2640             : 
    2641           0 :                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
    2642             : 
    2643           0 :                 ret = insert_block_group_item(trans, block_group);
    2644           0 :                 if (ret)
    2645           0 :                         btrfs_abort_transaction(trans, ret);
    2646           0 :                 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
    2647             :                               &block_group->runtime_flags)) {
    2648           0 :                         mutex_lock(&fs_info->chunk_mutex);
    2649           0 :                         ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
    2650           0 :                         mutex_unlock(&fs_info->chunk_mutex);
    2651           0 :                         if (ret)
    2652           0 :                                 btrfs_abort_transaction(trans, ret);
    2653             :                 }
    2654           0 :                 ret = insert_dev_extents(trans, block_group->start,
    2655             :                                          block_group->length);
    2656           0 :                 if (ret)
    2657           0 :                         btrfs_abort_transaction(trans, ret);
    2658           0 :                 add_block_group_free_space(trans, block_group);
    2659             : 
    2660             :                 /*
    2661             :                  * If we restriped during balance, we may have added a new raid
    2662             :                  * type, so now add the sysfs entries when it is safe to do so.
    2663             :                  * We don't have to worry about locking here as it's handled in
    2664             :                  * btrfs_sysfs_add_block_group_type.
    2665             :                  */
    2666           0 :                 if (block_group->space_info->block_group_kobjs[index] == NULL)
    2667           0 :                         btrfs_sysfs_add_block_group_type(block_group);
    2668             : 
    2669             :                 /* Already aborted the transaction if it failed. */
    2670           0 : next:
    2671           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    2672           0 :                 list_del_init(&block_group->bg_list);
    2673           0 :                 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
    2674             :         }
    2675           0 :         btrfs_trans_release_chunk_metadata(trans);
    2676           0 : }
    2677             : 
    2678             : /*
    2679             :  * For extent tree v2 we use the block_group_item->chunk_offset to point at our
    2680             :  * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
    2681             :  */
    2682             : static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
    2683             : {
    2684           0 :         u64 div = SZ_1G;
    2685           0 :         u64 index;
    2686             : 
    2687           0 :         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
    2688             :                 return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    2689             : 
    2690             :         /* If we have a smaller fs index based on 128MiB. */
    2691           0 :         if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
    2692           0 :                 div = SZ_128M;
    2693             : 
    2694           0 :         offset = div64_u64(offset, div);
    2695           0 :         div64_u64_rem(offset, fs_info->nr_global_roots, &index);
    2696           0 :         return index;
    2697             : }
    2698             : 
    2699           0 : struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
    2700             :                                                  u64 type,
    2701             :                                                  u64 chunk_offset, u64 size)
    2702             : {
    2703           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2704           0 :         struct btrfs_block_group *cache;
    2705           0 :         int ret;
    2706             : 
    2707           0 :         btrfs_set_log_full_commit(trans);
    2708             : 
    2709           0 :         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
    2710           0 :         if (!cache)
    2711             :                 return ERR_PTR(-ENOMEM);
    2712             : 
    2713             :         /*
    2714             :          * Mark it as new before adding it to the rbtree of block groups or any
    2715             :          * list, so that no other task finds it and calls btrfs_mark_bg_unused()
    2716             :          * before the new flag is set.
    2717             :          */
    2718           0 :         set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
    2719             : 
    2720           0 :         cache->length = size;
    2721           0 :         set_free_space_tree_thresholds(cache);
    2722           0 :         cache->flags = type;
    2723           0 :         cache->cached = BTRFS_CACHE_FINISHED;
    2724           0 :         cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
    2725             : 
    2726           0 :         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
    2727           0 :                 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
    2728             : 
    2729           0 :         ret = btrfs_load_block_group_zone_info(cache, true);
    2730           0 :         if (ret) {
    2731           0 :                 btrfs_put_block_group(cache);
    2732           0 :                 return ERR_PTR(ret);
    2733             :         }
    2734             : 
    2735           0 :         ret = exclude_super_stripes(cache);
    2736           0 :         if (ret) {
    2737             :                 /* We may have excluded something, so call this just in case */
    2738           0 :                 btrfs_free_excluded_extents(cache);
    2739           0 :                 btrfs_put_block_group(cache);
    2740           0 :                 return ERR_PTR(ret);
    2741             :         }
    2742             : 
    2743           0 :         add_new_free_space(cache, chunk_offset, chunk_offset + size);
    2744             : 
    2745           0 :         btrfs_free_excluded_extents(cache);
    2746             : 
    2747             :         /*
    2748             :          * Ensure the corresponding space_info object is created and
    2749             :          * assigned to our block group. We want our bg to be added to the rbtree
    2750             :          * with its ->space_info set.
    2751             :          */
    2752           0 :         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
    2753           0 :         ASSERT(cache->space_info);
    2754             : 
    2755           0 :         ret = btrfs_add_block_group_cache(fs_info, cache);
    2756           0 :         if (ret) {
    2757           0 :                 btrfs_remove_free_space_cache(cache);
    2758           0 :                 btrfs_put_block_group(cache);
    2759           0 :                 return ERR_PTR(ret);
    2760             :         }
    2761             : 
    2762             :         /*
    2763             :          * Now that our block group has its ->space_info set and is inserted in
    2764             :          * the rbtree, update the space info's counters.
    2765             :          */
    2766           0 :         trace_btrfs_add_block_group(fs_info, cache, 1);
    2767           0 :         btrfs_add_bg_to_space_info(fs_info, cache);
    2768           0 :         btrfs_update_global_block_rsv(fs_info);
    2769             : 
    2770             : #ifdef CONFIG_BTRFS_DEBUG
    2771             :         if (btrfs_should_fragment_free_space(cache)) {
    2772             :                 cache->space_info->bytes_used += size >> 1;
    2773             :                 fragment_free_space(cache);
    2774             :         }
    2775             : #endif
    2776             : 
    2777           0 :         list_add_tail(&cache->bg_list, &trans->new_bgs);
    2778           0 :         trans->delayed_ref_updates++;
    2779           0 :         btrfs_update_delayed_refs_rsv(trans);
    2780             : 
    2781           0 :         set_avail_alloc_bits(fs_info, type);
    2782           0 :         return cache;
    2783             : }
    2784             : 
    2785             : /*
    2786             :  * Mark one block group RO, can be called several times for the same block
    2787             :  * group.
    2788             :  *
    2789             :  * @cache:              the destination block group
    2790             :  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
    2791             :  *                      ensure we still have some free space after marking this
    2792             :  *                      block group RO.
    2793             :  */
    2794           0 : int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
    2795             :                              bool do_chunk_alloc)
    2796             : {
    2797           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    2798           0 :         struct btrfs_trans_handle *trans;
    2799           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2800           0 :         u64 alloc_flags;
    2801           0 :         int ret;
    2802           0 :         bool dirty_bg_running;
    2803             : 
    2804             :         /*
    2805             :          * This can only happen when we are doing read-only scrub on read-only
    2806             :          * mount.
    2807             :          * In that case we should not start a new transaction on read-only fs.
    2808             :          * Thus here we skip all chunk allocations.
    2809             :          */
    2810           0 :         if (sb_rdonly(fs_info->sb)) {
    2811           0 :                 mutex_lock(&fs_info->ro_block_group_mutex);
    2812           0 :                 ret = inc_block_group_ro(cache, 0);
    2813           0 :                 mutex_unlock(&fs_info->ro_block_group_mutex);
    2814           0 :                 return ret;
    2815             :         }
    2816             : 
    2817           0 :         do {
    2818           0 :                 trans = btrfs_join_transaction(root);
    2819           0 :                 if (IS_ERR(trans))
    2820           0 :                         return PTR_ERR(trans);
    2821             : 
    2822           0 :                 dirty_bg_running = false;
    2823             : 
    2824             :                 /*
    2825             :                  * We're not allowed to set block groups readonly after the dirty
    2826             :                  * block group cache has started writing.  If it already started,
    2827             :                  * back off and let this transaction commit.
    2828             :                  */
    2829           0 :                 mutex_lock(&fs_info->ro_block_group_mutex);
    2830           0 :                 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
    2831           0 :                         u64 transid = trans->transid;
    2832             : 
    2833           0 :                         mutex_unlock(&fs_info->ro_block_group_mutex);
    2834           0 :                         btrfs_end_transaction(trans);
    2835             : 
    2836           0 :                         ret = btrfs_wait_for_commit(fs_info, transid);
    2837           0 :                         if (ret)
    2838           0 :                                 return ret;
    2839             :                         dirty_bg_running = true;
    2840             :                 }
    2841           0 :         } while (dirty_bg_running);
    2842             : 
    2843           0 :         if (do_chunk_alloc) {
    2844             :                 /*
    2845             :                  * If we are changing raid levels, try to allocate a
    2846             :                  * corresponding block group with the new raid level.
    2847             :                  */
    2848           0 :                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
    2849           0 :                 if (alloc_flags != cache->flags) {
    2850           0 :                         ret = btrfs_chunk_alloc(trans, alloc_flags,
    2851             :                                                 CHUNK_ALLOC_FORCE);
    2852             :                         /*
    2853             :                          * ENOSPC is allowed here, we may have enough space
    2854             :                          * already allocated at the new raid level to carry on
    2855             :                          */
    2856           0 :                         if (ret == -ENOSPC)
    2857             :                                 ret = 0;
    2858           0 :                         if (ret < 0)
    2859           0 :                                 goto out;
    2860             :                 }
    2861             :         }
    2862             : 
    2863           0 :         ret = inc_block_group_ro(cache, 0);
    2864           0 :         if (!ret)
    2865           0 :                 goto out;
    2866           0 :         if (ret == -ETXTBSY)
    2867           0 :                 goto unlock_out;
    2868             : 
    2869             :         /*
    2870             :          * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
    2871             :          * chunk allocation storm to exhaust the system chunk array.  Otherwise
    2872             :          * we still want to try our best to mark the block group read-only.
    2873             :          */
    2874           0 :         if (!do_chunk_alloc && ret == -ENOSPC &&
    2875           0 :             (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
    2876           0 :                 goto unlock_out;
    2877             : 
    2878           0 :         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
    2879           0 :         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
    2880           0 :         if (ret < 0)
    2881           0 :                 goto out;
    2882             :         /*
    2883             :          * We have allocated a new chunk. We also need to activate that chunk to
    2884             :          * grant metadata tickets for zoned filesystem.
    2885             :          */
    2886           0 :         ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
    2887           0 :         if (ret < 0)
    2888           0 :                 goto out;
    2889             : 
    2890           0 :         ret = inc_block_group_ro(cache, 0);
    2891           0 :         if (ret == -ETXTBSY)
    2892           0 :                 goto unlock_out;
    2893           0 : out:
    2894           0 :         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
    2895           0 :                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
    2896           0 :                 mutex_lock(&fs_info->chunk_mutex);
    2897           0 :                 check_system_chunk(trans, alloc_flags);
    2898           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    2899             :         }
    2900           0 : unlock_out:
    2901           0 :         mutex_unlock(&fs_info->ro_block_group_mutex);
    2902             : 
    2903           0 :         btrfs_end_transaction(trans);
    2904           0 :         return ret;
    2905             : }
    2906             : 
    2907           0 : void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
    2908             : {
    2909           0 :         struct btrfs_space_info *sinfo = cache->space_info;
    2910           0 :         u64 num_bytes;
    2911             : 
    2912           0 :         BUG_ON(!cache->ro);
    2913             : 
    2914           0 :         spin_lock(&sinfo->lock);
    2915           0 :         spin_lock(&cache->lock);
    2916           0 :         if (!--cache->ro) {
    2917           0 :                 if (btrfs_is_zoned(cache->fs_info)) {
    2918             :                         /* Migrate zone_unusable bytes back */
    2919           0 :                         cache->zone_unusable =
    2920           0 :                                 (cache->alloc_offset - cache->used) +
    2921           0 :                                 (cache->length - cache->zone_capacity);
    2922           0 :                         sinfo->bytes_zone_unusable += cache->zone_unusable;
    2923           0 :                         sinfo->bytes_readonly -= cache->zone_unusable;
    2924             :                 }
    2925           0 :                 num_bytes = cache->length - cache->reserved -
    2926           0 :                             cache->pinned - cache->bytes_super -
    2927           0 :                             cache->zone_unusable - cache->used;
    2928           0 :                 sinfo->bytes_readonly -= num_bytes;
    2929           0 :                 list_del_init(&cache->ro_list);
    2930             :         }
    2931           0 :         spin_unlock(&cache->lock);
    2932           0 :         spin_unlock(&sinfo->lock);
    2933           0 : }
    2934             : 
    2935           0 : static int update_block_group_item(struct btrfs_trans_handle *trans,
    2936             :                                    struct btrfs_path *path,
    2937             :                                    struct btrfs_block_group *cache)
    2938             : {
    2939           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2940           0 :         int ret;
    2941           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2942           0 :         unsigned long bi;
    2943           0 :         struct extent_buffer *leaf;
    2944           0 :         struct btrfs_block_group_item bgi;
    2945           0 :         struct btrfs_key key;
    2946           0 :         u64 old_commit_used;
    2947           0 :         u64 used;
    2948             : 
    2949             :         /*
    2950             :          * Block group items update can be triggered out of commit transaction
    2951             :          * critical section, thus we need a consistent view of used bytes.
    2952             :          * We cannot use cache->used directly outside of the spin lock, as it
    2953             :          * may be changed.
    2954             :          */
    2955           0 :         spin_lock(&cache->lock);
    2956           0 :         old_commit_used = cache->commit_used;
    2957           0 :         used = cache->used;
    2958             :         /* No change in used bytes, can safely skip it. */
    2959           0 :         if (cache->commit_used == used) {
    2960           0 :                 spin_unlock(&cache->lock);
    2961           0 :                 return 0;
    2962             :         }
    2963           0 :         cache->commit_used = used;
    2964           0 :         spin_unlock(&cache->lock);
    2965             : 
    2966           0 :         key.objectid = cache->start;
    2967           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2968           0 :         key.offset = cache->length;
    2969             : 
    2970           0 :         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2971           0 :         if (ret) {
    2972           0 :                 if (ret > 0)
    2973           0 :                         ret = -ENOENT;
    2974           0 :                 goto fail;
    2975             :         }
    2976             : 
    2977           0 :         leaf = path->nodes[0];
    2978           0 :         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
    2979           0 :         btrfs_set_stack_block_group_used(&bgi, used);
    2980           0 :         btrfs_set_stack_block_group_chunk_objectid(&bgi,
    2981             :                                                    cache->global_root_id);
    2982           0 :         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
    2983           0 :         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
    2984           0 :         btrfs_mark_buffer_dirty(leaf);
    2985           0 : fail:
    2986           0 :         btrfs_release_path(path);
    2987             :         /* We didn't update the block group item, need to revert @commit_used. */
    2988           0 :         if (ret < 0) {
    2989           0 :                 spin_lock(&cache->lock);
    2990           0 :                 cache->commit_used = old_commit_used;
    2991           0 :                 spin_unlock(&cache->lock);
    2992             :         }
    2993             :         return ret;
    2994             : 
    2995             : }
    2996             : 
    2997           0 : static int cache_save_setup(struct btrfs_block_group *block_group,
    2998             :                             struct btrfs_trans_handle *trans,
    2999             :                             struct btrfs_path *path)
    3000             : {
    3001           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    3002           0 :         struct btrfs_root *root = fs_info->tree_root;
    3003           0 :         struct inode *inode = NULL;
    3004           0 :         struct extent_changeset *data_reserved = NULL;
    3005           0 :         u64 alloc_hint = 0;
    3006           0 :         int dcs = BTRFS_DC_ERROR;
    3007           0 :         u64 cache_size = 0;
    3008           0 :         int retries = 0;
    3009           0 :         int ret = 0;
    3010             : 
    3011           0 :         if (!btrfs_test_opt(fs_info, SPACE_CACHE))
    3012             :                 return 0;
    3013             : 
    3014             :         /*
    3015             :          * If this block group is smaller than 100 megs don't bother caching the
    3016             :          * block group.
    3017             :          */
    3018           0 :         if (block_group->length < (100 * SZ_1M)) {
    3019           0 :                 spin_lock(&block_group->lock);
    3020           0 :                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
    3021           0 :                 spin_unlock(&block_group->lock);
    3022           0 :                 return 0;
    3023             :         }
    3024             : 
    3025           0 :         if (TRANS_ABORTED(trans))
    3026             :                 return 0;
    3027           0 : again:
    3028           0 :         inode = lookup_free_space_inode(block_group, path);
    3029           0 :         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
    3030           0 :                 ret = PTR_ERR(inode);
    3031           0 :                 btrfs_release_path(path);
    3032           0 :                 goto out;
    3033             :         }
    3034             : 
    3035           0 :         if (IS_ERR(inode)) {
    3036           0 :                 BUG_ON(retries);
    3037           0 :                 retries++;
    3038             : 
    3039           0 :                 if (block_group->ro)
    3040           0 :                         goto out_free;
    3041             : 
    3042           0 :                 ret = create_free_space_inode(trans, block_group, path);
    3043           0 :                 if (ret)
    3044           0 :                         goto out_free;
    3045           0 :                 goto again;
    3046             :         }
    3047             : 
    3048             :         /*
    3049             :          * We want to set the generation to 0, that way if anything goes wrong
    3050             :          * from here on out we know not to trust this cache when we load up next
    3051             :          * time.
    3052             :          */
    3053           0 :         BTRFS_I(inode)->generation = 0;
    3054           0 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    3055           0 :         if (ret) {
    3056             :                 /*
    3057             :                  * So theoretically we could recover from this, simply set the
    3058             :                  * super cache generation to 0 so we know to invalidate the
    3059             :                  * cache, but then we'd have to keep track of the block groups
    3060             :                  * that fail this way so we know we _have_ to reset this cache
    3061             :                  * before the next commit or risk reading stale cache.  So to
    3062             :                  * limit our exposure to horrible edge cases lets just abort the
    3063             :                  * transaction, this only happens in really bad situations
    3064             :                  * anyway.
    3065             :                  */
    3066           0 :                 btrfs_abort_transaction(trans, ret);
    3067           0 :                 goto out_put;
    3068             :         }
    3069           0 :         WARN_ON(ret);
    3070             : 
    3071             :         /* We've already setup this transaction, go ahead and exit */
    3072           0 :         if (block_group->cache_generation == trans->transid &&
    3073             :             i_size_read(inode)) {
    3074           0 :                 dcs = BTRFS_DC_SETUP;
    3075           0 :                 goto out_put;
    3076             :         }
    3077             : 
    3078           0 :         if (i_size_read(inode) > 0) {
    3079           0 :                 ret = btrfs_check_trunc_cache_free_space(fs_info,
    3080             :                                         &fs_info->global_block_rsv);
    3081           0 :                 if (ret)
    3082           0 :                         goto out_put;
    3083             : 
    3084           0 :                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
    3085           0 :                 if (ret)
    3086           0 :                         goto out_put;
    3087             :         }
    3088             : 
    3089           0 :         spin_lock(&block_group->lock);
    3090           0 :         if (block_group->cached != BTRFS_CACHE_FINISHED ||
    3091           0 :             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
    3092             :                 /*
    3093             :                  * don't bother trying to write stuff out _if_
    3094             :                  * a) we're not cached,
    3095             :                  * b) we're with nospace_cache mount option,
    3096             :                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
    3097             :                  */
    3098           0 :                 dcs = BTRFS_DC_WRITTEN;
    3099           0 :                 spin_unlock(&block_group->lock);
    3100           0 :                 goto out_put;
    3101             :         }
    3102           0 :         spin_unlock(&block_group->lock);
    3103             : 
    3104             :         /*
    3105             :          * We hit an ENOSPC when setting up the cache in this transaction, just
    3106             :          * skip doing the setup, we've already cleared the cache so we're safe.
    3107             :          */
    3108           0 :         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
    3109           0 :                 ret = -ENOSPC;
    3110           0 :                 goto out_put;
    3111             :         }
    3112             : 
    3113             :         /*
    3114             :          * Try to preallocate enough space based on how big the block group is.
    3115             :          * Keep in mind this has to include any pinned space which could end up
    3116             :          * taking up quite a bit since it's not folded into the other space
    3117             :          * cache.
    3118             :          */
    3119           0 :         cache_size = div_u64(block_group->length, SZ_256M);
    3120           0 :         if (!cache_size)
    3121           0 :                 cache_size = 1;
    3122             : 
    3123           0 :         cache_size *= 16;
    3124           0 :         cache_size *= fs_info->sectorsize;
    3125             : 
    3126           0 :         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
    3127             :                                           cache_size, false);
    3128           0 :         if (ret)
    3129           0 :                 goto out_put;
    3130             : 
    3131           0 :         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
    3132             :                                               cache_size, cache_size,
    3133             :                                               &alloc_hint);
    3134             :         /*
    3135             :          * Our cache requires contiguous chunks so that we don't modify a bunch
    3136             :          * of metadata or split extents when writing the cache out, which means
    3137             :          * we can enospc if we are heavily fragmented in addition to just normal
    3138             :          * out of space conditions.  So if we hit this just skip setting up any
    3139             :          * other block groups for this transaction, maybe we'll unpin enough
    3140             :          * space the next time around.
    3141             :          */
    3142           0 :         if (!ret)
    3143             :                 dcs = BTRFS_DC_SETUP;
    3144           0 :         else if (ret == -ENOSPC)
    3145           0 :                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
    3146             : 
    3147           0 : out_put:
    3148           0 :         iput(inode);
    3149           0 : out_free:
    3150           0 :         btrfs_release_path(path);
    3151           0 : out:
    3152           0 :         spin_lock(&block_group->lock);
    3153           0 :         if (!ret && dcs == BTRFS_DC_SETUP)
    3154           0 :                 block_group->cache_generation = trans->transid;
    3155           0 :         block_group->disk_cache_state = dcs;
    3156           0 :         spin_unlock(&block_group->lock);
    3157             : 
    3158           0 :         extent_changeset_free(data_reserved);
    3159           0 :         return ret;
    3160             : }
    3161             : 
    3162           0 : int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
    3163             : {
    3164           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3165           0 :         struct btrfs_block_group *cache, *tmp;
    3166           0 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3167           0 :         struct btrfs_path *path;
    3168             : 
    3169           0 :         if (list_empty(&cur_trans->dirty_bgs) ||
    3170           0 :             !btrfs_test_opt(fs_info, SPACE_CACHE))
    3171             :                 return 0;
    3172             : 
    3173           0 :         path = btrfs_alloc_path();
    3174           0 :         if (!path)
    3175             :                 return -ENOMEM;
    3176             : 
    3177             :         /* Could add new block groups, use _safe just in case */
    3178           0 :         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
    3179             :                                  dirty_list) {
    3180           0 :                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
    3181           0 :                         cache_save_setup(cache, trans, path);
    3182             :         }
    3183             : 
    3184           0 :         btrfs_free_path(path);
    3185           0 :         return 0;
    3186             : }
    3187             : 
    3188             : /*
    3189             :  * Transaction commit does final block group cache writeback during a critical
    3190             :  * section where nothing is allowed to change the FS.  This is required in
    3191             :  * order for the cache to actually match the block group, but can introduce a
    3192             :  * lot of latency into the commit.
    3193             :  *
    3194             :  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
    3195             :  * There's a chance we'll have to redo some of it if the block group changes
    3196             :  * again during the commit, but it greatly reduces the commit latency by
    3197             :  * getting rid of the easy block groups while we're still allowing others to
    3198             :  * join the commit.
    3199             :  */
    3200           0 : int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
    3201             : {
    3202           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3203           0 :         struct btrfs_block_group *cache;
    3204           0 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3205           0 :         int ret = 0;
    3206           0 :         int should_put;
    3207           0 :         struct btrfs_path *path = NULL;
    3208           0 :         LIST_HEAD(dirty);
    3209           0 :         struct list_head *io = &cur_trans->io_bgs;
    3210           0 :         int loops = 0;
    3211             : 
    3212           0 :         spin_lock(&cur_trans->dirty_bgs_lock);
    3213           0 :         if (list_empty(&cur_trans->dirty_bgs)) {
    3214           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3215           0 :                 return 0;
    3216             :         }
    3217           0 :         list_splice_init(&cur_trans->dirty_bgs, &dirty);
    3218           0 :         spin_unlock(&cur_trans->dirty_bgs_lock);
    3219             : 
    3220           0 : again:
    3221             :         /* Make sure all the block groups on our dirty list actually exist */
    3222           0 :         btrfs_create_pending_block_groups(trans);
    3223             : 
    3224           0 :         if (!path) {
    3225           0 :                 path = btrfs_alloc_path();
    3226           0 :                 if (!path) {
    3227           0 :                         ret = -ENOMEM;
    3228           0 :                         goto out;
    3229             :                 }
    3230             :         }
    3231             : 
    3232             :         /*
    3233             :          * cache_write_mutex is here only to save us from balance or automatic
    3234             :          * removal of empty block groups deleting this block group while we are
    3235             :          * writing out the cache
    3236             :          */
    3237           0 :         mutex_lock(&trans->transaction->cache_write_mutex);
    3238           0 :         while (!list_empty(&dirty)) {
    3239           0 :                 bool drop_reserve = true;
    3240             : 
    3241           0 :                 cache = list_first_entry(&dirty, struct btrfs_block_group,
    3242             :                                          dirty_list);
    3243             :                 /*
    3244             :                  * This can happen if something re-dirties a block group that
    3245             :                  * is already under IO.  Just wait for it to finish and then do
    3246             :                  * it all again
    3247             :                  */
    3248           0 :                 if (!list_empty(&cache->io_list)) {
    3249           0 :                         list_del_init(&cache->io_list);
    3250           0 :                         btrfs_wait_cache_io(trans, cache, path);
    3251           0 :                         btrfs_put_block_group(cache);
    3252             :                 }
    3253             : 
    3254             : 
    3255             :                 /*
    3256             :                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
    3257             :                  * it should update the cache_state.  Don't delete until after
    3258             :                  * we wait.
    3259             :                  *
    3260             :                  * Since we're not running in the commit critical section
    3261             :                  * we need the dirty_bgs_lock to protect from update_block_group
    3262             :                  */
    3263           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3264           0 :                 list_del_init(&cache->dirty_list);
    3265           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3266             : 
    3267           0 :                 should_put = 1;
    3268             : 
    3269           0 :                 cache_save_setup(cache, trans, path);
    3270             : 
    3271           0 :                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
    3272           0 :                         cache->io_ctl.inode = NULL;
    3273           0 :                         ret = btrfs_write_out_cache(trans, cache, path);
    3274           0 :                         if (ret == 0 && cache->io_ctl.inode) {
    3275           0 :                                 should_put = 0;
    3276             : 
    3277             :                                 /*
    3278             :                                  * The cache_write_mutex is protecting the
    3279             :                                  * io_list, also refer to the definition of
    3280             :                                  * btrfs_transaction::io_bgs for more details
    3281             :                                  */
    3282           0 :                                 list_add_tail(&cache->io_list, io);
    3283             :                         } else {
    3284             :                                 /*
    3285             :                                  * If we failed to write the cache, the
    3286             :                                  * generation will be bad and life goes on
    3287             :                                  */
    3288             :                                 ret = 0;
    3289             :                         }
    3290             :                 }
    3291             :                 if (!ret) {
    3292           0 :                         ret = update_block_group_item(trans, path, cache);
    3293             :                         /*
    3294             :                          * Our block group might still be attached to the list
    3295             :                          * of new block groups in the transaction handle of some
    3296             :                          * other task (struct btrfs_trans_handle->new_bgs). This
    3297             :                          * means its block group item isn't yet in the extent
    3298             :                          * tree. If this happens ignore the error, as we will
    3299             :                          * try again later in the critical section of the
    3300             :                          * transaction commit.
    3301             :                          */
    3302           0 :                         if (ret == -ENOENT) {
    3303           0 :                                 ret = 0;
    3304           0 :                                 spin_lock(&cur_trans->dirty_bgs_lock);
    3305           0 :                                 if (list_empty(&cache->dirty_list)) {
    3306           0 :                                         list_add_tail(&cache->dirty_list,
    3307             :                                                       &cur_trans->dirty_bgs);
    3308           0 :                                         btrfs_get_block_group(cache);
    3309           0 :                                         drop_reserve = false;
    3310             :                                 }
    3311           0 :                                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3312           0 :                         } else if (ret) {
    3313           0 :                                 btrfs_abort_transaction(trans, ret);
    3314             :                         }
    3315             :                 }
    3316             : 
    3317             :                 /* If it's not on the io list, we need to put the block group */
    3318           0 :                 if (should_put)
    3319           0 :                         btrfs_put_block_group(cache);
    3320           0 :                 if (drop_reserve)
    3321           0 :                         btrfs_delayed_refs_rsv_release(fs_info, 1);
    3322             :                 /*
    3323             :                  * Avoid blocking other tasks for too long. It might even save
    3324             :                  * us from writing caches for block groups that are going to be
    3325             :                  * removed.
    3326             :                  */
    3327           0 :                 mutex_unlock(&trans->transaction->cache_write_mutex);
    3328           0 :                 if (ret)
    3329           0 :                         goto out;
    3330           0 :                 mutex_lock(&trans->transaction->cache_write_mutex);
    3331             :         }
    3332           0 :         mutex_unlock(&trans->transaction->cache_write_mutex);
    3333             : 
    3334             :         /*
    3335             :          * Go through delayed refs for all the stuff we've just kicked off
    3336             :          * and then loop back (just once)
    3337             :          */
    3338           0 :         if (!ret)
    3339           0 :                 ret = btrfs_run_delayed_refs(trans, 0);
    3340           0 :         if (!ret && loops == 0) {
    3341           0 :                 loops++;
    3342           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3343           0 :                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
    3344             :                 /*
    3345             :                  * dirty_bgs_lock protects us from concurrent block group
    3346             :                  * deletes too (not just cache_write_mutex).
    3347             :                  */
    3348           0 :                 if (!list_empty(&dirty)) {
    3349           0 :                         spin_unlock(&cur_trans->dirty_bgs_lock);
    3350           0 :                         goto again;
    3351             :                 }
    3352           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3353             :         }
    3354           0 : out:
    3355           0 :         if (ret < 0) {
    3356           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3357           0 :                 list_splice_init(&dirty, &cur_trans->dirty_bgs);
    3358           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3359           0 :                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
    3360             :         }
    3361             : 
    3362           0 :         btrfs_free_path(path);
    3363           0 :         return ret;
    3364             : }
    3365             : 
    3366           0 : int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
    3367             : {
    3368           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3369           0 :         struct btrfs_block_group *cache;
    3370           0 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3371           0 :         int ret = 0;
    3372           0 :         int should_put;
    3373           0 :         struct btrfs_path *path;
    3374           0 :         struct list_head *io = &cur_trans->io_bgs;
    3375             : 
    3376           0 :         path = btrfs_alloc_path();
    3377           0 :         if (!path)
    3378             :                 return -ENOMEM;
    3379             : 
    3380             :         /*
    3381             :          * Even though we are in the critical section of the transaction commit,
    3382             :          * we can still have concurrent tasks adding elements to this
    3383             :          * transaction's list of dirty block groups. These tasks correspond to
    3384             :          * endio free space workers started when writeback finishes for a
    3385             :          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
    3386             :          * allocate new block groups as a result of COWing nodes of the root
    3387             :          * tree when updating the free space inode. The writeback for the space
    3388             :          * caches is triggered by an earlier call to
    3389             :          * btrfs_start_dirty_block_groups() and iterations of the following
    3390             :          * loop.
    3391             :          * Also we want to do the cache_save_setup first and then run the
    3392             :          * delayed refs to make sure we have the best chance at doing this all
    3393             :          * in one shot.
    3394             :          */
    3395           0 :         spin_lock(&cur_trans->dirty_bgs_lock);
    3396           0 :         while (!list_empty(&cur_trans->dirty_bgs)) {
    3397           0 :                 cache = list_first_entry(&cur_trans->dirty_bgs,
    3398             :                                          struct btrfs_block_group,
    3399             :                                          dirty_list);
    3400             : 
    3401             :                 /*
    3402             :                  * This can happen if cache_save_setup re-dirties a block group
    3403             :                  * that is already under IO.  Just wait for it to finish and
    3404             :                  * then do it all again
    3405             :                  */
    3406           0 :                 if (!list_empty(&cache->io_list)) {
    3407           0 :                         spin_unlock(&cur_trans->dirty_bgs_lock);
    3408           0 :                         list_del_init(&cache->io_list);
    3409           0 :                         btrfs_wait_cache_io(trans, cache, path);
    3410           0 :                         btrfs_put_block_group(cache);
    3411           0 :                         spin_lock(&cur_trans->dirty_bgs_lock);
    3412             :                 }
    3413             : 
    3414             :                 /*
    3415             :                  * Don't remove from the dirty list until after we've waited on
    3416             :                  * any pending IO
    3417             :                  */
    3418           0 :                 list_del_init(&cache->dirty_list);
    3419           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3420           0 :                 should_put = 1;
    3421             : 
    3422           0 :                 cache_save_setup(cache, trans, path);
    3423             : 
    3424           0 :                 if (!ret)
    3425           0 :                         ret = btrfs_run_delayed_refs(trans,
    3426             :                                                      (unsigned long) -1);
    3427             : 
    3428           0 :                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
    3429           0 :                         cache->io_ctl.inode = NULL;
    3430           0 :                         ret = btrfs_write_out_cache(trans, cache, path);
    3431           0 :                         if (ret == 0 && cache->io_ctl.inode) {
    3432           0 :                                 should_put = 0;
    3433           0 :                                 list_add_tail(&cache->io_list, io);
    3434             :                         } else {
    3435             :                                 /*
    3436             :                                  * If we failed to write the cache, the
    3437             :                                  * generation will be bad and life goes on
    3438             :                                  */
    3439             :                                 ret = 0;
    3440             :                         }
    3441             :                 }
    3442           0 :                 if (!ret) {
    3443           0 :                         ret = update_block_group_item(trans, path, cache);
    3444             :                         /*
    3445             :                          * One of the free space endio workers might have
    3446             :                          * created a new block group while updating a free space
    3447             :                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
    3448             :                          * and hasn't released its transaction handle yet, in
    3449             :                          * which case the new block group is still attached to
    3450             :                          * its transaction handle and its creation has not
    3451             :                          * finished yet (no block group item in the extent tree
    3452             :                          * yet, etc). If this is the case, wait for all free
    3453             :                          * space endio workers to finish and retry. This is a
    3454             :                          * very rare case so no need for a more efficient and
    3455             :                          * complex approach.
    3456             :                          */
    3457           0 :                         if (ret == -ENOENT) {
    3458           0 :                                 wait_event(cur_trans->writer_wait,
    3459             :                                    atomic_read(&cur_trans->num_writers) == 1);
    3460           0 :                                 ret = update_block_group_item(trans, path, cache);
    3461             :                         }
    3462           0 :                         if (ret)
    3463           0 :                                 btrfs_abort_transaction(trans, ret);
    3464             :                 }
    3465             : 
    3466             :                 /* If its not on the io list, we need to put the block group */
    3467           0 :                 if (should_put)
    3468           0 :                         btrfs_put_block_group(cache);
    3469           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    3470           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3471             :         }
    3472           0 :         spin_unlock(&cur_trans->dirty_bgs_lock);
    3473             : 
    3474             :         /*
    3475             :          * Refer to the definition of io_bgs member for details why it's safe
    3476             :          * to use it without any locking
    3477             :          */
    3478           0 :         while (!list_empty(io)) {
    3479           0 :                 cache = list_first_entry(io, struct btrfs_block_group,
    3480             :                                          io_list);
    3481           0 :                 list_del_init(&cache->io_list);
    3482           0 :                 btrfs_wait_cache_io(trans, cache, path);
    3483           0 :                 btrfs_put_block_group(cache);
    3484             :         }
    3485             : 
    3486           0 :         btrfs_free_path(path);
    3487           0 :         return ret;
    3488             : }
    3489             : 
    3490           0 : int btrfs_update_block_group(struct btrfs_trans_handle *trans,
    3491             :                              u64 bytenr, u64 num_bytes, bool alloc)
    3492             : {
    3493           0 :         struct btrfs_fs_info *info = trans->fs_info;
    3494           0 :         struct btrfs_block_group *cache = NULL;
    3495           0 :         u64 total = num_bytes;
    3496           0 :         u64 old_val;
    3497           0 :         u64 byte_in_group;
    3498           0 :         int factor;
    3499           0 :         int ret = 0;
    3500             : 
    3501             :         /* Block accounting for super block */
    3502           0 :         spin_lock(&info->delalloc_root_lock);
    3503           0 :         old_val = btrfs_super_bytes_used(info->super_copy);
    3504           0 :         if (alloc)
    3505           0 :                 old_val += num_bytes;
    3506             :         else
    3507           0 :                 old_val -= num_bytes;
    3508           0 :         btrfs_set_super_bytes_used(info->super_copy, old_val);
    3509           0 :         spin_unlock(&info->delalloc_root_lock);
    3510             : 
    3511           0 :         while (total) {
    3512           0 :                 struct btrfs_space_info *space_info;
    3513           0 :                 bool reclaim = false;
    3514             : 
    3515           0 :                 cache = btrfs_lookup_block_group(info, bytenr);
    3516           0 :                 if (!cache) {
    3517             :                         ret = -ENOENT;
    3518             :                         break;
    3519             :                 }
    3520           0 :                 space_info = cache->space_info;
    3521           0 :                 factor = btrfs_bg_type_to_factor(cache->flags);
    3522             : 
    3523             :                 /*
    3524             :                  * If this block group has free space cache written out, we
    3525             :                  * need to make sure to load it if we are removing space.  This
    3526             :                  * is because we need the unpinning stage to actually add the
    3527             :                  * space back to the block group, otherwise we will leak space.
    3528             :                  */
    3529           0 :                 if (!alloc && !btrfs_block_group_done(cache))
    3530           0 :                         btrfs_cache_block_group(cache, true);
    3531             : 
    3532           0 :                 byte_in_group = bytenr - cache->start;
    3533           0 :                 WARN_ON(byte_in_group > cache->length);
    3534             : 
    3535           0 :                 spin_lock(&space_info->lock);
    3536           0 :                 spin_lock(&cache->lock);
    3537             : 
    3538           0 :                 if (btrfs_test_opt(info, SPACE_CACHE) &&
    3539           0 :                     cache->disk_cache_state < BTRFS_DC_CLEAR)
    3540           0 :                         cache->disk_cache_state = BTRFS_DC_CLEAR;
    3541             : 
    3542           0 :                 old_val = cache->used;
    3543           0 :                 num_bytes = min(total, cache->length - byte_in_group);
    3544           0 :                 if (alloc) {
    3545           0 :                         old_val += num_bytes;
    3546           0 :                         cache->used = old_val;
    3547           0 :                         cache->reserved -= num_bytes;
    3548           0 :                         space_info->bytes_reserved -= num_bytes;
    3549           0 :                         space_info->bytes_used += num_bytes;
    3550           0 :                         space_info->disk_used += num_bytes * factor;
    3551           0 :                         spin_unlock(&cache->lock);
    3552           0 :                         spin_unlock(&space_info->lock);
    3553             :                 } else {
    3554           0 :                         old_val -= num_bytes;
    3555           0 :                         cache->used = old_val;
    3556           0 :                         cache->pinned += num_bytes;
    3557           0 :                         btrfs_space_info_update_bytes_pinned(info, space_info,
    3558             :                                                              num_bytes);
    3559           0 :                         space_info->bytes_used -= num_bytes;
    3560           0 :                         space_info->disk_used -= num_bytes * factor;
    3561             : 
    3562           0 :                         reclaim = should_reclaim_block_group(cache, num_bytes);
    3563             : 
    3564           0 :                         spin_unlock(&cache->lock);
    3565           0 :                         spin_unlock(&space_info->lock);
    3566             : 
    3567           0 :                         set_extent_bit(&trans->transaction->pinned_extents,
    3568           0 :                                        bytenr, bytenr + num_bytes - 1,
    3569             :                                        EXTENT_DIRTY, NULL);
    3570             :                 }
    3571             : 
    3572           0 :                 spin_lock(&trans->transaction->dirty_bgs_lock);
    3573           0 :                 if (list_empty(&cache->dirty_list)) {
    3574           0 :                         list_add_tail(&cache->dirty_list,
    3575           0 :                                       &trans->transaction->dirty_bgs);
    3576           0 :                         trans->delayed_ref_updates++;
    3577           0 :                         btrfs_get_block_group(cache);
    3578             :                 }
    3579           0 :                 spin_unlock(&trans->transaction->dirty_bgs_lock);
    3580             : 
    3581             :                 /*
    3582             :                  * No longer have used bytes in this block group, queue it for
    3583             :                  * deletion. We do this after adding the block group to the
    3584             :                  * dirty list to avoid races between cleaner kthread and space
    3585             :                  * cache writeout.
    3586             :                  */
    3587           0 :                 if (!alloc && old_val == 0) {
    3588           0 :                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
    3589           0 :                                 btrfs_mark_bg_unused(cache);
    3590           0 :                 } else if (!alloc && reclaim) {
    3591           0 :                         btrfs_mark_bg_to_reclaim(cache);
    3592             :                 }
    3593             : 
    3594           0 :                 btrfs_put_block_group(cache);
    3595           0 :                 total -= num_bytes;
    3596           0 :                 bytenr += num_bytes;
    3597             :         }
    3598             : 
    3599             :         /* Modified block groups are accounted for in the delayed_refs_rsv. */
    3600           0 :         btrfs_update_delayed_refs_rsv(trans);
    3601           0 :         return ret;
    3602             : }
    3603             : 
    3604             : /*
    3605             :  * Update the block_group and space info counters.
    3606             :  *
    3607             :  * @cache:      The cache we are manipulating
    3608             :  * @ram_bytes:  The number of bytes of file content, and will be same to
    3609             :  *              @num_bytes except for the compress path.
    3610             :  * @num_bytes:  The number of bytes in question
    3611             :  * @delalloc:   The blocks are allocated for the delalloc write
    3612             :  *
    3613             :  * This is called by the allocator when it reserves space. If this is a
    3614             :  * reservation and the block group has become read only we cannot make the
    3615             :  * reservation and return -EAGAIN, otherwise this function always succeeds.
    3616             :  */
    3617           0 : int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
    3618             :                              u64 ram_bytes, u64 num_bytes, int delalloc,
    3619             :                              bool force_wrong_size_class)
    3620             : {
    3621           0 :         struct btrfs_space_info *space_info = cache->space_info;
    3622           0 :         enum btrfs_block_group_size_class size_class;
    3623           0 :         int ret = 0;
    3624             : 
    3625           0 :         spin_lock(&space_info->lock);
    3626           0 :         spin_lock(&cache->lock);
    3627           0 :         if (cache->ro) {
    3628           0 :                 ret = -EAGAIN;
    3629           0 :                 goto out;
    3630             :         }
    3631             : 
    3632           0 :         if (btrfs_block_group_should_use_size_class(cache)) {
    3633           0 :                 size_class = btrfs_calc_block_group_size_class(num_bytes);
    3634           0 :                 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
    3635           0 :                 if (ret)
    3636           0 :                         goto out;
    3637             :         }
    3638           0 :         cache->reserved += num_bytes;
    3639           0 :         space_info->bytes_reserved += num_bytes;
    3640           0 :         trace_btrfs_space_reservation(cache->fs_info, "space_info",
    3641             :                                       space_info->flags, num_bytes, 1);
    3642           0 :         btrfs_space_info_update_bytes_may_use(cache->fs_info,
    3643           0 :                                               space_info, -ram_bytes);
    3644           0 :         if (delalloc)
    3645           0 :                 cache->delalloc_bytes += num_bytes;
    3646             : 
    3647             :         /*
    3648             :          * Compression can use less space than we reserved, so wake tickets if
    3649             :          * that happens.
    3650             :          */
    3651           0 :         if (num_bytes < ram_bytes)
    3652           0 :                 btrfs_try_granting_tickets(cache->fs_info, space_info);
    3653           0 : out:
    3654           0 :         spin_unlock(&cache->lock);
    3655           0 :         spin_unlock(&space_info->lock);
    3656           0 :         return ret;
    3657             : }
    3658             : 
    3659             : /*
    3660             :  * Update the block_group and space info counters.
    3661             :  *
    3662             :  * @cache:      The cache we are manipulating
    3663             :  * @num_bytes:  The number of bytes in question
    3664             :  * @delalloc:   The blocks are allocated for the delalloc write
    3665             :  *
    3666             :  * This is called by somebody who is freeing space that was never actually used
    3667             :  * on disk.  For example if you reserve some space for a new leaf in transaction
    3668             :  * A and before transaction A commits you free that leaf, you call this with
    3669             :  * reserve set to 0 in order to clear the reservation.
    3670             :  */
    3671           0 : void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
    3672             :                                u64 num_bytes, int delalloc)
    3673             : {
    3674           0 :         struct btrfs_space_info *space_info = cache->space_info;
    3675             : 
    3676           0 :         spin_lock(&space_info->lock);
    3677           0 :         spin_lock(&cache->lock);
    3678           0 :         if (cache->ro)
    3679           0 :                 space_info->bytes_readonly += num_bytes;
    3680           0 :         cache->reserved -= num_bytes;
    3681           0 :         space_info->bytes_reserved -= num_bytes;
    3682           0 :         space_info->max_extent_size = 0;
    3683             : 
    3684           0 :         if (delalloc)
    3685           0 :                 cache->delalloc_bytes -= num_bytes;
    3686           0 :         spin_unlock(&cache->lock);
    3687             : 
    3688           0 :         btrfs_try_granting_tickets(cache->fs_info, space_info);
    3689           0 :         spin_unlock(&space_info->lock);
    3690           0 : }
    3691             : 
    3692           0 : static void force_metadata_allocation(struct btrfs_fs_info *info)
    3693             : {
    3694           0 :         struct list_head *head = &info->space_info;
    3695           0 :         struct btrfs_space_info *found;
    3696             : 
    3697           0 :         list_for_each_entry(found, head, list) {
    3698           0 :                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
    3699           0 :                         found->force_alloc = CHUNK_ALLOC_FORCE;
    3700             :         }
    3701           0 : }
    3702             : 
    3703           0 : static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
    3704             :                               struct btrfs_space_info *sinfo, int force)
    3705             : {
    3706           0 :         u64 bytes_used = btrfs_space_info_used(sinfo, false);
    3707           0 :         u64 thresh;
    3708             : 
    3709           0 :         if (force == CHUNK_ALLOC_FORCE)
    3710             :                 return 1;
    3711             : 
    3712             :         /*
    3713             :          * in limited mode, we want to have some free space up to
    3714             :          * about 1% of the FS size.
    3715             :          */
    3716           0 :         if (force == CHUNK_ALLOC_LIMITED) {
    3717           0 :                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
    3718           0 :                 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
    3719             : 
    3720           0 :                 if (sinfo->total_bytes - bytes_used < thresh)
    3721             :                         return 1;
    3722             :         }
    3723             : 
    3724           0 :         if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
    3725           0 :                 return 0;
    3726             :         return 1;
    3727             : }
    3728             : 
    3729           0 : int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
    3730             : {
    3731           0 :         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
    3732             : 
    3733           0 :         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
    3734             : }
    3735             : 
    3736           0 : static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
    3737             : {
    3738           0 :         struct btrfs_block_group *bg;
    3739           0 :         int ret;
    3740             : 
    3741             :         /*
    3742             :          * Check if we have enough space in the system space info because we
    3743             :          * will need to update device items in the chunk btree and insert a new
    3744             :          * chunk item in the chunk btree as well. This will allocate a new
    3745             :          * system block group if needed.
    3746             :          */
    3747           0 :         check_system_chunk(trans, flags);
    3748             : 
    3749           0 :         bg = btrfs_create_chunk(trans, flags);
    3750           0 :         if (IS_ERR(bg)) {
    3751           0 :                 ret = PTR_ERR(bg);
    3752           0 :                 goto out;
    3753             :         }
    3754             : 
    3755           0 :         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
    3756             :         /*
    3757             :          * Normally we are not expected to fail with -ENOSPC here, since we have
    3758             :          * previously reserved space in the system space_info and allocated one
    3759             :          * new system chunk if necessary. However there are three exceptions:
    3760             :          *
    3761             :          * 1) We may have enough free space in the system space_info but all the
    3762             :          *    existing system block groups have a profile which can not be used
    3763             :          *    for extent allocation.
    3764             :          *
    3765             :          *    This happens when mounting in degraded mode. For example we have a
    3766             :          *    RAID1 filesystem with 2 devices, lose one device and mount the fs
    3767             :          *    using the other device in degraded mode. If we then allocate a chunk,
    3768             :          *    we may have enough free space in the existing system space_info, but
    3769             :          *    none of the block groups can be used for extent allocation since they
    3770             :          *    have a RAID1 profile, and because we are in degraded mode with a
    3771             :          *    single device, we are forced to allocate a new system chunk with a
    3772             :          *    SINGLE profile. Making check_system_chunk() iterate over all system
    3773             :          *    block groups and check if they have a usable profile and enough space
    3774             :          *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
    3775             :          *    try again after forcing allocation of a new system chunk. Like this
    3776             :          *    we avoid paying the cost of that search in normal circumstances, when
    3777             :          *    we were not mounted in degraded mode;
    3778             :          *
    3779             :          * 2) We had enough free space info the system space_info, and one suitable
    3780             :          *    block group to allocate from when we called check_system_chunk()
    3781             :          *    above. However right after we called it, the only system block group
    3782             :          *    with enough free space got turned into RO mode by a running scrub,
    3783             :          *    and in this case we have to allocate a new one and retry. We only
    3784             :          *    need do this allocate and retry once, since we have a transaction
    3785             :          *    handle and scrub uses the commit root to search for block groups;
    3786             :          *
    3787             :          * 3) We had one system block group with enough free space when we called
    3788             :          *    check_system_chunk(), but after that, right before we tried to
    3789             :          *    allocate the last extent buffer we needed, a discard operation came
    3790             :          *    in and it temporarily removed the last free space entry from the
    3791             :          *    block group (discard removes a free space entry, discards it, and
    3792             :          *    then adds back the entry to the block group cache).
    3793             :          */
    3794           0 :         if (ret == -ENOSPC) {
    3795           0 :                 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
    3796           0 :                 struct btrfs_block_group *sys_bg;
    3797             : 
    3798           0 :                 sys_bg = btrfs_create_chunk(trans, sys_flags);
    3799           0 :                 if (IS_ERR(sys_bg)) {
    3800           0 :                         ret = PTR_ERR(sys_bg);
    3801           0 :                         btrfs_abort_transaction(trans, ret);
    3802           0 :                         goto out;
    3803             :                 }
    3804             : 
    3805           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
    3806           0 :                 if (ret) {
    3807           0 :                         btrfs_abort_transaction(trans, ret);
    3808           0 :                         goto out;
    3809             :                 }
    3810             : 
    3811           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
    3812           0 :                 if (ret) {
    3813           0 :                         btrfs_abort_transaction(trans, ret);
    3814           0 :                         goto out;
    3815             :                 }
    3816           0 :         } else if (ret) {
    3817           0 :                 btrfs_abort_transaction(trans, ret);
    3818           0 :                 goto out;
    3819             :         }
    3820           0 : out:
    3821           0 :         btrfs_trans_release_chunk_metadata(trans);
    3822             : 
    3823           0 :         if (ret)
    3824           0 :                 return ERR_PTR(ret);
    3825             : 
    3826           0 :         btrfs_get_block_group(bg);
    3827           0 :         return bg;
    3828             : }
    3829             : 
    3830             : /*
    3831             :  * Chunk allocation is done in 2 phases:
    3832             :  *
    3833             :  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
    3834             :  *    the chunk, the chunk mapping, create its block group and add the items
    3835             :  *    that belong in the chunk btree to it - more specifically, we need to
    3836             :  *    update device items in the chunk btree and add a new chunk item to it.
    3837             :  *
    3838             :  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
    3839             :  *    group item to the extent btree and the device extent items to the devices
    3840             :  *    btree.
    3841             :  *
    3842             :  * This is done to prevent deadlocks. For example when COWing a node from the
    3843             :  * extent btree we are holding a write lock on the node's parent and if we
    3844             :  * trigger chunk allocation and attempted to insert the new block group item
    3845             :  * in the extent btree right way, we could deadlock because the path for the
    3846             :  * insertion can include that parent node. At first glance it seems impossible
    3847             :  * to trigger chunk allocation after starting a transaction since tasks should
    3848             :  * reserve enough transaction units (metadata space), however while that is true
    3849             :  * most of the time, chunk allocation may still be triggered for several reasons:
    3850             :  *
    3851             :  * 1) When reserving metadata, we check if there is enough free space in the
    3852             :  *    metadata space_info and therefore don't trigger allocation of a new chunk.
    3853             :  *    However later when the task actually tries to COW an extent buffer from
    3854             :  *    the extent btree or from the device btree for example, it is forced to
    3855             :  *    allocate a new block group (chunk) because the only one that had enough
    3856             :  *    free space was just turned to RO mode by a running scrub for example (or
    3857             :  *    device replace, block group reclaim thread, etc), so we can not use it
    3858             :  *    for allocating an extent and end up being forced to allocate a new one;
    3859             :  *
    3860             :  * 2) Because we only check that the metadata space_info has enough free bytes,
    3861             :  *    we end up not allocating a new metadata chunk in that case. However if
    3862             :  *    the filesystem was mounted in degraded mode, none of the existing block
    3863             :  *    groups might be suitable for extent allocation due to their incompatible
    3864             :  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
    3865             :  *    use a RAID1 profile, in degraded mode using a single device). In this case
    3866             :  *    when the task attempts to COW some extent buffer of the extent btree for
    3867             :  *    example, it will trigger allocation of a new metadata block group with a
    3868             :  *    suitable profile (SINGLE profile in the example of the degraded mount of
    3869             :  *    the RAID1 filesystem);
    3870             :  *
    3871             :  * 3) The task has reserved enough transaction units / metadata space, but when
    3872             :  *    it attempts to COW an extent buffer from the extent or device btree for
    3873             :  *    example, it does not find any free extent in any metadata block group,
    3874             :  *    therefore forced to try to allocate a new metadata block group.
    3875             :  *    This is because some other task allocated all available extents in the
    3876             :  *    meanwhile - this typically happens with tasks that don't reserve space
    3877             :  *    properly, either intentionally or as a bug. One example where this is
    3878             :  *    done intentionally is fsync, as it does not reserve any transaction units
    3879             :  *    and ends up allocating a variable number of metadata extents for log
    3880             :  *    tree extent buffers;
    3881             :  *
    3882             :  * 4) The task has reserved enough transaction units / metadata space, but right
    3883             :  *    before it tries to allocate the last extent buffer it needs, a discard
    3884             :  *    operation comes in and, temporarily, removes the last free space entry from
    3885             :  *    the only metadata block group that had free space (discard starts by
    3886             :  *    removing a free space entry from a block group, then does the discard
    3887             :  *    operation and, once it's done, it adds back the free space entry to the
    3888             :  *    block group).
    3889             :  *
    3890             :  * We also need this 2 phases setup when adding a device to a filesystem with
    3891             :  * a seed device - we must create new metadata and system chunks without adding
    3892             :  * any of the block group items to the chunk, extent and device btrees. If we
    3893             :  * did not do it this way, we would get ENOSPC when attempting to update those
    3894             :  * btrees, since all the chunks from the seed device are read-only.
    3895             :  *
    3896             :  * Phase 1 does the updates and insertions to the chunk btree because if we had
    3897             :  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
    3898             :  * parallel, we risk having too many system chunks allocated by many tasks if
    3899             :  * many tasks reach phase 1 without the previous ones completing phase 2. In the
    3900             :  * extreme case this leads to exhaustion of the system chunk array in the
    3901             :  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
    3902             :  * and with RAID filesystems (so we have more device items in the chunk btree).
    3903             :  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
    3904             :  * the system chunk array due to concurrent allocations") provides more details.
    3905             :  *
    3906             :  * Allocation of system chunks does not happen through this function. A task that
    3907             :  * needs to update the chunk btree (the only btree that uses system chunks), must
    3908             :  * preallocate chunk space by calling either check_system_chunk() or
    3909             :  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
    3910             :  * metadata chunk or when removing a chunk, while the later is used before doing
    3911             :  * a modification to the chunk btree - use cases for the later are adding,
    3912             :  * removing and resizing a device as well as relocation of a system chunk.
    3913             :  * See the comment below for more details.
    3914             :  *
    3915             :  * The reservation of system space, done through check_system_chunk(), as well
    3916             :  * as all the updates and insertions into the chunk btree must be done while
    3917             :  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
    3918             :  * an extent buffer from the chunks btree we never trigger allocation of a new
    3919             :  * system chunk, which would result in a deadlock (trying to lock twice an
    3920             :  * extent buffer of the chunk btree, first time before triggering the chunk
    3921             :  * allocation and the second time during chunk allocation while attempting to
    3922             :  * update the chunks btree). The system chunk array is also updated while holding
    3923             :  * that mutex. The same logic applies to removing chunks - we must reserve system
    3924             :  * space, update the chunk btree and the system chunk array in the superblock
    3925             :  * while holding fs_info->chunk_mutex.
    3926             :  *
    3927             :  * This function, btrfs_chunk_alloc(), belongs to phase 1.
    3928             :  *
    3929             :  * If @force is CHUNK_ALLOC_FORCE:
    3930             :  *    - return 1 if it successfully allocates a chunk,
    3931             :  *    - return errors including -ENOSPC otherwise.
    3932             :  * If @force is NOT CHUNK_ALLOC_FORCE:
    3933             :  *    - return 0 if it doesn't need to allocate a new chunk,
    3934             :  *    - return 1 if it successfully allocates a chunk,
    3935             :  *    - return errors including -ENOSPC otherwise.
    3936             :  */
    3937           0 : int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
    3938             :                       enum btrfs_chunk_alloc_enum force)
    3939             : {
    3940           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3941           0 :         struct btrfs_space_info *space_info;
    3942           0 :         struct btrfs_block_group *ret_bg;
    3943           0 :         bool wait_for_alloc = false;
    3944           0 :         bool should_alloc = false;
    3945           0 :         bool from_extent_allocation = false;
    3946           0 :         int ret = 0;
    3947             : 
    3948           0 :         if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
    3949           0 :                 from_extent_allocation = true;
    3950           0 :                 force = CHUNK_ALLOC_FORCE;
    3951             :         }
    3952             : 
    3953             :         /* Don't re-enter if we're already allocating a chunk */
    3954           0 :         if (trans->allocating_chunk)
    3955             :                 return -ENOSPC;
    3956             :         /*
    3957             :          * Allocation of system chunks can not happen through this path, as we
    3958             :          * could end up in a deadlock if we are allocating a data or metadata
    3959             :          * chunk and there is another task modifying the chunk btree.
    3960             :          *
    3961             :          * This is because while we are holding the chunk mutex, we will attempt
    3962             :          * to add the new chunk item to the chunk btree or update an existing
    3963             :          * device item in the chunk btree, while the other task that is modifying
    3964             :          * the chunk btree is attempting to COW an extent buffer while holding a
    3965             :          * lock on it and on its parent - if the COW operation triggers a system
    3966             :          * chunk allocation, then we can deadlock because we are holding the
    3967             :          * chunk mutex and we may need to access that extent buffer or its parent
    3968             :          * in order to add the chunk item or update a device item.
    3969             :          *
    3970             :          * Tasks that want to modify the chunk tree should reserve system space
    3971             :          * before updating the chunk btree, by calling either
    3972             :          * btrfs_reserve_chunk_metadata() or check_system_chunk().
    3973             :          * It's possible that after a task reserves the space, it still ends up
    3974             :          * here - this happens in the cases described above at do_chunk_alloc().
    3975             :          * The task will have to either retry or fail.
    3976             :          */
    3977           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    3978             :                 return -ENOSPC;
    3979             : 
    3980           0 :         space_info = btrfs_find_space_info(fs_info, flags);
    3981           0 :         ASSERT(space_info);
    3982             : 
    3983           0 :         do {
    3984           0 :                 spin_lock(&space_info->lock);
    3985           0 :                 if (force < space_info->force_alloc)
    3986             :                         force = space_info->force_alloc;
    3987           0 :                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
    3988           0 :                 if (space_info->full) {
    3989             :                         /* No more free physical space */
    3990           0 :                         if (should_alloc)
    3991             :                                 ret = -ENOSPC;
    3992             :                         else
    3993           0 :                                 ret = 0;
    3994           0 :                         spin_unlock(&space_info->lock);
    3995           0 :                         return ret;
    3996           0 :                 } else if (!should_alloc) {
    3997           0 :                         spin_unlock(&space_info->lock);
    3998           0 :                         return 0;
    3999           0 :                 } else if (space_info->chunk_alloc) {
    4000             :                         /*
    4001             :                          * Someone is already allocating, so we need to block
    4002             :                          * until this someone is finished and then loop to
    4003             :                          * recheck if we should continue with our allocation
    4004             :                          * attempt.
    4005             :                          */
    4006           0 :                         wait_for_alloc = true;
    4007           0 :                         force = CHUNK_ALLOC_NO_FORCE;
    4008           0 :                         spin_unlock(&space_info->lock);
    4009           0 :                         mutex_lock(&fs_info->chunk_mutex);
    4010           0 :                         mutex_unlock(&fs_info->chunk_mutex);
    4011             :                 } else {
    4012             :                         /* Proceed with allocation */
    4013           0 :                         space_info->chunk_alloc = 1;
    4014           0 :                         wait_for_alloc = false;
    4015           0 :                         spin_unlock(&space_info->lock);
    4016             :                 }
    4017             : 
    4018           0 :                 cond_resched();
    4019           0 :         } while (wait_for_alloc);
    4020             : 
    4021           0 :         mutex_lock(&fs_info->chunk_mutex);
    4022           0 :         trans->allocating_chunk = true;
    4023             : 
    4024             :         /*
    4025             :          * If we have mixed data/metadata chunks we want to make sure we keep
    4026             :          * allocating mixed chunks instead of individual chunks.
    4027             :          */
    4028           0 :         if (btrfs_mixed_space_info(space_info))
    4029           0 :                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
    4030             : 
    4031             :         /*
    4032             :          * if we're doing a data chunk, go ahead and make sure that
    4033             :          * we keep a reasonable number of metadata chunks allocated in the
    4034             :          * FS as well.
    4035             :          */
    4036           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
    4037           0 :                 fs_info->data_chunk_allocations++;
    4038           0 :                 if (!(fs_info->data_chunk_allocations %
    4039             :                       fs_info->metadata_ratio))
    4040           0 :                         force_metadata_allocation(fs_info);
    4041             :         }
    4042             : 
    4043           0 :         ret_bg = do_chunk_alloc(trans, flags);
    4044           0 :         trans->allocating_chunk = false;
    4045             : 
    4046           0 :         if (IS_ERR(ret_bg)) {
    4047           0 :                 ret = PTR_ERR(ret_bg);
    4048           0 :         } else if (from_extent_allocation) {
    4049             :                 /*
    4050             :                  * New block group is likely to be used soon. Try to activate
    4051             :                  * it now. Failure is OK for now.
    4052             :                  */
    4053           0 :                 btrfs_zone_activate(ret_bg);
    4054             :         }
    4055             : 
    4056           0 :         if (!ret)
    4057           0 :                 btrfs_put_block_group(ret_bg);
    4058             : 
    4059           0 :         spin_lock(&space_info->lock);
    4060           0 :         if (ret < 0) {
    4061           0 :                 if (ret == -ENOSPC)
    4062           0 :                         space_info->full = 1;
    4063             :                 else
    4064           0 :                         goto out;
    4065             :         } else {
    4066           0 :                 ret = 1;
    4067           0 :                 space_info->max_extent_size = 0;
    4068             :         }
    4069             : 
    4070           0 :         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
    4071           0 : out:
    4072           0 :         space_info->chunk_alloc = 0;
    4073           0 :         spin_unlock(&space_info->lock);
    4074           0 :         mutex_unlock(&fs_info->chunk_mutex);
    4075             : 
    4076           0 :         return ret;
    4077             : }
    4078             : 
    4079           0 : static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
    4080             : {
    4081           0 :         u64 num_dev;
    4082             : 
    4083           0 :         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
    4084           0 :         if (!num_dev)
    4085           0 :                 num_dev = fs_info->fs_devices->rw_devices;
    4086             : 
    4087           0 :         return num_dev;
    4088             : }
    4089             : 
    4090           0 : static void reserve_chunk_space(struct btrfs_trans_handle *trans,
    4091             :                                 u64 bytes,
    4092             :                                 u64 type)
    4093             : {
    4094           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4095           0 :         struct btrfs_space_info *info;
    4096           0 :         u64 left;
    4097           0 :         int ret = 0;
    4098             : 
    4099             :         /*
    4100             :          * Needed because we can end up allocating a system chunk and for an
    4101             :          * atomic and race free space reservation in the chunk block reserve.
    4102             :          */
    4103           0 :         lockdep_assert_held(&fs_info->chunk_mutex);
    4104             : 
    4105           0 :         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
    4106           0 :         spin_lock(&info->lock);
    4107           0 :         left = info->total_bytes - btrfs_space_info_used(info, true);
    4108           0 :         spin_unlock(&info->lock);
    4109             : 
    4110           0 :         if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
    4111           0 :                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
    4112             :                            left, bytes, type);
    4113           0 :                 btrfs_dump_space_info(fs_info, info, 0, 0);
    4114             :         }
    4115             : 
    4116           0 :         if (left < bytes) {
    4117           0 :                 u64 flags = btrfs_system_alloc_profile(fs_info);
    4118           0 :                 struct btrfs_block_group *bg;
    4119             : 
    4120             :                 /*
    4121             :                  * Ignore failure to create system chunk. We might end up not
    4122             :                  * needing it, as we might not need to COW all nodes/leafs from
    4123             :                  * the paths we visit in the chunk tree (they were already COWed
    4124             :                  * or created in the current transaction for example).
    4125             :                  */
    4126           0 :                 bg = btrfs_create_chunk(trans, flags);
    4127           0 :                 if (IS_ERR(bg)) {
    4128           0 :                         ret = PTR_ERR(bg);
    4129             :                 } else {
    4130             :                         /*
    4131             :                          * We have a new chunk. We also need to activate it for
    4132             :                          * zoned filesystem.
    4133             :                          */
    4134           0 :                         ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
    4135           0 :                         if (ret < 0)
    4136             :                                 return;
    4137             : 
    4138             :                         /*
    4139             :                          * If we fail to add the chunk item here, we end up
    4140             :                          * trying again at phase 2 of chunk allocation, at
    4141             :                          * btrfs_create_pending_block_groups(). So ignore
    4142             :                          * any error here. An ENOSPC here could happen, due to
    4143             :                          * the cases described at do_chunk_alloc() - the system
    4144             :                          * block group we just created was just turned into RO
    4145             :                          * mode by a scrub for example, or a running discard
    4146             :                          * temporarily removed its free space entries, etc.
    4147             :                          */
    4148           0 :                         btrfs_chunk_alloc_add_chunk_item(trans, bg);
    4149             :                 }
    4150             :         }
    4151             : 
    4152           0 :         if (!ret) {
    4153           0 :                 ret = btrfs_block_rsv_add(fs_info,
    4154             :                                           &fs_info->chunk_block_rsv,
    4155             :                                           bytes, BTRFS_RESERVE_NO_FLUSH);
    4156           0 :                 if (!ret)
    4157           0 :                         trans->chunk_bytes_reserved += bytes;
    4158             :         }
    4159             : }
    4160             : 
    4161             : /*
    4162             :  * Reserve space in the system space for allocating or removing a chunk.
    4163             :  * The caller must be holding fs_info->chunk_mutex.
    4164             :  */
    4165           0 : void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
    4166             : {
    4167           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4168           0 :         const u64 num_devs = get_profile_num_devs(fs_info, type);
    4169           0 :         u64 bytes;
    4170             : 
    4171             :         /* num_devs device items to update and 1 chunk item to add or remove. */
    4172           0 :         bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
    4173             :                 btrfs_calc_insert_metadata_size(fs_info, 1);
    4174             : 
    4175           0 :         reserve_chunk_space(trans, bytes, type);
    4176           0 : }
    4177             : 
    4178             : /*
    4179             :  * Reserve space in the system space, if needed, for doing a modification to the
    4180             :  * chunk btree.
    4181             :  *
    4182             :  * @trans:              A transaction handle.
    4183             :  * @is_item_insertion:  Indicate if the modification is for inserting a new item
    4184             :  *                      in the chunk btree or if it's for the deletion or update
    4185             :  *                      of an existing item.
    4186             :  *
    4187             :  * This is used in a context where we need to update the chunk btree outside
    4188             :  * block group allocation and removal, to avoid a deadlock with a concurrent
    4189             :  * task that is allocating a metadata or data block group and therefore needs to
    4190             :  * update the chunk btree while holding the chunk mutex. After the update to the
    4191             :  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
    4192             :  *
    4193             :  */
    4194           0 : void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
    4195             :                                   bool is_item_insertion)
    4196             : {
    4197           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4198           0 :         u64 bytes;
    4199             : 
    4200           0 :         if (is_item_insertion)
    4201           0 :                 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
    4202             :         else
    4203           0 :                 bytes = btrfs_calc_metadata_size(fs_info, 1);
    4204             : 
    4205           0 :         mutex_lock(&fs_info->chunk_mutex);
    4206           0 :         reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
    4207           0 :         mutex_unlock(&fs_info->chunk_mutex);
    4208           0 : }
    4209             : 
    4210           0 : void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
    4211             : {
    4212           0 :         struct btrfs_block_group *block_group;
    4213             : 
    4214           0 :         block_group = btrfs_lookup_first_block_group(info, 0);
    4215           0 :         while (block_group) {
    4216           0 :                 btrfs_wait_block_group_cache_done(block_group);
    4217           0 :                 spin_lock(&block_group->lock);
    4218           0 :                 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
    4219           0 :                                        &block_group->runtime_flags)) {
    4220           0 :                         struct inode *inode = block_group->inode;
    4221             : 
    4222           0 :                         block_group->inode = NULL;
    4223           0 :                         spin_unlock(&block_group->lock);
    4224             : 
    4225           0 :                         ASSERT(block_group->io_ctl.inode == NULL);
    4226           0 :                         iput(inode);
    4227             :                 } else {
    4228           0 :                         spin_unlock(&block_group->lock);
    4229             :                 }
    4230           0 :                 block_group = btrfs_next_block_group(block_group);
    4231             :         }
    4232           0 : }
    4233             : 
    4234             : /*
    4235             :  * Must be called only after stopping all workers, since we could have block
    4236             :  * group caching kthreads running, and therefore they could race with us if we
    4237             :  * freed the block groups before stopping them.
    4238             :  */
    4239           0 : int btrfs_free_block_groups(struct btrfs_fs_info *info)
    4240             : {
    4241           0 :         struct btrfs_block_group *block_group;
    4242           0 :         struct btrfs_space_info *space_info;
    4243           0 :         struct btrfs_caching_control *caching_ctl;
    4244           0 :         struct rb_node *n;
    4245             : 
    4246           0 :         write_lock(&info->block_group_cache_lock);
    4247           0 :         while (!list_empty(&info->caching_block_groups)) {
    4248           0 :                 caching_ctl = list_entry(info->caching_block_groups.next,
    4249             :                                          struct btrfs_caching_control, list);
    4250           0 :                 list_del(&caching_ctl->list);
    4251           0 :                 btrfs_put_caching_control(caching_ctl);
    4252             :         }
    4253           0 :         write_unlock(&info->block_group_cache_lock);
    4254             : 
    4255           0 :         spin_lock(&info->unused_bgs_lock);
    4256           0 :         while (!list_empty(&info->unused_bgs)) {
    4257           0 :                 block_group = list_first_entry(&info->unused_bgs,
    4258             :                                                struct btrfs_block_group,
    4259             :                                                bg_list);
    4260           0 :                 list_del_init(&block_group->bg_list);
    4261           0 :                 btrfs_put_block_group(block_group);
    4262             :         }
    4263             : 
    4264           0 :         while (!list_empty(&info->reclaim_bgs)) {
    4265           0 :                 block_group = list_first_entry(&info->reclaim_bgs,
    4266             :                                                struct btrfs_block_group,
    4267             :                                                bg_list);
    4268           0 :                 list_del_init(&block_group->bg_list);
    4269           0 :                 btrfs_put_block_group(block_group);
    4270             :         }
    4271           0 :         spin_unlock(&info->unused_bgs_lock);
    4272             : 
    4273           0 :         spin_lock(&info->zone_active_bgs_lock);
    4274           0 :         while (!list_empty(&info->zone_active_bgs)) {
    4275           0 :                 block_group = list_first_entry(&info->zone_active_bgs,
    4276             :                                                struct btrfs_block_group,
    4277             :                                                active_bg_list);
    4278           0 :                 list_del_init(&block_group->active_bg_list);
    4279           0 :                 btrfs_put_block_group(block_group);
    4280             :         }
    4281           0 :         spin_unlock(&info->zone_active_bgs_lock);
    4282             : 
    4283           0 :         write_lock(&info->block_group_cache_lock);
    4284           0 :         while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
    4285           0 :                 block_group = rb_entry(n, struct btrfs_block_group,
    4286             :                                        cache_node);
    4287           0 :                 rb_erase_cached(&block_group->cache_node,
    4288             :                                 &info->block_group_cache_tree);
    4289           0 :                 RB_CLEAR_NODE(&block_group->cache_node);
    4290           0 :                 write_unlock(&info->block_group_cache_lock);
    4291             : 
    4292           0 :                 down_write(&block_group->space_info->groups_sem);
    4293           0 :                 list_del(&block_group->list);
    4294           0 :                 up_write(&block_group->space_info->groups_sem);
    4295             : 
    4296             :                 /*
    4297             :                  * We haven't cached this block group, which means we could
    4298             :                  * possibly have excluded extents on this block group.
    4299             :                  */
    4300           0 :                 if (block_group->cached == BTRFS_CACHE_NO ||
    4301             :                     block_group->cached == BTRFS_CACHE_ERROR)
    4302           0 :                         btrfs_free_excluded_extents(block_group);
    4303             : 
    4304           0 :                 btrfs_remove_free_space_cache(block_group);
    4305           0 :                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
    4306           0 :                 ASSERT(list_empty(&block_group->dirty_list));
    4307           0 :                 ASSERT(list_empty(&block_group->io_list));
    4308           0 :                 ASSERT(list_empty(&block_group->bg_list));
    4309           0 :                 ASSERT(refcount_read(&block_group->refs) == 1);
    4310           0 :                 ASSERT(block_group->swap_extents == 0);
    4311           0 :                 btrfs_put_block_group(block_group);
    4312             : 
    4313           0 :                 write_lock(&info->block_group_cache_lock);
    4314             :         }
    4315           0 :         write_unlock(&info->block_group_cache_lock);
    4316             : 
    4317           0 :         btrfs_release_global_block_rsv(info);
    4318             : 
    4319           0 :         while (!list_empty(&info->space_info)) {
    4320           0 :                 space_info = list_entry(info->space_info.next,
    4321             :                                         struct btrfs_space_info,
    4322             :                                         list);
    4323             : 
    4324             :                 /*
    4325             :                  * Do not hide this behind enospc_debug, this is actually
    4326             :                  * important and indicates a real bug if this happens.
    4327             :                  */
    4328           0 :                 if (WARN_ON(space_info->bytes_pinned > 0 ||
    4329             :                             space_info->bytes_may_use > 0))
    4330           0 :                         btrfs_dump_space_info(info, space_info, 0, 0);
    4331             : 
    4332             :                 /*
    4333             :                  * If there was a failure to cleanup a log tree, very likely due
    4334             :                  * to an IO failure on a writeback attempt of one or more of its
    4335             :                  * extent buffers, we could not do proper (and cheap) unaccounting
    4336             :                  * of their reserved space, so don't warn on bytes_reserved > 0 in
    4337             :                  * that case.
    4338             :                  */
    4339           0 :                 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
    4340           0 :                     !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
    4341           0 :                         if (WARN_ON(space_info->bytes_reserved > 0))
    4342           0 :                                 btrfs_dump_space_info(info, space_info, 0, 0);
    4343             :                 }
    4344             : 
    4345           0 :                 WARN_ON(space_info->reclaim_size > 0);
    4346           0 :                 list_del(&space_info->list);
    4347           0 :                 btrfs_sysfs_remove_space_info(space_info);
    4348             :         }
    4349           0 :         return 0;
    4350             : }
    4351             : 
    4352           0 : void btrfs_freeze_block_group(struct btrfs_block_group *cache)
    4353             : {
    4354           0 :         atomic_inc(&cache->frozen);
    4355           0 : }
    4356             : 
    4357           0 : void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
    4358             : {
    4359           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    4360           0 :         struct extent_map_tree *em_tree;
    4361           0 :         struct extent_map *em;
    4362           0 :         bool cleanup;
    4363             : 
    4364           0 :         spin_lock(&block_group->lock);
    4365           0 :         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
    4366           0 :                    test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
    4367           0 :         spin_unlock(&block_group->lock);
    4368             : 
    4369           0 :         if (cleanup) {
    4370           0 :                 em_tree = &fs_info->mapping_tree;
    4371           0 :                 write_lock(&em_tree->lock);
    4372           0 :                 em = lookup_extent_mapping(em_tree, block_group->start,
    4373             :                                            1);
    4374           0 :                 BUG_ON(!em); /* logic error, can't happen */
    4375           0 :                 remove_extent_mapping(em_tree, em);
    4376           0 :                 write_unlock(&em_tree->lock);
    4377             : 
    4378             :                 /* once for us and once for the tree */
    4379           0 :                 free_extent_map(em);
    4380           0 :                 free_extent_map(em);
    4381             : 
    4382             :                 /*
    4383             :                  * We may have left one free space entry and other possible
    4384             :                  * tasks trimming this block group have left 1 entry each one.
    4385             :                  * Free them if any.
    4386             :                  */
    4387           0 :                 btrfs_remove_free_space_cache(block_group);
    4388             :         }
    4389           0 : }
    4390             : 
    4391           0 : bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
    4392             : {
    4393           0 :         bool ret = true;
    4394             : 
    4395           0 :         spin_lock(&bg->lock);
    4396           0 :         if (bg->ro)
    4397             :                 ret = false;
    4398             :         else
    4399           0 :                 bg->swap_extents++;
    4400           0 :         spin_unlock(&bg->lock);
    4401             : 
    4402           0 :         return ret;
    4403             : }
    4404             : 
    4405           0 : void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
    4406             : {
    4407           0 :         spin_lock(&bg->lock);
    4408           0 :         ASSERT(!bg->ro);
    4409           0 :         ASSERT(bg->swap_extents >= amount);
    4410           0 :         bg->swap_extents -= amount;
    4411           0 :         spin_unlock(&bg->lock);
    4412           0 : }
    4413             : 
    4414           0 : enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
    4415             : {
    4416           0 :         if (size <= SZ_128K)
    4417             :                 return BTRFS_BG_SZ_SMALL;
    4418           0 :         if (size <= SZ_8M)
    4419           0 :                 return BTRFS_BG_SZ_MEDIUM;
    4420             :         return BTRFS_BG_SZ_LARGE;
    4421             : }
    4422             : 
    4423             : /*
    4424             :  * Handle a block group allocating an extent in a size class
    4425             :  *
    4426             :  * @bg:                         The block group we allocated in.
    4427             :  * @size_class:                 The size class of the allocation.
    4428             :  * @force_wrong_size_class:     Whether we are desperate enough to allow
    4429             :  *                              mismatched size classes.
    4430             :  *
    4431             :  * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
    4432             :  * case of a race that leads to the wrong size class without
    4433             :  * force_wrong_size_class set.
    4434             :  *
    4435             :  * find_free_extent will skip block groups with a mismatched size class until
    4436             :  * it really needs to avoid ENOSPC. In that case it will set
    4437             :  * force_wrong_size_class. However, if a block group is newly allocated and
    4438             :  * doesn't yet have a size class, then it is possible for two allocations of
    4439             :  * different sizes to race and both try to use it. The loser is caught here and
    4440             :  * has to retry.
    4441             :  */
    4442           0 : int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
    4443             :                                      enum btrfs_block_group_size_class size_class,
    4444             :                                      bool force_wrong_size_class)
    4445             : {
    4446           0 :         ASSERT(size_class != BTRFS_BG_SZ_NONE);
    4447             : 
    4448             :         /* The new allocation is in the right size class, do nothing */
    4449           0 :         if (bg->size_class == size_class)
    4450             :                 return 0;
    4451             :         /*
    4452             :          * The new allocation is in a mismatched size class.
    4453             :          * This means one of two things:
    4454             :          *
    4455             :          * 1. Two tasks in find_free_extent for different size_classes raced
    4456             :          *    and hit the same empty block_group. Make the loser try again.
    4457             :          * 2. A call to find_free_extent got desperate enough to set
    4458             :          *    'force_wrong_slab'. Don't change the size_class, but allow the
    4459             :          *    allocation.
    4460             :          */
    4461           0 :         if (bg->size_class != BTRFS_BG_SZ_NONE) {
    4462           0 :                 if (force_wrong_size_class)
    4463             :                         return 0;
    4464           0 :                 return -EAGAIN;
    4465             :         }
    4466             :         /*
    4467             :          * The happy new block group case: the new allocation is the first
    4468             :          * one in the block_group so we set size_class.
    4469             :          */
    4470           0 :         bg->size_class = size_class;
    4471             : 
    4472           0 :         return 0;
    4473             : }
    4474             : 
    4475           0 : bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
    4476             : {
    4477           0 :         if (btrfs_is_zoned(bg->fs_info))
    4478             :                 return false;
    4479           0 :         if (!btrfs_is_block_group_data_only(bg))
    4480           0 :                 return false;
    4481             :         return true;
    4482             : }

Generated by: LCOV version 1.14