LCOV - code coverage report
Current view: top level - fs/btrfs - block-group.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023 Lines: 0 2125 0.0 %
Date: 2023-07-31 20:08:27 Functions: 0 83 0.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/sizes.h>
       4             : #include <linux/list_sort.h>
       5             : #include "misc.h"
       6             : #include "ctree.h"
       7             : #include "block-group.h"
       8             : #include "space-info.h"
       9             : #include "disk-io.h"
      10             : #include "free-space-cache.h"
      11             : #include "free-space-tree.h"
      12             : #include "volumes.h"
      13             : #include "transaction.h"
      14             : #include "ref-verify.h"
      15             : #include "sysfs.h"
      16             : #include "tree-log.h"
      17             : #include "delalloc-space.h"
      18             : #include "discard.h"
      19             : #include "raid56.h"
      20             : #include "zoned.h"
      21             : #include "fs.h"
      22             : #include "accessors.h"
      23             : #include "extent-tree.h"
      24             : 
      25             : #ifdef CONFIG_BTRFS_DEBUG
      26             : int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
      27             : {
      28             :         struct btrfs_fs_info *fs_info = block_group->fs_info;
      29             : 
      30             :         return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
      31             :                 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
      32             :                (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
      33             :                 block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
      34             : }
      35             : #endif
      36             : 
      37             : /*
      38             :  * Return target flags in extended format or 0 if restripe for this chunk_type
      39             :  * is not in progress
      40             :  *
      41             :  * Should be called with balance_lock held
      42             :  */
      43           0 : static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
      44             : {
      45           0 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
      46           0 :         u64 target = 0;
      47             : 
      48           0 :         if (!bctl)
      49             :                 return 0;
      50             : 
      51           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA &&
      52           0 :             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      53           0 :                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
      54           0 :         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
      55           0 :                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      56           0 :                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
      57           0 :         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
      58           0 :                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
      59           0 :                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
      60             :         }
      61             : 
      62             :         return target;
      63             : }
      64             : 
      65             : /*
      66             :  * @flags: available profiles in extended format (see ctree.h)
      67             :  *
      68             :  * Return reduced profile in chunk format.  If profile changing is in progress
      69             :  * (either running or paused) picks the target profile (if it's already
      70             :  * available), otherwise falls back to plain reducing.
      71             :  */
      72           0 : static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
      73             : {
      74           0 :         u64 num_devices = fs_info->fs_devices->rw_devices;
      75           0 :         u64 target;
      76           0 :         u64 raid_type;
      77           0 :         u64 allowed = 0;
      78             : 
      79             :         /*
      80             :          * See if restripe for this chunk_type is in progress, if so try to
      81             :          * reduce to the target profile
      82             :          */
      83           0 :         spin_lock(&fs_info->balance_lock);
      84           0 :         target = get_restripe_target(fs_info, flags);
      85           0 :         if (target) {
      86           0 :                 spin_unlock(&fs_info->balance_lock);
      87           0 :                 return extended_to_chunk(target);
      88             :         }
      89           0 :         spin_unlock(&fs_info->balance_lock);
      90             : 
      91             :         /* First, mask out the RAID levels which aren't possible */
      92           0 :         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
      93           0 :                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
      94           0 :                         allowed |= btrfs_raid_array[raid_type].bg_flag;
      95             :         }
      96           0 :         allowed &= flags;
      97             : 
      98             :         /* Select the highest-redundancy RAID level. */
      99           0 :         if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
     100             :                 allowed = BTRFS_BLOCK_GROUP_RAID1C4;
     101           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
     102             :                 allowed = BTRFS_BLOCK_GROUP_RAID6;
     103           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
     104             :                 allowed = BTRFS_BLOCK_GROUP_RAID1C3;
     105           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
     106             :                 allowed = BTRFS_BLOCK_GROUP_RAID5;
     107           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
     108             :                 allowed = BTRFS_BLOCK_GROUP_RAID10;
     109           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
     110             :                 allowed = BTRFS_BLOCK_GROUP_RAID1;
     111           0 :         else if (allowed & BTRFS_BLOCK_GROUP_DUP)
     112             :                 allowed = BTRFS_BLOCK_GROUP_DUP;
     113           0 :         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
     114           0 :                 allowed = BTRFS_BLOCK_GROUP_RAID0;
     115             : 
     116           0 :         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
     117             : 
     118           0 :         return extended_to_chunk(flags | allowed);
     119             : }
     120             : 
     121           0 : u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
     122             : {
     123           0 :         unsigned seq;
     124           0 :         u64 flags;
     125             : 
     126           0 :         do {
     127           0 :                 flags = orig_flags;
     128           0 :                 seq = read_seqbegin(&fs_info->profiles_lock);
     129             : 
     130           0 :                 if (flags & BTRFS_BLOCK_GROUP_DATA)
     131           0 :                         flags |= fs_info->avail_data_alloc_bits;
     132           0 :                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
     133           0 :                         flags |= fs_info->avail_system_alloc_bits;
     134           0 :                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
     135           0 :                         flags |= fs_info->avail_metadata_alloc_bits;
     136           0 :         } while (read_seqretry(&fs_info->profiles_lock, seq));
     137             : 
     138           0 :         return btrfs_reduce_alloc_profile(fs_info, flags);
     139             : }
     140             : 
     141           0 : void btrfs_get_block_group(struct btrfs_block_group *cache)
     142             : {
     143           0 :         refcount_inc(&cache->refs);
     144           0 : }
     145             : 
     146           0 : void btrfs_put_block_group(struct btrfs_block_group *cache)
     147             : {
     148           0 :         if (refcount_dec_and_test(&cache->refs)) {
     149           0 :                 WARN_ON(cache->pinned > 0);
     150             :                 /*
     151             :                  * If there was a failure to cleanup a log tree, very likely due
     152             :                  * to an IO failure on a writeback attempt of one or more of its
     153             :                  * extent buffers, we could not do proper (and cheap) unaccounting
     154             :                  * of their reserved space, so don't warn on reserved > 0 in that
     155             :                  * case.
     156             :                  */
     157           0 :                 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
     158           0 :                     !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
     159           0 :                         WARN_ON(cache->reserved > 0);
     160             : 
     161             :                 /*
     162             :                  * A block_group shouldn't be on the discard_list anymore.
     163             :                  * Remove the block_group from the discard_list to prevent us
     164             :                  * from causing a panic due to NULL pointer dereference.
     165             :                  */
     166           0 :                 if (WARN_ON(!list_empty(&cache->discard_list)))
     167           0 :                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
     168             :                                                   cache);
     169             : 
     170           0 :                 kfree(cache->free_space_ctl);
     171           0 :                 kfree(cache->physical_map);
     172           0 :                 kfree(cache);
     173             :         }
     174           0 : }
     175             : 
     176             : /*
     177             :  * This adds the block group to the fs_info rb tree for the block group cache
     178             :  */
     179           0 : static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
     180             :                                        struct btrfs_block_group *block_group)
     181             : {
     182           0 :         struct rb_node **p;
     183           0 :         struct rb_node *parent = NULL;
     184           0 :         struct btrfs_block_group *cache;
     185           0 :         bool leftmost = true;
     186             : 
     187           0 :         ASSERT(block_group->length != 0);
     188             : 
     189           0 :         write_lock(&info->block_group_cache_lock);
     190           0 :         p = &info->block_group_cache_tree.rb_root.rb_node;
     191             : 
     192           0 :         while (*p) {
     193           0 :                 parent = *p;
     194           0 :                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
     195           0 :                 if (block_group->start < cache->start) {
     196           0 :                         p = &(*p)->rb_left;
     197           0 :                 } else if (block_group->start > cache->start) {
     198           0 :                         p = &(*p)->rb_right;
     199           0 :                         leftmost = false;
     200             :                 } else {
     201           0 :                         write_unlock(&info->block_group_cache_lock);
     202           0 :                         return -EEXIST;
     203             :                 }
     204             :         }
     205             : 
     206           0 :         rb_link_node(&block_group->cache_node, parent, p);
     207           0 :         rb_insert_color_cached(&block_group->cache_node,
     208             :                                &info->block_group_cache_tree, leftmost);
     209             : 
     210           0 :         write_unlock(&info->block_group_cache_lock);
     211             : 
     212           0 :         return 0;
     213             : }
     214             : 
     215             : /*
     216             :  * This will return the block group at or after bytenr if contains is 0, else
     217             :  * it will return the block group that contains the bytenr
     218             :  */
     219           0 : static struct btrfs_block_group *block_group_cache_tree_search(
     220             :                 struct btrfs_fs_info *info, u64 bytenr, int contains)
     221             : {
     222           0 :         struct btrfs_block_group *cache, *ret = NULL;
     223           0 :         struct rb_node *n;
     224           0 :         u64 end, start;
     225             : 
     226           0 :         read_lock(&info->block_group_cache_lock);
     227           0 :         n = info->block_group_cache_tree.rb_root.rb_node;
     228             : 
     229           0 :         while (n) {
     230           0 :                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
     231           0 :                 end = cache->start + cache->length - 1;
     232           0 :                 start = cache->start;
     233             : 
     234           0 :                 if (bytenr < start) {
     235           0 :                         if (!contains && (!ret || start < ret->start))
     236           0 :                                 ret = cache;
     237           0 :                         n = n->rb_left;
     238           0 :                 } else if (bytenr > start) {
     239           0 :                         if (contains && bytenr <= end) {
     240             :                                 ret = cache;
     241             :                                 break;
     242             :                         }
     243           0 :                         n = n->rb_right;
     244             :                 } else {
     245             :                         ret = cache;
     246             :                         break;
     247             :                 }
     248             :         }
     249           0 :         if (ret)
     250           0 :                 btrfs_get_block_group(ret);
     251           0 :         read_unlock(&info->block_group_cache_lock);
     252             : 
     253           0 :         return ret;
     254             : }
     255             : 
     256             : /*
     257             :  * Return the block group that starts at or after bytenr
     258             :  */
     259           0 : struct btrfs_block_group *btrfs_lookup_first_block_group(
     260             :                 struct btrfs_fs_info *info, u64 bytenr)
     261             : {
     262           0 :         return block_group_cache_tree_search(info, bytenr, 0);
     263             : }
     264             : 
     265             : /*
     266             :  * Return the block group that contains the given bytenr
     267             :  */
     268           0 : struct btrfs_block_group *btrfs_lookup_block_group(
     269             :                 struct btrfs_fs_info *info, u64 bytenr)
     270             : {
     271           0 :         return block_group_cache_tree_search(info, bytenr, 1);
     272             : }
     273             : 
     274           0 : struct btrfs_block_group *btrfs_next_block_group(
     275             :                 struct btrfs_block_group *cache)
     276             : {
     277           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     278           0 :         struct rb_node *node;
     279             : 
     280           0 :         read_lock(&fs_info->block_group_cache_lock);
     281             : 
     282             :         /* If our block group was removed, we need a full search. */
     283           0 :         if (RB_EMPTY_NODE(&cache->cache_node)) {
     284           0 :                 const u64 next_bytenr = cache->start + cache->length;
     285             : 
     286           0 :                 read_unlock(&fs_info->block_group_cache_lock);
     287           0 :                 btrfs_put_block_group(cache);
     288           0 :                 return btrfs_lookup_first_block_group(fs_info, next_bytenr);
     289             :         }
     290           0 :         node = rb_next(&cache->cache_node);
     291           0 :         btrfs_put_block_group(cache);
     292           0 :         if (node) {
     293           0 :                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
     294           0 :                 btrfs_get_block_group(cache);
     295             :         } else
     296             :                 cache = NULL;
     297           0 :         read_unlock(&fs_info->block_group_cache_lock);
     298           0 :         return cache;
     299             : }
     300             : 
     301             : /*
     302             :  * Check if we can do a NOCOW write for a given extent.
     303             :  *
     304             :  * @fs_info:       The filesystem information object.
     305             :  * @bytenr:        Logical start address of the extent.
     306             :  *
     307             :  * Check if we can do a NOCOW write for the given extent, and increments the
     308             :  * number of NOCOW writers in the block group that contains the extent, as long
     309             :  * as the block group exists and it's currently not in read-only mode.
     310             :  *
     311             :  * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
     312             :  *          is responsible for calling btrfs_dec_nocow_writers() later.
     313             :  *
     314             :  *          Or NULL if we can not do a NOCOW write
     315             :  */
     316           0 : struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
     317             :                                                   u64 bytenr)
     318             : {
     319           0 :         struct btrfs_block_group *bg;
     320           0 :         bool can_nocow = true;
     321             : 
     322           0 :         bg = btrfs_lookup_block_group(fs_info, bytenr);
     323           0 :         if (!bg)
     324             :                 return NULL;
     325             : 
     326           0 :         spin_lock(&bg->lock);
     327           0 :         if (bg->ro)
     328             :                 can_nocow = false;
     329             :         else
     330           0 :                 atomic_inc(&bg->nocow_writers);
     331           0 :         spin_unlock(&bg->lock);
     332             : 
     333           0 :         if (!can_nocow) {
     334           0 :                 btrfs_put_block_group(bg);
     335           0 :                 return NULL;
     336             :         }
     337             : 
     338             :         /* No put on block group, done by btrfs_dec_nocow_writers(). */
     339             :         return bg;
     340             : }
     341             : 
     342             : /*
     343             :  * Decrement the number of NOCOW writers in a block group.
     344             :  *
     345             :  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
     346             :  * and on the block group returned by that call. Typically this is called after
     347             :  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
     348             :  * relocation.
     349             :  *
     350             :  * After this call, the caller should not use the block group anymore. It it wants
     351             :  * to use it, then it should get a reference on it before calling this function.
     352             :  */
     353           0 : void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
     354             : {
     355           0 :         if (atomic_dec_and_test(&bg->nocow_writers))
     356           0 :                 wake_up_var(&bg->nocow_writers);
     357             : 
     358             :         /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
     359           0 :         btrfs_put_block_group(bg);
     360           0 : }
     361             : 
     362           0 : void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
     363             : {
     364           0 :         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
     365           0 : }
     366             : 
     367           0 : void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
     368             :                                         const u64 start)
     369             : {
     370           0 :         struct btrfs_block_group *bg;
     371             : 
     372           0 :         bg = btrfs_lookup_block_group(fs_info, start);
     373           0 :         ASSERT(bg);
     374           0 :         if (atomic_dec_and_test(&bg->reservations))
     375           0 :                 wake_up_var(&bg->reservations);
     376           0 :         btrfs_put_block_group(bg);
     377           0 : }
     378             : 
     379           0 : void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
     380             : {
     381           0 :         struct btrfs_space_info *space_info = bg->space_info;
     382             : 
     383           0 :         ASSERT(bg->ro);
     384             : 
     385           0 :         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
     386             :                 return;
     387             : 
     388             :         /*
     389             :          * Our block group is read only but before we set it to read only,
     390             :          * some task might have had allocated an extent from it already, but it
     391             :          * has not yet created a respective ordered extent (and added it to a
     392             :          * root's list of ordered extents).
     393             :          * Therefore wait for any task currently allocating extents, since the
     394             :          * block group's reservations counter is incremented while a read lock
     395             :          * on the groups' semaphore is held and decremented after releasing
     396             :          * the read access on that semaphore and creating the ordered extent.
     397             :          */
     398           0 :         down_write(&space_info->groups_sem);
     399           0 :         up_write(&space_info->groups_sem);
     400             : 
     401           0 :         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
     402             : }
     403             : 
     404           0 : struct btrfs_caching_control *btrfs_get_caching_control(
     405             :                 struct btrfs_block_group *cache)
     406             : {
     407           0 :         struct btrfs_caching_control *ctl;
     408             : 
     409           0 :         spin_lock(&cache->lock);
     410           0 :         if (!cache->caching_ctl) {
     411           0 :                 spin_unlock(&cache->lock);
     412           0 :                 return NULL;
     413             :         }
     414             : 
     415           0 :         ctl = cache->caching_ctl;
     416           0 :         refcount_inc(&ctl->count);
     417           0 :         spin_unlock(&cache->lock);
     418           0 :         return ctl;
     419             : }
     420             : 
     421           0 : void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
     422             : {
     423           0 :         if (refcount_dec_and_test(&ctl->count))
     424           0 :                 kfree(ctl);
     425           0 : }
     426             : 
     427             : /*
     428             :  * When we wait for progress in the block group caching, its because our
     429             :  * allocation attempt failed at least once.  So, we must sleep and let some
     430             :  * progress happen before we try again.
     431             :  *
     432             :  * This function will sleep at least once waiting for new free space to show
     433             :  * up, and then it will check the block group free space numbers for our min
     434             :  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
     435             :  * a free extent of a given size, but this is a good start.
     436             :  *
     437             :  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
     438             :  * any of the information in this block group.
     439             :  */
     440           0 : void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
     441             :                                            u64 num_bytes)
     442             : {
     443           0 :         struct btrfs_caching_control *caching_ctl;
     444             : 
     445           0 :         caching_ctl = btrfs_get_caching_control(cache);
     446           0 :         if (!caching_ctl)
     447             :                 return;
     448             : 
     449           0 :         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
     450             :                    (cache->free_space_ctl->free_space >= num_bytes));
     451             : 
     452           0 :         btrfs_put_caching_control(caching_ctl);
     453             : }
     454             : 
     455           0 : static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
     456             :                                        struct btrfs_caching_control *caching_ctl)
     457             : {
     458           0 :         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
     459           0 :         return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
     460             : }
     461             : 
     462           0 : static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
     463             : {
     464           0 :         struct btrfs_caching_control *caching_ctl;
     465           0 :         int ret;
     466             : 
     467           0 :         caching_ctl = btrfs_get_caching_control(cache);
     468           0 :         if (!caching_ctl)
     469           0 :                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
     470           0 :         ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
     471           0 :         btrfs_put_caching_control(caching_ctl);
     472           0 :         return ret;
     473             : }
     474             : 
     475             : #ifdef CONFIG_BTRFS_DEBUG
     476             : static void fragment_free_space(struct btrfs_block_group *block_group)
     477             : {
     478             :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     479             :         u64 start = block_group->start;
     480             :         u64 len = block_group->length;
     481             :         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
     482             :                 fs_info->nodesize : fs_info->sectorsize;
     483             :         u64 step = chunk << 1;
     484             : 
     485             :         while (len > chunk) {
     486             :                 btrfs_remove_free_space(block_group, start, chunk);
     487             :                 start += step;
     488             :                 if (len < step)
     489             :                         len = 0;
     490             :                 else
     491             :                         len -= step;
     492             :         }
     493             : }
     494             : #endif
     495             : 
     496             : /*
     497             :  * This is only called by btrfs_cache_block_group, since we could have freed
     498             :  * extents we need to check the pinned_extents for any extents that can't be
     499             :  * used yet since their free space will be released as soon as the transaction
     500             :  * commits.
     501             :  */
     502           0 : int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end,
     503             :                        u64 *total_added_ret)
     504             : {
     505           0 :         struct btrfs_fs_info *info = block_group->fs_info;
     506           0 :         u64 extent_start, extent_end, size;
     507           0 :         int ret;
     508             : 
     509           0 :         if (total_added_ret)
     510           0 :                 *total_added_ret = 0;
     511             : 
     512           0 :         while (start < end) {
     513           0 :                 ret = find_first_extent_bit(&info->excluded_extents, start,
     514             :                                             &extent_start, &extent_end,
     515             :                                             EXTENT_DIRTY | EXTENT_UPTODATE,
     516             :                                             NULL);
     517           0 :                 if (ret)
     518             :                         break;
     519             : 
     520           0 :                 if (extent_start <= start) {
     521           0 :                         start = extent_end + 1;
     522           0 :                 } else if (extent_start > start && extent_start < end) {
     523           0 :                         size = extent_start - start;
     524           0 :                         ret = btrfs_add_free_space_async_trimmed(block_group,
     525             :                                                                  start, size);
     526           0 :                         if (ret)
     527           0 :                                 return ret;
     528           0 :                         if (total_added_ret)
     529           0 :                                 *total_added_ret += size;
     530           0 :                         start = extent_end + 1;
     531             :                 } else {
     532             :                         break;
     533             :                 }
     534             :         }
     535             : 
     536           0 :         if (start < end) {
     537           0 :                 size = end - start;
     538           0 :                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
     539             :                                                          size);
     540           0 :                 if (ret)
     541             :                         return ret;
     542           0 :                 if (total_added_ret)
     543           0 :                         *total_added_ret += size;
     544             :         }
     545             : 
     546             :         return 0;
     547             : }
     548             : 
     549             : /*
     550             :  * Get an arbitrary extent item index / max_index through the block group
     551             :  *
     552             :  * @block_group   the block group to sample from
     553             :  * @index:        the integral step through the block group to grab from
     554             :  * @max_index:    the granularity of the sampling
     555             :  * @key:          return value parameter for the item we find
     556             :  *
     557             :  * Pre-conditions on indices:
     558             :  * 0 <= index <= max_index
     559             :  * 0 < max_index
     560             :  *
     561             :  * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
     562             :  * error code on error.
     563             :  */
     564           0 : static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
     565             :                                           struct btrfs_block_group *block_group,
     566             :                                           int index, int max_index,
     567             :                                           struct btrfs_key *found_key)
     568             : {
     569           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     570           0 :         struct btrfs_root *extent_root;
     571           0 :         u64 search_offset;
     572           0 :         u64 search_end = block_group->start + block_group->length;
     573           0 :         struct btrfs_path *path;
     574           0 :         struct btrfs_key search_key;
     575           0 :         int ret = 0;
     576             : 
     577           0 :         ASSERT(index >= 0);
     578           0 :         ASSERT(index <= max_index);
     579           0 :         ASSERT(max_index > 0);
     580           0 :         lockdep_assert_held(&caching_ctl->mutex);
     581           0 :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     582             : 
     583           0 :         path = btrfs_alloc_path();
     584           0 :         if (!path)
     585             :                 return -ENOMEM;
     586             : 
     587           0 :         extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
     588             :                                                        BTRFS_SUPER_INFO_OFFSET));
     589             : 
     590           0 :         path->skip_locking = 1;
     591           0 :         path->search_commit_root = 1;
     592           0 :         path->reada = READA_FORWARD;
     593             : 
     594           0 :         search_offset = index * div_u64(block_group->length, max_index);
     595           0 :         search_key.objectid = block_group->start + search_offset;
     596           0 :         search_key.type = BTRFS_EXTENT_ITEM_KEY;
     597           0 :         search_key.offset = 0;
     598             : 
     599           0 :         btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
     600             :                 /* Success; sampled an extent item in the block group */
     601           0 :                 if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
     602           0 :                     found_key->objectid >= block_group->start &&
     603           0 :                     found_key->objectid + found_key->offset <= search_end)
     604             :                         break;
     605             : 
     606             :                 /* We can't possibly find a valid extent item anymore */
     607           0 :                 if (found_key->objectid >= search_end) {
     608             :                         ret = 1;
     609             :                         break;
     610             :                 }
     611             :         }
     612             : 
     613           0 :         lockdep_assert_held(&caching_ctl->mutex);
     614           0 :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     615           0 :         btrfs_free_path(path);
     616           0 :         return ret;
     617             : }
     618             : 
     619             : /*
     620             :  * Best effort attempt to compute a block group's size class while caching it.
     621             :  *
     622             :  * @block_group: the block group we are caching
     623             :  *
     624             :  * We cannot infer the size class while adding free space extents, because that
     625             :  * logic doesn't care about contiguous file extents (it doesn't differentiate
     626             :  * between a 100M extent and 100 contiguous 1M extents). So we need to read the
     627             :  * file extent items. Reading all of them is quite wasteful, because usually
     628             :  * only a handful are enough to give a good answer. Therefore, we just grab 5 of
     629             :  * them at even steps through the block group and pick the smallest size class
     630             :  * we see. Since size class is best effort, and not guaranteed in general,
     631             :  * inaccuracy is acceptable.
     632             :  *
     633             :  * To be more explicit about why this algorithm makes sense:
     634             :  *
     635             :  * If we are caching in a block group from disk, then there are three major cases
     636             :  * to consider:
     637             :  * 1. the block group is well behaved and all extents in it are the same size
     638             :  *    class.
     639             :  * 2. the block group is mostly one size class with rare exceptions for last
     640             :  *    ditch allocations
     641             :  * 3. the block group was populated before size classes and can have a totally
     642             :  *    arbitrary mix of size classes.
     643             :  *
     644             :  * In case 1, looking at any extent in the block group will yield the correct
     645             :  * result. For the mixed cases, taking the minimum size class seems like a good
     646             :  * approximation, since gaps from frees will be usable to the size class. For
     647             :  * 2., a small handful of file extents is likely to yield the right answer. For
     648             :  * 3, we can either read every file extent, or admit that this is best effort
     649             :  * anyway and try to stay fast.
     650             :  *
     651             :  * Returns: 0 on success, negative error code on error.
     652             :  */
     653           0 : static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
     654             :                                        struct btrfs_block_group *block_group)
     655             : {
     656           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     657           0 :         struct btrfs_key key;
     658           0 :         int i;
     659           0 :         u64 min_size = block_group->length;
     660           0 :         enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
     661           0 :         int ret;
     662             : 
     663           0 :         if (!btrfs_block_group_should_use_size_class(block_group))
     664             :                 return 0;
     665             : 
     666             :         lockdep_assert_held(&caching_ctl->mutex);
     667             :         lockdep_assert_held_read(&fs_info->commit_root_sem);
     668           0 :         for (i = 0; i < 5; ++i) {
     669           0 :                 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
     670           0 :                 if (ret < 0)
     671           0 :                         goto out;
     672           0 :                 if (ret > 0)
     673           0 :                         continue;
     674           0 :                 min_size = min_t(u64, min_size, key.offset);
     675           0 :                 size_class = btrfs_calc_block_group_size_class(min_size);
     676             :         }
     677           0 :         if (size_class != BTRFS_BG_SZ_NONE) {
     678           0 :                 spin_lock(&block_group->lock);
     679           0 :                 block_group->size_class = size_class;
     680           0 :                 spin_unlock(&block_group->lock);
     681             :         }
     682           0 : out:
     683             :         return ret;
     684             : }
     685             : 
     686           0 : static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
     687             : {
     688           0 :         struct btrfs_block_group *block_group = caching_ctl->block_group;
     689           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
     690           0 :         struct btrfs_root *extent_root;
     691           0 :         struct btrfs_path *path;
     692           0 :         struct extent_buffer *leaf;
     693           0 :         struct btrfs_key key;
     694           0 :         u64 total_found = 0;
     695           0 :         u64 last = 0;
     696           0 :         u32 nritems;
     697           0 :         int ret;
     698           0 :         bool wakeup = true;
     699             : 
     700           0 :         path = btrfs_alloc_path();
     701           0 :         if (!path)
     702             :                 return -ENOMEM;
     703             : 
     704           0 :         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
     705           0 :         extent_root = btrfs_extent_root(fs_info, last);
     706             : 
     707             : #ifdef CONFIG_BTRFS_DEBUG
     708             :         /*
     709             :          * If we're fragmenting we don't want to make anybody think we can
     710             :          * allocate from this block group until we've had a chance to fragment
     711             :          * the free space.
     712             :          */
     713             :         if (btrfs_should_fragment_free_space(block_group))
     714             :                 wakeup = false;
     715             : #endif
     716             :         /*
     717             :          * We don't want to deadlock with somebody trying to allocate a new
     718             :          * extent for the extent root while also trying to search the extent
     719             :          * root to add free space.  So we skip locking and search the commit
     720             :          * root, since its read-only
     721             :          */
     722           0 :         path->skip_locking = 1;
     723           0 :         path->search_commit_root = 1;
     724           0 :         path->reada = READA_FORWARD;
     725             : 
     726           0 :         key.objectid = last;
     727           0 :         key.offset = 0;
     728           0 :         key.type = BTRFS_EXTENT_ITEM_KEY;
     729             : 
     730             : next:
     731           0 :         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
     732           0 :         if (ret < 0)
     733           0 :                 goto out;
     734             : 
     735           0 :         leaf = path->nodes[0];
     736           0 :         nritems = btrfs_header_nritems(leaf);
     737             : 
     738           0 :         while (1) {
     739           0 :                 if (btrfs_fs_closing(fs_info) > 1) {
     740             :                         last = (u64)-1;
     741             :                         break;
     742             :                 }
     743             : 
     744           0 :                 if (path->slots[0] < nritems) {
     745           0 :                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
     746             :                 } else {
     747           0 :                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
     748           0 :                         if (ret)
     749             :                                 break;
     750             : 
     751           0 :                         if (need_resched() ||
     752             :                             rwsem_is_contended(&fs_info->commit_root_sem)) {
     753           0 :                                 btrfs_release_path(path);
     754           0 :                                 up_read(&fs_info->commit_root_sem);
     755           0 :                                 mutex_unlock(&caching_ctl->mutex);
     756           0 :                                 cond_resched();
     757           0 :                                 mutex_lock(&caching_ctl->mutex);
     758           0 :                                 down_read(&fs_info->commit_root_sem);
     759           0 :                                 goto next;
     760             :                         }
     761             : 
     762           0 :                         ret = btrfs_next_leaf(extent_root, path);
     763           0 :                         if (ret < 0)
     764           0 :                                 goto out;
     765           0 :                         if (ret)
     766             :                                 break;
     767           0 :                         leaf = path->nodes[0];
     768           0 :                         nritems = btrfs_header_nritems(leaf);
     769           0 :                         continue;
     770             :                 }
     771             : 
     772           0 :                 if (key.objectid < last) {
     773           0 :                         key.objectid = last;
     774           0 :                         key.offset = 0;
     775           0 :                         key.type = BTRFS_EXTENT_ITEM_KEY;
     776           0 :                         btrfs_release_path(path);
     777           0 :                         goto next;
     778             :                 }
     779             : 
     780           0 :                 if (key.objectid < block_group->start) {
     781           0 :                         path->slots[0]++;
     782           0 :                         continue;
     783             :                 }
     784             : 
     785           0 :                 if (key.objectid >= block_group->start + block_group->length)
     786             :                         break;
     787             : 
     788           0 :                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
     789             :                     key.type == BTRFS_METADATA_ITEM_KEY) {
     790           0 :                         u64 space_added;
     791             : 
     792           0 :                         ret = add_new_free_space(block_group, last, key.objectid,
     793             :                                                  &space_added);
     794           0 :                         if (ret)
     795           0 :                                 goto out;
     796           0 :                         total_found += space_added;
     797           0 :                         if (key.type == BTRFS_METADATA_ITEM_KEY)
     798           0 :                                 last = key.objectid +
     799           0 :                                         fs_info->nodesize;
     800             :                         else
     801           0 :                                 last = key.objectid + key.offset;
     802             : 
     803           0 :                         if (total_found > CACHING_CTL_WAKE_UP) {
     804           0 :                                 total_found = 0;
     805           0 :                                 if (wakeup)
     806           0 :                                         wake_up(&caching_ctl->wait);
     807             :                         }
     808             :                 }
     809           0 :                 path->slots[0]++;
     810             :         }
     811             : 
     812           0 :         ret = add_new_free_space(block_group, last,
     813           0 :                                  block_group->start + block_group->length,
     814             :                                  NULL);
     815           0 : out:
     816           0 :         btrfs_free_path(path);
     817           0 :         return ret;
     818             : }
     819             : 
     820           0 : static noinline void caching_thread(struct btrfs_work *work)
     821             : {
     822           0 :         struct btrfs_block_group *block_group;
     823           0 :         struct btrfs_fs_info *fs_info;
     824           0 :         struct btrfs_caching_control *caching_ctl;
     825           0 :         int ret;
     826             : 
     827           0 :         caching_ctl = container_of(work, struct btrfs_caching_control, work);
     828           0 :         block_group = caching_ctl->block_group;
     829           0 :         fs_info = block_group->fs_info;
     830             : 
     831           0 :         mutex_lock(&caching_ctl->mutex);
     832           0 :         down_read(&fs_info->commit_root_sem);
     833             : 
     834           0 :         load_block_group_size_class(caching_ctl, block_group);
     835           0 :         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
     836           0 :                 ret = load_free_space_cache(block_group);
     837           0 :                 if (ret == 1) {
     838           0 :                         ret = 0;
     839           0 :                         goto done;
     840             :                 }
     841             : 
     842             :                 /*
     843             :                  * We failed to load the space cache, set ourselves to
     844             :                  * CACHE_STARTED and carry on.
     845             :                  */
     846           0 :                 spin_lock(&block_group->lock);
     847           0 :                 block_group->cached = BTRFS_CACHE_STARTED;
     848           0 :                 spin_unlock(&block_group->lock);
     849           0 :                 wake_up(&caching_ctl->wait);
     850             :         }
     851             : 
     852             :         /*
     853             :          * If we are in the transaction that populated the free space tree we
     854             :          * can't actually cache from the free space tree as our commit root and
     855             :          * real root are the same, so we could change the contents of the blocks
     856             :          * while caching.  Instead do the slow caching in this case, and after
     857             :          * the transaction has committed we will be safe.
     858             :          */
     859           0 :         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
     860           0 :             !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
     861           0 :                 ret = load_free_space_tree(caching_ctl);
     862             :         else
     863           0 :                 ret = load_extent_tree_free(caching_ctl);
     864           0 : done:
     865           0 :         spin_lock(&block_group->lock);
     866           0 :         block_group->caching_ctl = NULL;
     867           0 :         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
     868           0 :         spin_unlock(&block_group->lock);
     869             : 
     870             : #ifdef CONFIG_BTRFS_DEBUG
     871             :         if (btrfs_should_fragment_free_space(block_group)) {
     872             :                 u64 bytes_used;
     873             : 
     874             :                 spin_lock(&block_group->space_info->lock);
     875             :                 spin_lock(&block_group->lock);
     876             :                 bytes_used = block_group->length - block_group->used;
     877             :                 block_group->space_info->bytes_used += bytes_used >> 1;
     878             :                 spin_unlock(&block_group->lock);
     879             :                 spin_unlock(&block_group->space_info->lock);
     880             :                 fragment_free_space(block_group);
     881             :         }
     882             : #endif
     883             : 
     884           0 :         up_read(&fs_info->commit_root_sem);
     885           0 :         btrfs_free_excluded_extents(block_group);
     886           0 :         mutex_unlock(&caching_ctl->mutex);
     887             : 
     888           0 :         wake_up(&caching_ctl->wait);
     889             : 
     890           0 :         btrfs_put_caching_control(caching_ctl);
     891           0 :         btrfs_put_block_group(block_group);
     892           0 : }
     893             : 
     894           0 : int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
     895             : {
     896           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     897           0 :         struct btrfs_caching_control *caching_ctl = NULL;
     898           0 :         int ret = 0;
     899             : 
     900             :         /* Allocator for zoned filesystems does not use the cache at all */
     901           0 :         if (btrfs_is_zoned(fs_info))
     902             :                 return 0;
     903             : 
     904           0 :         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
     905           0 :         if (!caching_ctl)
     906             :                 return -ENOMEM;
     907             : 
     908           0 :         INIT_LIST_HEAD(&caching_ctl->list);
     909           0 :         mutex_init(&caching_ctl->mutex);
     910           0 :         init_waitqueue_head(&caching_ctl->wait);
     911           0 :         caching_ctl->block_group = cache;
     912           0 :         refcount_set(&caching_ctl->count, 2);
     913           0 :         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
     914             : 
     915           0 :         spin_lock(&cache->lock);
     916           0 :         if (cache->cached != BTRFS_CACHE_NO) {
     917           0 :                 kfree(caching_ctl);
     918             : 
     919           0 :                 caching_ctl = cache->caching_ctl;
     920           0 :                 if (caching_ctl)
     921           0 :                         refcount_inc(&caching_ctl->count);
     922           0 :                 spin_unlock(&cache->lock);
     923           0 :                 goto out;
     924             :         }
     925           0 :         WARN_ON(cache->caching_ctl);
     926           0 :         cache->caching_ctl = caching_ctl;
     927           0 :         cache->cached = BTRFS_CACHE_STARTED;
     928           0 :         spin_unlock(&cache->lock);
     929             : 
     930           0 :         write_lock(&fs_info->block_group_cache_lock);
     931           0 :         refcount_inc(&caching_ctl->count);
     932           0 :         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
     933           0 :         write_unlock(&fs_info->block_group_cache_lock);
     934             : 
     935           0 :         btrfs_get_block_group(cache);
     936             : 
     937           0 :         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
     938           0 : out:
     939           0 :         if (wait && caching_ctl)
     940           0 :                 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
     941           0 :         if (caching_ctl)
     942           0 :                 btrfs_put_caching_control(caching_ctl);
     943             : 
     944             :         return ret;
     945             : }
     946             : 
     947           0 : static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
     948             : {
     949           0 :         u64 extra_flags = chunk_to_extended(flags) &
     950             :                                 BTRFS_EXTENDED_PROFILE_MASK;
     951             : 
     952           0 :         write_seqlock(&fs_info->profiles_lock);
     953           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA)
     954           0 :                 fs_info->avail_data_alloc_bits &= ~extra_flags;
     955           0 :         if (flags & BTRFS_BLOCK_GROUP_METADATA)
     956           0 :                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
     957           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
     958           0 :                 fs_info->avail_system_alloc_bits &= ~extra_flags;
     959           0 :         write_sequnlock(&fs_info->profiles_lock);
     960           0 : }
     961             : 
     962             : /*
     963             :  * Clear incompat bits for the following feature(s):
     964             :  *
     965             :  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
     966             :  *            in the whole filesystem
     967             :  *
     968             :  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
     969             :  */
     970           0 : static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
     971             : {
     972           0 :         bool found_raid56 = false;
     973           0 :         bool found_raid1c34 = false;
     974             : 
     975           0 :         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
     976           0 :             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
     977             :             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
     978           0 :                 struct list_head *head = &fs_info->space_info;
     979           0 :                 struct btrfs_space_info *sinfo;
     980             : 
     981           0 :                 list_for_each_entry_rcu(sinfo, head, list) {
     982           0 :                         down_read(&sinfo->groups_sem);
     983           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
     984           0 :                                 found_raid56 = true;
     985           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
     986           0 :                                 found_raid56 = true;
     987           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
     988           0 :                                 found_raid1c34 = true;
     989           0 :                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
     990           0 :                                 found_raid1c34 = true;
     991           0 :                         up_read(&sinfo->groups_sem);
     992             :                 }
     993           0 :                 if (!found_raid56)
     994           0 :                         btrfs_clear_fs_incompat(fs_info, RAID56);
     995           0 :                 if (!found_raid1c34)
     996           0 :                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
     997             :         }
     998           0 : }
     999             : 
    1000           0 : static int remove_block_group_item(struct btrfs_trans_handle *trans,
    1001             :                                    struct btrfs_path *path,
    1002             :                                    struct btrfs_block_group *block_group)
    1003             : {
    1004           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    1005           0 :         struct btrfs_root *root;
    1006           0 :         struct btrfs_key key;
    1007           0 :         int ret;
    1008             : 
    1009           0 :         root = btrfs_block_group_root(fs_info);
    1010           0 :         key.objectid = block_group->start;
    1011           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    1012           0 :         key.offset = block_group->length;
    1013             : 
    1014           0 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    1015           0 :         if (ret > 0)
    1016             :                 ret = -ENOENT;
    1017           0 :         if (ret < 0)
    1018           0 :                 return ret;
    1019             : 
    1020           0 :         ret = btrfs_del_item(trans, root, path);
    1021           0 :         return ret;
    1022             : }
    1023             : 
    1024           0 : int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
    1025             :                              u64 group_start, struct extent_map *em)
    1026             : {
    1027           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    1028           0 :         struct btrfs_path *path;
    1029           0 :         struct btrfs_block_group *block_group;
    1030           0 :         struct btrfs_free_cluster *cluster;
    1031           0 :         struct inode *inode;
    1032           0 :         struct kobject *kobj = NULL;
    1033           0 :         int ret;
    1034           0 :         int index;
    1035           0 :         int factor;
    1036           0 :         struct btrfs_caching_control *caching_ctl = NULL;
    1037           0 :         bool remove_em;
    1038           0 :         bool remove_rsv = false;
    1039             : 
    1040           0 :         block_group = btrfs_lookup_block_group(fs_info, group_start);
    1041           0 :         BUG_ON(!block_group);
    1042           0 :         BUG_ON(!block_group->ro);
    1043             : 
    1044           0 :         trace_btrfs_remove_block_group(block_group);
    1045             :         /*
    1046             :          * Free the reserved super bytes from this block group before
    1047             :          * remove it.
    1048             :          */
    1049           0 :         btrfs_free_excluded_extents(block_group);
    1050           0 :         btrfs_free_ref_tree_range(fs_info, block_group->start,
    1051             :                                   block_group->length);
    1052             : 
    1053           0 :         index = btrfs_bg_flags_to_raid_index(block_group->flags);
    1054           0 :         factor = btrfs_bg_type_to_factor(block_group->flags);
    1055             : 
    1056             :         /* make sure this block group isn't part of an allocation cluster */
    1057           0 :         cluster = &fs_info->data_alloc_cluster;
    1058           0 :         spin_lock(&cluster->refill_lock);
    1059           0 :         btrfs_return_cluster_to_free_space(block_group, cluster);
    1060           0 :         spin_unlock(&cluster->refill_lock);
    1061             : 
    1062             :         /*
    1063             :          * make sure this block group isn't part of a metadata
    1064             :          * allocation cluster
    1065             :          */
    1066           0 :         cluster = &fs_info->meta_alloc_cluster;
    1067           0 :         spin_lock(&cluster->refill_lock);
    1068           0 :         btrfs_return_cluster_to_free_space(block_group, cluster);
    1069           0 :         spin_unlock(&cluster->refill_lock);
    1070             : 
    1071           0 :         btrfs_clear_treelog_bg(block_group);
    1072           0 :         btrfs_clear_data_reloc_bg(block_group);
    1073             : 
    1074           0 :         path = btrfs_alloc_path();
    1075           0 :         if (!path) {
    1076           0 :                 ret = -ENOMEM;
    1077           0 :                 goto out;
    1078             :         }
    1079             : 
    1080             :         /*
    1081             :          * get the inode first so any iput calls done for the io_list
    1082             :          * aren't the final iput (no unlinks allowed now)
    1083             :          */
    1084           0 :         inode = lookup_free_space_inode(block_group, path);
    1085             : 
    1086           0 :         mutex_lock(&trans->transaction->cache_write_mutex);
    1087             :         /*
    1088             :          * Make sure our free space cache IO is done before removing the
    1089             :          * free space inode
    1090             :          */
    1091           0 :         spin_lock(&trans->transaction->dirty_bgs_lock);
    1092           0 :         if (!list_empty(&block_group->io_list)) {
    1093           0 :                 list_del_init(&block_group->io_list);
    1094             : 
    1095           0 :                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
    1096             : 
    1097           0 :                 spin_unlock(&trans->transaction->dirty_bgs_lock);
    1098           0 :                 btrfs_wait_cache_io(trans, block_group, path);
    1099           0 :                 btrfs_put_block_group(block_group);
    1100           0 :                 spin_lock(&trans->transaction->dirty_bgs_lock);
    1101             :         }
    1102             : 
    1103           0 :         if (!list_empty(&block_group->dirty_list)) {
    1104           0 :                 list_del_init(&block_group->dirty_list);
    1105           0 :                 remove_rsv = true;
    1106           0 :                 btrfs_put_block_group(block_group);
    1107             :         }
    1108           0 :         spin_unlock(&trans->transaction->dirty_bgs_lock);
    1109           0 :         mutex_unlock(&trans->transaction->cache_write_mutex);
    1110             : 
    1111           0 :         ret = btrfs_remove_free_space_inode(trans, inode, block_group);
    1112           0 :         if (ret)
    1113           0 :                 goto out;
    1114             : 
    1115           0 :         write_lock(&fs_info->block_group_cache_lock);
    1116           0 :         rb_erase_cached(&block_group->cache_node,
    1117             :                         &fs_info->block_group_cache_tree);
    1118           0 :         RB_CLEAR_NODE(&block_group->cache_node);
    1119             : 
    1120             :         /* Once for the block groups rbtree */
    1121           0 :         btrfs_put_block_group(block_group);
    1122             : 
    1123           0 :         write_unlock(&fs_info->block_group_cache_lock);
    1124             : 
    1125           0 :         down_write(&block_group->space_info->groups_sem);
    1126             :         /*
    1127             :          * we must use list_del_init so people can check to see if they
    1128             :          * are still on the list after taking the semaphore
    1129             :          */
    1130           0 :         list_del_init(&block_group->list);
    1131           0 :         if (list_empty(&block_group->space_info->block_groups[index])) {
    1132           0 :                 kobj = block_group->space_info->block_group_kobjs[index];
    1133           0 :                 block_group->space_info->block_group_kobjs[index] = NULL;
    1134           0 :                 clear_avail_alloc_bits(fs_info, block_group->flags);
    1135             :         }
    1136           0 :         up_write(&block_group->space_info->groups_sem);
    1137           0 :         clear_incompat_bg_bits(fs_info, block_group->flags);
    1138           0 :         if (kobj) {
    1139           0 :                 kobject_del(kobj);
    1140           0 :                 kobject_put(kobj);
    1141             :         }
    1142             : 
    1143           0 :         if (block_group->cached == BTRFS_CACHE_STARTED)
    1144           0 :                 btrfs_wait_block_group_cache_done(block_group);
    1145             : 
    1146           0 :         write_lock(&fs_info->block_group_cache_lock);
    1147           0 :         caching_ctl = btrfs_get_caching_control(block_group);
    1148           0 :         if (!caching_ctl) {
    1149           0 :                 struct btrfs_caching_control *ctl;
    1150             : 
    1151           0 :                 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
    1152           0 :                         if (ctl->block_group == block_group) {
    1153           0 :                                 caching_ctl = ctl;
    1154           0 :                                 refcount_inc(&caching_ctl->count);
    1155             :                                 break;
    1156             :                         }
    1157             :                 }
    1158             :         }
    1159           0 :         if (caching_ctl)
    1160           0 :                 list_del_init(&caching_ctl->list);
    1161           0 :         write_unlock(&fs_info->block_group_cache_lock);
    1162             : 
    1163           0 :         if (caching_ctl) {
    1164             :                 /* Once for the caching bgs list and once for us. */
    1165           0 :                 btrfs_put_caching_control(caching_ctl);
    1166           0 :                 btrfs_put_caching_control(caching_ctl);
    1167             :         }
    1168             : 
    1169           0 :         spin_lock(&trans->transaction->dirty_bgs_lock);
    1170           0 :         WARN_ON(!list_empty(&block_group->dirty_list));
    1171           0 :         WARN_ON(!list_empty(&block_group->io_list));
    1172           0 :         spin_unlock(&trans->transaction->dirty_bgs_lock);
    1173             : 
    1174           0 :         btrfs_remove_free_space_cache(block_group);
    1175             : 
    1176           0 :         spin_lock(&block_group->space_info->lock);
    1177           0 :         list_del_init(&block_group->ro_list);
    1178             : 
    1179           0 :         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
    1180           0 :                 WARN_ON(block_group->space_info->total_bytes
    1181             :                         < block_group->length);
    1182           0 :                 WARN_ON(block_group->space_info->bytes_readonly
    1183             :                         < block_group->length - block_group->zone_unusable);
    1184           0 :                 WARN_ON(block_group->space_info->bytes_zone_unusable
    1185             :                         < block_group->zone_unusable);
    1186           0 :                 WARN_ON(block_group->space_info->disk_total
    1187             :                         < block_group->length * factor);
    1188             :         }
    1189           0 :         block_group->space_info->total_bytes -= block_group->length;
    1190           0 :         block_group->space_info->bytes_readonly -=
    1191           0 :                 (block_group->length - block_group->zone_unusable);
    1192           0 :         block_group->space_info->bytes_zone_unusable -=
    1193           0 :                 block_group->zone_unusable;
    1194           0 :         block_group->space_info->disk_total -= block_group->length * factor;
    1195             : 
    1196           0 :         spin_unlock(&block_group->space_info->lock);
    1197             : 
    1198             :         /*
    1199             :          * Remove the free space for the block group from the free space tree
    1200             :          * and the block group's item from the extent tree before marking the
    1201             :          * block group as removed. This is to prevent races with tasks that
    1202             :          * freeze and unfreeze a block group, this task and another task
    1203             :          * allocating a new block group - the unfreeze task ends up removing
    1204             :          * the block group's extent map before the task calling this function
    1205             :          * deletes the block group item from the extent tree, allowing for
    1206             :          * another task to attempt to create another block group with the same
    1207             :          * item key (and failing with -EEXIST and a transaction abort).
    1208             :          */
    1209           0 :         ret = remove_block_group_free_space(trans, block_group);
    1210           0 :         if (ret)
    1211           0 :                 goto out;
    1212             : 
    1213           0 :         ret = remove_block_group_item(trans, path, block_group);
    1214           0 :         if (ret < 0)
    1215           0 :                 goto out;
    1216             : 
    1217           0 :         spin_lock(&block_group->lock);
    1218           0 :         set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
    1219             : 
    1220             :         /*
    1221             :          * At this point trimming or scrub can't start on this block group,
    1222             :          * because we removed the block group from the rbtree
    1223             :          * fs_info->block_group_cache_tree so no one can't find it anymore and
    1224             :          * even if someone already got this block group before we removed it
    1225             :          * from the rbtree, they have already incremented block_group->frozen -
    1226             :          * if they didn't, for the trimming case they won't find any free space
    1227             :          * entries because we already removed them all when we called
    1228             :          * btrfs_remove_free_space_cache().
    1229             :          *
    1230             :          * And we must not remove the extent map from the fs_info->mapping_tree
    1231             :          * to prevent the same logical address range and physical device space
    1232             :          * ranges from being reused for a new block group. This is needed to
    1233             :          * avoid races with trimming and scrub.
    1234             :          *
    1235             :          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
    1236             :          * completely transactionless, so while it is trimming a range the
    1237             :          * currently running transaction might finish and a new one start,
    1238             :          * allowing for new block groups to be created that can reuse the same
    1239             :          * physical device locations unless we take this special care.
    1240             :          *
    1241             :          * There may also be an implicit trim operation if the file system
    1242             :          * is mounted with -odiscard. The same protections must remain
    1243             :          * in place until the extents have been discarded completely when
    1244             :          * the transaction commit has completed.
    1245             :          */
    1246           0 :         remove_em = (atomic_read(&block_group->frozen) == 0);
    1247           0 :         spin_unlock(&block_group->lock);
    1248             : 
    1249           0 :         if (remove_em) {
    1250           0 :                 struct extent_map_tree *em_tree;
    1251             : 
    1252           0 :                 em_tree = &fs_info->mapping_tree;
    1253           0 :                 write_lock(&em_tree->lock);
    1254           0 :                 remove_extent_mapping(em_tree, em);
    1255           0 :                 write_unlock(&em_tree->lock);
    1256             :                 /* once for the tree */
    1257           0 :                 free_extent_map(em);
    1258             :         }
    1259             : 
    1260           0 : out:
    1261             :         /* Once for the lookup reference */
    1262           0 :         btrfs_put_block_group(block_group);
    1263           0 :         if (remove_rsv)
    1264           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    1265           0 :         btrfs_free_path(path);
    1266           0 :         return ret;
    1267             : }
    1268             : 
    1269           0 : struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
    1270             :                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
    1271             : {
    1272           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    1273           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    1274           0 :         struct extent_map *em;
    1275           0 :         struct map_lookup *map;
    1276           0 :         unsigned int num_items;
    1277             : 
    1278           0 :         read_lock(&em_tree->lock);
    1279           0 :         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
    1280           0 :         read_unlock(&em_tree->lock);
    1281           0 :         ASSERT(em && em->start == chunk_offset);
    1282             : 
    1283             :         /*
    1284             :          * We need to reserve 3 + N units from the metadata space info in order
    1285             :          * to remove a block group (done at btrfs_remove_chunk() and at
    1286             :          * btrfs_remove_block_group()), which are used for:
    1287             :          *
    1288             :          * 1 unit for adding the free space inode's orphan (located in the tree
    1289             :          * of tree roots).
    1290             :          * 1 unit for deleting the block group item (located in the extent
    1291             :          * tree).
    1292             :          * 1 unit for deleting the free space item (located in tree of tree
    1293             :          * roots).
    1294             :          * N units for deleting N device extent items corresponding to each
    1295             :          * stripe (located in the device tree).
    1296             :          *
    1297             :          * In order to remove a block group we also need to reserve units in the
    1298             :          * system space info in order to update the chunk tree (update one or
    1299             :          * more device items and remove one chunk item), but this is done at
    1300             :          * btrfs_remove_chunk() through a call to check_system_chunk().
    1301             :          */
    1302           0 :         map = em->map_lookup;
    1303           0 :         num_items = 3 + map->num_stripes;
    1304           0 :         free_extent_map(em);
    1305             : 
    1306           0 :         return btrfs_start_transaction_fallback_global_rsv(root, num_items);
    1307             : }
    1308             : 
    1309             : /*
    1310             :  * Mark block group @cache read-only, so later write won't happen to block
    1311             :  * group @cache.
    1312             :  *
    1313             :  * If @force is not set, this function will only mark the block group readonly
    1314             :  * if we have enough free space (1M) in other metadata/system block groups.
    1315             :  * If @force is not set, this function will mark the block group readonly
    1316             :  * without checking free space.
    1317             :  *
    1318             :  * NOTE: This function doesn't care if other block groups can contain all the
    1319             :  * data in this block group. That check should be done by relocation routine,
    1320             :  * not this function.
    1321             :  */
    1322           0 : static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
    1323             : {
    1324           0 :         struct btrfs_space_info *sinfo = cache->space_info;
    1325           0 :         u64 num_bytes;
    1326           0 :         int ret = -ENOSPC;
    1327             : 
    1328           0 :         spin_lock(&sinfo->lock);
    1329           0 :         spin_lock(&cache->lock);
    1330             : 
    1331           0 :         if (cache->swap_extents) {
    1332           0 :                 ret = -ETXTBSY;
    1333           0 :                 goto out;
    1334             :         }
    1335             : 
    1336           0 :         if (cache->ro) {
    1337           0 :                 cache->ro++;
    1338           0 :                 ret = 0;
    1339           0 :                 goto out;
    1340             :         }
    1341             : 
    1342           0 :         num_bytes = cache->length - cache->reserved - cache->pinned -
    1343           0 :                     cache->bytes_super - cache->zone_unusable - cache->used;
    1344             : 
    1345             :         /*
    1346             :          * Data never overcommits, even in mixed mode, so do just the straight
    1347             :          * check of left over space in how much we have allocated.
    1348             :          */
    1349           0 :         if (force) {
    1350             :                 ret = 0;
    1351           0 :         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
    1352           0 :                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
    1353             : 
    1354             :                 /*
    1355             :                  * Here we make sure if we mark this bg RO, we still have enough
    1356             :                  * free space as buffer.
    1357             :                  */
    1358           0 :                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
    1359             :                         ret = 0;
    1360             :         } else {
    1361             :                 /*
    1362             :                  * We overcommit metadata, so we need to do the
    1363             :                  * btrfs_can_overcommit check here, and we need to pass in
    1364             :                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
    1365             :                  * leeway to allow us to mark this block group as read only.
    1366             :                  */
    1367           0 :                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
    1368             :                                          BTRFS_RESERVE_NO_FLUSH))
    1369             :                         ret = 0;
    1370             :         }
    1371             : 
    1372             :         if (!ret) {
    1373           0 :                 sinfo->bytes_readonly += num_bytes;
    1374           0 :                 if (btrfs_is_zoned(cache->fs_info)) {
    1375             :                         /* Migrate zone_unusable bytes to readonly */
    1376           0 :                         sinfo->bytes_readonly += cache->zone_unusable;
    1377           0 :                         sinfo->bytes_zone_unusable -= cache->zone_unusable;
    1378           0 :                         cache->zone_unusable = 0;
    1379             :                 }
    1380           0 :                 cache->ro++;
    1381           0 :                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
    1382             :         }
    1383           0 : out:
    1384           0 :         spin_unlock(&cache->lock);
    1385           0 :         spin_unlock(&sinfo->lock);
    1386           0 :         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
    1387           0 :                 btrfs_info(cache->fs_info,
    1388             :                         "unable to make block group %llu ro", cache->start);
    1389           0 :                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
    1390             :         }
    1391           0 :         return ret;
    1392             : }
    1393             : 
    1394           0 : static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
    1395             :                                  struct btrfs_block_group *bg)
    1396             : {
    1397           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1398           0 :         struct btrfs_transaction *prev_trans = NULL;
    1399           0 :         const u64 start = bg->start;
    1400           0 :         const u64 end = start + bg->length - 1;
    1401           0 :         int ret;
    1402             : 
    1403           0 :         spin_lock(&fs_info->trans_lock);
    1404           0 :         if (trans->transaction->list.prev != &fs_info->trans_list) {
    1405           0 :                 prev_trans = list_last_entry(&trans->transaction->list,
    1406             :                                              struct btrfs_transaction, list);
    1407           0 :                 refcount_inc(&prev_trans->use_count);
    1408             :         }
    1409           0 :         spin_unlock(&fs_info->trans_lock);
    1410             : 
    1411             :         /*
    1412             :          * Hold the unused_bg_unpin_mutex lock to avoid racing with
    1413             :          * btrfs_finish_extent_commit(). If we are at transaction N, another
    1414             :          * task might be running finish_extent_commit() for the previous
    1415             :          * transaction N - 1, and have seen a range belonging to the block
    1416             :          * group in pinned_extents before we were able to clear the whole block
    1417             :          * group range from pinned_extents. This means that task can lookup for
    1418             :          * the block group after we unpinned it from pinned_extents and removed
    1419             :          * it, leading to a BUG_ON() at unpin_extent_range().
    1420             :          */
    1421           0 :         mutex_lock(&fs_info->unused_bg_unpin_mutex);
    1422           0 :         if (prev_trans) {
    1423           0 :                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
    1424             :                                         EXTENT_DIRTY);
    1425           0 :                 if (ret)
    1426           0 :                         goto out;
    1427             :         }
    1428             : 
    1429           0 :         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
    1430             :                                 EXTENT_DIRTY);
    1431           0 : out:
    1432           0 :         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
    1433           0 :         if (prev_trans)
    1434           0 :                 btrfs_put_transaction(prev_trans);
    1435             : 
    1436           0 :         return ret == 0;
    1437             : }
    1438             : 
    1439             : /*
    1440             :  * Process the unused_bgs list and remove any that don't have any allocated
    1441             :  * space inside of them.
    1442             :  */
    1443           0 : void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
    1444             : {
    1445           0 :         struct btrfs_block_group *block_group;
    1446           0 :         struct btrfs_space_info *space_info;
    1447           0 :         struct btrfs_trans_handle *trans;
    1448           0 :         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
    1449           0 :         int ret = 0;
    1450             : 
    1451           0 :         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
    1452             :                 return;
    1453             : 
    1454           0 :         if (btrfs_fs_closing(fs_info))
    1455             :                 return;
    1456             : 
    1457             :         /*
    1458             :          * Long running balances can keep us blocked here for eternity, so
    1459             :          * simply skip deletion if we're unable to get the mutex.
    1460             :          */
    1461           0 :         if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
    1462             :                 return;
    1463             : 
    1464           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1465           0 :         while (!list_empty(&fs_info->unused_bgs)) {
    1466           0 :                 int trimming;
    1467             : 
    1468           0 :                 block_group = list_first_entry(&fs_info->unused_bgs,
    1469             :                                                struct btrfs_block_group,
    1470             :                                                bg_list);
    1471           0 :                 list_del_init(&block_group->bg_list);
    1472             : 
    1473           0 :                 space_info = block_group->space_info;
    1474             : 
    1475           0 :                 if (ret || btrfs_mixed_space_info(space_info)) {
    1476           0 :                         btrfs_put_block_group(block_group);
    1477           0 :                         continue;
    1478             :                 }
    1479           0 :                 spin_unlock(&fs_info->unused_bgs_lock);
    1480             : 
    1481           0 :                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
    1482             : 
    1483             :                 /* Don't want to race with allocators so take the groups_sem */
    1484           0 :                 down_write(&space_info->groups_sem);
    1485             : 
    1486             :                 /*
    1487             :                  * Async discard moves the final block group discard to be prior
    1488             :                  * to the unused_bgs code path.  Therefore, if it's not fully
    1489             :                  * trimmed, punt it back to the async discard lists.
    1490             :                  */
    1491           0 :                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
    1492           0 :                     !btrfs_is_free_space_trimmed(block_group)) {
    1493           0 :                         trace_btrfs_skip_unused_block_group(block_group);
    1494           0 :                         up_write(&space_info->groups_sem);
    1495             :                         /* Requeue if we failed because of async discard */
    1496           0 :                         btrfs_discard_queue_work(&fs_info->discard_ctl,
    1497             :                                                  block_group);
    1498           0 :                         goto next;
    1499             :                 }
    1500             : 
    1501           0 :                 spin_lock(&block_group->lock);
    1502           0 :                 if (block_group->reserved || block_group->pinned ||
    1503           0 :                     block_group->used || block_group->ro ||
    1504           0 :                     list_is_singular(&block_group->list)) {
    1505             :                         /*
    1506             :                          * We want to bail if we made new allocations or have
    1507             :                          * outstanding allocations in this block group.  We do
    1508             :                          * the ro check in case balance is currently acting on
    1509             :                          * this block group.
    1510             :                          */
    1511           0 :                         trace_btrfs_skip_unused_block_group(block_group);
    1512           0 :                         spin_unlock(&block_group->lock);
    1513           0 :                         up_write(&space_info->groups_sem);
    1514           0 :                         goto next;
    1515             :                 }
    1516           0 :                 spin_unlock(&block_group->lock);
    1517             : 
    1518             :                 /* We don't want to force the issue, only flip if it's ok. */
    1519           0 :                 ret = inc_block_group_ro(block_group, 0);
    1520           0 :                 up_write(&space_info->groups_sem);
    1521           0 :                 if (ret < 0) {
    1522           0 :                         ret = 0;
    1523           0 :                         goto next;
    1524             :                 }
    1525             : 
    1526           0 :                 ret = btrfs_zone_finish(block_group);
    1527           0 :                 if (ret < 0) {
    1528           0 :                         btrfs_dec_block_group_ro(block_group);
    1529           0 :                         if (ret == -EAGAIN)
    1530           0 :                                 ret = 0;
    1531           0 :                         goto next;
    1532             :                 }
    1533             : 
    1534             :                 /*
    1535             :                  * Want to do this before we do anything else so we can recover
    1536             :                  * properly if we fail to join the transaction.
    1537             :                  */
    1538           0 :                 trans = btrfs_start_trans_remove_block_group(fs_info,
    1539             :                                                      block_group->start);
    1540           0 :                 if (IS_ERR(trans)) {
    1541           0 :                         btrfs_dec_block_group_ro(block_group);
    1542           0 :                         ret = PTR_ERR(trans);
    1543           0 :                         goto next;
    1544             :                 }
    1545             : 
    1546             :                 /*
    1547             :                  * We could have pending pinned extents for this block group,
    1548             :                  * just delete them, we don't care about them anymore.
    1549             :                  */
    1550           0 :                 if (!clean_pinned_extents(trans, block_group)) {
    1551           0 :                         btrfs_dec_block_group_ro(block_group);
    1552           0 :                         goto end_trans;
    1553             :                 }
    1554             : 
    1555             :                 /*
    1556             :                  * At this point, the block_group is read only and should fail
    1557             :                  * new allocations.  However, btrfs_finish_extent_commit() can
    1558             :                  * cause this block_group to be placed back on the discard
    1559             :                  * lists because now the block_group isn't fully discarded.
    1560             :                  * Bail here and try again later after discarding everything.
    1561             :                  */
    1562           0 :                 spin_lock(&fs_info->discard_ctl.lock);
    1563           0 :                 if (!list_empty(&block_group->discard_list)) {
    1564           0 :                         spin_unlock(&fs_info->discard_ctl.lock);
    1565           0 :                         btrfs_dec_block_group_ro(block_group);
    1566           0 :                         btrfs_discard_queue_work(&fs_info->discard_ctl,
    1567             :                                                  block_group);
    1568           0 :                         goto end_trans;
    1569             :                 }
    1570           0 :                 spin_unlock(&fs_info->discard_ctl.lock);
    1571             : 
    1572             :                 /* Reset pinned so btrfs_put_block_group doesn't complain */
    1573           0 :                 spin_lock(&space_info->lock);
    1574           0 :                 spin_lock(&block_group->lock);
    1575             : 
    1576           0 :                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
    1577           0 :                                                      -block_group->pinned);
    1578           0 :                 space_info->bytes_readonly += block_group->pinned;
    1579           0 :                 block_group->pinned = 0;
    1580             : 
    1581           0 :                 spin_unlock(&block_group->lock);
    1582           0 :                 spin_unlock(&space_info->lock);
    1583             : 
    1584             :                 /*
    1585             :                  * The normal path here is an unused block group is passed here,
    1586             :                  * then trimming is handled in the transaction commit path.
    1587             :                  * Async discard interposes before this to do the trimming
    1588             :                  * before coming down the unused block group path as trimming
    1589             :                  * will no longer be done later in the transaction commit path.
    1590             :                  */
    1591           0 :                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
    1592           0 :                         goto flip_async;
    1593             : 
    1594             :                 /*
    1595             :                  * DISCARD can flip during remount. On zoned filesystems, we
    1596             :                  * need to reset sequential-required zones.
    1597             :                  */
    1598           0 :                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
    1599             :                                 btrfs_is_zoned(fs_info);
    1600             : 
    1601             :                 /* Implicit trim during transaction commit. */
    1602           0 :                 if (trimming)
    1603           0 :                         btrfs_freeze_block_group(block_group);
    1604             : 
    1605             :                 /*
    1606             :                  * Btrfs_remove_chunk will abort the transaction if things go
    1607             :                  * horribly wrong.
    1608             :                  */
    1609           0 :                 ret = btrfs_remove_chunk(trans, block_group->start);
    1610             : 
    1611           0 :                 if (ret) {
    1612           0 :                         if (trimming)
    1613           0 :                                 btrfs_unfreeze_block_group(block_group);
    1614           0 :                         goto end_trans;
    1615             :                 }
    1616             : 
    1617             :                 /*
    1618             :                  * If we're not mounted with -odiscard, we can just forget
    1619             :                  * about this block group. Otherwise we'll need to wait
    1620             :                  * until transaction commit to do the actual discard.
    1621             :                  */
    1622           0 :                 if (trimming) {
    1623           0 :                         spin_lock(&fs_info->unused_bgs_lock);
    1624             :                         /*
    1625             :                          * A concurrent scrub might have added us to the list
    1626             :                          * fs_info->unused_bgs, so use a list_move operation
    1627             :                          * to add the block group to the deleted_bgs list.
    1628             :                          */
    1629           0 :                         list_move(&block_group->bg_list,
    1630           0 :                                   &trans->transaction->deleted_bgs);
    1631           0 :                         spin_unlock(&fs_info->unused_bgs_lock);
    1632           0 :                         btrfs_get_block_group(block_group);
    1633             :                 }
    1634           0 : end_trans:
    1635           0 :                 btrfs_end_transaction(trans);
    1636           0 : next:
    1637           0 :                 btrfs_put_block_group(block_group);
    1638           0 :                 spin_lock(&fs_info->unused_bgs_lock);
    1639             :         }
    1640           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1641           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1642           0 :         return;
    1643             : 
    1644             : flip_async:
    1645           0 :         btrfs_end_transaction(trans);
    1646           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1647           0 :         btrfs_put_block_group(block_group);
    1648           0 :         btrfs_discard_punt_unused_bgs_list(fs_info);
    1649             : }
    1650             : 
    1651           0 : void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
    1652             : {
    1653           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1654             : 
    1655           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1656           0 :         if (list_empty(&bg->bg_list)) {
    1657           0 :                 btrfs_get_block_group(bg);
    1658           0 :                 trace_btrfs_add_unused_block_group(bg);
    1659           0 :                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
    1660           0 :         } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
    1661             :                 /* Pull out the block group from the reclaim_bgs list. */
    1662           0 :                 trace_btrfs_add_unused_block_group(bg);
    1663           0 :                 list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
    1664             :         }
    1665           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1666           0 : }
    1667             : 
    1668             : /*
    1669             :  * We want block groups with a low number of used bytes to be in the beginning
    1670             :  * of the list, so they will get reclaimed first.
    1671             :  */
    1672           0 : static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
    1673             :                            const struct list_head *b)
    1674             : {
    1675           0 :         const struct btrfs_block_group *bg1, *bg2;
    1676             : 
    1677           0 :         bg1 = list_entry(a, struct btrfs_block_group, bg_list);
    1678           0 :         bg2 = list_entry(b, struct btrfs_block_group, bg_list);
    1679             : 
    1680           0 :         return bg1->used > bg2->used;
    1681             : }
    1682             : 
    1683             : static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
    1684             : {
    1685           0 :         if (btrfs_is_zoned(fs_info))
    1686           0 :                 return btrfs_zoned_should_reclaim(fs_info);
    1687             :         return true;
    1688             : }
    1689             : 
    1690           0 : static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
    1691             : {
    1692           0 :         const struct btrfs_space_info *space_info = bg->space_info;
    1693           0 :         const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
    1694           0 :         const u64 new_val = bg->used;
    1695           0 :         const u64 old_val = new_val + bytes_freed;
    1696           0 :         u64 thresh;
    1697             : 
    1698           0 :         if (reclaim_thresh == 0)
    1699             :                 return false;
    1700             : 
    1701           0 :         thresh = mult_perc(bg->length, reclaim_thresh);
    1702             : 
    1703             :         /*
    1704             :          * If we were below the threshold before don't reclaim, we are likely a
    1705             :          * brand new block group and we don't want to relocate new block groups.
    1706             :          */
    1707           0 :         if (old_val < thresh)
    1708             :                 return false;
    1709           0 :         if (new_val >= thresh)
    1710           0 :                 return false;
    1711             :         return true;
    1712             : }
    1713             : 
    1714           0 : void btrfs_reclaim_bgs_work(struct work_struct *work)
    1715             : {
    1716           0 :         struct btrfs_fs_info *fs_info =
    1717           0 :                 container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
    1718           0 :         struct btrfs_block_group *bg;
    1719           0 :         struct btrfs_space_info *space_info;
    1720             : 
    1721           0 :         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
    1722             :                 return;
    1723             : 
    1724           0 :         if (btrfs_fs_closing(fs_info))
    1725             :                 return;
    1726             : 
    1727           0 :         if (!btrfs_should_reclaim(fs_info))
    1728             :                 return;
    1729             : 
    1730           0 :         sb_start_write(fs_info->sb);
    1731             : 
    1732           0 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
    1733           0 :                 sb_end_write(fs_info->sb);
    1734           0 :                 return;
    1735             :         }
    1736             : 
    1737             :         /*
    1738             :          * Long running balances can keep us blocked here for eternity, so
    1739             :          * simply skip reclaim if we're unable to get the mutex.
    1740             :          */
    1741           0 :         if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
    1742           0 :                 btrfs_exclop_finish(fs_info);
    1743           0 :                 sb_end_write(fs_info->sb);
    1744           0 :                 return;
    1745             :         }
    1746             : 
    1747           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1748             :         /*
    1749             :          * Sort happens under lock because we can't simply splice it and sort.
    1750             :          * The block groups might still be in use and reachable via bg_list,
    1751             :          * and their presence in the reclaim_bgs list must be preserved.
    1752             :          */
    1753           0 :         list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
    1754           0 :         while (!list_empty(&fs_info->reclaim_bgs)) {
    1755           0 :                 u64 zone_unusable;
    1756           0 :                 int ret = 0;
    1757             : 
    1758           0 :                 bg = list_first_entry(&fs_info->reclaim_bgs,
    1759             :                                       struct btrfs_block_group,
    1760             :                                       bg_list);
    1761           0 :                 list_del_init(&bg->bg_list);
    1762             : 
    1763           0 :                 space_info = bg->space_info;
    1764           0 :                 spin_unlock(&fs_info->unused_bgs_lock);
    1765             : 
    1766             :                 /* Don't race with allocators so take the groups_sem */
    1767           0 :                 down_write(&space_info->groups_sem);
    1768             : 
    1769           0 :                 spin_lock(&bg->lock);
    1770           0 :                 if (bg->reserved || bg->pinned || bg->ro) {
    1771             :                         /*
    1772             :                          * We want to bail if we made new allocations or have
    1773             :                          * outstanding allocations in this block group.  We do
    1774             :                          * the ro check in case balance is currently acting on
    1775             :                          * this block group.
    1776             :                          */
    1777           0 :                         spin_unlock(&bg->lock);
    1778           0 :                         up_write(&space_info->groups_sem);
    1779           0 :                         goto next;
    1780             :                 }
    1781           0 :                 if (bg->used == 0) {
    1782             :                         /*
    1783             :                          * It is possible that we trigger relocation on a block
    1784             :                          * group as its extents are deleted and it first goes
    1785             :                          * below the threshold, then shortly after goes empty.
    1786             :                          *
    1787             :                          * In this case, relocating it does delete it, but has
    1788             :                          * some overhead in relocation specific metadata, looking
    1789             :                          * for the non-existent extents and running some extra
    1790             :                          * transactions, which we can avoid by using one of the
    1791             :                          * other mechanisms for dealing with empty block groups.
    1792             :                          */
    1793           0 :                         if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
    1794           0 :                                 btrfs_mark_bg_unused(bg);
    1795           0 :                         spin_unlock(&bg->lock);
    1796           0 :                         up_write(&space_info->groups_sem);
    1797           0 :                         goto next;
    1798             : 
    1799             :                 }
    1800             :                 /*
    1801             :                  * The block group might no longer meet the reclaim condition by
    1802             :                  * the time we get around to reclaiming it, so to avoid
    1803             :                  * reclaiming overly full block_groups, skip reclaiming them.
    1804             :                  *
    1805             :                  * Since the decision making process also depends on the amount
    1806             :                  * being freed, pass in a fake giant value to skip that extra
    1807             :                  * check, which is more meaningful when adding to the list in
    1808             :                  * the first place.
    1809             :                  */
    1810           0 :                 if (!should_reclaim_block_group(bg, bg->length)) {
    1811           0 :                         spin_unlock(&bg->lock);
    1812           0 :                         up_write(&space_info->groups_sem);
    1813           0 :                         goto next;
    1814             :                 }
    1815           0 :                 spin_unlock(&bg->lock);
    1816             : 
    1817             :                 /*
    1818             :                  * Get out fast, in case we're read-only or unmounting the
    1819             :                  * filesystem. It is OK to drop block groups from the list even
    1820             :                  * for the read-only case. As we did sb_start_write(),
    1821             :                  * "mount -o remount,ro" won't happen and read-only filesystem
    1822             :                  * means it is forced read-only due to a fatal error. So, it
    1823             :                  * never gets back to read-write to let us reclaim again.
    1824             :                  */
    1825           0 :                 if (btrfs_need_cleaner_sleep(fs_info)) {
    1826           0 :                         up_write(&space_info->groups_sem);
    1827           0 :                         goto next;
    1828             :                 }
    1829             : 
    1830             :                 /*
    1831             :                  * Cache the zone_unusable value before turning the block group
    1832             :                  * to read only. As soon as the blog group is read only it's
    1833             :                  * zone_unusable value gets moved to the block group's read-only
    1834             :                  * bytes and isn't available for calculations anymore.
    1835             :                  */
    1836           0 :                 zone_unusable = bg->zone_unusable;
    1837           0 :                 ret = inc_block_group_ro(bg, 0);
    1838           0 :                 up_write(&space_info->groups_sem);
    1839           0 :                 if (ret < 0)
    1840           0 :                         goto next;
    1841             : 
    1842           0 :                 btrfs_info(fs_info,
    1843             :                         "reclaiming chunk %llu with %llu%% used %llu%% unusable",
    1844             :                                 bg->start,
    1845             :                                 div64_u64(bg->used * 100, bg->length),
    1846             :                                 div64_u64(zone_unusable * 100, bg->length));
    1847           0 :                 trace_btrfs_reclaim_block_group(bg);
    1848           0 :                 ret = btrfs_relocate_chunk(fs_info, bg->start);
    1849           0 :                 if (ret) {
    1850           0 :                         btrfs_dec_block_group_ro(bg);
    1851           0 :                         btrfs_err(fs_info, "error relocating chunk %llu",
    1852             :                                   bg->start);
    1853             :                 }
    1854             : 
    1855           0 : next:
    1856           0 :                 if (ret)
    1857           0 :                         btrfs_mark_bg_to_reclaim(bg);
    1858           0 :                 btrfs_put_block_group(bg);
    1859             : 
    1860           0 :                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    1861             :                 /*
    1862             :                  * Reclaiming all the block groups in the list can take really
    1863             :                  * long.  Prioritize cleaning up unused block groups.
    1864             :                  */
    1865           0 :                 btrfs_delete_unused_bgs(fs_info);
    1866             :                 /*
    1867             :                  * If we are interrupted by a balance, we can just bail out. The
    1868             :                  * cleaner thread restart again if necessary.
    1869             :                  */
    1870           0 :                 if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
    1871           0 :                         goto end;
    1872           0 :                 spin_lock(&fs_info->unused_bgs_lock);
    1873             :         }
    1874           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1875           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    1876           0 : end:
    1877           0 :         btrfs_exclop_finish(fs_info);
    1878           0 :         sb_end_write(fs_info->sb);
    1879             : }
    1880             : 
    1881           0 : void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
    1882             : {
    1883           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1884           0 :         if (!list_empty(&fs_info->reclaim_bgs))
    1885           0 :                 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
    1886           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1887           0 : }
    1888             : 
    1889           0 : void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
    1890             : {
    1891           0 :         struct btrfs_fs_info *fs_info = bg->fs_info;
    1892             : 
    1893           0 :         spin_lock(&fs_info->unused_bgs_lock);
    1894           0 :         if (list_empty(&bg->bg_list)) {
    1895           0 :                 btrfs_get_block_group(bg);
    1896           0 :                 trace_btrfs_add_reclaim_block_group(bg);
    1897           0 :                 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
    1898             :         }
    1899           0 :         spin_unlock(&fs_info->unused_bgs_lock);
    1900           0 : }
    1901             : 
    1902           0 : static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
    1903             :                            struct btrfs_path *path)
    1904             : {
    1905           0 :         struct extent_map_tree *em_tree;
    1906           0 :         struct extent_map *em;
    1907           0 :         struct btrfs_block_group_item bg;
    1908           0 :         struct extent_buffer *leaf;
    1909           0 :         int slot;
    1910           0 :         u64 flags;
    1911           0 :         int ret = 0;
    1912             : 
    1913           0 :         slot = path->slots[0];
    1914           0 :         leaf = path->nodes[0];
    1915             : 
    1916           0 :         em_tree = &fs_info->mapping_tree;
    1917           0 :         read_lock(&em_tree->lock);
    1918           0 :         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
    1919           0 :         read_unlock(&em_tree->lock);
    1920           0 :         if (!em) {
    1921           0 :                 btrfs_err(fs_info,
    1922             :                           "logical %llu len %llu found bg but no related chunk",
    1923             :                           key->objectid, key->offset);
    1924           0 :                 return -ENOENT;
    1925             :         }
    1926             : 
    1927           0 :         if (em->start != key->objectid || em->len != key->offset) {
    1928           0 :                 btrfs_err(fs_info,
    1929             :                         "block group %llu len %llu mismatch with chunk %llu len %llu",
    1930             :                         key->objectid, key->offset, em->start, em->len);
    1931           0 :                 ret = -EUCLEAN;
    1932           0 :                 goto out_free_em;
    1933             :         }
    1934             : 
    1935           0 :         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
    1936             :                            sizeof(bg));
    1937           0 :         flags = btrfs_stack_block_group_flags(&bg) &
    1938             :                 BTRFS_BLOCK_GROUP_TYPE_MASK;
    1939             : 
    1940           0 :         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    1941           0 :                 btrfs_err(fs_info,
    1942             : "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
    1943             :                           key->objectid, key->offset, flags,
    1944             :                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
    1945           0 :                 ret = -EUCLEAN;
    1946             :         }
    1947             : 
    1948           0 : out_free_em:
    1949           0 :         free_extent_map(em);
    1950           0 :         return ret;
    1951             : }
    1952             : 
    1953           0 : static int find_first_block_group(struct btrfs_fs_info *fs_info,
    1954             :                                   struct btrfs_path *path,
    1955             :                                   struct btrfs_key *key)
    1956             : {
    1957           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    1958           0 :         int ret;
    1959           0 :         struct btrfs_key found_key;
    1960             : 
    1961           0 :         btrfs_for_each_slot(root, key, &found_key, path, ret) {
    1962           0 :                 if (found_key.objectid >= key->objectid &&
    1963           0 :                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
    1964           0 :                         return read_bg_from_eb(fs_info, &found_key, path);
    1965             :                 }
    1966             :         }
    1967             :         return ret;
    1968             : }
    1969             : 
    1970           0 : static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
    1971             : {
    1972           0 :         u64 extra_flags = chunk_to_extended(flags) &
    1973             :                                 BTRFS_EXTENDED_PROFILE_MASK;
    1974             : 
    1975           0 :         write_seqlock(&fs_info->profiles_lock);
    1976           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA)
    1977           0 :                 fs_info->avail_data_alloc_bits |= extra_flags;
    1978           0 :         if (flags & BTRFS_BLOCK_GROUP_METADATA)
    1979           0 :                 fs_info->avail_metadata_alloc_bits |= extra_flags;
    1980           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    1981           0 :                 fs_info->avail_system_alloc_bits |= extra_flags;
    1982           0 :         write_sequnlock(&fs_info->profiles_lock);
    1983           0 : }
    1984             : 
    1985             : /*
    1986             :  * Map a physical disk address to a list of logical addresses.
    1987             :  *
    1988             :  * @fs_info:       the filesystem
    1989             :  * @chunk_start:   logical address of block group
    1990             :  * @physical:      physical address to map to logical addresses
    1991             :  * @logical:       return array of logical addresses which map to @physical
    1992             :  * @naddrs:        length of @logical
    1993             :  * @stripe_len:    size of IO stripe for the given block group
    1994             :  *
    1995             :  * Maps a particular @physical disk address to a list of @logical addresses.
    1996             :  * Used primarily to exclude those portions of a block group that contain super
    1997             :  * block copies.
    1998             :  */
    1999           0 : int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
    2000             :                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
    2001             : {
    2002           0 :         struct extent_map *em;
    2003           0 :         struct map_lookup *map;
    2004           0 :         u64 *buf;
    2005           0 :         u64 bytenr;
    2006           0 :         u64 data_stripe_length;
    2007           0 :         u64 io_stripe_size;
    2008           0 :         int i, nr = 0;
    2009           0 :         int ret = 0;
    2010             : 
    2011           0 :         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
    2012           0 :         if (IS_ERR(em))
    2013             :                 return -EIO;
    2014             : 
    2015           0 :         map = em->map_lookup;
    2016           0 :         data_stripe_length = em->orig_block_len;
    2017           0 :         io_stripe_size = BTRFS_STRIPE_LEN;
    2018           0 :         chunk_start = em->start;
    2019             : 
    2020             :         /* For RAID5/6 adjust to a full IO stripe length */
    2021           0 :         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
    2022           0 :                 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
    2023             : 
    2024           0 :         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
    2025           0 :         if (!buf) {
    2026           0 :                 ret = -ENOMEM;
    2027           0 :                 goto out;
    2028             :         }
    2029             : 
    2030           0 :         for (i = 0; i < map->num_stripes; i++) {
    2031           0 :                 bool already_inserted = false;
    2032           0 :                 u32 stripe_nr;
    2033           0 :                 u32 offset;
    2034           0 :                 int j;
    2035             : 
    2036           0 :                 if (!in_range(physical, map->stripes[i].physical,
    2037             :                               data_stripe_length))
    2038           0 :                         continue;
    2039             : 
    2040           0 :                 stripe_nr = (physical - map->stripes[i].physical) >>
    2041             :                             BTRFS_STRIPE_LEN_SHIFT;
    2042           0 :                 offset = (physical - map->stripes[i].physical) &
    2043             :                          BTRFS_STRIPE_LEN_MASK;
    2044             : 
    2045           0 :                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
    2046             :                                  BTRFS_BLOCK_GROUP_RAID10))
    2047           0 :                         stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
    2048           0 :                                             map->sub_stripes);
    2049             :                 /*
    2050             :                  * The remaining case would be for RAID56, multiply by
    2051             :                  * nr_data_stripes().  Alternatively, just use rmap_len below
    2052             :                  * instead of map->stripe_len
    2053             :                  */
    2054           0 :                 bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
    2055             : 
    2056             :                 /* Ensure we don't add duplicate addresses */
    2057           0 :                 for (j = 0; j < nr; j++) {
    2058           0 :                         if (buf[j] == bytenr) {
    2059             :                                 already_inserted = true;
    2060             :                                 break;
    2061             :                         }
    2062             :                 }
    2063             : 
    2064           0 :                 if (!already_inserted)
    2065           0 :                         buf[nr++] = bytenr;
    2066             :         }
    2067             : 
    2068           0 :         *logical = buf;
    2069           0 :         *naddrs = nr;
    2070           0 :         *stripe_len = io_stripe_size;
    2071           0 : out:
    2072           0 :         free_extent_map(em);
    2073           0 :         return ret;
    2074             : }
    2075             : 
    2076           0 : static int exclude_super_stripes(struct btrfs_block_group *cache)
    2077             : {
    2078           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    2079           0 :         const bool zoned = btrfs_is_zoned(fs_info);
    2080           0 :         u64 bytenr;
    2081           0 :         u64 *logical;
    2082           0 :         int stripe_len;
    2083           0 :         int i, nr, ret;
    2084             : 
    2085           0 :         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
    2086           0 :                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
    2087           0 :                 cache->bytes_super += stripe_len;
    2088           0 :                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
    2089             :                                                 stripe_len);
    2090           0 :                 if (ret)
    2091             :                         return ret;
    2092             :         }
    2093             : 
    2094           0 :         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    2095           0 :                 bytenr = btrfs_sb_offset(i);
    2096           0 :                 ret = btrfs_rmap_block(fs_info, cache->start,
    2097             :                                        bytenr, &logical, &nr, &stripe_len);
    2098           0 :                 if (ret)
    2099           0 :                         return ret;
    2100             : 
    2101             :                 /* Shouldn't have super stripes in sequential zones */
    2102           0 :                 if (zoned && nr) {
    2103           0 :                         kfree(logical);
    2104           0 :                         btrfs_err(fs_info,
    2105             :                         "zoned: block group %llu must not contain super block",
    2106             :                                   cache->start);
    2107           0 :                         return -EUCLEAN;
    2108             :                 }
    2109             : 
    2110           0 :                 while (nr--) {
    2111           0 :                         u64 len = min_t(u64, stripe_len,
    2112             :                                 cache->start + cache->length - logical[nr]);
    2113             : 
    2114           0 :                         cache->bytes_super += len;
    2115           0 :                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
    2116             :                                                         len);
    2117           0 :                         if (ret) {
    2118           0 :                                 kfree(logical);
    2119           0 :                                 return ret;
    2120             :                         }
    2121             :                 }
    2122             : 
    2123           0 :                 kfree(logical);
    2124             :         }
    2125             :         return 0;
    2126             : }
    2127             : 
    2128           0 : static struct btrfs_block_group *btrfs_create_block_group_cache(
    2129             :                 struct btrfs_fs_info *fs_info, u64 start)
    2130             : {
    2131           0 :         struct btrfs_block_group *cache;
    2132             : 
    2133           0 :         cache = kzalloc(sizeof(*cache), GFP_NOFS);
    2134           0 :         if (!cache)
    2135             :                 return NULL;
    2136             : 
    2137           0 :         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
    2138             :                                         GFP_NOFS);
    2139           0 :         if (!cache->free_space_ctl) {
    2140           0 :                 kfree(cache);
    2141           0 :                 return NULL;
    2142             :         }
    2143             : 
    2144           0 :         cache->start = start;
    2145             : 
    2146           0 :         cache->fs_info = fs_info;
    2147           0 :         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
    2148             : 
    2149           0 :         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
    2150             : 
    2151           0 :         refcount_set(&cache->refs, 1);
    2152           0 :         spin_lock_init(&cache->lock);
    2153           0 :         init_rwsem(&cache->data_rwsem);
    2154           0 :         INIT_LIST_HEAD(&cache->list);
    2155           0 :         INIT_LIST_HEAD(&cache->cluster_list);
    2156           0 :         INIT_LIST_HEAD(&cache->bg_list);
    2157           0 :         INIT_LIST_HEAD(&cache->ro_list);
    2158           0 :         INIT_LIST_HEAD(&cache->discard_list);
    2159           0 :         INIT_LIST_HEAD(&cache->dirty_list);
    2160           0 :         INIT_LIST_HEAD(&cache->io_list);
    2161           0 :         INIT_LIST_HEAD(&cache->active_bg_list);
    2162           0 :         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
    2163           0 :         atomic_set(&cache->frozen, 0);
    2164           0 :         mutex_init(&cache->free_space_lock);
    2165             : 
    2166           0 :         return cache;
    2167             : }
    2168             : 
    2169             : /*
    2170             :  * Iterate all chunks and verify that each of them has the corresponding block
    2171             :  * group
    2172             :  */
    2173           0 : static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
    2174             : {
    2175           0 :         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
    2176           0 :         struct extent_map *em;
    2177           0 :         struct btrfs_block_group *bg;
    2178           0 :         u64 start = 0;
    2179           0 :         int ret = 0;
    2180             : 
    2181           0 :         while (1) {
    2182           0 :                 read_lock(&map_tree->lock);
    2183             :                 /*
    2184             :                  * lookup_extent_mapping will return the first extent map
    2185             :                  * intersecting the range, so setting @len to 1 is enough to
    2186             :                  * get the first chunk.
    2187             :                  */
    2188           0 :                 em = lookup_extent_mapping(map_tree, start, 1);
    2189           0 :                 read_unlock(&map_tree->lock);
    2190           0 :                 if (!em)
    2191             :                         break;
    2192             : 
    2193           0 :                 bg = btrfs_lookup_block_group(fs_info, em->start);
    2194           0 :                 if (!bg) {
    2195           0 :                         btrfs_err(fs_info,
    2196             :         "chunk start=%llu len=%llu doesn't have corresponding block group",
    2197             :                                      em->start, em->len);
    2198           0 :                         ret = -EUCLEAN;
    2199           0 :                         free_extent_map(em);
    2200           0 :                         break;
    2201             :                 }
    2202           0 :                 if (bg->start != em->start || bg->length != em->len ||
    2203           0 :                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
    2204           0 :                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    2205           0 :                         btrfs_err(fs_info,
    2206             : "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
    2207             :                                 em->start, em->len,
    2208             :                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
    2209             :                                 bg->start, bg->length,
    2210             :                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
    2211           0 :                         ret = -EUCLEAN;
    2212           0 :                         free_extent_map(em);
    2213           0 :                         btrfs_put_block_group(bg);
    2214           0 :                         break;
    2215             :                 }
    2216           0 :                 start = em->start + em->len;
    2217           0 :                 free_extent_map(em);
    2218           0 :                 btrfs_put_block_group(bg);
    2219             :         }
    2220           0 :         return ret;
    2221             : }
    2222             : 
    2223           0 : static int read_one_block_group(struct btrfs_fs_info *info,
    2224             :                                 struct btrfs_block_group_item *bgi,
    2225             :                                 const struct btrfs_key *key,
    2226             :                                 int need_clear)
    2227             : {
    2228           0 :         struct btrfs_block_group *cache;
    2229           0 :         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
    2230           0 :         int ret;
    2231             : 
    2232           0 :         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
    2233             : 
    2234           0 :         cache = btrfs_create_block_group_cache(info, key->objectid);
    2235           0 :         if (!cache)
    2236             :                 return -ENOMEM;
    2237             : 
    2238           0 :         cache->length = key->offset;
    2239           0 :         cache->used = btrfs_stack_block_group_used(bgi);
    2240           0 :         cache->commit_used = cache->used;
    2241           0 :         cache->flags = btrfs_stack_block_group_flags(bgi);
    2242           0 :         cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
    2243             : 
    2244           0 :         set_free_space_tree_thresholds(cache);
    2245             : 
    2246           0 :         if (need_clear) {
    2247             :                 /*
    2248             :                  * When we mount with old space cache, we need to
    2249             :                  * set BTRFS_DC_CLEAR and set dirty flag.
    2250             :                  *
    2251             :                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
    2252             :                  *    truncate the old free space cache inode and
    2253             :                  *    setup a new one.
    2254             :                  * b) Setting 'dirty flag' makes sure that we flush
    2255             :                  *    the new space cache info onto disk.
    2256             :                  */
    2257           0 :                 if (btrfs_test_opt(info, SPACE_CACHE))
    2258           0 :                         cache->disk_cache_state = BTRFS_DC_CLEAR;
    2259             :         }
    2260           0 :         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
    2261             :             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
    2262           0 :                         btrfs_err(info,
    2263             : "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
    2264             :                                   cache->start);
    2265           0 :                         ret = -EINVAL;
    2266           0 :                         goto error;
    2267             :         }
    2268             : 
    2269           0 :         ret = btrfs_load_block_group_zone_info(cache, false);
    2270           0 :         if (ret) {
    2271           0 :                 btrfs_err(info, "zoned: failed to load zone info of bg %llu",
    2272             :                           cache->start);
    2273           0 :                 goto error;
    2274             :         }
    2275             : 
    2276             :         /*
    2277             :          * We need to exclude the super stripes now so that the space info has
    2278             :          * super bytes accounted for, otherwise we'll think we have more space
    2279             :          * than we actually do.
    2280             :          */
    2281           0 :         ret = exclude_super_stripes(cache);
    2282           0 :         if (ret) {
    2283             :                 /* We may have excluded something, so call this just in case. */
    2284           0 :                 btrfs_free_excluded_extents(cache);
    2285           0 :                 goto error;
    2286             :         }
    2287             : 
    2288             :         /*
    2289             :          * For zoned filesystem, space after the allocation offset is the only
    2290             :          * free space for a block group. So, we don't need any caching work.
    2291             :          * btrfs_calc_zone_unusable() will set the amount of free space and
    2292             :          * zone_unusable space.
    2293             :          *
    2294             :          * For regular filesystem, check for two cases, either we are full, and
    2295             :          * therefore don't need to bother with the caching work since we won't
    2296             :          * find any space, or we are empty, and we can just add all the space
    2297             :          * in and be done with it.  This saves us _a_lot_ of time, particularly
    2298             :          * in the full case.
    2299             :          */
    2300           0 :         if (btrfs_is_zoned(info)) {
    2301           0 :                 btrfs_calc_zone_unusable(cache);
    2302             :                 /* Should not have any excluded extents. Just in case, though. */
    2303           0 :                 btrfs_free_excluded_extents(cache);
    2304           0 :         } else if (cache->length == cache->used) {
    2305           0 :                 cache->cached = BTRFS_CACHE_FINISHED;
    2306           0 :                 btrfs_free_excluded_extents(cache);
    2307           0 :         } else if (cache->used == 0) {
    2308           0 :                 cache->cached = BTRFS_CACHE_FINISHED;
    2309           0 :                 ret = add_new_free_space(cache, cache->start,
    2310             :                                          cache->start + cache->length, NULL);
    2311           0 :                 btrfs_free_excluded_extents(cache);
    2312           0 :                 if (ret)
    2313           0 :                         goto error;
    2314             :         }
    2315             : 
    2316           0 :         ret = btrfs_add_block_group_cache(info, cache);
    2317           0 :         if (ret) {
    2318           0 :                 btrfs_remove_free_space_cache(cache);
    2319           0 :                 goto error;
    2320             :         }
    2321           0 :         trace_btrfs_add_block_group(info, cache, 0);
    2322           0 :         btrfs_add_bg_to_space_info(info, cache);
    2323             : 
    2324           0 :         set_avail_alloc_bits(info, cache->flags);
    2325           0 :         if (btrfs_chunk_writeable(info, cache->start)) {
    2326           0 :                 if (cache->used == 0) {
    2327           0 :                         ASSERT(list_empty(&cache->bg_list));
    2328           0 :                         if (btrfs_test_opt(info, DISCARD_ASYNC))
    2329           0 :                                 btrfs_discard_queue_work(&info->discard_ctl, cache);
    2330             :                         else
    2331           0 :                                 btrfs_mark_bg_unused(cache);
    2332             :                 }
    2333             :         } else {
    2334           0 :                 inc_block_group_ro(cache, 1);
    2335             :         }
    2336             : 
    2337             :         return 0;
    2338           0 : error:
    2339           0 :         btrfs_put_block_group(cache);
    2340           0 :         return ret;
    2341             : }
    2342             : 
    2343           0 : static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
    2344             : {
    2345           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    2346           0 :         struct rb_node *node;
    2347           0 :         int ret = 0;
    2348             : 
    2349           0 :         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
    2350           0 :                 struct extent_map *em;
    2351           0 :                 struct map_lookup *map;
    2352           0 :                 struct btrfs_block_group *bg;
    2353             : 
    2354           0 :                 em = rb_entry(node, struct extent_map, rb_node);
    2355           0 :                 map = em->map_lookup;
    2356           0 :                 bg = btrfs_create_block_group_cache(fs_info, em->start);
    2357           0 :                 if (!bg) {
    2358             :                         ret = -ENOMEM;
    2359             :                         break;
    2360             :                 }
    2361             : 
    2362             :                 /* Fill dummy cache as FULL */
    2363           0 :                 bg->length = em->len;
    2364           0 :                 bg->flags = map->type;
    2365           0 :                 bg->cached = BTRFS_CACHE_FINISHED;
    2366           0 :                 bg->used = em->len;
    2367           0 :                 bg->flags = map->type;
    2368           0 :                 ret = btrfs_add_block_group_cache(fs_info, bg);
    2369             :                 /*
    2370             :                  * We may have some valid block group cache added already, in
    2371             :                  * that case we skip to the next one.
    2372             :                  */
    2373           0 :                 if (ret == -EEXIST) {
    2374           0 :                         ret = 0;
    2375           0 :                         btrfs_put_block_group(bg);
    2376           0 :                         continue;
    2377             :                 }
    2378             : 
    2379           0 :                 if (ret) {
    2380           0 :                         btrfs_remove_free_space_cache(bg);
    2381           0 :                         btrfs_put_block_group(bg);
    2382           0 :                         break;
    2383             :                 }
    2384             : 
    2385           0 :                 btrfs_add_bg_to_space_info(fs_info, bg);
    2386             : 
    2387           0 :                 set_avail_alloc_bits(fs_info, bg->flags);
    2388             :         }
    2389           0 :         if (!ret)
    2390           0 :                 btrfs_init_global_block_rsv(fs_info);
    2391           0 :         return ret;
    2392             : }
    2393             : 
    2394           0 : int btrfs_read_block_groups(struct btrfs_fs_info *info)
    2395             : {
    2396           0 :         struct btrfs_root *root = btrfs_block_group_root(info);
    2397           0 :         struct btrfs_path *path;
    2398           0 :         int ret;
    2399           0 :         struct btrfs_block_group *cache;
    2400           0 :         struct btrfs_space_info *space_info;
    2401           0 :         struct btrfs_key key;
    2402           0 :         int need_clear = 0;
    2403           0 :         u64 cache_gen;
    2404             : 
    2405             :         /*
    2406             :          * Either no extent root (with ibadroots rescue option) or we have
    2407             :          * unsupported RO options. The fs can never be mounted read-write, so no
    2408             :          * need to waste time searching block group items.
    2409             :          *
    2410             :          * This also allows new extent tree related changes to be RO compat,
    2411             :          * no need for a full incompat flag.
    2412             :          */
    2413           0 :         if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
    2414             :                       ~BTRFS_FEATURE_COMPAT_RO_SUPP))
    2415           0 :                 return fill_dummy_bgs(info);
    2416             : 
    2417           0 :         key.objectid = 0;
    2418           0 :         key.offset = 0;
    2419           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2420           0 :         path = btrfs_alloc_path();
    2421           0 :         if (!path)
    2422             :                 return -ENOMEM;
    2423             : 
    2424           0 :         cache_gen = btrfs_super_cache_generation(info->super_copy);
    2425           0 :         if (btrfs_test_opt(info, SPACE_CACHE) &&
    2426             :             btrfs_super_generation(info->super_copy) != cache_gen)
    2427           0 :                 need_clear = 1;
    2428           0 :         if (btrfs_test_opt(info, CLEAR_CACHE))
    2429           0 :                 need_clear = 1;
    2430             : 
    2431           0 :         while (1) {
    2432           0 :                 struct btrfs_block_group_item bgi;
    2433           0 :                 struct extent_buffer *leaf;
    2434           0 :                 int slot;
    2435             : 
    2436           0 :                 ret = find_first_block_group(info, path, &key);
    2437           0 :                 if (ret > 0)
    2438             :                         break;
    2439           0 :                 if (ret != 0)
    2440           0 :                         goto error;
    2441             : 
    2442           0 :                 leaf = path->nodes[0];
    2443           0 :                 slot = path->slots[0];
    2444             : 
    2445           0 :                 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
    2446             :                                    sizeof(bgi));
    2447             : 
    2448           0 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    2449           0 :                 btrfs_release_path(path);
    2450           0 :                 ret = read_one_block_group(info, &bgi, &key, need_clear);
    2451           0 :                 if (ret < 0)
    2452           0 :                         goto error;
    2453           0 :                 key.objectid += key.offset;
    2454           0 :                 key.offset = 0;
    2455             :         }
    2456           0 :         btrfs_release_path(path);
    2457             : 
    2458           0 :         list_for_each_entry(space_info, &info->space_info, list) {
    2459             :                 int i;
    2460             : 
    2461           0 :                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
    2462           0 :                         if (list_empty(&space_info->block_groups[i]))
    2463           0 :                                 continue;
    2464           0 :                         cache = list_first_entry(&space_info->block_groups[i],
    2465             :                                                  struct btrfs_block_group,
    2466             :                                                  list);
    2467           0 :                         btrfs_sysfs_add_block_group_type(cache);
    2468             :                 }
    2469             : 
    2470           0 :                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
    2471             :                       (BTRFS_BLOCK_GROUP_RAID10 |
    2472             :                        BTRFS_BLOCK_GROUP_RAID1_MASK |
    2473             :                        BTRFS_BLOCK_GROUP_RAID56_MASK |
    2474             :                        BTRFS_BLOCK_GROUP_DUP)))
    2475           0 :                         continue;
    2476             :                 /*
    2477             :                  * Avoid allocating from un-mirrored block group if there are
    2478             :                  * mirrored block groups.
    2479             :                  */
    2480           0 :                 list_for_each_entry(cache,
    2481             :                                 &space_info->block_groups[BTRFS_RAID_RAID0],
    2482             :                                 list)
    2483           0 :                         inc_block_group_ro(cache, 1);
    2484           0 :                 list_for_each_entry(cache,
    2485             :                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
    2486             :                                 list)
    2487           0 :                         inc_block_group_ro(cache, 1);
    2488             :         }
    2489             : 
    2490           0 :         btrfs_init_global_block_rsv(info);
    2491           0 :         ret = check_chunk_block_group_mappings(info);
    2492           0 : error:
    2493           0 :         btrfs_free_path(path);
    2494             :         /*
    2495             :          * We've hit some error while reading the extent tree, and have
    2496             :          * rescue=ibadroots mount option.
    2497             :          * Try to fill the tree using dummy block groups so that the user can
    2498             :          * continue to mount and grab their data.
    2499             :          */
    2500           0 :         if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
    2501           0 :                 ret = fill_dummy_bgs(info);
    2502             :         return ret;
    2503             : }
    2504             : 
    2505             : /*
    2506             :  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
    2507             :  * allocation.
    2508             :  *
    2509             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2510             :  * phases.
    2511             :  */
    2512           0 : static int insert_block_group_item(struct btrfs_trans_handle *trans,
    2513             :                                    struct btrfs_block_group *block_group)
    2514             : {
    2515           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2516           0 :         struct btrfs_block_group_item bgi;
    2517           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2518           0 :         struct btrfs_key key;
    2519           0 :         u64 old_commit_used;
    2520           0 :         int ret;
    2521             : 
    2522           0 :         spin_lock(&block_group->lock);
    2523           0 :         btrfs_set_stack_block_group_used(&bgi, block_group->used);
    2524           0 :         btrfs_set_stack_block_group_chunk_objectid(&bgi,
    2525             :                                                    block_group->global_root_id);
    2526           0 :         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
    2527           0 :         old_commit_used = block_group->commit_used;
    2528           0 :         block_group->commit_used = block_group->used;
    2529           0 :         key.objectid = block_group->start;
    2530           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2531           0 :         key.offset = block_group->length;
    2532           0 :         spin_unlock(&block_group->lock);
    2533             : 
    2534           0 :         ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
    2535           0 :         if (ret < 0) {
    2536           0 :                 spin_lock(&block_group->lock);
    2537           0 :                 block_group->commit_used = old_commit_used;
    2538           0 :                 spin_unlock(&block_group->lock);
    2539             :         }
    2540             : 
    2541           0 :         return ret;
    2542             : }
    2543             : 
    2544           0 : static int insert_dev_extent(struct btrfs_trans_handle *trans,
    2545             :                             struct btrfs_device *device, u64 chunk_offset,
    2546             :                             u64 start, u64 num_bytes)
    2547             : {
    2548           0 :         struct btrfs_fs_info *fs_info = device->fs_info;
    2549           0 :         struct btrfs_root *root = fs_info->dev_root;
    2550           0 :         struct btrfs_path *path;
    2551           0 :         struct btrfs_dev_extent *extent;
    2552           0 :         struct extent_buffer *leaf;
    2553           0 :         struct btrfs_key key;
    2554           0 :         int ret;
    2555             : 
    2556           0 :         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
    2557           0 :         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
    2558           0 :         path = btrfs_alloc_path();
    2559           0 :         if (!path)
    2560             :                 return -ENOMEM;
    2561             : 
    2562           0 :         key.objectid = device->devid;
    2563           0 :         key.type = BTRFS_DEV_EXTENT_KEY;
    2564           0 :         key.offset = start;
    2565           0 :         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
    2566           0 :         if (ret)
    2567           0 :                 goto out;
    2568             : 
    2569           0 :         leaf = path->nodes[0];
    2570           0 :         extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
    2571           0 :         btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
    2572           0 :         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
    2573             :                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
    2574           0 :         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
    2575             : 
    2576           0 :         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
    2577           0 :         btrfs_mark_buffer_dirty(leaf);
    2578           0 : out:
    2579           0 :         btrfs_free_path(path);
    2580           0 :         return ret;
    2581             : }
    2582             : 
    2583             : /*
    2584             :  * This function belongs to phase 2.
    2585             :  *
    2586             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2587             :  * phases.
    2588             :  */
    2589           0 : static int insert_dev_extents(struct btrfs_trans_handle *trans,
    2590             :                                    u64 chunk_offset, u64 chunk_size)
    2591             : {
    2592           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2593           0 :         struct btrfs_device *device;
    2594           0 :         struct extent_map *em;
    2595           0 :         struct map_lookup *map;
    2596           0 :         u64 dev_offset;
    2597           0 :         u64 stripe_size;
    2598           0 :         int i;
    2599           0 :         int ret = 0;
    2600             : 
    2601           0 :         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
    2602           0 :         if (IS_ERR(em))
    2603           0 :                 return PTR_ERR(em);
    2604             : 
    2605           0 :         map = em->map_lookup;
    2606           0 :         stripe_size = em->orig_block_len;
    2607             : 
    2608             :         /*
    2609             :          * Take the device list mutex to prevent races with the final phase of
    2610             :          * a device replace operation that replaces the device object associated
    2611             :          * with the map's stripes, because the device object's id can change
    2612             :          * at any time during that final phase of the device replace operation
    2613             :          * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
    2614             :          * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
    2615             :          * resulting in persisting a device extent item with such ID.
    2616             :          */
    2617           0 :         mutex_lock(&fs_info->fs_devices->device_list_mutex);
    2618           0 :         for (i = 0; i < map->num_stripes; i++) {
    2619           0 :                 device = map->stripes[i].dev;
    2620           0 :                 dev_offset = map->stripes[i].physical;
    2621             : 
    2622           0 :                 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
    2623             :                                        stripe_size);
    2624           0 :                 if (ret)
    2625             :                         break;
    2626             :         }
    2627           0 :         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2628             : 
    2629           0 :         free_extent_map(em);
    2630           0 :         return ret;
    2631             : }
    2632             : 
    2633             : /*
    2634             :  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
    2635             :  * chunk allocation.
    2636             :  *
    2637             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    2638             :  * phases.
    2639             :  */
    2640           0 : void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
    2641             : {
    2642           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2643           0 :         struct btrfs_block_group *block_group;
    2644           0 :         int ret = 0;
    2645             : 
    2646           0 :         while (!list_empty(&trans->new_bgs)) {
    2647           0 :                 int index;
    2648             : 
    2649           0 :                 block_group = list_first_entry(&trans->new_bgs,
    2650             :                                                struct btrfs_block_group,
    2651             :                                                bg_list);
    2652           0 :                 if (ret)
    2653           0 :                         goto next;
    2654             : 
    2655           0 :                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
    2656             : 
    2657           0 :                 ret = insert_block_group_item(trans, block_group);
    2658           0 :                 if (ret)
    2659           0 :                         btrfs_abort_transaction(trans, ret);
    2660           0 :                 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
    2661             :                               &block_group->runtime_flags)) {
    2662           0 :                         mutex_lock(&fs_info->chunk_mutex);
    2663           0 :                         ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
    2664           0 :                         mutex_unlock(&fs_info->chunk_mutex);
    2665           0 :                         if (ret)
    2666           0 :                                 btrfs_abort_transaction(trans, ret);
    2667             :                 }
    2668           0 :                 ret = insert_dev_extents(trans, block_group->start,
    2669             :                                          block_group->length);
    2670           0 :                 if (ret)
    2671           0 :                         btrfs_abort_transaction(trans, ret);
    2672           0 :                 add_block_group_free_space(trans, block_group);
    2673             : 
    2674             :                 /*
    2675             :                  * If we restriped during balance, we may have added a new raid
    2676             :                  * type, so now add the sysfs entries when it is safe to do so.
    2677             :                  * We don't have to worry about locking here as it's handled in
    2678             :                  * btrfs_sysfs_add_block_group_type.
    2679             :                  */
    2680           0 :                 if (block_group->space_info->block_group_kobjs[index] == NULL)
    2681           0 :                         btrfs_sysfs_add_block_group_type(block_group);
    2682             : 
    2683             :                 /* Already aborted the transaction if it failed. */
    2684           0 : next:
    2685           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    2686           0 :                 list_del_init(&block_group->bg_list);
    2687           0 :                 clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
    2688             :         }
    2689           0 :         btrfs_trans_release_chunk_metadata(trans);
    2690           0 : }
    2691             : 
    2692             : /*
    2693             :  * For extent tree v2 we use the block_group_item->chunk_offset to point at our
    2694             :  * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
    2695             :  */
    2696             : static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
    2697             : {
    2698           0 :         u64 div = SZ_1G;
    2699           0 :         u64 index;
    2700             : 
    2701           0 :         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
    2702             :                 return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    2703             : 
    2704             :         /* If we have a smaller fs index based on 128MiB. */
    2705           0 :         if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
    2706           0 :                 div = SZ_128M;
    2707             : 
    2708           0 :         offset = div64_u64(offset, div);
    2709           0 :         div64_u64_rem(offset, fs_info->nr_global_roots, &index);
    2710           0 :         return index;
    2711             : }
    2712             : 
    2713           0 : struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
    2714             :                                                  u64 type,
    2715             :                                                  u64 chunk_offset, u64 size)
    2716             : {
    2717           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2718           0 :         struct btrfs_block_group *cache;
    2719           0 :         int ret;
    2720             : 
    2721           0 :         btrfs_set_log_full_commit(trans);
    2722             : 
    2723           0 :         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
    2724           0 :         if (!cache)
    2725             :                 return ERR_PTR(-ENOMEM);
    2726             : 
    2727             :         /*
    2728             :          * Mark it as new before adding it to the rbtree of block groups or any
    2729             :          * list, so that no other task finds it and calls btrfs_mark_bg_unused()
    2730             :          * before the new flag is set.
    2731             :          */
    2732           0 :         set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
    2733             : 
    2734           0 :         cache->length = size;
    2735           0 :         set_free_space_tree_thresholds(cache);
    2736           0 :         cache->flags = type;
    2737           0 :         cache->cached = BTRFS_CACHE_FINISHED;
    2738           0 :         cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
    2739             : 
    2740           0 :         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
    2741           0 :                 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
    2742             : 
    2743           0 :         ret = btrfs_load_block_group_zone_info(cache, true);
    2744           0 :         if (ret) {
    2745           0 :                 btrfs_put_block_group(cache);
    2746           0 :                 return ERR_PTR(ret);
    2747             :         }
    2748             : 
    2749           0 :         ret = exclude_super_stripes(cache);
    2750           0 :         if (ret) {
    2751             :                 /* We may have excluded something, so call this just in case */
    2752           0 :                 btrfs_free_excluded_extents(cache);
    2753           0 :                 btrfs_put_block_group(cache);
    2754           0 :                 return ERR_PTR(ret);
    2755             :         }
    2756             : 
    2757           0 :         ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
    2758           0 :         btrfs_free_excluded_extents(cache);
    2759           0 :         if (ret) {
    2760           0 :                 btrfs_put_block_group(cache);
    2761           0 :                 return ERR_PTR(ret);
    2762             :         }
    2763             : 
    2764             :         /*
    2765             :          * Ensure the corresponding space_info object is created and
    2766             :          * assigned to our block group. We want our bg to be added to the rbtree
    2767             :          * with its ->space_info set.
    2768             :          */
    2769           0 :         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
    2770           0 :         ASSERT(cache->space_info);
    2771             : 
    2772           0 :         ret = btrfs_add_block_group_cache(fs_info, cache);
    2773           0 :         if (ret) {
    2774           0 :                 btrfs_remove_free_space_cache(cache);
    2775           0 :                 btrfs_put_block_group(cache);
    2776           0 :                 return ERR_PTR(ret);
    2777             :         }
    2778             : 
    2779             :         /*
    2780             :          * Now that our block group has its ->space_info set and is inserted in
    2781             :          * the rbtree, update the space info's counters.
    2782             :          */
    2783           0 :         trace_btrfs_add_block_group(fs_info, cache, 1);
    2784           0 :         btrfs_add_bg_to_space_info(fs_info, cache);
    2785           0 :         btrfs_update_global_block_rsv(fs_info);
    2786             : 
    2787             : #ifdef CONFIG_BTRFS_DEBUG
    2788             :         if (btrfs_should_fragment_free_space(cache)) {
    2789             :                 cache->space_info->bytes_used += size >> 1;
    2790             :                 fragment_free_space(cache);
    2791             :         }
    2792             : #endif
    2793             : 
    2794           0 :         list_add_tail(&cache->bg_list, &trans->new_bgs);
    2795           0 :         trans->delayed_ref_updates++;
    2796           0 :         btrfs_update_delayed_refs_rsv(trans);
    2797             : 
    2798           0 :         set_avail_alloc_bits(fs_info, type);
    2799           0 :         return cache;
    2800             : }
    2801             : 
    2802             : /*
    2803             :  * Mark one block group RO, can be called several times for the same block
    2804             :  * group.
    2805             :  *
    2806             :  * @cache:              the destination block group
    2807             :  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
    2808             :  *                      ensure we still have some free space after marking this
    2809             :  *                      block group RO.
    2810             :  */
    2811           0 : int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
    2812             :                              bool do_chunk_alloc)
    2813             : {
    2814           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    2815           0 :         struct btrfs_trans_handle *trans;
    2816           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2817           0 :         u64 alloc_flags;
    2818           0 :         int ret;
    2819           0 :         bool dirty_bg_running;
    2820             : 
    2821             :         /*
    2822             :          * This can only happen when we are doing read-only scrub on read-only
    2823             :          * mount.
    2824             :          * In that case we should not start a new transaction on read-only fs.
    2825             :          * Thus here we skip all chunk allocations.
    2826             :          */
    2827           0 :         if (sb_rdonly(fs_info->sb)) {
    2828           0 :                 mutex_lock(&fs_info->ro_block_group_mutex);
    2829           0 :                 ret = inc_block_group_ro(cache, 0);
    2830           0 :                 mutex_unlock(&fs_info->ro_block_group_mutex);
    2831           0 :                 return ret;
    2832             :         }
    2833             : 
    2834           0 :         do {
    2835           0 :                 trans = btrfs_join_transaction(root);
    2836           0 :                 if (IS_ERR(trans))
    2837           0 :                         return PTR_ERR(trans);
    2838             : 
    2839           0 :                 dirty_bg_running = false;
    2840             : 
    2841             :                 /*
    2842             :                  * We're not allowed to set block groups readonly after the dirty
    2843             :                  * block group cache has started writing.  If it already started,
    2844             :                  * back off and let this transaction commit.
    2845             :                  */
    2846           0 :                 mutex_lock(&fs_info->ro_block_group_mutex);
    2847           0 :                 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
    2848           0 :                         u64 transid = trans->transid;
    2849             : 
    2850           0 :                         mutex_unlock(&fs_info->ro_block_group_mutex);
    2851           0 :                         btrfs_end_transaction(trans);
    2852             : 
    2853           0 :                         ret = btrfs_wait_for_commit(fs_info, transid);
    2854           0 :                         if (ret)
    2855           0 :                                 return ret;
    2856             :                         dirty_bg_running = true;
    2857             :                 }
    2858           0 :         } while (dirty_bg_running);
    2859             : 
    2860           0 :         if (do_chunk_alloc) {
    2861             :                 /*
    2862             :                  * If we are changing raid levels, try to allocate a
    2863             :                  * corresponding block group with the new raid level.
    2864             :                  */
    2865           0 :                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
    2866           0 :                 if (alloc_flags != cache->flags) {
    2867           0 :                         ret = btrfs_chunk_alloc(trans, alloc_flags,
    2868             :                                                 CHUNK_ALLOC_FORCE);
    2869             :                         /*
    2870             :                          * ENOSPC is allowed here, we may have enough space
    2871             :                          * already allocated at the new raid level to carry on
    2872             :                          */
    2873           0 :                         if (ret == -ENOSPC)
    2874             :                                 ret = 0;
    2875           0 :                         if (ret < 0)
    2876           0 :                                 goto out;
    2877             :                 }
    2878             :         }
    2879             : 
    2880           0 :         ret = inc_block_group_ro(cache, 0);
    2881           0 :         if (!ret)
    2882           0 :                 goto out;
    2883           0 :         if (ret == -ETXTBSY)
    2884           0 :                 goto unlock_out;
    2885             : 
    2886             :         /*
    2887             :          * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
    2888             :          * chunk allocation storm to exhaust the system chunk array.  Otherwise
    2889             :          * we still want to try our best to mark the block group read-only.
    2890             :          */
    2891           0 :         if (!do_chunk_alloc && ret == -ENOSPC &&
    2892           0 :             (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
    2893           0 :                 goto unlock_out;
    2894             : 
    2895           0 :         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
    2896           0 :         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
    2897           0 :         if (ret < 0)
    2898           0 :                 goto out;
    2899             :         /*
    2900             :          * We have allocated a new chunk. We also need to activate that chunk to
    2901             :          * grant metadata tickets for zoned filesystem.
    2902             :          */
    2903           0 :         ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
    2904           0 :         if (ret < 0)
    2905           0 :                 goto out;
    2906             : 
    2907           0 :         ret = inc_block_group_ro(cache, 0);
    2908           0 :         if (ret == -ETXTBSY)
    2909           0 :                 goto unlock_out;
    2910           0 : out:
    2911           0 :         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
    2912           0 :                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
    2913           0 :                 mutex_lock(&fs_info->chunk_mutex);
    2914           0 :                 check_system_chunk(trans, alloc_flags);
    2915           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    2916             :         }
    2917           0 : unlock_out:
    2918           0 :         mutex_unlock(&fs_info->ro_block_group_mutex);
    2919             : 
    2920           0 :         btrfs_end_transaction(trans);
    2921           0 :         return ret;
    2922             : }
    2923             : 
    2924           0 : void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
    2925             : {
    2926           0 :         struct btrfs_space_info *sinfo = cache->space_info;
    2927           0 :         u64 num_bytes;
    2928             : 
    2929           0 :         BUG_ON(!cache->ro);
    2930             : 
    2931           0 :         spin_lock(&sinfo->lock);
    2932           0 :         spin_lock(&cache->lock);
    2933           0 :         if (!--cache->ro) {
    2934           0 :                 if (btrfs_is_zoned(cache->fs_info)) {
    2935             :                         /* Migrate zone_unusable bytes back */
    2936           0 :                         cache->zone_unusable =
    2937           0 :                                 (cache->alloc_offset - cache->used) +
    2938           0 :                                 (cache->length - cache->zone_capacity);
    2939           0 :                         sinfo->bytes_zone_unusable += cache->zone_unusable;
    2940           0 :                         sinfo->bytes_readonly -= cache->zone_unusable;
    2941             :                 }
    2942           0 :                 num_bytes = cache->length - cache->reserved -
    2943           0 :                             cache->pinned - cache->bytes_super -
    2944           0 :                             cache->zone_unusable - cache->used;
    2945           0 :                 sinfo->bytes_readonly -= num_bytes;
    2946           0 :                 list_del_init(&cache->ro_list);
    2947             :         }
    2948           0 :         spin_unlock(&cache->lock);
    2949           0 :         spin_unlock(&sinfo->lock);
    2950           0 : }
    2951             : 
    2952           0 : static int update_block_group_item(struct btrfs_trans_handle *trans,
    2953             :                                    struct btrfs_path *path,
    2954             :                                    struct btrfs_block_group *cache)
    2955             : {
    2956           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2957           0 :         int ret;
    2958           0 :         struct btrfs_root *root = btrfs_block_group_root(fs_info);
    2959           0 :         unsigned long bi;
    2960           0 :         struct extent_buffer *leaf;
    2961           0 :         struct btrfs_block_group_item bgi;
    2962           0 :         struct btrfs_key key;
    2963           0 :         u64 old_commit_used;
    2964           0 :         u64 used;
    2965             : 
    2966             :         /*
    2967             :          * Block group items update can be triggered out of commit transaction
    2968             :          * critical section, thus we need a consistent view of used bytes.
    2969             :          * We cannot use cache->used directly outside of the spin lock, as it
    2970             :          * may be changed.
    2971             :          */
    2972           0 :         spin_lock(&cache->lock);
    2973           0 :         old_commit_used = cache->commit_used;
    2974           0 :         used = cache->used;
    2975             :         /* No change in used bytes, can safely skip it. */
    2976           0 :         if (cache->commit_used == used) {
    2977           0 :                 spin_unlock(&cache->lock);
    2978           0 :                 return 0;
    2979             :         }
    2980           0 :         cache->commit_used = used;
    2981           0 :         spin_unlock(&cache->lock);
    2982             : 
    2983           0 :         key.objectid = cache->start;
    2984           0 :         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
    2985           0 :         key.offset = cache->length;
    2986             : 
    2987           0 :         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2988           0 :         if (ret) {
    2989           0 :                 if (ret > 0)
    2990           0 :                         ret = -ENOENT;
    2991           0 :                 goto fail;
    2992             :         }
    2993             : 
    2994           0 :         leaf = path->nodes[0];
    2995           0 :         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
    2996           0 :         btrfs_set_stack_block_group_used(&bgi, used);
    2997           0 :         btrfs_set_stack_block_group_chunk_objectid(&bgi,
    2998             :                                                    cache->global_root_id);
    2999           0 :         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
    3000           0 :         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
    3001           0 :         btrfs_mark_buffer_dirty(leaf);
    3002           0 : fail:
    3003           0 :         btrfs_release_path(path);
    3004             :         /* We didn't update the block group item, need to revert @commit_used. */
    3005           0 :         if (ret < 0) {
    3006           0 :                 spin_lock(&cache->lock);
    3007           0 :                 cache->commit_used = old_commit_used;
    3008           0 :                 spin_unlock(&cache->lock);
    3009             :         }
    3010             :         return ret;
    3011             : 
    3012             : }
    3013             : 
    3014           0 : static int cache_save_setup(struct btrfs_block_group *block_group,
    3015             :                             struct btrfs_trans_handle *trans,
    3016             :                             struct btrfs_path *path)
    3017             : {
    3018           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    3019           0 :         struct btrfs_root *root = fs_info->tree_root;
    3020           0 :         struct inode *inode = NULL;
    3021           0 :         struct extent_changeset *data_reserved = NULL;
    3022           0 :         u64 alloc_hint = 0;
    3023           0 :         int dcs = BTRFS_DC_ERROR;
    3024           0 :         u64 cache_size = 0;
    3025           0 :         int retries = 0;
    3026           0 :         int ret = 0;
    3027             : 
    3028           0 :         if (!btrfs_test_opt(fs_info, SPACE_CACHE))
    3029             :                 return 0;
    3030             : 
    3031             :         /*
    3032             :          * If this block group is smaller than 100 megs don't bother caching the
    3033             :          * block group.
    3034             :          */
    3035           0 :         if (block_group->length < (100 * SZ_1M)) {
    3036           0 :                 spin_lock(&block_group->lock);
    3037           0 :                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
    3038           0 :                 spin_unlock(&block_group->lock);
    3039           0 :                 return 0;
    3040             :         }
    3041             : 
    3042           0 :         if (TRANS_ABORTED(trans))
    3043             :                 return 0;
    3044           0 : again:
    3045           0 :         inode = lookup_free_space_inode(block_group, path);
    3046           0 :         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
    3047           0 :                 ret = PTR_ERR(inode);
    3048           0 :                 btrfs_release_path(path);
    3049           0 :                 goto out;
    3050             :         }
    3051             : 
    3052           0 :         if (IS_ERR(inode)) {
    3053           0 :                 BUG_ON(retries);
    3054           0 :                 retries++;
    3055             : 
    3056           0 :                 if (block_group->ro)
    3057           0 :                         goto out_free;
    3058             : 
    3059           0 :                 ret = create_free_space_inode(trans, block_group, path);
    3060           0 :                 if (ret)
    3061           0 :                         goto out_free;
    3062           0 :                 goto again;
    3063             :         }
    3064             : 
    3065             :         /*
    3066             :          * We want to set the generation to 0, that way if anything goes wrong
    3067             :          * from here on out we know not to trust this cache when we load up next
    3068             :          * time.
    3069             :          */
    3070           0 :         BTRFS_I(inode)->generation = 0;
    3071           0 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
    3072           0 :         if (ret) {
    3073             :                 /*
    3074             :                  * So theoretically we could recover from this, simply set the
    3075             :                  * super cache generation to 0 so we know to invalidate the
    3076             :                  * cache, but then we'd have to keep track of the block groups
    3077             :                  * that fail this way so we know we _have_ to reset this cache
    3078             :                  * before the next commit or risk reading stale cache.  So to
    3079             :                  * limit our exposure to horrible edge cases lets just abort the
    3080             :                  * transaction, this only happens in really bad situations
    3081             :                  * anyway.
    3082             :                  */
    3083           0 :                 btrfs_abort_transaction(trans, ret);
    3084           0 :                 goto out_put;
    3085             :         }
    3086           0 :         WARN_ON(ret);
    3087             : 
    3088             :         /* We've already setup this transaction, go ahead and exit */
    3089           0 :         if (block_group->cache_generation == trans->transid &&
    3090             :             i_size_read(inode)) {
    3091           0 :                 dcs = BTRFS_DC_SETUP;
    3092           0 :                 goto out_put;
    3093             :         }
    3094             : 
    3095           0 :         if (i_size_read(inode) > 0) {
    3096           0 :                 ret = btrfs_check_trunc_cache_free_space(fs_info,
    3097             :                                         &fs_info->global_block_rsv);
    3098           0 :                 if (ret)
    3099           0 :                         goto out_put;
    3100             : 
    3101           0 :                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
    3102           0 :                 if (ret)
    3103           0 :                         goto out_put;
    3104             :         }
    3105             : 
    3106           0 :         spin_lock(&block_group->lock);
    3107           0 :         if (block_group->cached != BTRFS_CACHE_FINISHED ||
    3108           0 :             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
    3109             :                 /*
    3110             :                  * don't bother trying to write stuff out _if_
    3111             :                  * a) we're not cached,
    3112             :                  * b) we're with nospace_cache mount option,
    3113             :                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
    3114             :                  */
    3115           0 :                 dcs = BTRFS_DC_WRITTEN;
    3116           0 :                 spin_unlock(&block_group->lock);
    3117           0 :                 goto out_put;
    3118             :         }
    3119           0 :         spin_unlock(&block_group->lock);
    3120             : 
    3121             :         /*
    3122             :          * We hit an ENOSPC when setting up the cache in this transaction, just
    3123             :          * skip doing the setup, we've already cleared the cache so we're safe.
    3124             :          */
    3125           0 :         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
    3126           0 :                 ret = -ENOSPC;
    3127           0 :                 goto out_put;
    3128             :         }
    3129             : 
    3130             :         /*
    3131             :          * Try to preallocate enough space based on how big the block group is.
    3132             :          * Keep in mind this has to include any pinned space which could end up
    3133             :          * taking up quite a bit since it's not folded into the other space
    3134             :          * cache.
    3135             :          */
    3136           0 :         cache_size = div_u64(block_group->length, SZ_256M);
    3137           0 :         if (!cache_size)
    3138           0 :                 cache_size = 1;
    3139             : 
    3140           0 :         cache_size *= 16;
    3141           0 :         cache_size *= fs_info->sectorsize;
    3142             : 
    3143           0 :         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
    3144             :                                           cache_size, false);
    3145           0 :         if (ret)
    3146           0 :                 goto out_put;
    3147             : 
    3148           0 :         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
    3149             :                                               cache_size, cache_size,
    3150             :                                               &alloc_hint);
    3151             :         /*
    3152             :          * Our cache requires contiguous chunks so that we don't modify a bunch
    3153             :          * of metadata or split extents when writing the cache out, which means
    3154             :          * we can enospc if we are heavily fragmented in addition to just normal
    3155             :          * out of space conditions.  So if we hit this just skip setting up any
    3156             :          * other block groups for this transaction, maybe we'll unpin enough
    3157             :          * space the next time around.
    3158             :          */
    3159           0 :         if (!ret)
    3160             :                 dcs = BTRFS_DC_SETUP;
    3161           0 :         else if (ret == -ENOSPC)
    3162           0 :                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
    3163             : 
    3164           0 : out_put:
    3165           0 :         iput(inode);
    3166           0 : out_free:
    3167           0 :         btrfs_release_path(path);
    3168           0 : out:
    3169           0 :         spin_lock(&block_group->lock);
    3170           0 :         if (!ret && dcs == BTRFS_DC_SETUP)
    3171           0 :                 block_group->cache_generation = trans->transid;
    3172           0 :         block_group->disk_cache_state = dcs;
    3173           0 :         spin_unlock(&block_group->lock);
    3174             : 
    3175           0 :         extent_changeset_free(data_reserved);
    3176           0 :         return ret;
    3177             : }
    3178             : 
    3179           0 : int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
    3180             : {
    3181           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3182           0 :         struct btrfs_block_group *cache, *tmp;
    3183           0 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3184           0 :         struct btrfs_path *path;
    3185             : 
    3186           0 :         if (list_empty(&cur_trans->dirty_bgs) ||
    3187           0 :             !btrfs_test_opt(fs_info, SPACE_CACHE))
    3188             :                 return 0;
    3189             : 
    3190           0 :         path = btrfs_alloc_path();
    3191           0 :         if (!path)
    3192             :                 return -ENOMEM;
    3193             : 
    3194             :         /* Could add new block groups, use _safe just in case */
    3195           0 :         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
    3196             :                                  dirty_list) {
    3197           0 :                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
    3198           0 :                         cache_save_setup(cache, trans, path);
    3199             :         }
    3200             : 
    3201           0 :         btrfs_free_path(path);
    3202           0 :         return 0;
    3203             : }
    3204             : 
    3205             : /*
    3206             :  * Transaction commit does final block group cache writeback during a critical
    3207             :  * section where nothing is allowed to change the FS.  This is required in
    3208             :  * order for the cache to actually match the block group, but can introduce a
    3209             :  * lot of latency into the commit.
    3210             :  *
    3211             :  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
    3212             :  * There's a chance we'll have to redo some of it if the block group changes
    3213             :  * again during the commit, but it greatly reduces the commit latency by
    3214             :  * getting rid of the easy block groups while we're still allowing others to
    3215             :  * join the commit.
    3216             :  */
    3217           0 : int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
    3218             : {
    3219           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3220           0 :         struct btrfs_block_group *cache;
    3221           0 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3222           0 :         int ret = 0;
    3223           0 :         int should_put;
    3224           0 :         struct btrfs_path *path = NULL;
    3225           0 :         LIST_HEAD(dirty);
    3226           0 :         struct list_head *io = &cur_trans->io_bgs;
    3227           0 :         int loops = 0;
    3228             : 
    3229           0 :         spin_lock(&cur_trans->dirty_bgs_lock);
    3230           0 :         if (list_empty(&cur_trans->dirty_bgs)) {
    3231           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3232           0 :                 return 0;
    3233             :         }
    3234           0 :         list_splice_init(&cur_trans->dirty_bgs, &dirty);
    3235           0 :         spin_unlock(&cur_trans->dirty_bgs_lock);
    3236             : 
    3237           0 : again:
    3238             :         /* Make sure all the block groups on our dirty list actually exist */
    3239           0 :         btrfs_create_pending_block_groups(trans);
    3240             : 
    3241           0 :         if (!path) {
    3242           0 :                 path = btrfs_alloc_path();
    3243           0 :                 if (!path) {
    3244           0 :                         ret = -ENOMEM;
    3245           0 :                         goto out;
    3246             :                 }
    3247             :         }
    3248             : 
    3249             :         /*
    3250             :          * cache_write_mutex is here only to save us from balance or automatic
    3251             :          * removal of empty block groups deleting this block group while we are
    3252             :          * writing out the cache
    3253             :          */
    3254           0 :         mutex_lock(&trans->transaction->cache_write_mutex);
    3255           0 :         while (!list_empty(&dirty)) {
    3256           0 :                 bool drop_reserve = true;
    3257             : 
    3258           0 :                 cache = list_first_entry(&dirty, struct btrfs_block_group,
    3259             :                                          dirty_list);
    3260             :                 /*
    3261             :                  * This can happen if something re-dirties a block group that
    3262             :                  * is already under IO.  Just wait for it to finish and then do
    3263             :                  * it all again
    3264             :                  */
    3265           0 :                 if (!list_empty(&cache->io_list)) {
    3266           0 :                         list_del_init(&cache->io_list);
    3267           0 :                         btrfs_wait_cache_io(trans, cache, path);
    3268           0 :                         btrfs_put_block_group(cache);
    3269             :                 }
    3270             : 
    3271             : 
    3272             :                 /*
    3273             :                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
    3274             :                  * it should update the cache_state.  Don't delete until after
    3275             :                  * we wait.
    3276             :                  *
    3277             :                  * Since we're not running in the commit critical section
    3278             :                  * we need the dirty_bgs_lock to protect from update_block_group
    3279             :                  */
    3280           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3281           0 :                 list_del_init(&cache->dirty_list);
    3282           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3283             : 
    3284           0 :                 should_put = 1;
    3285             : 
    3286           0 :                 cache_save_setup(cache, trans, path);
    3287             : 
    3288           0 :                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
    3289           0 :                         cache->io_ctl.inode = NULL;
    3290           0 :                         ret = btrfs_write_out_cache(trans, cache, path);
    3291           0 :                         if (ret == 0 && cache->io_ctl.inode) {
    3292           0 :                                 should_put = 0;
    3293             : 
    3294             :                                 /*
    3295             :                                  * The cache_write_mutex is protecting the
    3296             :                                  * io_list, also refer to the definition of
    3297             :                                  * btrfs_transaction::io_bgs for more details
    3298             :                                  */
    3299           0 :                                 list_add_tail(&cache->io_list, io);
    3300             :                         } else {
    3301             :                                 /*
    3302             :                                  * If we failed to write the cache, the
    3303             :                                  * generation will be bad and life goes on
    3304             :                                  */
    3305             :                                 ret = 0;
    3306             :                         }
    3307             :                 }
    3308             :                 if (!ret) {
    3309           0 :                         ret = update_block_group_item(trans, path, cache);
    3310             :                         /*
    3311             :                          * Our block group might still be attached to the list
    3312             :                          * of new block groups in the transaction handle of some
    3313             :                          * other task (struct btrfs_trans_handle->new_bgs). This
    3314             :                          * means its block group item isn't yet in the extent
    3315             :                          * tree. If this happens ignore the error, as we will
    3316             :                          * try again later in the critical section of the
    3317             :                          * transaction commit.
    3318             :                          */
    3319           0 :                         if (ret == -ENOENT) {
    3320           0 :                                 ret = 0;
    3321           0 :                                 spin_lock(&cur_trans->dirty_bgs_lock);
    3322           0 :                                 if (list_empty(&cache->dirty_list)) {
    3323           0 :                                         list_add_tail(&cache->dirty_list,
    3324             :                                                       &cur_trans->dirty_bgs);
    3325           0 :                                         btrfs_get_block_group(cache);
    3326           0 :                                         drop_reserve = false;
    3327             :                                 }
    3328           0 :                                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3329           0 :                         } else if (ret) {
    3330           0 :                                 btrfs_abort_transaction(trans, ret);
    3331             :                         }
    3332             :                 }
    3333             : 
    3334             :                 /* If it's not on the io list, we need to put the block group */
    3335           0 :                 if (should_put)
    3336           0 :                         btrfs_put_block_group(cache);
    3337           0 :                 if (drop_reserve)
    3338           0 :                         btrfs_delayed_refs_rsv_release(fs_info, 1);
    3339             :                 /*
    3340             :                  * Avoid blocking other tasks for too long. It might even save
    3341             :                  * us from writing caches for block groups that are going to be
    3342             :                  * removed.
    3343             :                  */
    3344           0 :                 mutex_unlock(&trans->transaction->cache_write_mutex);
    3345           0 :                 if (ret)
    3346           0 :                         goto out;
    3347           0 :                 mutex_lock(&trans->transaction->cache_write_mutex);
    3348             :         }
    3349           0 :         mutex_unlock(&trans->transaction->cache_write_mutex);
    3350             : 
    3351             :         /*
    3352             :          * Go through delayed refs for all the stuff we've just kicked off
    3353             :          * and then loop back (just once)
    3354             :          */
    3355           0 :         if (!ret)
    3356           0 :                 ret = btrfs_run_delayed_refs(trans, 0);
    3357           0 :         if (!ret && loops == 0) {
    3358           0 :                 loops++;
    3359           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3360           0 :                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
    3361             :                 /*
    3362             :                  * dirty_bgs_lock protects us from concurrent block group
    3363             :                  * deletes too (not just cache_write_mutex).
    3364             :                  */
    3365           0 :                 if (!list_empty(&dirty)) {
    3366           0 :                         spin_unlock(&cur_trans->dirty_bgs_lock);
    3367           0 :                         goto again;
    3368             :                 }
    3369           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3370             :         }
    3371           0 : out:
    3372           0 :         if (ret < 0) {
    3373           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3374           0 :                 list_splice_init(&dirty, &cur_trans->dirty_bgs);
    3375           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3376           0 :                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
    3377             :         }
    3378             : 
    3379           0 :         btrfs_free_path(path);
    3380           0 :         return ret;
    3381             : }
    3382             : 
    3383           0 : int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
    3384             : {
    3385           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3386           0 :         struct btrfs_block_group *cache;
    3387           0 :         struct btrfs_transaction *cur_trans = trans->transaction;
    3388           0 :         int ret = 0;
    3389           0 :         int should_put;
    3390           0 :         struct btrfs_path *path;
    3391           0 :         struct list_head *io = &cur_trans->io_bgs;
    3392             : 
    3393           0 :         path = btrfs_alloc_path();
    3394           0 :         if (!path)
    3395             :                 return -ENOMEM;
    3396             : 
    3397             :         /*
    3398             :          * Even though we are in the critical section of the transaction commit,
    3399             :          * we can still have concurrent tasks adding elements to this
    3400             :          * transaction's list of dirty block groups. These tasks correspond to
    3401             :          * endio free space workers started when writeback finishes for a
    3402             :          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
    3403             :          * allocate new block groups as a result of COWing nodes of the root
    3404             :          * tree when updating the free space inode. The writeback for the space
    3405             :          * caches is triggered by an earlier call to
    3406             :          * btrfs_start_dirty_block_groups() and iterations of the following
    3407             :          * loop.
    3408             :          * Also we want to do the cache_save_setup first and then run the
    3409             :          * delayed refs to make sure we have the best chance at doing this all
    3410             :          * in one shot.
    3411             :          */
    3412           0 :         spin_lock(&cur_trans->dirty_bgs_lock);
    3413           0 :         while (!list_empty(&cur_trans->dirty_bgs)) {
    3414           0 :                 cache = list_first_entry(&cur_trans->dirty_bgs,
    3415             :                                          struct btrfs_block_group,
    3416             :                                          dirty_list);
    3417             : 
    3418             :                 /*
    3419             :                  * This can happen if cache_save_setup re-dirties a block group
    3420             :                  * that is already under IO.  Just wait for it to finish and
    3421             :                  * then do it all again
    3422             :                  */
    3423           0 :                 if (!list_empty(&cache->io_list)) {
    3424           0 :                         spin_unlock(&cur_trans->dirty_bgs_lock);
    3425           0 :                         list_del_init(&cache->io_list);
    3426           0 :                         btrfs_wait_cache_io(trans, cache, path);
    3427           0 :                         btrfs_put_block_group(cache);
    3428           0 :                         spin_lock(&cur_trans->dirty_bgs_lock);
    3429             :                 }
    3430             : 
    3431             :                 /*
    3432             :                  * Don't remove from the dirty list until after we've waited on
    3433             :                  * any pending IO
    3434             :                  */
    3435           0 :                 list_del_init(&cache->dirty_list);
    3436           0 :                 spin_unlock(&cur_trans->dirty_bgs_lock);
    3437           0 :                 should_put = 1;
    3438             : 
    3439           0 :                 cache_save_setup(cache, trans, path);
    3440             : 
    3441           0 :                 if (!ret)
    3442           0 :                         ret = btrfs_run_delayed_refs(trans,
    3443             :                                                      (unsigned long) -1);
    3444             : 
    3445           0 :                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
    3446           0 :                         cache->io_ctl.inode = NULL;
    3447           0 :                         ret = btrfs_write_out_cache(trans, cache, path);
    3448           0 :                         if (ret == 0 && cache->io_ctl.inode) {
    3449           0 :                                 should_put = 0;
    3450           0 :                                 list_add_tail(&cache->io_list, io);
    3451             :                         } else {
    3452             :                                 /*
    3453             :                                  * If we failed to write the cache, the
    3454             :                                  * generation will be bad and life goes on
    3455             :                                  */
    3456             :                                 ret = 0;
    3457             :                         }
    3458             :                 }
    3459           0 :                 if (!ret) {
    3460           0 :                         ret = update_block_group_item(trans, path, cache);
    3461             :                         /*
    3462             :                          * One of the free space endio workers might have
    3463             :                          * created a new block group while updating a free space
    3464             :                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
    3465             :                          * and hasn't released its transaction handle yet, in
    3466             :                          * which case the new block group is still attached to
    3467             :                          * its transaction handle and its creation has not
    3468             :                          * finished yet (no block group item in the extent tree
    3469             :                          * yet, etc). If this is the case, wait for all free
    3470             :                          * space endio workers to finish and retry. This is a
    3471             :                          * very rare case so no need for a more efficient and
    3472             :                          * complex approach.
    3473             :                          */
    3474           0 :                         if (ret == -ENOENT) {
    3475           0 :                                 wait_event(cur_trans->writer_wait,
    3476             :                                    atomic_read(&cur_trans->num_writers) == 1);
    3477           0 :                                 ret = update_block_group_item(trans, path, cache);
    3478             :                         }
    3479           0 :                         if (ret)
    3480           0 :                                 btrfs_abort_transaction(trans, ret);
    3481             :                 }
    3482             : 
    3483             :                 /* If its not on the io list, we need to put the block group */
    3484           0 :                 if (should_put)
    3485           0 :                         btrfs_put_block_group(cache);
    3486           0 :                 btrfs_delayed_refs_rsv_release(fs_info, 1);
    3487           0 :                 spin_lock(&cur_trans->dirty_bgs_lock);
    3488             :         }
    3489           0 :         spin_unlock(&cur_trans->dirty_bgs_lock);
    3490             : 
    3491             :         /*
    3492             :          * Refer to the definition of io_bgs member for details why it's safe
    3493             :          * to use it without any locking
    3494             :          */
    3495           0 :         while (!list_empty(io)) {
    3496           0 :                 cache = list_first_entry(io, struct btrfs_block_group,
    3497             :                                          io_list);
    3498           0 :                 list_del_init(&cache->io_list);
    3499           0 :                 btrfs_wait_cache_io(trans, cache, path);
    3500           0 :                 btrfs_put_block_group(cache);
    3501             :         }
    3502             : 
    3503           0 :         btrfs_free_path(path);
    3504           0 :         return ret;
    3505             : }
    3506             : 
    3507           0 : int btrfs_update_block_group(struct btrfs_trans_handle *trans,
    3508             :                              u64 bytenr, u64 num_bytes, bool alloc)
    3509             : {
    3510           0 :         struct btrfs_fs_info *info = trans->fs_info;
    3511           0 :         struct btrfs_block_group *cache = NULL;
    3512           0 :         u64 total = num_bytes;
    3513           0 :         u64 old_val;
    3514           0 :         u64 byte_in_group;
    3515           0 :         int factor;
    3516           0 :         int ret = 0;
    3517             : 
    3518             :         /* Block accounting for super block */
    3519           0 :         spin_lock(&info->delalloc_root_lock);
    3520           0 :         old_val = btrfs_super_bytes_used(info->super_copy);
    3521           0 :         if (alloc)
    3522           0 :                 old_val += num_bytes;
    3523             :         else
    3524           0 :                 old_val -= num_bytes;
    3525           0 :         btrfs_set_super_bytes_used(info->super_copy, old_val);
    3526           0 :         spin_unlock(&info->delalloc_root_lock);
    3527             : 
    3528           0 :         while (total) {
    3529           0 :                 struct btrfs_space_info *space_info;
    3530           0 :                 bool reclaim = false;
    3531             : 
    3532           0 :                 cache = btrfs_lookup_block_group(info, bytenr);
    3533           0 :                 if (!cache) {
    3534             :                         ret = -ENOENT;
    3535             :                         break;
    3536             :                 }
    3537           0 :                 space_info = cache->space_info;
    3538           0 :                 factor = btrfs_bg_type_to_factor(cache->flags);
    3539             : 
    3540             :                 /*
    3541             :                  * If this block group has free space cache written out, we
    3542             :                  * need to make sure to load it if we are removing space.  This
    3543             :                  * is because we need the unpinning stage to actually add the
    3544             :                  * space back to the block group, otherwise we will leak space.
    3545             :                  */
    3546           0 :                 if (!alloc && !btrfs_block_group_done(cache))
    3547           0 :                         btrfs_cache_block_group(cache, true);
    3548             : 
    3549           0 :                 byte_in_group = bytenr - cache->start;
    3550           0 :                 WARN_ON(byte_in_group > cache->length);
    3551             : 
    3552           0 :                 spin_lock(&space_info->lock);
    3553           0 :                 spin_lock(&cache->lock);
    3554             : 
    3555           0 :                 if (btrfs_test_opt(info, SPACE_CACHE) &&
    3556           0 :                     cache->disk_cache_state < BTRFS_DC_CLEAR)
    3557           0 :                         cache->disk_cache_state = BTRFS_DC_CLEAR;
    3558             : 
    3559           0 :                 old_val = cache->used;
    3560           0 :                 num_bytes = min(total, cache->length - byte_in_group);
    3561           0 :                 if (alloc) {
    3562           0 :                         old_val += num_bytes;
    3563           0 :                         cache->used = old_val;
    3564           0 :                         cache->reserved -= num_bytes;
    3565           0 :                         space_info->bytes_reserved -= num_bytes;
    3566           0 :                         space_info->bytes_used += num_bytes;
    3567           0 :                         space_info->disk_used += num_bytes * factor;
    3568           0 :                         spin_unlock(&cache->lock);
    3569           0 :                         spin_unlock(&space_info->lock);
    3570             :                 } else {
    3571           0 :                         old_val -= num_bytes;
    3572           0 :                         cache->used = old_val;
    3573           0 :                         cache->pinned += num_bytes;
    3574           0 :                         btrfs_space_info_update_bytes_pinned(info, space_info,
    3575             :                                                              num_bytes);
    3576           0 :                         space_info->bytes_used -= num_bytes;
    3577           0 :                         space_info->disk_used -= num_bytes * factor;
    3578             : 
    3579           0 :                         reclaim = should_reclaim_block_group(cache, num_bytes);
    3580             : 
    3581           0 :                         spin_unlock(&cache->lock);
    3582           0 :                         spin_unlock(&space_info->lock);
    3583             : 
    3584           0 :                         set_extent_bit(&trans->transaction->pinned_extents,
    3585           0 :                                        bytenr, bytenr + num_bytes - 1,
    3586             :                                        EXTENT_DIRTY, NULL);
    3587             :                 }
    3588             : 
    3589           0 :                 spin_lock(&trans->transaction->dirty_bgs_lock);
    3590           0 :                 if (list_empty(&cache->dirty_list)) {
    3591           0 :                         list_add_tail(&cache->dirty_list,
    3592           0 :                                       &trans->transaction->dirty_bgs);
    3593           0 :                         trans->delayed_ref_updates++;
    3594           0 :                         btrfs_get_block_group(cache);
    3595             :                 }
    3596           0 :                 spin_unlock(&trans->transaction->dirty_bgs_lock);
    3597             : 
    3598             :                 /*
    3599             :                  * No longer have used bytes in this block group, queue it for
    3600             :                  * deletion. We do this after adding the block group to the
    3601             :                  * dirty list to avoid races between cleaner kthread and space
    3602             :                  * cache writeout.
    3603             :                  */
    3604           0 :                 if (!alloc && old_val == 0) {
    3605           0 :                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
    3606           0 :                                 btrfs_mark_bg_unused(cache);
    3607           0 :                 } else if (!alloc && reclaim) {
    3608           0 :                         btrfs_mark_bg_to_reclaim(cache);
    3609             :                 }
    3610             : 
    3611           0 :                 btrfs_put_block_group(cache);
    3612           0 :                 total -= num_bytes;
    3613           0 :                 bytenr += num_bytes;
    3614             :         }
    3615             : 
    3616             :         /* Modified block groups are accounted for in the delayed_refs_rsv. */
    3617           0 :         btrfs_update_delayed_refs_rsv(trans);
    3618           0 :         return ret;
    3619             : }
    3620             : 
    3621             : /*
    3622             :  * Update the block_group and space info counters.
    3623             :  *
    3624             :  * @cache:      The cache we are manipulating
    3625             :  * @ram_bytes:  The number of bytes of file content, and will be same to
    3626             :  *              @num_bytes except for the compress path.
    3627             :  * @num_bytes:  The number of bytes in question
    3628             :  * @delalloc:   The blocks are allocated for the delalloc write
    3629             :  *
    3630             :  * This is called by the allocator when it reserves space. If this is a
    3631             :  * reservation and the block group has become read only we cannot make the
    3632             :  * reservation and return -EAGAIN, otherwise this function always succeeds.
    3633             :  */
    3634           0 : int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
    3635             :                              u64 ram_bytes, u64 num_bytes, int delalloc,
    3636             :                              bool force_wrong_size_class)
    3637             : {
    3638           0 :         struct btrfs_space_info *space_info = cache->space_info;
    3639           0 :         enum btrfs_block_group_size_class size_class;
    3640           0 :         int ret = 0;
    3641             : 
    3642           0 :         spin_lock(&space_info->lock);
    3643           0 :         spin_lock(&cache->lock);
    3644           0 :         if (cache->ro) {
    3645           0 :                 ret = -EAGAIN;
    3646           0 :                 goto out;
    3647             :         }
    3648             : 
    3649           0 :         if (btrfs_block_group_should_use_size_class(cache)) {
    3650           0 :                 size_class = btrfs_calc_block_group_size_class(num_bytes);
    3651           0 :                 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
    3652           0 :                 if (ret)
    3653           0 :                         goto out;
    3654             :         }
    3655           0 :         cache->reserved += num_bytes;
    3656           0 :         space_info->bytes_reserved += num_bytes;
    3657           0 :         trace_btrfs_space_reservation(cache->fs_info, "space_info",
    3658             :                                       space_info->flags, num_bytes, 1);
    3659           0 :         btrfs_space_info_update_bytes_may_use(cache->fs_info,
    3660           0 :                                               space_info, -ram_bytes);
    3661           0 :         if (delalloc)
    3662           0 :                 cache->delalloc_bytes += num_bytes;
    3663             : 
    3664             :         /*
    3665             :          * Compression can use less space than we reserved, so wake tickets if
    3666             :          * that happens.
    3667             :          */
    3668           0 :         if (num_bytes < ram_bytes)
    3669           0 :                 btrfs_try_granting_tickets(cache->fs_info, space_info);
    3670           0 : out:
    3671           0 :         spin_unlock(&cache->lock);
    3672           0 :         spin_unlock(&space_info->lock);
    3673           0 :         return ret;
    3674             : }
    3675             : 
    3676             : /*
    3677             :  * Update the block_group and space info counters.
    3678             :  *
    3679             :  * @cache:      The cache we are manipulating
    3680             :  * @num_bytes:  The number of bytes in question
    3681             :  * @delalloc:   The blocks are allocated for the delalloc write
    3682             :  *
    3683             :  * This is called by somebody who is freeing space that was never actually used
    3684             :  * on disk.  For example if you reserve some space for a new leaf in transaction
    3685             :  * A and before transaction A commits you free that leaf, you call this with
    3686             :  * reserve set to 0 in order to clear the reservation.
    3687             :  */
    3688           0 : void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
    3689             :                                u64 num_bytes, int delalloc)
    3690             : {
    3691           0 :         struct btrfs_space_info *space_info = cache->space_info;
    3692             : 
    3693           0 :         spin_lock(&space_info->lock);
    3694           0 :         spin_lock(&cache->lock);
    3695           0 :         if (cache->ro)
    3696           0 :                 space_info->bytes_readonly += num_bytes;
    3697           0 :         cache->reserved -= num_bytes;
    3698           0 :         space_info->bytes_reserved -= num_bytes;
    3699           0 :         space_info->max_extent_size = 0;
    3700             : 
    3701           0 :         if (delalloc)
    3702           0 :                 cache->delalloc_bytes -= num_bytes;
    3703           0 :         spin_unlock(&cache->lock);
    3704             : 
    3705           0 :         btrfs_try_granting_tickets(cache->fs_info, space_info);
    3706           0 :         spin_unlock(&space_info->lock);
    3707           0 : }
    3708             : 
    3709           0 : static void force_metadata_allocation(struct btrfs_fs_info *info)
    3710             : {
    3711           0 :         struct list_head *head = &info->space_info;
    3712           0 :         struct btrfs_space_info *found;
    3713             : 
    3714           0 :         list_for_each_entry(found, head, list) {
    3715           0 :                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
    3716           0 :                         found->force_alloc = CHUNK_ALLOC_FORCE;
    3717             :         }
    3718           0 : }
    3719             : 
    3720           0 : static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
    3721             :                               struct btrfs_space_info *sinfo, int force)
    3722             : {
    3723           0 :         u64 bytes_used = btrfs_space_info_used(sinfo, false);
    3724           0 :         u64 thresh;
    3725             : 
    3726           0 :         if (force == CHUNK_ALLOC_FORCE)
    3727             :                 return 1;
    3728             : 
    3729             :         /*
    3730             :          * in limited mode, we want to have some free space up to
    3731             :          * about 1% of the FS size.
    3732             :          */
    3733           0 :         if (force == CHUNK_ALLOC_LIMITED) {
    3734           0 :                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
    3735           0 :                 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
    3736             : 
    3737           0 :                 if (sinfo->total_bytes - bytes_used < thresh)
    3738             :                         return 1;
    3739             :         }
    3740             : 
    3741           0 :         if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
    3742           0 :                 return 0;
    3743             :         return 1;
    3744             : }
    3745             : 
    3746           0 : int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
    3747             : {
    3748           0 :         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
    3749             : 
    3750           0 :         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
    3751             : }
    3752             : 
    3753           0 : static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
    3754             : {
    3755           0 :         struct btrfs_block_group *bg;
    3756           0 :         int ret;
    3757             : 
    3758             :         /*
    3759             :          * Check if we have enough space in the system space info because we
    3760             :          * will need to update device items in the chunk btree and insert a new
    3761             :          * chunk item in the chunk btree as well. This will allocate a new
    3762             :          * system block group if needed.
    3763             :          */
    3764           0 :         check_system_chunk(trans, flags);
    3765             : 
    3766           0 :         bg = btrfs_create_chunk(trans, flags);
    3767           0 :         if (IS_ERR(bg)) {
    3768           0 :                 ret = PTR_ERR(bg);
    3769           0 :                 goto out;
    3770             :         }
    3771             : 
    3772           0 :         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
    3773             :         /*
    3774             :          * Normally we are not expected to fail with -ENOSPC here, since we have
    3775             :          * previously reserved space in the system space_info and allocated one
    3776             :          * new system chunk if necessary. However there are three exceptions:
    3777             :          *
    3778             :          * 1) We may have enough free space in the system space_info but all the
    3779             :          *    existing system block groups have a profile which can not be used
    3780             :          *    for extent allocation.
    3781             :          *
    3782             :          *    This happens when mounting in degraded mode. For example we have a
    3783             :          *    RAID1 filesystem with 2 devices, lose one device and mount the fs
    3784             :          *    using the other device in degraded mode. If we then allocate a chunk,
    3785             :          *    we may have enough free space in the existing system space_info, but
    3786             :          *    none of the block groups can be used for extent allocation since they
    3787             :          *    have a RAID1 profile, and because we are in degraded mode with a
    3788             :          *    single device, we are forced to allocate a new system chunk with a
    3789             :          *    SINGLE profile. Making check_system_chunk() iterate over all system
    3790             :          *    block groups and check if they have a usable profile and enough space
    3791             :          *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
    3792             :          *    try again after forcing allocation of a new system chunk. Like this
    3793             :          *    we avoid paying the cost of that search in normal circumstances, when
    3794             :          *    we were not mounted in degraded mode;
    3795             :          *
    3796             :          * 2) We had enough free space info the system space_info, and one suitable
    3797             :          *    block group to allocate from when we called check_system_chunk()
    3798             :          *    above. However right after we called it, the only system block group
    3799             :          *    with enough free space got turned into RO mode by a running scrub,
    3800             :          *    and in this case we have to allocate a new one and retry. We only
    3801             :          *    need do this allocate and retry once, since we have a transaction
    3802             :          *    handle and scrub uses the commit root to search for block groups;
    3803             :          *
    3804             :          * 3) We had one system block group with enough free space when we called
    3805             :          *    check_system_chunk(), but after that, right before we tried to
    3806             :          *    allocate the last extent buffer we needed, a discard operation came
    3807             :          *    in and it temporarily removed the last free space entry from the
    3808             :          *    block group (discard removes a free space entry, discards it, and
    3809             :          *    then adds back the entry to the block group cache).
    3810             :          */
    3811           0 :         if (ret == -ENOSPC) {
    3812           0 :                 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
    3813           0 :                 struct btrfs_block_group *sys_bg;
    3814             : 
    3815           0 :                 sys_bg = btrfs_create_chunk(trans, sys_flags);
    3816           0 :                 if (IS_ERR(sys_bg)) {
    3817           0 :                         ret = PTR_ERR(sys_bg);
    3818           0 :                         btrfs_abort_transaction(trans, ret);
    3819           0 :                         goto out;
    3820             :                 }
    3821             : 
    3822           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
    3823           0 :                 if (ret) {
    3824           0 :                         btrfs_abort_transaction(trans, ret);
    3825           0 :                         goto out;
    3826             :                 }
    3827             : 
    3828           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
    3829           0 :                 if (ret) {
    3830           0 :                         btrfs_abort_transaction(trans, ret);
    3831           0 :                         goto out;
    3832             :                 }
    3833           0 :         } else if (ret) {
    3834           0 :                 btrfs_abort_transaction(trans, ret);
    3835           0 :                 goto out;
    3836             :         }
    3837           0 : out:
    3838           0 :         btrfs_trans_release_chunk_metadata(trans);
    3839             : 
    3840           0 :         if (ret)
    3841           0 :                 return ERR_PTR(ret);
    3842             : 
    3843           0 :         btrfs_get_block_group(bg);
    3844           0 :         return bg;
    3845             : }
    3846             : 
    3847             : /*
    3848             :  * Chunk allocation is done in 2 phases:
    3849             :  *
    3850             :  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
    3851             :  *    the chunk, the chunk mapping, create its block group and add the items
    3852             :  *    that belong in the chunk btree to it - more specifically, we need to
    3853             :  *    update device items in the chunk btree and add a new chunk item to it.
    3854             :  *
    3855             :  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
    3856             :  *    group item to the extent btree and the device extent items to the devices
    3857             :  *    btree.
    3858             :  *
    3859             :  * This is done to prevent deadlocks. For example when COWing a node from the
    3860             :  * extent btree we are holding a write lock on the node's parent and if we
    3861             :  * trigger chunk allocation and attempted to insert the new block group item
    3862             :  * in the extent btree right way, we could deadlock because the path for the
    3863             :  * insertion can include that parent node. At first glance it seems impossible
    3864             :  * to trigger chunk allocation after starting a transaction since tasks should
    3865             :  * reserve enough transaction units (metadata space), however while that is true
    3866             :  * most of the time, chunk allocation may still be triggered for several reasons:
    3867             :  *
    3868             :  * 1) When reserving metadata, we check if there is enough free space in the
    3869             :  *    metadata space_info and therefore don't trigger allocation of a new chunk.
    3870             :  *    However later when the task actually tries to COW an extent buffer from
    3871             :  *    the extent btree or from the device btree for example, it is forced to
    3872             :  *    allocate a new block group (chunk) because the only one that had enough
    3873             :  *    free space was just turned to RO mode by a running scrub for example (or
    3874             :  *    device replace, block group reclaim thread, etc), so we can not use it
    3875             :  *    for allocating an extent and end up being forced to allocate a new one;
    3876             :  *
    3877             :  * 2) Because we only check that the metadata space_info has enough free bytes,
    3878             :  *    we end up not allocating a new metadata chunk in that case. However if
    3879             :  *    the filesystem was mounted in degraded mode, none of the existing block
    3880             :  *    groups might be suitable for extent allocation due to their incompatible
    3881             :  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
    3882             :  *    use a RAID1 profile, in degraded mode using a single device). In this case
    3883             :  *    when the task attempts to COW some extent buffer of the extent btree for
    3884             :  *    example, it will trigger allocation of a new metadata block group with a
    3885             :  *    suitable profile (SINGLE profile in the example of the degraded mount of
    3886             :  *    the RAID1 filesystem);
    3887             :  *
    3888             :  * 3) The task has reserved enough transaction units / metadata space, but when
    3889             :  *    it attempts to COW an extent buffer from the extent or device btree for
    3890             :  *    example, it does not find any free extent in any metadata block group,
    3891             :  *    therefore forced to try to allocate a new metadata block group.
    3892             :  *    This is because some other task allocated all available extents in the
    3893             :  *    meanwhile - this typically happens with tasks that don't reserve space
    3894             :  *    properly, either intentionally or as a bug. One example where this is
    3895             :  *    done intentionally is fsync, as it does not reserve any transaction units
    3896             :  *    and ends up allocating a variable number of metadata extents for log
    3897             :  *    tree extent buffers;
    3898             :  *
    3899             :  * 4) The task has reserved enough transaction units / metadata space, but right
    3900             :  *    before it tries to allocate the last extent buffer it needs, a discard
    3901             :  *    operation comes in and, temporarily, removes the last free space entry from
    3902             :  *    the only metadata block group that had free space (discard starts by
    3903             :  *    removing a free space entry from a block group, then does the discard
    3904             :  *    operation and, once it's done, it adds back the free space entry to the
    3905             :  *    block group).
    3906             :  *
    3907             :  * We also need this 2 phases setup when adding a device to a filesystem with
    3908             :  * a seed device - we must create new metadata and system chunks without adding
    3909             :  * any of the block group items to the chunk, extent and device btrees. If we
    3910             :  * did not do it this way, we would get ENOSPC when attempting to update those
    3911             :  * btrees, since all the chunks from the seed device are read-only.
    3912             :  *
    3913             :  * Phase 1 does the updates and insertions to the chunk btree because if we had
    3914             :  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
    3915             :  * parallel, we risk having too many system chunks allocated by many tasks if
    3916             :  * many tasks reach phase 1 without the previous ones completing phase 2. In the
    3917             :  * extreme case this leads to exhaustion of the system chunk array in the
    3918             :  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
    3919             :  * and with RAID filesystems (so we have more device items in the chunk btree).
    3920             :  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
    3921             :  * the system chunk array due to concurrent allocations") provides more details.
    3922             :  *
    3923             :  * Allocation of system chunks does not happen through this function. A task that
    3924             :  * needs to update the chunk btree (the only btree that uses system chunks), must
    3925             :  * preallocate chunk space by calling either check_system_chunk() or
    3926             :  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
    3927             :  * metadata chunk or when removing a chunk, while the later is used before doing
    3928             :  * a modification to the chunk btree - use cases for the later are adding,
    3929             :  * removing and resizing a device as well as relocation of a system chunk.
    3930             :  * See the comment below for more details.
    3931             :  *
    3932             :  * The reservation of system space, done through check_system_chunk(), as well
    3933             :  * as all the updates and insertions into the chunk btree must be done while
    3934             :  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
    3935             :  * an extent buffer from the chunks btree we never trigger allocation of a new
    3936             :  * system chunk, which would result in a deadlock (trying to lock twice an
    3937             :  * extent buffer of the chunk btree, first time before triggering the chunk
    3938             :  * allocation and the second time during chunk allocation while attempting to
    3939             :  * update the chunks btree). The system chunk array is also updated while holding
    3940             :  * that mutex. The same logic applies to removing chunks - we must reserve system
    3941             :  * space, update the chunk btree and the system chunk array in the superblock
    3942             :  * while holding fs_info->chunk_mutex.
    3943             :  *
    3944             :  * This function, btrfs_chunk_alloc(), belongs to phase 1.
    3945             :  *
    3946             :  * If @force is CHUNK_ALLOC_FORCE:
    3947             :  *    - return 1 if it successfully allocates a chunk,
    3948             :  *    - return errors including -ENOSPC otherwise.
    3949             :  * If @force is NOT CHUNK_ALLOC_FORCE:
    3950             :  *    - return 0 if it doesn't need to allocate a new chunk,
    3951             :  *    - return 1 if it successfully allocates a chunk,
    3952             :  *    - return errors including -ENOSPC otherwise.
    3953             :  */
    3954           0 : int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
    3955             :                       enum btrfs_chunk_alloc_enum force)
    3956             : {
    3957           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3958           0 :         struct btrfs_space_info *space_info;
    3959           0 :         struct btrfs_block_group *ret_bg;
    3960           0 :         bool wait_for_alloc = false;
    3961           0 :         bool should_alloc = false;
    3962           0 :         bool from_extent_allocation = false;
    3963           0 :         int ret = 0;
    3964             : 
    3965           0 :         if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
    3966           0 :                 from_extent_allocation = true;
    3967           0 :                 force = CHUNK_ALLOC_FORCE;
    3968             :         }
    3969             : 
    3970             :         /* Don't re-enter if we're already allocating a chunk */
    3971           0 :         if (trans->allocating_chunk)
    3972             :                 return -ENOSPC;
    3973             :         /*
    3974             :          * Allocation of system chunks can not happen through this path, as we
    3975             :          * could end up in a deadlock if we are allocating a data or metadata
    3976             :          * chunk and there is another task modifying the chunk btree.
    3977             :          *
    3978             :          * This is because while we are holding the chunk mutex, we will attempt
    3979             :          * to add the new chunk item to the chunk btree or update an existing
    3980             :          * device item in the chunk btree, while the other task that is modifying
    3981             :          * the chunk btree is attempting to COW an extent buffer while holding a
    3982             :          * lock on it and on its parent - if the COW operation triggers a system
    3983             :          * chunk allocation, then we can deadlock because we are holding the
    3984             :          * chunk mutex and we may need to access that extent buffer or its parent
    3985             :          * in order to add the chunk item or update a device item.
    3986             :          *
    3987             :          * Tasks that want to modify the chunk tree should reserve system space
    3988             :          * before updating the chunk btree, by calling either
    3989             :          * btrfs_reserve_chunk_metadata() or check_system_chunk().
    3990             :          * It's possible that after a task reserves the space, it still ends up
    3991             :          * here - this happens in the cases described above at do_chunk_alloc().
    3992             :          * The task will have to either retry or fail.
    3993             :          */
    3994           0 :         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
    3995             :                 return -ENOSPC;
    3996             : 
    3997           0 :         space_info = btrfs_find_space_info(fs_info, flags);
    3998           0 :         ASSERT(space_info);
    3999             : 
    4000           0 :         do {
    4001           0 :                 spin_lock(&space_info->lock);
    4002           0 :                 if (force < space_info->force_alloc)
    4003             :                         force = space_info->force_alloc;
    4004           0 :                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
    4005           0 :                 if (space_info->full) {
    4006             :                         /* No more free physical space */
    4007           0 :                         if (should_alloc)
    4008             :                                 ret = -ENOSPC;
    4009             :                         else
    4010           0 :                                 ret = 0;
    4011           0 :                         spin_unlock(&space_info->lock);
    4012           0 :                         return ret;
    4013           0 :                 } else if (!should_alloc) {
    4014           0 :                         spin_unlock(&space_info->lock);
    4015           0 :                         return 0;
    4016           0 :                 } else if (space_info->chunk_alloc) {
    4017             :                         /*
    4018             :                          * Someone is already allocating, so we need to block
    4019             :                          * until this someone is finished and then loop to
    4020             :                          * recheck if we should continue with our allocation
    4021             :                          * attempt.
    4022             :                          */
    4023           0 :                         wait_for_alloc = true;
    4024           0 :                         force = CHUNK_ALLOC_NO_FORCE;
    4025           0 :                         spin_unlock(&space_info->lock);
    4026           0 :                         mutex_lock(&fs_info->chunk_mutex);
    4027           0 :                         mutex_unlock(&fs_info->chunk_mutex);
    4028             :                 } else {
    4029             :                         /* Proceed with allocation */
    4030           0 :                         space_info->chunk_alloc = 1;
    4031           0 :                         wait_for_alloc = false;
    4032           0 :                         spin_unlock(&space_info->lock);
    4033             :                 }
    4034             : 
    4035           0 :                 cond_resched();
    4036           0 :         } while (wait_for_alloc);
    4037             : 
    4038           0 :         mutex_lock(&fs_info->chunk_mutex);
    4039           0 :         trans->allocating_chunk = true;
    4040             : 
    4041             :         /*
    4042             :          * If we have mixed data/metadata chunks we want to make sure we keep
    4043             :          * allocating mixed chunks instead of individual chunks.
    4044             :          */
    4045           0 :         if (btrfs_mixed_space_info(space_info))
    4046           0 :                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
    4047             : 
    4048             :         /*
    4049             :          * if we're doing a data chunk, go ahead and make sure that
    4050             :          * we keep a reasonable number of metadata chunks allocated in the
    4051             :          * FS as well.
    4052             :          */
    4053           0 :         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
    4054           0 :                 fs_info->data_chunk_allocations++;
    4055           0 :                 if (!(fs_info->data_chunk_allocations %
    4056             :                       fs_info->metadata_ratio))
    4057           0 :                         force_metadata_allocation(fs_info);
    4058             :         }
    4059             : 
    4060           0 :         ret_bg = do_chunk_alloc(trans, flags);
    4061           0 :         trans->allocating_chunk = false;
    4062             : 
    4063           0 :         if (IS_ERR(ret_bg)) {
    4064           0 :                 ret = PTR_ERR(ret_bg);
    4065           0 :         } else if (from_extent_allocation) {
    4066             :                 /*
    4067             :                  * New block group is likely to be used soon. Try to activate
    4068             :                  * it now. Failure is OK for now.
    4069             :                  */
    4070           0 :                 btrfs_zone_activate(ret_bg);
    4071             :         }
    4072             : 
    4073           0 :         if (!ret)
    4074           0 :                 btrfs_put_block_group(ret_bg);
    4075             : 
    4076           0 :         spin_lock(&space_info->lock);
    4077           0 :         if (ret < 0) {
    4078           0 :                 if (ret == -ENOSPC)
    4079           0 :                         space_info->full = 1;
    4080             :                 else
    4081           0 :                         goto out;
    4082             :         } else {
    4083           0 :                 ret = 1;
    4084           0 :                 space_info->max_extent_size = 0;
    4085             :         }
    4086             : 
    4087           0 :         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
    4088           0 : out:
    4089           0 :         space_info->chunk_alloc = 0;
    4090           0 :         spin_unlock(&space_info->lock);
    4091           0 :         mutex_unlock(&fs_info->chunk_mutex);
    4092             : 
    4093           0 :         return ret;
    4094             : }
    4095             : 
    4096           0 : static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
    4097             : {
    4098           0 :         u64 num_dev;
    4099             : 
    4100           0 :         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
    4101           0 :         if (!num_dev)
    4102           0 :                 num_dev = fs_info->fs_devices->rw_devices;
    4103             : 
    4104           0 :         return num_dev;
    4105             : }
    4106             : 
    4107           0 : static void reserve_chunk_space(struct btrfs_trans_handle *trans,
    4108             :                                 u64 bytes,
    4109             :                                 u64 type)
    4110             : {
    4111           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4112           0 :         struct btrfs_space_info *info;
    4113           0 :         u64 left;
    4114           0 :         int ret = 0;
    4115             : 
    4116             :         /*
    4117             :          * Needed because we can end up allocating a system chunk and for an
    4118             :          * atomic and race free space reservation in the chunk block reserve.
    4119             :          */
    4120           0 :         lockdep_assert_held(&fs_info->chunk_mutex);
    4121             : 
    4122           0 :         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
    4123           0 :         spin_lock(&info->lock);
    4124           0 :         left = info->total_bytes - btrfs_space_info_used(info, true);
    4125           0 :         spin_unlock(&info->lock);
    4126             : 
    4127           0 :         if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
    4128           0 :                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
    4129             :                            left, bytes, type);
    4130           0 :                 btrfs_dump_space_info(fs_info, info, 0, 0);
    4131             :         }
    4132             : 
    4133           0 :         if (left < bytes) {
    4134           0 :                 u64 flags = btrfs_system_alloc_profile(fs_info);
    4135           0 :                 struct btrfs_block_group *bg;
    4136             : 
    4137             :                 /*
    4138             :                  * Ignore failure to create system chunk. We might end up not
    4139             :                  * needing it, as we might not need to COW all nodes/leafs from
    4140             :                  * the paths we visit in the chunk tree (they were already COWed
    4141             :                  * or created in the current transaction for example).
    4142             :                  */
    4143           0 :                 bg = btrfs_create_chunk(trans, flags);
    4144           0 :                 if (IS_ERR(bg)) {
    4145           0 :                         ret = PTR_ERR(bg);
    4146             :                 } else {
    4147             :                         /*
    4148             :                          * We have a new chunk. We also need to activate it for
    4149             :                          * zoned filesystem.
    4150             :                          */
    4151           0 :                         ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
    4152           0 :                         if (ret < 0)
    4153             :                                 return;
    4154             : 
    4155             :                         /*
    4156             :                          * If we fail to add the chunk item here, we end up
    4157             :                          * trying again at phase 2 of chunk allocation, at
    4158             :                          * btrfs_create_pending_block_groups(). So ignore
    4159             :                          * any error here. An ENOSPC here could happen, due to
    4160             :                          * the cases described at do_chunk_alloc() - the system
    4161             :                          * block group we just created was just turned into RO
    4162             :                          * mode by a scrub for example, or a running discard
    4163             :                          * temporarily removed its free space entries, etc.
    4164             :                          */
    4165           0 :                         btrfs_chunk_alloc_add_chunk_item(trans, bg);
    4166             :                 }
    4167             :         }
    4168             : 
    4169           0 :         if (!ret) {
    4170           0 :                 ret = btrfs_block_rsv_add(fs_info,
    4171             :                                           &fs_info->chunk_block_rsv,
    4172             :                                           bytes, BTRFS_RESERVE_NO_FLUSH);
    4173           0 :                 if (!ret)
    4174           0 :                         trans->chunk_bytes_reserved += bytes;
    4175             :         }
    4176             : }
    4177             : 
    4178             : /*
    4179             :  * Reserve space in the system space for allocating or removing a chunk.
    4180             :  * The caller must be holding fs_info->chunk_mutex.
    4181             :  */
    4182           0 : void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
    4183             : {
    4184           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4185           0 :         const u64 num_devs = get_profile_num_devs(fs_info, type);
    4186           0 :         u64 bytes;
    4187             : 
    4188             :         /* num_devs device items to update and 1 chunk item to add or remove. */
    4189           0 :         bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
    4190             :                 btrfs_calc_insert_metadata_size(fs_info, 1);
    4191             : 
    4192           0 :         reserve_chunk_space(trans, bytes, type);
    4193           0 : }
    4194             : 
    4195             : /*
    4196             :  * Reserve space in the system space, if needed, for doing a modification to the
    4197             :  * chunk btree.
    4198             :  *
    4199             :  * @trans:              A transaction handle.
    4200             :  * @is_item_insertion:  Indicate if the modification is for inserting a new item
    4201             :  *                      in the chunk btree or if it's for the deletion or update
    4202             :  *                      of an existing item.
    4203             :  *
    4204             :  * This is used in a context where we need to update the chunk btree outside
    4205             :  * block group allocation and removal, to avoid a deadlock with a concurrent
    4206             :  * task that is allocating a metadata or data block group and therefore needs to
    4207             :  * update the chunk btree while holding the chunk mutex. After the update to the
    4208             :  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
    4209             :  *
    4210             :  */
    4211           0 : void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
    4212             :                                   bool is_item_insertion)
    4213             : {
    4214           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    4215           0 :         u64 bytes;
    4216             : 
    4217           0 :         if (is_item_insertion)
    4218           0 :                 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
    4219             :         else
    4220           0 :                 bytes = btrfs_calc_metadata_size(fs_info, 1);
    4221             : 
    4222           0 :         mutex_lock(&fs_info->chunk_mutex);
    4223           0 :         reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
    4224           0 :         mutex_unlock(&fs_info->chunk_mutex);
    4225           0 : }
    4226             : 
    4227           0 : void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
    4228             : {
    4229           0 :         struct btrfs_block_group *block_group;
    4230             : 
    4231           0 :         block_group = btrfs_lookup_first_block_group(info, 0);
    4232           0 :         while (block_group) {
    4233           0 :                 btrfs_wait_block_group_cache_done(block_group);
    4234           0 :                 spin_lock(&block_group->lock);
    4235           0 :                 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
    4236           0 :                                        &block_group->runtime_flags)) {
    4237           0 :                         struct inode *inode = block_group->inode;
    4238             : 
    4239           0 :                         block_group->inode = NULL;
    4240           0 :                         spin_unlock(&block_group->lock);
    4241             : 
    4242           0 :                         ASSERT(block_group->io_ctl.inode == NULL);
    4243           0 :                         iput(inode);
    4244             :                 } else {
    4245           0 :                         spin_unlock(&block_group->lock);
    4246             :                 }
    4247           0 :                 block_group = btrfs_next_block_group(block_group);
    4248             :         }
    4249           0 : }
    4250             : 
    4251             : /*
    4252             :  * Must be called only after stopping all workers, since we could have block
    4253             :  * group caching kthreads running, and therefore they could race with us if we
    4254             :  * freed the block groups before stopping them.
    4255             :  */
    4256           0 : int btrfs_free_block_groups(struct btrfs_fs_info *info)
    4257             : {
    4258           0 :         struct btrfs_block_group *block_group;
    4259           0 :         struct btrfs_space_info *space_info;
    4260           0 :         struct btrfs_caching_control *caching_ctl;
    4261           0 :         struct rb_node *n;
    4262             : 
    4263           0 :         write_lock(&info->block_group_cache_lock);
    4264           0 :         while (!list_empty(&info->caching_block_groups)) {
    4265           0 :                 caching_ctl = list_entry(info->caching_block_groups.next,
    4266             :                                          struct btrfs_caching_control, list);
    4267           0 :                 list_del(&caching_ctl->list);
    4268           0 :                 btrfs_put_caching_control(caching_ctl);
    4269             :         }
    4270           0 :         write_unlock(&info->block_group_cache_lock);
    4271             : 
    4272           0 :         spin_lock(&info->unused_bgs_lock);
    4273           0 :         while (!list_empty(&info->unused_bgs)) {
    4274           0 :                 block_group = list_first_entry(&info->unused_bgs,
    4275             :                                                struct btrfs_block_group,
    4276             :                                                bg_list);
    4277           0 :                 list_del_init(&block_group->bg_list);
    4278           0 :                 btrfs_put_block_group(block_group);
    4279             :         }
    4280             : 
    4281           0 :         while (!list_empty(&info->reclaim_bgs)) {
    4282           0 :                 block_group = list_first_entry(&info->reclaim_bgs,
    4283             :                                                struct btrfs_block_group,
    4284             :                                                bg_list);
    4285           0 :                 list_del_init(&block_group->bg_list);
    4286           0 :                 btrfs_put_block_group(block_group);
    4287             :         }
    4288           0 :         spin_unlock(&info->unused_bgs_lock);
    4289             : 
    4290           0 :         spin_lock(&info->zone_active_bgs_lock);
    4291           0 :         while (!list_empty(&info->zone_active_bgs)) {
    4292           0 :                 block_group = list_first_entry(&info->zone_active_bgs,
    4293             :                                                struct btrfs_block_group,
    4294             :                                                active_bg_list);
    4295           0 :                 list_del_init(&block_group->active_bg_list);
    4296           0 :                 btrfs_put_block_group(block_group);
    4297             :         }
    4298           0 :         spin_unlock(&info->zone_active_bgs_lock);
    4299             : 
    4300           0 :         write_lock(&info->block_group_cache_lock);
    4301           0 :         while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
    4302           0 :                 block_group = rb_entry(n, struct btrfs_block_group,
    4303             :                                        cache_node);
    4304           0 :                 rb_erase_cached(&block_group->cache_node,
    4305             :                                 &info->block_group_cache_tree);
    4306           0 :                 RB_CLEAR_NODE(&block_group->cache_node);
    4307           0 :                 write_unlock(&info->block_group_cache_lock);
    4308             : 
    4309           0 :                 down_write(&block_group->space_info->groups_sem);
    4310           0 :                 list_del(&block_group->list);
    4311           0 :                 up_write(&block_group->space_info->groups_sem);
    4312             : 
    4313             :                 /*
    4314             :                  * We haven't cached this block group, which means we could
    4315             :                  * possibly have excluded extents on this block group.
    4316             :                  */
    4317           0 :                 if (block_group->cached == BTRFS_CACHE_NO ||
    4318             :                     block_group->cached == BTRFS_CACHE_ERROR)
    4319           0 :                         btrfs_free_excluded_extents(block_group);
    4320             : 
    4321           0 :                 btrfs_remove_free_space_cache(block_group);
    4322           0 :                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
    4323           0 :                 ASSERT(list_empty(&block_group->dirty_list));
    4324           0 :                 ASSERT(list_empty(&block_group->io_list));
    4325           0 :                 ASSERT(list_empty(&block_group->bg_list));
    4326           0 :                 ASSERT(refcount_read(&block_group->refs) == 1);
    4327           0 :                 ASSERT(block_group->swap_extents == 0);
    4328           0 :                 btrfs_put_block_group(block_group);
    4329             : 
    4330           0 :                 write_lock(&info->block_group_cache_lock);
    4331             :         }
    4332           0 :         write_unlock(&info->block_group_cache_lock);
    4333             : 
    4334           0 :         btrfs_release_global_block_rsv(info);
    4335             : 
    4336           0 :         while (!list_empty(&info->space_info)) {
    4337           0 :                 space_info = list_entry(info->space_info.next,
    4338             :                                         struct btrfs_space_info,
    4339             :                                         list);
    4340             : 
    4341             :                 /*
    4342             :                  * Do not hide this behind enospc_debug, this is actually
    4343             :                  * important and indicates a real bug if this happens.
    4344             :                  */
    4345           0 :                 if (WARN_ON(space_info->bytes_pinned > 0 ||
    4346             :                             space_info->bytes_may_use > 0))
    4347           0 :                         btrfs_dump_space_info(info, space_info, 0, 0);
    4348             : 
    4349             :                 /*
    4350             :                  * If there was a failure to cleanup a log tree, very likely due
    4351             :                  * to an IO failure on a writeback attempt of one or more of its
    4352             :                  * extent buffers, we could not do proper (and cheap) unaccounting
    4353             :                  * of their reserved space, so don't warn on bytes_reserved > 0 in
    4354             :                  * that case.
    4355             :                  */
    4356           0 :                 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
    4357           0 :                     !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
    4358           0 :                         if (WARN_ON(space_info->bytes_reserved > 0))
    4359           0 :                                 btrfs_dump_space_info(info, space_info, 0, 0);
    4360             :                 }
    4361             : 
    4362           0 :                 WARN_ON(space_info->reclaim_size > 0);
    4363           0 :                 list_del(&space_info->list);
    4364           0 :                 btrfs_sysfs_remove_space_info(space_info);
    4365             :         }
    4366           0 :         return 0;
    4367             : }
    4368             : 
    4369           0 : void btrfs_freeze_block_group(struct btrfs_block_group *cache)
    4370             : {
    4371           0 :         atomic_inc(&cache->frozen);
    4372           0 : }
    4373             : 
    4374           0 : void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
    4375             : {
    4376           0 :         struct btrfs_fs_info *fs_info = block_group->fs_info;
    4377           0 :         struct extent_map_tree *em_tree;
    4378           0 :         struct extent_map *em;
    4379           0 :         bool cleanup;
    4380             : 
    4381           0 :         spin_lock(&block_group->lock);
    4382           0 :         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
    4383           0 :                    test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
    4384           0 :         spin_unlock(&block_group->lock);
    4385             : 
    4386           0 :         if (cleanup) {
    4387           0 :                 em_tree = &fs_info->mapping_tree;
    4388           0 :                 write_lock(&em_tree->lock);
    4389           0 :                 em = lookup_extent_mapping(em_tree, block_group->start,
    4390             :                                            1);
    4391           0 :                 BUG_ON(!em); /* logic error, can't happen */
    4392           0 :                 remove_extent_mapping(em_tree, em);
    4393           0 :                 write_unlock(&em_tree->lock);
    4394             : 
    4395             :                 /* once for us and once for the tree */
    4396           0 :                 free_extent_map(em);
    4397           0 :                 free_extent_map(em);
    4398             : 
    4399             :                 /*
    4400             :                  * We may have left one free space entry and other possible
    4401             :                  * tasks trimming this block group have left 1 entry each one.
    4402             :                  * Free them if any.
    4403             :                  */
    4404           0 :                 btrfs_remove_free_space_cache(block_group);
    4405             :         }
    4406           0 : }
    4407             : 
    4408           0 : bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
    4409             : {
    4410           0 :         bool ret = true;
    4411             : 
    4412           0 :         spin_lock(&bg->lock);
    4413           0 :         if (bg->ro)
    4414             :                 ret = false;
    4415             :         else
    4416           0 :                 bg->swap_extents++;
    4417           0 :         spin_unlock(&bg->lock);
    4418             : 
    4419           0 :         return ret;
    4420             : }
    4421             : 
    4422           0 : void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
    4423             : {
    4424           0 :         spin_lock(&bg->lock);
    4425           0 :         ASSERT(!bg->ro);
    4426           0 :         ASSERT(bg->swap_extents >= amount);
    4427           0 :         bg->swap_extents -= amount;
    4428           0 :         spin_unlock(&bg->lock);
    4429           0 : }
    4430             : 
    4431           0 : enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
    4432             : {
    4433           0 :         if (size <= SZ_128K)
    4434             :                 return BTRFS_BG_SZ_SMALL;
    4435           0 :         if (size <= SZ_8M)
    4436           0 :                 return BTRFS_BG_SZ_MEDIUM;
    4437             :         return BTRFS_BG_SZ_LARGE;
    4438             : }
    4439             : 
    4440             : /*
    4441             :  * Handle a block group allocating an extent in a size class
    4442             :  *
    4443             :  * @bg:                         The block group we allocated in.
    4444             :  * @size_class:                 The size class of the allocation.
    4445             :  * @force_wrong_size_class:     Whether we are desperate enough to allow
    4446             :  *                              mismatched size classes.
    4447             :  *
    4448             :  * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
    4449             :  * case of a race that leads to the wrong size class without
    4450             :  * force_wrong_size_class set.
    4451             :  *
    4452             :  * find_free_extent will skip block groups with a mismatched size class until
    4453             :  * it really needs to avoid ENOSPC. In that case it will set
    4454             :  * force_wrong_size_class. However, if a block group is newly allocated and
    4455             :  * doesn't yet have a size class, then it is possible for two allocations of
    4456             :  * different sizes to race and both try to use it. The loser is caught here and
    4457             :  * has to retry.
    4458             :  */
    4459           0 : int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
    4460             :                                      enum btrfs_block_group_size_class size_class,
    4461             :                                      bool force_wrong_size_class)
    4462             : {
    4463           0 :         ASSERT(size_class != BTRFS_BG_SZ_NONE);
    4464             : 
    4465             :         /* The new allocation is in the right size class, do nothing */
    4466           0 :         if (bg->size_class == size_class)
    4467             :                 return 0;
    4468             :         /*
    4469             :          * The new allocation is in a mismatched size class.
    4470             :          * This means one of two things:
    4471             :          *
    4472             :          * 1. Two tasks in find_free_extent for different size_classes raced
    4473             :          *    and hit the same empty block_group. Make the loser try again.
    4474             :          * 2. A call to find_free_extent got desperate enough to set
    4475             :          *    'force_wrong_slab'. Don't change the size_class, but allow the
    4476             :          *    allocation.
    4477             :          */
    4478           0 :         if (bg->size_class != BTRFS_BG_SZ_NONE) {
    4479           0 :                 if (force_wrong_size_class)
    4480             :                         return 0;
    4481           0 :                 return -EAGAIN;
    4482             :         }
    4483             :         /*
    4484             :          * The happy new block group case: the new allocation is the first
    4485             :          * one in the block_group so we set size_class.
    4486             :          */
    4487           0 :         bg->size_class = size_class;
    4488             : 
    4489           0 :         return 0;
    4490             : }
    4491             : 
    4492           0 : bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
    4493             : {
    4494           0 :         if (btrfs_is_zoned(bg->fs_info))
    4495             :                 return false;
    4496           0 :         if (!btrfs_is_block_group_data_only(bg))
    4497           0 :                 return false;
    4498             :         return true;
    4499             : }

Generated by: LCOV version 1.14