LCOV - code coverage report
Current view: top level - fs/btrfs - volumes.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 2317 4011 57.8 %
Date: 2023-07-31 20:08:22 Functions: 111 162 68.5 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) 2007 Oracle.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <linux/sched.h>
       7             : #include <linux/sched/mm.h>
       8             : #include <linux/slab.h>
       9             : #include <linux/ratelimit.h>
      10             : #include <linux/kthread.h>
      11             : #include <linux/semaphore.h>
      12             : #include <linux/uuid.h>
      13             : #include <linux/list_sort.h>
      14             : #include <linux/namei.h>
      15             : #include "misc.h"
      16             : #include "ctree.h"
      17             : #include "extent_map.h"
      18             : #include "disk-io.h"
      19             : #include "transaction.h"
      20             : #include "print-tree.h"
      21             : #include "volumes.h"
      22             : #include "raid56.h"
      23             : #include "rcu-string.h"
      24             : #include "dev-replace.h"
      25             : #include "sysfs.h"
      26             : #include "tree-checker.h"
      27             : #include "space-info.h"
      28             : #include "block-group.h"
      29             : #include "discard.h"
      30             : #include "zoned.h"
      31             : #include "fs.h"
      32             : #include "accessors.h"
      33             : #include "uuid-tree.h"
      34             : #include "ioctl.h"
      35             : #include "relocation.h"
      36             : #include "scrub.h"
      37             : #include "super.h"
      38             : 
      39             : #define BTRFS_BLOCK_GROUP_STRIPE_MASK   (BTRFS_BLOCK_GROUP_RAID0 | \
      40             :                                          BTRFS_BLOCK_GROUP_RAID10 | \
      41             :                                          BTRFS_BLOCK_GROUP_RAID56_MASK)
      42             : 
      43             : const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
      44             :         [BTRFS_RAID_RAID10] = {
      45             :                 .sub_stripes    = 2,
      46             :                 .dev_stripes    = 1,
      47             :                 .devs_max       = 0,    /* 0 == as many as possible */
      48             :                 .devs_min       = 2,
      49             :                 .tolerated_failures = 1,
      50             :                 .devs_increment = 2,
      51             :                 .ncopies        = 2,
      52             :                 .nparity        = 0,
      53             :                 .raid_name      = "raid10",
      54             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
      55             :                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
      56             :         },
      57             :         [BTRFS_RAID_RAID1] = {
      58             :                 .sub_stripes    = 1,
      59             :                 .dev_stripes    = 1,
      60             :                 .devs_max       = 2,
      61             :                 .devs_min       = 2,
      62             :                 .tolerated_failures = 1,
      63             :                 .devs_increment = 2,
      64             :                 .ncopies        = 2,
      65             :                 .nparity        = 0,
      66             :                 .raid_name      = "raid1",
      67             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
      68             :                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
      69             :         },
      70             :         [BTRFS_RAID_RAID1C3] = {
      71             :                 .sub_stripes    = 1,
      72             :                 .dev_stripes    = 1,
      73             :                 .devs_max       = 3,
      74             :                 .devs_min       = 3,
      75             :                 .tolerated_failures = 2,
      76             :                 .devs_increment = 3,
      77             :                 .ncopies        = 3,
      78             :                 .nparity        = 0,
      79             :                 .raid_name      = "raid1c3",
      80             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
      81             :                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
      82             :         },
      83             :         [BTRFS_RAID_RAID1C4] = {
      84             :                 .sub_stripes    = 1,
      85             :                 .dev_stripes    = 1,
      86             :                 .devs_max       = 4,
      87             :                 .devs_min       = 4,
      88             :                 .tolerated_failures = 3,
      89             :                 .devs_increment = 4,
      90             :                 .ncopies        = 4,
      91             :                 .nparity        = 0,
      92             :                 .raid_name      = "raid1c4",
      93             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
      94             :                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
      95             :         },
      96             :         [BTRFS_RAID_DUP] = {
      97             :                 .sub_stripes    = 1,
      98             :                 .dev_stripes    = 2,
      99             :                 .devs_max       = 1,
     100             :                 .devs_min       = 1,
     101             :                 .tolerated_failures = 0,
     102             :                 .devs_increment = 1,
     103             :                 .ncopies        = 2,
     104             :                 .nparity        = 0,
     105             :                 .raid_name      = "dup",
     106             :                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
     107             :                 .mindev_error   = 0,
     108             :         },
     109             :         [BTRFS_RAID_RAID0] = {
     110             :                 .sub_stripes    = 1,
     111             :                 .dev_stripes    = 1,
     112             :                 .devs_max       = 0,
     113             :                 .devs_min       = 1,
     114             :                 .tolerated_failures = 0,
     115             :                 .devs_increment = 1,
     116             :                 .ncopies        = 1,
     117             :                 .nparity        = 0,
     118             :                 .raid_name      = "raid0",
     119             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
     120             :                 .mindev_error   = 0,
     121             :         },
     122             :         [BTRFS_RAID_SINGLE] = {
     123             :                 .sub_stripes    = 1,
     124             :                 .dev_stripes    = 1,
     125             :                 .devs_max       = 1,
     126             :                 .devs_min       = 1,
     127             :                 .tolerated_failures = 0,
     128             :                 .devs_increment = 1,
     129             :                 .ncopies        = 1,
     130             :                 .nparity        = 0,
     131             :                 .raid_name      = "single",
     132             :                 .bg_flag        = 0,
     133             :                 .mindev_error   = 0,
     134             :         },
     135             :         [BTRFS_RAID_RAID5] = {
     136             :                 .sub_stripes    = 1,
     137             :                 .dev_stripes    = 1,
     138             :                 .devs_max       = 0,
     139             :                 .devs_min       = 2,
     140             :                 .tolerated_failures = 1,
     141             :                 .devs_increment = 1,
     142             :                 .ncopies        = 1,
     143             :                 .nparity        = 1,
     144             :                 .raid_name      = "raid5",
     145             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
     146             :                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
     147             :         },
     148             :         [BTRFS_RAID_RAID6] = {
     149             :                 .sub_stripes    = 1,
     150             :                 .dev_stripes    = 1,
     151             :                 .devs_max       = 0,
     152             :                 .devs_min       = 3,
     153             :                 .tolerated_failures = 2,
     154             :                 .devs_increment = 1,
     155             :                 .ncopies        = 1,
     156             :                 .nparity        = 2,
     157             :                 .raid_name      = "raid6",
     158             :                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
     159             :                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
     160             :         },
     161             : };
     162             : 
     163             : /*
     164             :  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
     165             :  * can be used as index to access btrfs_raid_array[].
     166             :  */
     167   169986167 : enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
     168             : {
     169   169986167 :         const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
     170             : 
     171   169986167 :         if (!profile)
     172             :                 return BTRFS_RAID_SINGLE;
     173             : 
     174   128685513 :         return BTRFS_BG_FLAG_TO_INDEX(profile);
     175             : }
     176             : 
     177        9615 : const char *btrfs_bg_type_to_raid_name(u64 flags)
     178             : {
     179        9615 :         const int index = btrfs_bg_flags_to_raid_index(flags);
     180             : 
     181        9615 :         if (index >= BTRFS_NR_RAID_TYPES)
     182             :                 return NULL;
     183             : 
     184        9615 :         return btrfs_raid_array[index].raid_name;
     185             : }
     186             : 
     187           0 : int btrfs_nr_parity_stripes(u64 type)
     188             : {
     189    14786463 :         enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
     190             : 
     191    14786463 :         return btrfs_raid_array[index].nparity;
     192             : }
     193             : 
     194             : /*
     195             :  * Fill @buf with textual description of @bg_flags, no more than @size_buf
     196             :  * bytes including terminating null byte.
     197             :  */
     198         521 : void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
     199             : {
     200         521 :         int i;
     201         521 :         int ret;
     202         521 :         char *bp = buf;
     203         521 :         u64 flags = bg_flags;
     204         521 :         u32 size_bp = size_buf;
     205             : 
     206         521 :         if (!flags) {
     207           0 :                 strcpy(bp, "NONE");
     208           0 :                 return;
     209             :         }
     210             : 
     211             : #define DESCRIBE_FLAG(flag, desc)                                               \
     212             :         do {                                                            \
     213             :                 if (flags & (flag)) {                                       \
     214             :                         ret = snprintf(bp, size_bp, "%s|", (desc));   \
     215             :                         if (ret < 0 || ret >= size_bp)                    \
     216             :                                 goto out_overflow;                      \
     217             :                         size_bp -= ret;                                 \
     218             :                         bp += ret;                                      \
     219             :                         flags &= ~(flag);                           \
     220             :                 }                                                       \
     221             :         } while (0)
     222             : 
     223         521 :         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
     224         521 :         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
     225         521 :         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
     226             : 
     227         521 :         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
     228        5210 :         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
     229        4689 :                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
     230             :                               btrfs_raid_array[i].raid_name);
     231             : #undef DESCRIBE_FLAG
     232             : 
     233         521 :         if (flags) {
     234           0 :                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
     235           0 :                 size_bp -= ret;
     236             :         }
     237             : 
     238         521 :         if (size_bp < size_buf)
     239         521 :                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
     240             : 
     241             :         /*
     242             :          * The text is trimmed, it's up to the caller to provide sufficiently
     243             :          * large buffer
     244             :          */
     245           0 : out_overflow:;
     246             : }
     247             : 
     248             : static int init_first_rw_device(struct btrfs_trans_handle *trans);
     249             : static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
     250             : static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
     251             : 
     252             : /*
     253             :  * Device locking
     254             :  * ==============
     255             :  *
     256             :  * There are several mutexes that protect manipulation of devices and low-level
     257             :  * structures like chunks but not block groups, extents or files
     258             :  *
     259             :  * uuid_mutex (global lock)
     260             :  * ------------------------
     261             :  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
     262             :  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
     263             :  * device) or requested by the device= mount option
     264             :  *
     265             :  * the mutex can be very coarse and can cover long-running operations
     266             :  *
     267             :  * protects: updates to fs_devices counters like missing devices, rw devices,
     268             :  * seeding, structure cloning, opening/closing devices at mount/umount time
     269             :  *
     270             :  * global::fs_devs - add, remove, updates to the global list
     271             :  *
     272             :  * does not protect: manipulation of the fs_devices::devices list in general
     273             :  * but in mount context it could be used to exclude list modifications by eg.
     274             :  * scan ioctl
     275             :  *
     276             :  * btrfs_device::name - renames (write side), read is RCU
     277             :  *
     278             :  * fs_devices::device_list_mutex (per-fs, with RCU)
     279             :  * ------------------------------------------------
     280             :  * protects updates to fs_devices::devices, ie. adding and deleting
     281             :  *
     282             :  * simple list traversal with read-only actions can be done with RCU protection
     283             :  *
     284             :  * may be used to exclude some operations from running concurrently without any
     285             :  * modifications to the list (see write_all_supers)
     286             :  *
     287             :  * Is not required at mount and close times, because our device list is
     288             :  * protected by the uuid_mutex at that point.
     289             :  *
     290             :  * balance_mutex
     291             :  * -------------
     292             :  * protects balance structures (status, state) and context accessed from
     293             :  * several places (internally, ioctl)
     294             :  *
     295             :  * chunk_mutex
     296             :  * -----------
     297             :  * protects chunks, adding or removing during allocation, trim or when a new
     298             :  * device is added/removed. Additionally it also protects post_commit_list of
     299             :  * individual devices, since they can be added to the transaction's
     300             :  * post_commit_list only with chunk_mutex held.
     301             :  *
     302             :  * cleaner_mutex
     303             :  * -------------
     304             :  * a big lock that is held by the cleaner thread and prevents running subvolume
     305             :  * cleaning together with relocation or delayed iputs
     306             :  *
     307             :  *
     308             :  * Lock nesting
     309             :  * ============
     310             :  *
     311             :  * uuid_mutex
     312             :  *   device_list_mutex
     313             :  *     chunk_mutex
     314             :  *   balance_mutex
     315             :  *
     316             :  *
     317             :  * Exclusive operations
     318             :  * ====================
     319             :  *
     320             :  * Maintains the exclusivity of the following operations that apply to the
     321             :  * whole filesystem and cannot run in parallel.
     322             :  *
     323             :  * - Balance (*)
     324             :  * - Device add
     325             :  * - Device remove
     326             :  * - Device replace (*)
     327             :  * - Resize
     328             :  *
     329             :  * The device operations (as above) can be in one of the following states:
     330             :  *
     331             :  * - Running state
     332             :  * - Paused state
     333             :  * - Completed state
     334             :  *
     335             :  * Only device operations marked with (*) can go into the Paused state for the
     336             :  * following reasons:
     337             :  *
     338             :  * - ioctl (only Balance can be Paused through ioctl)
     339             :  * - filesystem remounted as read-only
     340             :  * - filesystem unmounted and mounted as read-only
     341             :  * - system power-cycle and filesystem mounted as read-only
     342             :  * - filesystem or device errors leading to forced read-only
     343             :  *
     344             :  * The status of exclusive operation is set and cleared atomically.
     345             :  * During the course of Paused state, fs_info::exclusive_operation remains set.
     346             :  * A device operation in Paused or Running state can be canceled or resumed
     347             :  * either by ioctl (Balance only) or when remounted as read-write.
     348             :  * The exclusive status is cleared when the device operation is canceled or
     349             :  * completed.
     350             :  */
     351             : 
     352             : DEFINE_MUTEX(uuid_mutex);
     353             : static LIST_HEAD(fs_uuids);
     354           0 : struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
     355             : {
     356           0 :         return &fs_uuids;
     357             : }
     358             : 
     359             : /*
     360             :  * alloc_fs_devices - allocate struct btrfs_fs_devices
     361             :  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
     362             :  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
     363             :  *
     364             :  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
     365             :  * The returned struct is not linked onto any lists and can be destroyed with
     366             :  * kfree() right away.
     367             :  */
     368        3264 : static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
     369             :                                                  const u8 *metadata_fsid)
     370             : {
     371        3264 :         struct btrfs_fs_devices *fs_devs;
     372             : 
     373        3264 :         ASSERT(fsid || !metadata_fsid);
     374             : 
     375        3264 :         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
     376        3264 :         if (!fs_devs)
     377             :                 return ERR_PTR(-ENOMEM);
     378             : 
     379        3264 :         mutex_init(&fs_devs->device_list_mutex);
     380             : 
     381        3264 :         INIT_LIST_HEAD(&fs_devs->devices);
     382        3264 :         INIT_LIST_HEAD(&fs_devs->alloc_list);
     383        3264 :         INIT_LIST_HEAD(&fs_devs->fs_list);
     384        3264 :         INIT_LIST_HEAD(&fs_devs->seed_list);
     385             : 
     386        3264 :         if (fsid) {
     387        6528 :                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
     388        6528 :                 memcpy(fs_devs->metadata_uuid,
     389             :                        metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
     390             :         }
     391             : 
     392             :         return fs_devs;
     393             : }
     394             : 
     395        3254 : static void btrfs_free_device(struct btrfs_device *device)
     396             : {
     397        3254 :         WARN_ON(!list_empty(&device->post_commit_list));
     398        3254 :         rcu_string_free(device->name);
     399        3254 :         extent_io_tree_release(&device->alloc_state);
     400        3254 :         btrfs_destroy_dev_zone_info(device);
     401        3254 :         kfree(device);
     402        3254 : }
     403             : 
     404        3253 : static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
     405             : {
     406        3253 :         struct btrfs_device *device;
     407             : 
     408        3253 :         WARN_ON(fs_devices->opened);
     409        6493 :         while (!list_empty(&fs_devices->devices)) {
     410        3240 :                 device = list_entry(fs_devices->devices.next,
     411             :                                     struct btrfs_device, dev_list);
     412        3240 :                 list_del(&device->dev_list);
     413        3240 :                 btrfs_free_device(device);
     414             :         }
     415        3253 :         kfree(fs_devices);
     416        3253 : }
     417             : 
     418           0 : void __exit btrfs_cleanup_fs_uuids(void)
     419             : {
     420           0 :         struct btrfs_fs_devices *fs_devices;
     421             : 
     422           0 :         while (!list_empty(&fs_uuids)) {
     423           0 :                 fs_devices = list_entry(fs_uuids.next,
     424             :                                         struct btrfs_fs_devices, fs_list);
     425           0 :                 list_del(&fs_devices->fs_list);
     426           0 :                 free_fs_devices(fs_devices);
     427             :         }
     428           0 : }
     429             : 
     430        6192 : static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
     431             :                                   const u8 *fsid, const u8 *metadata_fsid)
     432             : {
     433       12384 :         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
     434             :                 return false;
     435             : 
     436        3476 :         if (!metadata_fsid)
     437             :                 return true;
     438             : 
     439           0 :         if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
     440           0 :                 return false;
     441             : 
     442             :         return true;
     443             : }
     444             : 
     445        6740 : static noinline struct btrfs_fs_devices *find_fsid(
     446             :                 const u8 *fsid, const u8 *metadata_fsid)
     447             : {
     448        6740 :         struct btrfs_fs_devices *fs_devices;
     449             : 
     450        6740 :         ASSERT(fsid);
     451             : 
     452             :         /* Handle non-split brain cases */
     453        9456 :         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
     454        6192 :                 if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
     455        3476 :                         return fs_devices;
     456             :         }
     457             :         return NULL;
     458             : }
     459             : 
     460             : /*
     461             :  * First check if the metadata_uuid is different from the fsid in the given
     462             :  * fs_devices. Then check if the given fsid is the same as the metadata_uuid
     463             :  * in the fs_devices. If it is, return true; otherwise, return false.
     464             :  */
     465           0 : static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
     466             :                                       const u8 *fsid)
     467             : {
     468           0 :         return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
     469           0 :                       BTRFS_FSID_SIZE) != 0 &&
     470             :                memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
     471             : }
     472             : 
     473           0 : static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
     474             :                                 struct btrfs_super_block *disk_super)
     475             : {
     476             : 
     477           0 :         struct btrfs_fs_devices *fs_devices;
     478             : 
     479             :         /*
     480             :          * Handle scanned device having completed its fsid change but
     481             :          * belonging to a fs_devices that was created by first scanning
     482             :          * a device which didn't have its fsid/metadata_uuid changed
     483             :          * at all and the CHANGING_FSID_V2 flag set.
     484             :          */
     485           0 :         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
     486           0 :                 if (!fs_devices->fsid_change)
     487           0 :                         continue;
     488             : 
     489           0 :                 if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
     490           0 :                                           fs_devices->fsid))
     491           0 :                         return fs_devices;
     492             :         }
     493             : 
     494             :         /*
     495             :          * Handle scanned device having completed its fsid change but
     496             :          * belonging to a fs_devices that was created by a device that
     497             :          * has an outdated pair of fsid/metadata_uuid and
     498             :          * CHANGING_FSID_V2 flag set.
     499             :          */
     500           0 :         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
     501           0 :                 if (!fs_devices->fsid_change)
     502           0 :                         continue;
     503             : 
     504           0 :                 if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
     505           0 :                         return fs_devices;
     506             :         }
     507             : 
     508           0 :         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
     509             : }
     510             : 
     511             : 
     512             : static int
     513        3242 : btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
     514             :                       int flush, struct block_device **bdev,
     515             :                       struct btrfs_super_block **disk_super)
     516             : {
     517        3242 :         int ret;
     518             : 
     519        3242 :         *bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
     520             : 
     521        3242 :         if (IS_ERR(*bdev)) {
     522           0 :                 ret = PTR_ERR(*bdev);
     523           0 :                 goto error;
     524             :         }
     525             : 
     526        3242 :         if (flush)
     527        3242 :                 sync_blockdev(*bdev);
     528        3242 :         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
     529        3242 :         if (ret) {
     530           0 :                 blkdev_put(*bdev, holder);
     531           0 :                 goto error;
     532             :         }
     533        3242 :         invalidate_bdev(*bdev);
     534        3242 :         *disk_super = btrfs_read_dev_super(*bdev);
     535        3242 :         if (IS_ERR(*disk_super)) {
     536           0 :                 ret = PTR_ERR(*disk_super);
     537           0 :                 blkdev_put(*bdev, holder);
     538           0 :                 goto error;
     539             :         }
     540             : 
     541             :         return 0;
     542             : 
     543           0 : error:
     544           0 :         *bdev = NULL;
     545           0 :         return ret;
     546             : }
     547             : 
     548             : /*
     549             :  *  Search and remove all stale devices (which are not mounted).  When both
     550             :  *  inputs are NULL, it will search and release all stale devices.
     551             :  *
     552             :  *  @devt:         Optional. When provided will it release all unmounted devices
     553             :  *                 matching this devt only.
     554             :  *  @skip_device:  Optional. Will skip this device when searching for the stale
     555             :  *                 devices.
     556             :  *
     557             :  *  Return:     0 for success or if @devt is 0.
     558             :  *              -EBUSY if @devt is a mounted device.
     559             :  *              -ENOENT if @devt does not match any device in the list.
     560             :  */
     561        3270 : static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
     562             : {
     563        3270 :         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
     564        3270 :         struct btrfs_device *device, *tmp_device;
     565        3270 :         int ret = 0;
     566             : 
     567        3270 :         lockdep_assert_held(&uuid_mutex);
     568             : 
     569        3270 :         if (devt)
     570        3265 :                 ret = -ENOENT;
     571             : 
     572        9257 :         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
     573             : 
     574        5987 :                 mutex_lock(&fs_devices->device_list_mutex);
     575       12386 :                 list_for_each_entry_safe(device, tmp_device,
     576             :                                          &fs_devices->devices, dev_list) {
     577        6404 :                         if (skip_device && skip_device == device)
     578        3265 :                                 continue;
     579        3139 :                         if (devt && devt != device->devt)
     580        3120 :                                 continue;
     581          19 :                         if (fs_devices->opened) {
     582             :                                 /* for an already deleted device return 0 */
     583           5 :                                 if (devt && ret != 0)
     584           0 :                                         ret = -EBUSY;
     585             :                                 break;
     586             :                         }
     587             : 
     588             :                         /* delete the stale device */
     589          14 :                         fs_devices->num_devices--;
     590          14 :                         list_del(&device->dev_list);
     591          14 :                         btrfs_free_device(device);
     592             : 
     593          14 :                         ret = 0;
     594             :                 }
     595        5987 :                 mutex_unlock(&fs_devices->device_list_mutex);
     596             : 
     597        5987 :                 if (fs_devices->num_devices == 0) {
     598          13 :                         btrfs_sysfs_remove_fsid(fs_devices);
     599          13 :                         list_del(&fs_devices->fs_list);
     600          13 :                         free_fs_devices(fs_devices);
     601             :                 }
     602             :         }
     603             : 
     604        3270 :         return ret;
     605             : }
     606             : 
     607             : /*
     608             :  * This is only used on mount, and we are protected from competing things
     609             :  * messing with our fs_devices by the uuid_mutex, thus we do not need the
     610             :  * fs_devices->device_list_mutex here.
     611             :  */
     612        3242 : static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
     613             :                         struct btrfs_device *device, blk_mode_t flags,
     614             :                         void *holder)
     615             : {
     616        3242 :         struct block_device *bdev;
     617        3242 :         struct btrfs_super_block *disk_super;
     618        3242 :         u64 devid;
     619        3242 :         int ret;
     620             : 
     621        3242 :         if (device->bdev)
     622             :                 return -EINVAL;
     623        3242 :         if (!device->name)
     624             :                 return -EINVAL;
     625             : 
     626        3242 :         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
     627             :                                     &bdev, &disk_super);
     628        3242 :         if (ret)
     629             :                 return ret;
     630             : 
     631        3242 :         devid = btrfs_stack_device_id(&disk_super->dev_item);
     632        3242 :         if (devid != device->devid)
     633           0 :                 goto error_free_page;
     634             : 
     635        6484 :         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
     636           0 :                 goto error_free_page;
     637             : 
     638        3242 :         device->generation = btrfs_super_generation(disk_super);
     639             : 
     640        3242 :         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
     641           0 :                 if (btrfs_super_incompat_flags(disk_super) &
     642             :                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
     643           0 :                         pr_err(
     644             :                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
     645           0 :                         goto error_free_page;
     646             :                 }
     647             : 
     648           0 :                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
     649           0 :                 fs_devices->seeding = true;
     650             :         } else {
     651        3242 :                 if (bdev_read_only(bdev))
     652           0 :                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
     653             :                 else
     654        3242 :                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
     655             :         }
     656             : 
     657        3242 :         if (!bdev_nonrot(bdev))
     658        3225 :                 fs_devices->rotating = true;
     659             : 
     660        3242 :         if (bdev_max_discard_sectors(bdev))
     661        3234 :                 fs_devices->discardable = true;
     662             : 
     663        3242 :         device->bdev = bdev;
     664        3242 :         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
     665        3242 :         device->holder = holder;
     666             : 
     667        3242 :         fs_devices->open_devices++;
     668        6484 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
     669        3242 :             device->devid != BTRFS_DEV_REPLACE_DEVID) {
     670        3242 :                 fs_devices->rw_devices++;
     671        3242 :                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
     672             :         }
     673        3242 :         btrfs_release_disk_super(disk_super);
     674             : 
     675        3242 :         return 0;
     676             : 
     677           0 : error_free_page:
     678           0 :         btrfs_release_disk_super(disk_super);
     679           0 :         blkdev_put(bdev, holder);
     680             : 
     681           0 :         return -EINVAL;
     682             : }
     683             : 
     684             : /*
     685             :  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
     686             :  * being created with a disk that has already completed its fsid change. Such
     687             :  * disk can belong to an fs which has its FSID changed or to one which doesn't.
     688             :  * Handle both cases here.
     689             :  */
     690           0 : static struct btrfs_fs_devices *find_fsid_inprogress(
     691             :                                         struct btrfs_super_block *disk_super)
     692             : {
     693           0 :         struct btrfs_fs_devices *fs_devices;
     694             : 
     695           0 :         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
     696           0 :                 if (fs_devices->fsid_change)
     697           0 :                         continue;
     698             : 
     699           0 :                 if (check_fsid_changed(fs_devices,  disk_super->fsid))
     700           0 :                         return fs_devices;
     701             :         }
     702             : 
     703           0 :         return find_fsid(disk_super->fsid, NULL);
     704             : }
     705             : 
     706           0 : static struct btrfs_fs_devices *find_fsid_changed(
     707             :                                         struct btrfs_super_block *disk_super)
     708             : {
     709           0 :         struct btrfs_fs_devices *fs_devices;
     710             : 
     711             :         /*
     712             :          * Handles the case where scanned device is part of an fs that had
     713             :          * multiple successful changes of FSID but currently device didn't
     714             :          * observe it. Meaning our fsid will be different than theirs. We need
     715             :          * to handle two subcases :
     716             :          *  1 - The fs still continues to have different METADATA/FSID uuids.
     717             :          *  2 - The fs is switched back to its original FSID (METADATA/FSID
     718             :          *  are equal).
     719             :          */
     720           0 :         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
     721             :                 /* Changed UUIDs */
     722           0 :                 if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
     723           0 :                     memcmp(fs_devices->fsid, disk_super->fsid,
     724             :                            BTRFS_FSID_SIZE) != 0)
     725           0 :                         return fs_devices;
     726             : 
     727             :                 /* Unchanged UUIDs */
     728           0 :                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
     729           0 :                            BTRFS_FSID_SIZE) == 0 &&
     730             :                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
     731             :                            BTRFS_FSID_SIZE) == 0)
     732           0 :                         return fs_devices;
     733             :         }
     734             : 
     735             :         return NULL;
     736             : }
     737             : 
     738        6740 : static struct btrfs_fs_devices *find_fsid_reverted_metadata(
     739             :                                 struct btrfs_super_block *disk_super)
     740             : {
     741        6740 :         struct btrfs_fs_devices *fs_devices;
     742             : 
     743             :         /*
     744             :          * Handle the case where the scanned device is part of an fs whose last
     745             :          * metadata UUID change reverted it to the original FSID. At the same
     746             :          * time fs_devices was first created by another constituent device
     747             :          * which didn't fully observe the operation. This results in an
     748             :          * btrfs_fs_devices created with metadata/fsid different AND
     749             :          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
     750             :          * fs_devices equal to the FSID of the disk.
     751             :          */
     752       17186 :         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
     753       10446 :                 if (!fs_devices->fsid_change)
     754       10446 :                         continue;
     755             : 
     756           0 :                 if (check_fsid_changed(fs_devices, disk_super->fsid))
     757           0 :                         return fs_devices;
     758             :         }
     759             : 
     760             :         return NULL;
     761             : }
     762             : /*
     763             :  * Add new device to list of registered devices
     764             :  *
     765             :  * Returns:
     766             :  * device pointer which was just added or updated when successful
     767             :  * error pointer when failed
     768             :  */
     769        6740 : static noinline struct btrfs_device *device_list_add(const char *path,
     770             :                            struct btrfs_super_block *disk_super,
     771             :                            bool *new_device_added)
     772             : {
     773        6740 :         struct btrfs_device *device;
     774        6740 :         struct btrfs_fs_devices *fs_devices = NULL;
     775        6740 :         struct rcu_string *name;
     776        6740 :         u64 found_transid = btrfs_super_generation(disk_super);
     777        6740 :         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
     778        6740 :         dev_t path_devt;
     779        6740 :         int error;
     780        6740 :         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
     781             :                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
     782        6740 :         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
     783             :                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
     784             : 
     785        6740 :         error = lookup_bdev(path, &path_devt);
     786        6740 :         if (error) {
     787           0 :                 btrfs_err(NULL, "failed to lookup block device for path %s: %d",
     788             :                           path, error);
     789           0 :                 return ERR_PTR(error);
     790             :         }
     791             : 
     792        6740 :         if (fsid_change_in_progress) {
     793           0 :                 if (!has_metadata_uuid)
     794           0 :                         fs_devices = find_fsid_inprogress(disk_super);
     795             :                 else
     796           0 :                         fs_devices = find_fsid_changed(disk_super);
     797        6740 :         } else if (has_metadata_uuid) {
     798           0 :                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
     799             :         } else {
     800        6740 :                 fs_devices = find_fsid_reverted_metadata(disk_super);
     801        6740 :                 if (!fs_devices)
     802        6740 :                         fs_devices = find_fsid(disk_super->fsid, NULL);
     803             :         }
     804             : 
     805             : 
     806        6740 :         if (!fs_devices) {
     807        3264 :                 fs_devices = alloc_fs_devices(disk_super->fsid,
     808             :                                 has_metadata_uuid ? disk_super->metadata_uuid : NULL);
     809        3264 :                 if (IS_ERR(fs_devices))
     810             :                         return ERR_CAST(fs_devices);
     811             : 
     812        3264 :                 fs_devices->fsid_change = fsid_change_in_progress;
     813             : 
     814        3264 :                 mutex_lock(&fs_devices->device_list_mutex);
     815        3264 :                 list_add(&fs_devices->fs_list, &fs_uuids);
     816             : 
     817        3264 :                 device = NULL;
     818             :         } else {
     819        3476 :                 struct btrfs_dev_lookup_args args = {
     820             :                         .devid = devid,
     821        3476 :                         .uuid = disk_super->dev_item.uuid,
     822             :                 };
     823             : 
     824        3476 :                 mutex_lock(&fs_devices->device_list_mutex);
     825        3476 :                 device = btrfs_find_device(fs_devices, &args);
     826             : 
     827             :                 /*
     828             :                  * If this disk has been pulled into an fs devices created by
     829             :                  * a device which had the CHANGING_FSID_V2 flag then replace the
     830             :                  * metadata_uuid/fsid values of the fs_devices.
     831             :                  */
     832        3476 :                 if (fs_devices->fsid_change &&
     833           0 :                     found_transid > fs_devices->latest_generation) {
     834           0 :                         memcpy(fs_devices->fsid, disk_super->fsid,
     835             :                                         BTRFS_FSID_SIZE);
     836             : 
     837           0 :                         if (has_metadata_uuid)
     838           0 :                                 memcpy(fs_devices->metadata_uuid,
     839             :                                        disk_super->metadata_uuid,
     840             :                                        BTRFS_FSID_SIZE);
     841             :                         else
     842           0 :                                 memcpy(fs_devices->metadata_uuid,
     843             :                                        disk_super->fsid, BTRFS_FSID_SIZE);
     844             : 
     845           0 :                         fs_devices->fsid_change = false;
     846             :                 }
     847             :         }
     848             : 
     849        6740 :         if (!device) {
     850        3265 :                 unsigned int nofs_flag;
     851             : 
     852        3265 :                 if (fs_devices->opened) {
     853           0 :                         btrfs_err(NULL,
     854             :                 "device %s belongs to fsid %pU, and the fs is already mounted",
     855             :                                   path, fs_devices->fsid);
     856           0 :                         mutex_unlock(&fs_devices->device_list_mutex);
     857           0 :                         return ERR_PTR(-EBUSY);
     858             :                 }
     859             : 
     860        3265 :                 nofs_flag = memalloc_nofs_save();
     861        3265 :                 device = btrfs_alloc_device(NULL, &devid,
     862        3265 :                                             disk_super->dev_item.uuid, path);
     863        3265 :                 memalloc_nofs_restore(nofs_flag);
     864        3265 :                 if (IS_ERR(device)) {
     865           0 :                         mutex_unlock(&fs_devices->device_list_mutex);
     866             :                         /* we can safely leave the fs_devices entry around */
     867           0 :                         return device;
     868             :                 }
     869             : 
     870        3265 :                 device->devt = path_devt;
     871             : 
     872        3265 :                 list_add_rcu(&device->dev_list, &fs_devices->devices);
     873        3265 :                 fs_devices->num_devices++;
     874             : 
     875        3265 :                 device->fs_devices = fs_devices;
     876        3265 :                 *new_device_added = true;
     877             : 
     878        3265 :                 if (disk_super->label[0])
     879           2 :                         pr_info(
     880             :         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
     881             :                                 disk_super->label, devid, found_transid, path,
     882             :                                 current->comm, task_pid_nr(current));
     883             :                 else
     884        3263 :                         pr_info(
     885             :         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
     886             :                                 disk_super->fsid, devid, found_transid, path,
     887             :                                 current->comm, task_pid_nr(current));
     888             : 
     889        3475 :         } else if (!device->name || strcmp(device->name->str, path)) {
     890             :                 /*
     891             :                  * When FS is already mounted.
     892             :                  * 1. If you are here and if the device->name is NULL that
     893             :                  *    means this device was missing at time of FS mount.
     894             :                  * 2. If you are here and if the device->name is different
     895             :                  *    from 'path' that means either
     896             :                  *      a. The same device disappeared and reappeared with
     897             :                  *         different name. or
     898             :                  *      b. The missing-disk-which-was-replaced, has
     899             :                  *         reappeared now.
     900             :                  *
     901             :                  * We must allow 1 and 2a above. But 2b would be a spurious
     902             :                  * and unintentional.
     903             :                  *
     904             :                  * Further in case of 1 and 2a above, the disk at 'path'
     905             :                  * would have missed some transaction when it was away and
     906             :                  * in case of 2a the stale bdev has to be updated as well.
     907             :                  * 2b must not be allowed at all time.
     908             :                  */
     909             : 
     910             :                 /*
     911             :                  * For now, we do allow update to btrfs_fs_device through the
     912             :                  * btrfs dev scan cli after FS has been mounted.  We're still
     913             :                  * tracking a problem where systems fail mount by subvolume id
     914             :                  * when we reject replacement on a mounted FS.
     915             :                  */
     916        1214 :                 if (!fs_devices->opened && found_transid < device->generation) {
     917             :                         /*
     918             :                          * That is if the FS is _not_ mounted and if you
     919             :                          * are here, that means there is more than one
     920             :                          * disk with same uuid and devid.We keep the one
     921             :                          * with larger generation number or the last-in if
     922             :                          * generation are equal.
     923             :                          */
     924           0 :                         mutex_unlock(&fs_devices->device_list_mutex);
     925           0 :                         btrfs_err(NULL,
     926             : "device %s already registered with a higher generation, found %llu expect %llu",
     927             :                                   path, found_transid, device->generation);
     928           0 :                         return ERR_PTR(-EEXIST);
     929             :                 }
     930             : 
     931             :                 /*
     932             :                  * We are going to replace the device path for a given devid,
     933             :                  * make sure it's the same device if the device is mounted
     934             :                  *
     935             :                  * NOTE: the device->fs_info may not be reliable here so pass
     936             :                  * in a NULL to message helpers instead. This avoids a possible
     937             :                  * use-after-free when the fs_info and fs_info->sb are already
     938             :                  * torn down.
     939             :                  */
     940        1214 :                 if (device->bdev) {
     941         669 :                         if (device->devt != path_devt) {
     942           3 :                                 mutex_unlock(&fs_devices->device_list_mutex);
     943           3 :                                 btrfs_warn_in_rcu(NULL,
     944             :         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
     945             :                                                   path, devid, found_transid,
     946             :                                                   current->comm,
     947             :                                                   task_pid_nr(current));
     948           3 :                                 return ERR_PTR(-EEXIST);
     949             :                         }
     950         666 :                         btrfs_info_in_rcu(NULL,
     951             :         "devid %llu device path %s changed to %s scanned by %s (%d)",
     952             :                                           devid, btrfs_dev_name(device),
     953             :                                           path, current->comm,
     954             :                                           task_pid_nr(current));
     955             :                 }
     956             : 
     957        1211 :                 name = rcu_string_strdup(path, GFP_NOFS);
     958        1211 :                 if (!name) {
     959           0 :                         mutex_unlock(&fs_devices->device_list_mutex);
     960           0 :                         return ERR_PTR(-ENOMEM);
     961             :                 }
     962        1211 :                 rcu_string_free(device->name);
     963        1211 :                 rcu_assign_pointer(device->name, name);
     964        2422 :                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
     965           0 :                         fs_devices->missing_devices--;
     966           0 :                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
     967             :                 }
     968        1211 :                 device->devt = path_devt;
     969             :         }
     970             : 
     971             :         /*
     972             :          * Unmount does not free the btrfs_device struct but would zero
     973             :          * generation along with most of the other members. So just update
     974             :          * it back. We need it to pick the disk with largest generation
     975             :          * (as above).
     976             :          */
     977        6737 :         if (!fs_devices->opened) {
     978        5557 :                 device->generation = found_transid;
     979        5557 :                 fs_devices->latest_generation = max_t(u64, found_transid,
     980             :                                                 fs_devices->latest_generation);
     981             :         }
     982             : 
     983        6737 :         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
     984             : 
     985        6737 :         mutex_unlock(&fs_devices->device_list_mutex);
     986        6737 :         return device;
     987             : }
     988             : 
     989           0 : static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
     990             : {
     991           0 :         struct btrfs_fs_devices *fs_devices;
     992           0 :         struct btrfs_device *device;
     993           0 :         struct btrfs_device *orig_dev;
     994           0 :         int ret = 0;
     995             : 
     996           0 :         lockdep_assert_held(&uuid_mutex);
     997             : 
     998           0 :         fs_devices = alloc_fs_devices(orig->fsid, NULL);
     999           0 :         if (IS_ERR(fs_devices))
    1000             :                 return fs_devices;
    1001             : 
    1002           0 :         fs_devices->total_devices = orig->total_devices;
    1003             : 
    1004           0 :         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
    1005           0 :                 const char *dev_path = NULL;
    1006             : 
    1007             :                 /*
    1008             :                  * This is ok to do without RCU read locked because we hold the
    1009             :                  * uuid mutex so nothing we touch in here is going to disappear.
    1010             :                  */
    1011           0 :                 if (orig_dev->name)
    1012           0 :                         dev_path = orig_dev->name->str;
    1013             : 
    1014           0 :                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
    1015           0 :                                             orig_dev->uuid, dev_path);
    1016           0 :                 if (IS_ERR(device)) {
    1017           0 :                         ret = PTR_ERR(device);
    1018           0 :                         goto error;
    1019             :                 }
    1020             : 
    1021           0 :                 if (orig_dev->zone_info) {
    1022           0 :                         struct btrfs_zoned_device_info *zone_info;
    1023             : 
    1024           0 :                         zone_info = btrfs_clone_dev_zone_info(orig_dev);
    1025           0 :                         if (!zone_info) {
    1026           0 :                                 btrfs_free_device(device);
    1027           0 :                                 ret = -ENOMEM;
    1028           0 :                                 goto error;
    1029             :                         }
    1030           0 :                         device->zone_info = zone_info;
    1031             :                 }
    1032             : 
    1033           0 :                 list_add(&device->dev_list, &fs_devices->devices);
    1034           0 :                 device->fs_devices = fs_devices;
    1035           0 :                 fs_devices->num_devices++;
    1036             :         }
    1037             :         return fs_devices;
    1038           0 : error:
    1039           0 :         free_fs_devices(fs_devices);
    1040           0 :         return ERR_PTR(ret);
    1041             : }
    1042             : 
    1043        3216 : static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
    1044             :                                       struct btrfs_device **latest_dev)
    1045             : {
    1046        3216 :         struct btrfs_device *device, *next;
    1047             : 
    1048             :         /* This is the initialized path, it is safe to release the devices. */
    1049        6433 :         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
    1050        6434 :                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
    1051        3217 :                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
    1052        6434 :                                       &device->dev_state) &&
    1053           0 :                             !test_bit(BTRFS_DEV_STATE_MISSING,
    1054        3217 :                                       &device->dev_state) &&
    1055        3217 :                             (!*latest_dev ||
    1056           1 :                              device->generation > (*latest_dev)->generation)) {
    1057        3216 :                                 *latest_dev = device;
    1058             :                         }
    1059        3217 :                         continue;
    1060             :                 }
    1061             : 
    1062             :                 /*
    1063             :                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
    1064             :                  * in btrfs_init_dev_replace() so just continue.
    1065             :                  */
    1066           0 :                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
    1067           0 :                         continue;
    1068             : 
    1069           0 :                 if (device->bdev) {
    1070           0 :                         blkdev_put(device->bdev, device->holder);
    1071           0 :                         device->bdev = NULL;
    1072           0 :                         fs_devices->open_devices--;
    1073             :                 }
    1074           0 :                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    1075           0 :                         list_del_init(&device->dev_alloc_list);
    1076           0 :                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    1077           0 :                         fs_devices->rw_devices--;
    1078             :                 }
    1079           0 :                 list_del_init(&device->dev_list);
    1080           0 :                 fs_devices->num_devices--;
    1081           0 :                 btrfs_free_device(device);
    1082             :         }
    1083             : 
    1084        3216 : }
    1085             : 
    1086             : /*
    1087             :  * After we have read the system tree and know devids belonging to this
    1088             :  * filesystem, remove the device which does not belong there.
    1089             :  */
    1090        3216 : void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
    1091             : {
    1092        3216 :         struct btrfs_device *latest_dev = NULL;
    1093        3216 :         struct btrfs_fs_devices *seed_dev;
    1094             : 
    1095        3216 :         mutex_lock(&uuid_mutex);
    1096        3216 :         __btrfs_free_extra_devids(fs_devices, &latest_dev);
    1097             : 
    1098        3216 :         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
    1099           0 :                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
    1100             : 
    1101        3216 :         fs_devices->latest_dev = latest_dev;
    1102             : 
    1103        3216 :         mutex_unlock(&uuid_mutex);
    1104        3216 : }
    1105             : 
    1106        3242 : static void btrfs_close_bdev(struct btrfs_device *device)
    1107             : {
    1108        3242 :         if (!device->bdev)
    1109             :                 return;
    1110             : 
    1111        6484 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    1112        3242 :                 sync_blockdev(device->bdev);
    1113        3242 :                 invalidate_bdev(device->bdev);
    1114             :         }
    1115             : 
    1116        3242 :         blkdev_put(device->bdev, device->holder);
    1117             : }
    1118             : 
    1119        3242 : static void btrfs_close_one_device(struct btrfs_device *device)
    1120             : {
    1121        3242 :         struct btrfs_fs_devices *fs_devices = device->fs_devices;
    1122             : 
    1123        6484 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
    1124        3242 :             device->devid != BTRFS_DEV_REPLACE_DEVID) {
    1125        3242 :                 list_del_init(&device->dev_alloc_list);
    1126        3242 :                 fs_devices->rw_devices--;
    1127             :         }
    1128             : 
    1129        3242 :         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
    1130           0 :                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
    1131             : 
    1132        6484 :         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
    1133           0 :                 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
    1134           0 :                 fs_devices->missing_devices--;
    1135             :         }
    1136             : 
    1137        3242 :         btrfs_close_bdev(device);
    1138        3242 :         if (device->bdev) {
    1139        3242 :                 fs_devices->open_devices--;
    1140        3242 :                 device->bdev = NULL;
    1141             :         }
    1142        3242 :         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    1143        3242 :         btrfs_destroy_dev_zone_info(device);
    1144             : 
    1145        3242 :         device->fs_info = NULL;
    1146        3242 :         atomic_set(&device->dev_stats_ccnt, 0);
    1147        3242 :         extent_io_tree_release(&device->alloc_state);
    1148             : 
    1149             :         /*
    1150             :          * Reset the flush error record. We might have a transient flush error
    1151             :          * in this mount, and if so we aborted the current transaction and set
    1152             :          * the fs to an error state, guaranteeing no super blocks can be further
    1153             :          * committed. However that error might be transient and if we unmount the
    1154             :          * filesystem and mount it again, we should allow the mount to succeed
    1155             :          * (btrfs_check_rw_degradable() should not fail) - if after mounting the
    1156             :          * filesystem again we still get flush errors, then we will again abort
    1157             :          * any transaction and set the error state, guaranteeing no commits of
    1158             :          * unsafe super blocks.
    1159             :          */
    1160        3242 :         device->last_flush_error = 0;
    1161             : 
    1162             :         /* Verify the device is back in a pristine state  */
    1163        3242 :         WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
    1164        3242 :         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
    1165        3242 :         WARN_ON(!list_empty(&device->dev_alloc_list));
    1166        3242 :         WARN_ON(!list_empty(&device->post_commit_list));
    1167        3242 : }
    1168             : 
    1169        3469 : static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
    1170             : {
    1171        3469 :         struct btrfs_device *device, *tmp;
    1172             : 
    1173        3469 :         lockdep_assert_held(&uuid_mutex);
    1174             : 
    1175        3469 :         if (--fs_devices->opened > 0)
    1176             :                 return;
    1177             : 
    1178        6483 :         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
    1179        3242 :                 btrfs_close_one_device(device);
    1180             : 
    1181        3241 :         WARN_ON(fs_devices->open_devices);
    1182        3241 :         WARN_ON(fs_devices->rw_devices);
    1183        3241 :         fs_devices->opened = 0;
    1184        3241 :         fs_devices->seeding = false;
    1185        3241 :         fs_devices->fs_info = NULL;
    1186             : }
    1187             : 
    1188        3469 : void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
    1189             : {
    1190        3469 :         LIST_HEAD(list);
    1191        3469 :         struct btrfs_fs_devices *tmp;
    1192             : 
    1193        3469 :         mutex_lock(&uuid_mutex);
    1194        3469 :         close_fs_devices(fs_devices);
    1195        3469 :         if (!fs_devices->opened) {
    1196        3241 :                 list_splice_init(&fs_devices->seed_list, &list);
    1197             : 
    1198             :                 /*
    1199             :                  * If the struct btrfs_fs_devices is not assembled with any
    1200             :                  * other device, it can be re-initialized during the next mount
    1201             :                  * without the needing device-scan step. Therefore, it can be
    1202             :                  * fully freed.
    1203             :                  */
    1204        3241 :                 if (fs_devices->num_devices == 1) {
    1205        3240 :                         list_del(&fs_devices->fs_list);
    1206        3240 :                         free_fs_devices(fs_devices);
    1207             :                 }
    1208             :         }
    1209             : 
    1210             : 
    1211        3469 :         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
    1212           0 :                 close_fs_devices(fs_devices);
    1213           0 :                 list_del(&fs_devices->seed_list);
    1214           0 :                 free_fs_devices(fs_devices);
    1215             :         }
    1216        3469 :         mutex_unlock(&uuid_mutex);
    1217        3469 : }
    1218             : 
    1219        3241 : static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
    1220             :                                 blk_mode_t flags, void *holder)
    1221             : {
    1222        3241 :         struct btrfs_device *device;
    1223        3241 :         struct btrfs_device *latest_dev = NULL;
    1224        3241 :         struct btrfs_device *tmp_device;
    1225             : 
    1226        6483 :         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
    1227             :                                  dev_list) {
    1228        3242 :                 int ret;
    1229             : 
    1230        3242 :                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
    1231        3242 :                 if (ret == 0 &&
    1232           1 :                     (!latest_dev || device->generation > latest_dev->generation)) {
    1233             :                         latest_dev = device;
    1234           1 :                 } else if (ret == -ENODATA) {
    1235           0 :                         fs_devices->num_devices--;
    1236           0 :                         list_del(&device->dev_list);
    1237           0 :                         btrfs_free_device(device);
    1238             :                 }
    1239             :         }
    1240        3241 :         if (fs_devices->open_devices == 0)
    1241             :                 return -EINVAL;
    1242             : 
    1243        3241 :         fs_devices->opened = 1;
    1244        3241 :         fs_devices->latest_dev = latest_dev;
    1245        3241 :         fs_devices->total_rw_bytes = 0;
    1246        3241 :         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
    1247        3241 :         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
    1248             : 
    1249        3241 :         return 0;
    1250             : }
    1251             : 
    1252           1 : static int devid_cmp(void *priv, const struct list_head *a,
    1253             :                      const struct list_head *b)
    1254             : {
    1255           1 :         const struct btrfs_device *dev1, *dev2;
    1256             : 
    1257           1 :         dev1 = list_entry(a, struct btrfs_device, dev_list);
    1258           1 :         dev2 = list_entry(b, struct btrfs_device, dev_list);
    1259             : 
    1260           1 :         if (dev1->devid < dev2->devid)
    1261             :                 return -1;
    1262           1 :         else if (dev1->devid > dev2->devid)
    1263           1 :                 return 1;
    1264             :         return 0;
    1265             : }
    1266             : 
    1267        3469 : int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
    1268             :                        blk_mode_t flags, void *holder)
    1269             : {
    1270        3469 :         int ret;
    1271             : 
    1272        3469 :         lockdep_assert_held(&uuid_mutex);
    1273             :         /*
    1274             :          * The device_list_mutex cannot be taken here in case opening the
    1275             :          * underlying device takes further locks like open_mutex.
    1276             :          *
    1277             :          * We also don't need the lock here as this is called during mount and
    1278             :          * exclusion is provided by uuid_mutex
    1279             :          */
    1280             : 
    1281        3469 :         if (fs_devices->opened) {
    1282         228 :                 fs_devices->opened++;
    1283         228 :                 ret = 0;
    1284             :         } else {
    1285        3241 :                 list_sort(NULL, &fs_devices->devices, devid_cmp);
    1286        3241 :                 ret = open_fs_devices(fs_devices, flags, holder);
    1287             :         }
    1288             : 
    1289        3469 :         return ret;
    1290             : }
    1291             : 
    1292       13271 : void btrfs_release_disk_super(struct btrfs_super_block *super)
    1293             : {
    1294       13271 :         struct page *page = virt_to_page(super);
    1295             : 
    1296       13271 :         put_page(page);
    1297       13271 : }
    1298             : 
    1299        6742 : static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
    1300             :                                                        u64 bytenr, u64 bytenr_orig)
    1301             : {
    1302        6742 :         struct btrfs_super_block *disk_super;
    1303        6742 :         struct page *page;
    1304        6742 :         void *p;
    1305        6742 :         pgoff_t index;
    1306             : 
    1307             :         /* make sure our super fits in the device */
    1308        6742 :         if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
    1309             :                 return ERR_PTR(-EINVAL);
    1310             : 
    1311             :         /* make sure our super fits in the page */
    1312        6741 :         if (sizeof(*disk_super) > PAGE_SIZE)
    1313             :                 return ERR_PTR(-EINVAL);
    1314             : 
    1315             :         /* make sure our super doesn't straddle pages on disk */
    1316        6741 :         index = bytenr >> PAGE_SHIFT;
    1317        6741 :         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
    1318             :                 return ERR_PTR(-EINVAL);
    1319             : 
    1320             :         /* pull in the page with our super */
    1321        6741 :         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
    1322             : 
    1323        6741 :         if (IS_ERR(page))
    1324             :                 return ERR_CAST(page);
    1325             : 
    1326        6741 :         p = page_address(page);
    1327             : 
    1328             :         /* align our pointer to the offset of the super block */
    1329        6741 :         disk_super = p + offset_in_page(bytenr);
    1330             : 
    1331        6741 :         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
    1332             :             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
    1333           1 :                 btrfs_release_disk_super(p);
    1334           1 :                 return ERR_PTR(-EINVAL);
    1335             :         }
    1336             : 
    1337        6740 :         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
    1338           0 :                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
    1339             : 
    1340             :         return disk_super;
    1341             : }
    1342             : 
    1343           5 : int btrfs_forget_devices(dev_t devt)
    1344             : {
    1345           5 :         int ret;
    1346             : 
    1347           5 :         mutex_lock(&uuid_mutex);
    1348           5 :         ret = btrfs_free_stale_devices(devt, NULL);
    1349           5 :         mutex_unlock(&uuid_mutex);
    1350             : 
    1351           5 :         return ret;
    1352             : }
    1353             : 
    1354             : /*
    1355             :  * Look for a btrfs signature on a device. This may be called out of the mount path
    1356             :  * and we are not allowed to call set_blocksize during the scan. The superblock
    1357             :  * is read via pagecache
    1358             :  */
    1359        6743 : struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
    1360             : {
    1361        6743 :         struct btrfs_super_block *disk_super;
    1362        6743 :         bool new_device_added = false;
    1363        6743 :         struct btrfs_device *device = NULL;
    1364        6743 :         struct block_device *bdev;
    1365        6743 :         u64 bytenr, bytenr_orig;
    1366        6743 :         int ret;
    1367             : 
    1368        6743 :         lockdep_assert_held(&uuid_mutex);
    1369             : 
    1370             :         /*
    1371             :          * we would like to check all the supers, but that would make
    1372             :          * a btrfs mount succeed after a mkfs from a different FS.
    1373             :          * So, we need to add a special mount option to scan for
    1374             :          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
    1375             :          */
    1376             : 
    1377             :         /*
    1378             :          * Avoid an exclusive open here, as the systemd-udev may initiate the
    1379             :          * device scan which may race with the user's mount or mkfs command,
    1380             :          * resulting in failure.
    1381             :          * Since the device scan is solely for reading purposes, there is no
    1382             :          * need for an exclusive open. Additionally, the devices are read again
    1383             :          * during the mount process. It is ok to get some inconsistent
    1384             :          * values temporarily, as the device paths of the fsid are the only
    1385             :          * required information for assembling the volume.
    1386             :          */
    1387        6743 :         bdev = blkdev_get_by_path(path, flags, NULL, NULL);
    1388        6743 :         if (IS_ERR(bdev))
    1389             :                 return ERR_CAST(bdev);
    1390             : 
    1391        6742 :         bytenr_orig = btrfs_sb_offset(0);
    1392        6742 :         ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
    1393        6742 :         if (ret) {
    1394           0 :                 device = ERR_PTR(ret);
    1395           0 :                 goto error_bdev_put;
    1396             :         }
    1397             : 
    1398        6742 :         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
    1399        6742 :         if (IS_ERR(disk_super)) {
    1400           2 :                 device = ERR_CAST(disk_super);
    1401           2 :                 goto error_bdev_put;
    1402             :         }
    1403             : 
    1404        6740 :         device = device_list_add(path, disk_super, &new_device_added);
    1405        6740 :         if (!IS_ERR(device) && new_device_added)
    1406        3265 :                 btrfs_free_stale_devices(device->devt, device);
    1407             : 
    1408        6740 :         btrfs_release_disk_super(disk_super);
    1409             : 
    1410        6742 : error_bdev_put:
    1411        6742 :         blkdev_put(bdev, NULL);
    1412             : 
    1413        6742 :         return device;
    1414             : }
    1415             : 
    1416             : /*
    1417             :  * Try to find a chunk that intersects [start, start + len] range and when one
    1418             :  * such is found, record the end of it in *start
    1419             :  */
    1420        3583 : static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
    1421             :                                     u64 len)
    1422             : {
    1423        3583 :         u64 physical_start, physical_end;
    1424             : 
    1425        3583 :         lockdep_assert_held(&device->fs_info->chunk_mutex);
    1426             : 
    1427        3583 :         if (!find_first_extent_bit(&device->alloc_state, *start,
    1428             :                                    &physical_start, &physical_end,
    1429             :                                    CHUNK_ALLOCATED, NULL)) {
    1430             : 
    1431        2453 :                 if (in_range(physical_start, *start, len) ||
    1432         473 :                     in_range(*start, physical_start,
    1433             :                              physical_end - physical_start)) {
    1434         505 :                         *start = physical_end + 1;
    1435         505 :                         return true;
    1436             :                 }
    1437             :         }
    1438             :         return false;
    1439             : }
    1440             : 
    1441        1482 : static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
    1442             : {
    1443        1482 :         switch (device->fs_devices->chunk_alloc_policy) {
    1444        1482 :         case BTRFS_CHUNK_ALLOC_REGULAR:
    1445        1482 :                 return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
    1446           0 :         case BTRFS_CHUNK_ALLOC_ZONED:
    1447             :                 /*
    1448             :                  * We don't care about the starting region like regular
    1449             :                  * allocator, because we anyway use/reserve the first two zones
    1450             :                  * for superblock logging.
    1451             :                  */
    1452           0 :                 return ALIGN(start, device->zone_info->zone_size);
    1453           0 :         default:
    1454           0 :                 BUG();
    1455             :         }
    1456             : }
    1457             : 
    1458           0 : static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
    1459             :                                         u64 *hole_start, u64 *hole_size,
    1460             :                                         u64 num_bytes)
    1461             : {
    1462           0 :         u64 zone_size = device->zone_info->zone_size;
    1463           0 :         u64 pos;
    1464           0 :         int ret;
    1465           0 :         bool changed = false;
    1466             : 
    1467           0 :         ASSERT(IS_ALIGNED(*hole_start, zone_size));
    1468             : 
    1469           0 :         while (*hole_size > 0) {
    1470           0 :                 pos = btrfs_find_allocatable_zones(device, *hole_start,
    1471           0 :                                                    *hole_start + *hole_size,
    1472             :                                                    num_bytes);
    1473           0 :                 if (pos != *hole_start) {
    1474           0 :                         *hole_size = *hole_start + *hole_size - pos;
    1475           0 :                         *hole_start = pos;
    1476           0 :                         changed = true;
    1477           0 :                         if (*hole_size < num_bytes)
    1478             :                                 break;
    1479             :                 }
    1480             : 
    1481           0 :                 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
    1482             : 
    1483             :                 /* Range is ensured to be empty */
    1484           0 :                 if (!ret)
    1485           0 :                         return changed;
    1486             : 
    1487             :                 /* Given hole range was invalid (outside of device) */
    1488           0 :                 if (ret == -ERANGE) {
    1489           0 :                         *hole_start += *hole_size;
    1490           0 :                         *hole_size = 0;
    1491           0 :                         return true;
    1492             :                 }
    1493             : 
    1494           0 :                 *hole_start += zone_size;
    1495           0 :                 *hole_size -= zone_size;
    1496           0 :                 changed = true;
    1497             :         }
    1498             : 
    1499             :         return changed;
    1500             : }
    1501             : 
    1502             : /*
    1503             :  * Check if specified hole is suitable for allocation.
    1504             :  *
    1505             :  * @device:     the device which we have the hole
    1506             :  * @hole_start: starting position of the hole
    1507             :  * @hole_size:  the size of the hole
    1508             :  * @num_bytes:  the size of the free space that we need
    1509             :  *
    1510             :  * This function may modify @hole_start and @hole_size to reflect the suitable
    1511             :  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
    1512             :  */
    1513        3580 : static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
    1514             :                                   u64 *hole_size, u64 num_bytes)
    1515             : {
    1516        3580 :         bool changed = false;
    1517        3580 :         u64 hole_end = *hole_start + *hole_size;
    1518             : 
    1519        3580 :         for (;;) {
    1520             :                 /*
    1521             :                  * Check before we set max_hole_start, otherwise we could end up
    1522             :                  * sending back this offset anyway.
    1523             :                  */
    1524        3580 :                 if (contains_pending_extent(device, hole_start, *hole_size)) {
    1525         503 :                         if (hole_end >= *hole_start)
    1526         472 :                                 *hole_size = hole_end - *hole_start;
    1527             :                         else
    1528          31 :                                 *hole_size = 0;
    1529             :                         changed = true;
    1530             :                 }
    1531             : 
    1532        3580 :                 switch (device->fs_devices->chunk_alloc_policy) {
    1533             :                 case BTRFS_CHUNK_ALLOC_REGULAR:
    1534             :                         /* No extra check */
    1535             :                         break;
    1536           0 :                 case BTRFS_CHUNK_ALLOC_ZONED:
    1537           0 :                         if (dev_extent_hole_check_zoned(device, hole_start,
    1538             :                                                         hole_size, num_bytes)) {
    1539           0 :                                 changed = true;
    1540             :                                 /*
    1541             :                                  * The changed hole can contain pending extent.
    1542             :                                  * Loop again to check that.
    1543             :                                  */
    1544           0 :                                 continue;
    1545             :                         }
    1546             :                         break;
    1547           0 :                 default:
    1548           0 :                         BUG();
    1549             :                 }
    1550             : 
    1551        3580 :                 break;
    1552             :         }
    1553             : 
    1554        3580 :         return changed;
    1555             : }
    1556             : 
    1557             : /*
    1558             :  * Find free space in the specified device.
    1559             :  *
    1560             :  * @device:       the device which we search the free space in
    1561             :  * @num_bytes:    the size of the free space that we need
    1562             :  * @search_start: the position from which to begin the search
    1563             :  * @start:        store the start of the free space.
    1564             :  * @len:          the size of the free space. that we find, or the size
    1565             :  *                of the max free space if we don't find suitable free space
    1566             :  *
    1567             :  * This does a pretty simple search, the expectation is that it is called very
    1568             :  * infrequently and that a given device has a small number of extents.
    1569             :  *
    1570             :  * @start is used to store the start of the free space if we find. But if we
    1571             :  * don't find suitable free space, it will be used to store the start position
    1572             :  * of the max free space.
    1573             :  *
    1574             :  * @len is used to store the size of the free space that we find.
    1575             :  * But if we don't find suitable free space, it is used to store the size of
    1576             :  * the max free space.
    1577             :  *
    1578             :  * NOTE: This function will search *commit* root of device tree, and does extra
    1579             :  * check to ensure dev extents are not double allocated.
    1580             :  * This makes the function safe to allocate dev extents but may not report
    1581             :  * correct usable device space, as device extent freed in current transaction
    1582             :  * is not reported as available.
    1583             :  */
    1584        1482 : static int find_free_dev_extent_start(struct btrfs_device *device,
    1585             :                                 u64 num_bytes, u64 search_start, u64 *start,
    1586             :                                 u64 *len)
    1587             : {
    1588        1482 :         struct btrfs_fs_info *fs_info = device->fs_info;
    1589        1482 :         struct btrfs_root *root = fs_info->dev_root;
    1590        1482 :         struct btrfs_key key;
    1591        1482 :         struct btrfs_dev_extent *dev_extent;
    1592        1482 :         struct btrfs_path *path;
    1593        1482 :         u64 hole_size;
    1594        1482 :         u64 max_hole_start;
    1595        1482 :         u64 max_hole_size;
    1596        1482 :         u64 extent_end;
    1597        1482 :         u64 search_end = device->total_bytes;
    1598        1482 :         int ret;
    1599        1482 :         int slot;
    1600        1482 :         struct extent_buffer *l;
    1601             : 
    1602        1482 :         search_start = dev_extent_search_start(device, search_start);
    1603             : 
    1604        2964 :         WARN_ON(device->zone_info &&
    1605             :                 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
    1606             : 
    1607        1482 :         path = btrfs_alloc_path();
    1608        1482 :         if (!path)
    1609             :                 return -ENOMEM;
    1610             : 
    1611             :         max_hole_start = search_start;
    1612             :         max_hole_size = 0;
    1613             : 
    1614        1950 : again:
    1615        3868 :         if (search_start >= search_end ||
    1616        1918 :                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
    1617          32 :                 ret = -ENOSPC;
    1618          32 :                 goto out;
    1619             :         }
    1620             : 
    1621        1918 :         path->reada = READA_FORWARD;
    1622        1918 :         path->search_commit_root = 1;
    1623        1918 :         path->skip_locking = 1;
    1624             : 
    1625        1918 :         key.objectid = device->devid;
    1626        1918 :         key.offset = search_start;
    1627        1918 :         key.type = BTRFS_DEV_EXTENT_KEY;
    1628             : 
    1629        1918 :         ret = btrfs_search_backwards(root, &key, path);
    1630        1918 :         if (ret < 0)
    1631           0 :                 goto out;
    1632             : 
    1633       11740 :         while (search_start < search_end) {
    1634       11688 :                 l = path->nodes[0];
    1635       11688 :                 slot = path->slots[0];
    1636       11688 :                 if (slot >= btrfs_header_nritems(l)) {
    1637        1596 :                         ret = btrfs_next_leaf(root, path);
    1638        1596 :                         if (ret == 0)
    1639           0 :                                 continue;
    1640        1596 :                         if (ret < 0)
    1641           0 :                                 goto out;
    1642             : 
    1643             :                         break;
    1644             :                 }
    1645       10092 :                 btrfs_item_key_to_cpu(l, &key, slot);
    1646             : 
    1647       10092 :                 if (key.objectid < device->devid)
    1648         715 :                         goto next;
    1649             : 
    1650        9377 :                 if (key.objectid > device->devid)
    1651             :                         break;
    1652             : 
    1653        9377 :                 if (key.type != BTRFS_DEV_EXTENT_KEY)
    1654           0 :                         goto next;
    1655             : 
    1656        9377 :                 if (key.offset > search_end)
    1657             :                         break;
    1658             : 
    1659        9377 :                 if (key.offset > search_start) {
    1660        1984 :                         hole_size = key.offset - search_start;
    1661        1984 :                         dev_extent_hole_check(device, &search_start, &hole_size,
    1662             :                                               num_bytes);
    1663             : 
    1664        1984 :                         if (hole_size > max_hole_size) {
    1665        1905 :                                 max_hole_start = search_start;
    1666        1905 :                                 max_hole_size = hole_size;
    1667             :                         }
    1668             : 
    1669             :                         /*
    1670             :                          * If this free space is greater than which we need,
    1671             :                          * it must be the max free space that we have found
    1672             :                          * until now, so max_hole_start must point to the start
    1673             :                          * of this free space and the length of this free space
    1674             :                          * is stored in max_hole_size. Thus, we return
    1675             :                          * max_hole_start and max_hole_size and go back to the
    1676             :                          * caller.
    1677             :                          */
    1678        1984 :                         if (hole_size >= num_bytes) {
    1679         270 :                                 ret = 0;
    1680         270 :                                 goto out;
    1681             :                         }
    1682             :                 }
    1683             : 
    1684        9107 :                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
    1685        9107 :                 extent_end = key.offset + btrfs_dev_extent_length(l,
    1686             :                                                                   dev_extent);
    1687        9107 :                 if (extent_end > search_start)
    1688        8637 :                         search_start = extent_end;
    1689         470 : next:
    1690        9822 :                 path->slots[0]++;
    1691        9822 :                 cond_resched();
    1692             :         }
    1693             : 
    1694             :         /*
    1695             :          * At this point, search_start should be the end of
    1696             :          * allocated dev extents, and when shrinking the device,
    1697             :          * search_end may be smaller than search_start.
    1698             :          */
    1699        1648 :         if (search_end > search_start) {
    1700        1596 :                 hole_size = search_end - search_start;
    1701        1596 :                 if (dev_extent_hole_check(device, &search_start, &hole_size,
    1702             :                                           num_bytes)) {
    1703         468 :                         btrfs_release_path(path);
    1704         468 :                         goto again;
    1705             :                 }
    1706             : 
    1707        1128 :                 if (hole_size > max_hole_size) {
    1708        1116 :                         max_hole_start = search_start;
    1709        1116 :                         max_hole_size = hole_size;
    1710             :                 }
    1711             :         }
    1712             : 
    1713             :         /* See above. */
    1714        1180 :         if (max_hole_size < num_bytes)
    1715             :                 ret = -ENOSPC;
    1716             :         else
    1717         670 :                 ret = 0;
    1718             : 
    1719        1180 :         ASSERT(max_hole_start + max_hole_size <= search_end);
    1720        1482 : out:
    1721        1482 :         btrfs_free_path(path);
    1722        1482 :         *start = max_hole_start;
    1723        1482 :         if (len)
    1724        1482 :                 *len = max_hole_size;
    1725             :         return ret;
    1726             : }
    1727             : 
    1728           0 : int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
    1729             :                          u64 *start, u64 *len)
    1730             : {
    1731             :         /* FIXME use last free of some kind */
    1732           0 :         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
    1733             : }
    1734             : 
    1735         817 : static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
    1736             :                           struct btrfs_device *device,
    1737             :                           u64 start, u64 *dev_extent_len)
    1738             : {
    1739         817 :         struct btrfs_fs_info *fs_info = device->fs_info;
    1740         817 :         struct btrfs_root *root = fs_info->dev_root;
    1741         817 :         int ret;
    1742         817 :         struct btrfs_path *path;
    1743         817 :         struct btrfs_key key;
    1744         817 :         struct btrfs_key found_key;
    1745         817 :         struct extent_buffer *leaf = NULL;
    1746         817 :         struct btrfs_dev_extent *extent = NULL;
    1747             : 
    1748         817 :         path = btrfs_alloc_path();
    1749         817 :         if (!path)
    1750             :                 return -ENOMEM;
    1751             : 
    1752         817 :         key.objectid = device->devid;
    1753         817 :         key.offset = start;
    1754         817 :         key.type = BTRFS_DEV_EXTENT_KEY;
    1755         817 : again:
    1756         817 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    1757         817 :         if (ret > 0) {
    1758           0 :                 ret = btrfs_previous_item(root, path, key.objectid,
    1759             :                                           BTRFS_DEV_EXTENT_KEY);
    1760           0 :                 if (ret)
    1761           0 :                         goto out;
    1762           0 :                 leaf = path->nodes[0];
    1763           0 :                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    1764           0 :                 extent = btrfs_item_ptr(leaf, path->slots[0],
    1765             :                                         struct btrfs_dev_extent);
    1766           0 :                 BUG_ON(found_key.offset > start || found_key.offset +
    1767             :                        btrfs_dev_extent_length(leaf, extent) < start);
    1768           0 :                 key = found_key;
    1769           0 :                 btrfs_release_path(path);
    1770           0 :                 goto again;
    1771         817 :         } else if (ret == 0) {
    1772         817 :                 leaf = path->nodes[0];
    1773         817 :                 extent = btrfs_item_ptr(leaf, path->slots[0],
    1774             :                                         struct btrfs_dev_extent);
    1775             :         } else {
    1776           0 :                 goto out;
    1777             :         }
    1778             : 
    1779         817 :         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
    1780             : 
    1781         817 :         ret = btrfs_del_item(trans, root, path);
    1782         817 :         if (ret == 0)
    1783         817 :                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
    1784           0 : out:
    1785         817 :         btrfs_free_path(path);
    1786         817 :         return ret;
    1787             : }
    1788             : 
    1789        1482 : static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
    1790             : {
    1791        1482 :         struct extent_map_tree *em_tree;
    1792        1482 :         struct extent_map *em;
    1793        1482 :         struct rb_node *n;
    1794        1482 :         u64 ret = 0;
    1795             : 
    1796        1482 :         em_tree = &fs_info->mapping_tree;
    1797        1482 :         read_lock(&em_tree->lock);
    1798        1482 :         n = rb_last(&em_tree->map.rb_root);
    1799        1482 :         if (n) {
    1800        1482 :                 em = rb_entry(n, struct extent_map, rb_node);
    1801        1482 :                 ret = em->start + em->len;
    1802             :         }
    1803        1482 :         read_unlock(&em_tree->lock);
    1804             : 
    1805        1482 :         return ret;
    1806             : }
    1807             : 
    1808           0 : static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
    1809             :                                     u64 *devid_ret)
    1810             : {
    1811           0 :         int ret;
    1812           0 :         struct btrfs_key key;
    1813           0 :         struct btrfs_key found_key;
    1814           0 :         struct btrfs_path *path;
    1815             : 
    1816           0 :         path = btrfs_alloc_path();
    1817           0 :         if (!path)
    1818             :                 return -ENOMEM;
    1819             : 
    1820           0 :         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
    1821           0 :         key.type = BTRFS_DEV_ITEM_KEY;
    1822           0 :         key.offset = (u64)-1;
    1823             : 
    1824           0 :         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
    1825           0 :         if (ret < 0)
    1826           0 :                 goto error;
    1827             : 
    1828           0 :         if (ret == 0) {
    1829             :                 /* Corruption */
    1830           0 :                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
    1831           0 :                 ret = -EUCLEAN;
    1832           0 :                 goto error;
    1833             :         }
    1834             : 
    1835           0 :         ret = btrfs_previous_item(fs_info->chunk_root, path,
    1836             :                                   BTRFS_DEV_ITEMS_OBJECTID,
    1837             :                                   BTRFS_DEV_ITEM_KEY);
    1838           0 :         if (ret) {
    1839           0 :                 *devid_ret = 1;
    1840             :         } else {
    1841           0 :                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
    1842             :                                       path->slots[0]);
    1843           0 :                 *devid_ret = found_key.offset + 1;
    1844             :         }
    1845             :         ret = 0;
    1846           0 : error:
    1847           0 :         btrfs_free_path(path);
    1848           0 :         return ret;
    1849             : }
    1850             : 
    1851             : /*
    1852             :  * the device information is stored in the chunk root
    1853             :  * the btrfs_device struct should be fully filled in
    1854             :  */
    1855           0 : static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
    1856             :                             struct btrfs_device *device)
    1857             : {
    1858           0 :         int ret;
    1859           0 :         struct btrfs_path *path;
    1860           0 :         struct btrfs_dev_item *dev_item;
    1861           0 :         struct extent_buffer *leaf;
    1862           0 :         struct btrfs_key key;
    1863           0 :         unsigned long ptr;
    1864             : 
    1865           0 :         path = btrfs_alloc_path();
    1866           0 :         if (!path)
    1867             :                 return -ENOMEM;
    1868             : 
    1869           0 :         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
    1870           0 :         key.type = BTRFS_DEV_ITEM_KEY;
    1871           0 :         key.offset = device->devid;
    1872             : 
    1873           0 :         btrfs_reserve_chunk_metadata(trans, true);
    1874           0 :         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
    1875             :                                       &key, sizeof(*dev_item));
    1876           0 :         btrfs_trans_release_chunk_metadata(trans);
    1877           0 :         if (ret)
    1878           0 :                 goto out;
    1879             : 
    1880           0 :         leaf = path->nodes[0];
    1881           0 :         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
    1882             : 
    1883           0 :         btrfs_set_device_id(leaf, dev_item, device->devid);
    1884           0 :         btrfs_set_device_generation(leaf, dev_item, 0);
    1885           0 :         btrfs_set_device_type(leaf, dev_item, device->type);
    1886           0 :         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
    1887           0 :         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
    1888           0 :         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
    1889           0 :         btrfs_set_device_total_bytes(leaf, dev_item,
    1890             :                                      btrfs_device_get_disk_total_bytes(device));
    1891           0 :         btrfs_set_device_bytes_used(leaf, dev_item,
    1892             :                                     btrfs_device_get_bytes_used(device));
    1893           0 :         btrfs_set_device_group(leaf, dev_item, 0);
    1894           0 :         btrfs_set_device_seek_speed(leaf, dev_item, 0);
    1895           0 :         btrfs_set_device_bandwidth(leaf, dev_item, 0);
    1896           0 :         btrfs_set_device_start_offset(leaf, dev_item, 0);
    1897             : 
    1898           0 :         ptr = btrfs_device_uuid(dev_item);
    1899           0 :         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
    1900           0 :         ptr = btrfs_device_fsid(dev_item);
    1901           0 :         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
    1902             :                             ptr, BTRFS_FSID_SIZE);
    1903           0 :         btrfs_mark_buffer_dirty(leaf);
    1904             : 
    1905           0 :         ret = 0;
    1906           0 : out:
    1907           0 :         btrfs_free_path(path);
    1908           0 :         return ret;
    1909             : }
    1910             : 
    1911             : /*
    1912             :  * Function to update ctime/mtime for a given device path.
    1913             :  * Mainly used for ctime/mtime based probe like libblkid.
    1914             :  *
    1915             :  * We don't care about errors here, this is just to be kind to userspace.
    1916             :  */
    1917           0 : static void update_dev_time(const char *device_path)
    1918             : {
    1919           0 :         struct path path;
    1920           0 :         struct timespec64 now;
    1921           0 :         int ret;
    1922             : 
    1923           0 :         ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
    1924           0 :         if (ret)
    1925           0 :                 return;
    1926             : 
    1927           0 :         now = current_time(d_inode(path.dentry));
    1928           0 :         inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION);
    1929           0 :         path_put(&path);
    1930             : }
    1931             : 
    1932           0 : static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
    1933             :                              struct btrfs_device *device)
    1934             : {
    1935           0 :         struct btrfs_root *root = device->fs_info->chunk_root;
    1936           0 :         int ret;
    1937           0 :         struct btrfs_path *path;
    1938           0 :         struct btrfs_key key;
    1939             : 
    1940           0 :         path = btrfs_alloc_path();
    1941           0 :         if (!path)
    1942             :                 return -ENOMEM;
    1943             : 
    1944           0 :         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
    1945           0 :         key.type = BTRFS_DEV_ITEM_KEY;
    1946           0 :         key.offset = device->devid;
    1947             : 
    1948           0 :         btrfs_reserve_chunk_metadata(trans, false);
    1949           0 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    1950           0 :         btrfs_trans_release_chunk_metadata(trans);
    1951           0 :         if (ret) {
    1952           0 :                 if (ret > 0)
    1953           0 :                         ret = -ENOENT;
    1954           0 :                 goto out;
    1955             :         }
    1956             : 
    1957           0 :         ret = btrfs_del_item(trans, root, path);
    1958           0 : out:
    1959           0 :         btrfs_free_path(path);
    1960           0 :         return ret;
    1961             : }
    1962             : 
    1963             : /*
    1964             :  * Verify that @num_devices satisfies the RAID profile constraints in the whole
    1965             :  * filesystem. It's up to the caller to adjust that number regarding eg. device
    1966             :  * replace.
    1967             :  */
    1968           1 : static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
    1969             :                 u64 num_devices)
    1970             : {
    1971           1 :         u64 all_avail;
    1972           1 :         unsigned seq;
    1973           1 :         int i;
    1974             : 
    1975           1 :         do {
    1976           1 :                 seq = read_seqbegin(&fs_info->profiles_lock);
    1977             : 
    1978           1 :                 all_avail = fs_info->avail_data_alloc_bits |
    1979           1 :                             fs_info->avail_system_alloc_bits |
    1980           1 :                             fs_info->avail_metadata_alloc_bits;
    1981           1 :         } while (read_seqretry(&fs_info->profiles_lock, seq));
    1982             : 
    1983           4 :         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
    1984           4 :                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
    1985           3 :                         continue;
    1986             : 
    1987           1 :                 if (num_devices < btrfs_raid_array[i].devs_min)
    1988           1 :                         return btrfs_raid_array[i].mindev_error;
    1989             :         }
    1990             : 
    1991             :         return 0;
    1992             : }
    1993             : 
    1994           0 : static struct btrfs_device * btrfs_find_next_active_device(
    1995             :                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
    1996             : {
    1997           0 :         struct btrfs_device *next_device;
    1998             : 
    1999           0 :         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
    2000           0 :                 if (next_device != device &&
    2001           0 :                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
    2002           0 :                     && next_device->bdev)
    2003           0 :                         return next_device;
    2004             :         }
    2005             : 
    2006             :         return NULL;
    2007             : }
    2008             : 
    2009             : /*
    2010             :  * Helper function to check if the given device is part of s_bdev / latest_dev
    2011             :  * and replace it with the provided or the next active device, in the context
    2012             :  * where this function called, there should be always be another device (or
    2013             :  * this_dev) which is active.
    2014             :  */
    2015           0 : void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
    2016             :                                             struct btrfs_device *next_device)
    2017             : {
    2018           0 :         struct btrfs_fs_info *fs_info = device->fs_info;
    2019             : 
    2020           0 :         if (!next_device)
    2021           0 :                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
    2022             :                                                             device);
    2023           0 :         ASSERT(next_device);
    2024             : 
    2025           0 :         if (fs_info->sb->s_bdev &&
    2026           0 :                         (fs_info->sb->s_bdev == device->bdev))
    2027           0 :                 fs_info->sb->s_bdev = next_device->bdev;
    2028             : 
    2029           0 :         if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
    2030           0 :                 fs_info->fs_devices->latest_dev = next_device;
    2031           0 : }
    2032             : 
    2033             : /*
    2034             :  * Return btrfs_fs_devices::num_devices excluding the device that's being
    2035             :  * currently replaced.
    2036             :  */
    2037           1 : static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
    2038             : {
    2039           1 :         u64 num_devices = fs_info->fs_devices->num_devices;
    2040             : 
    2041           1 :         down_read(&fs_info->dev_replace.rwsem);
    2042           1 :         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
    2043           0 :                 ASSERT(num_devices > 1);
    2044           0 :                 num_devices--;
    2045             :         }
    2046           1 :         up_read(&fs_info->dev_replace.rwsem);
    2047             : 
    2048           1 :         return num_devices;
    2049             : }
    2050             : 
    2051           0 : static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
    2052             :                                      struct block_device *bdev, int copy_num)
    2053             : {
    2054           0 :         struct btrfs_super_block *disk_super;
    2055           0 :         const size_t len = sizeof(disk_super->magic);
    2056           0 :         const u64 bytenr = btrfs_sb_offset(copy_num);
    2057           0 :         int ret;
    2058             : 
    2059           0 :         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
    2060           0 :         if (IS_ERR(disk_super))
    2061             :                 return;
    2062             : 
    2063           0 :         memset(&disk_super->magic, 0, len);
    2064           0 :         folio_mark_dirty(virt_to_folio(disk_super));
    2065           0 :         btrfs_release_disk_super(disk_super);
    2066             : 
    2067           0 :         ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
    2068           0 :         if (ret)
    2069           0 :                 btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
    2070             :                         copy_num, ret);
    2071             : }
    2072             : 
    2073           0 : void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
    2074             :                                struct block_device *bdev,
    2075             :                                const char *device_path)
    2076             : {
    2077           0 :         int copy_num;
    2078             : 
    2079           0 :         if (!bdev)
    2080             :                 return;
    2081             : 
    2082           0 :         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
    2083           0 :                 if (bdev_is_zoned(bdev))
    2084           0 :                         btrfs_reset_sb_log_zones(bdev, copy_num);
    2085             :                 else
    2086           0 :                         btrfs_scratch_superblock(fs_info, bdev, copy_num);
    2087             :         }
    2088             : 
    2089             :         /* Notify udev that device has changed */
    2090           0 :         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
    2091             : 
    2092             :         /* Update ctime/mtime for device path for libblkid */
    2093           0 :         update_dev_time(device_path);
    2094             : }
    2095             : 
    2096           1 : int btrfs_rm_device(struct btrfs_fs_info *fs_info,
    2097             :                     struct btrfs_dev_lookup_args *args,
    2098             :                     struct block_device **bdev, void **holder)
    2099             : {
    2100           1 :         struct btrfs_trans_handle *trans;
    2101           1 :         struct btrfs_device *device;
    2102           1 :         struct btrfs_fs_devices *cur_devices;
    2103           1 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2104           1 :         u64 num_devices;
    2105           1 :         int ret = 0;
    2106             : 
    2107           1 :         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
    2108           0 :                 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
    2109           0 :                 return -EINVAL;
    2110             :         }
    2111             : 
    2112             :         /*
    2113             :          * The device list in fs_devices is accessed without locks (neither
    2114             :          * uuid_mutex nor device_list_mutex) as it won't change on a mounted
    2115             :          * filesystem and another device rm cannot run.
    2116             :          */
    2117           1 :         num_devices = btrfs_num_devices(fs_info);
    2118             : 
    2119           1 :         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
    2120           1 :         if (ret)
    2121             :                 return ret;
    2122             : 
    2123           1 :         device = btrfs_find_device(fs_info->fs_devices, args);
    2124           1 :         if (!device) {
    2125           1 :                 if (args->missing)
    2126             :                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
    2127             :                 else
    2128           1 :                         ret = -ENOENT;
    2129           1 :                 return ret;
    2130             :         }
    2131             : 
    2132           0 :         if (btrfs_pinned_by_swapfile(fs_info, device)) {
    2133           0 :                 btrfs_warn_in_rcu(fs_info,
    2134             :                   "cannot remove device %s (devid %llu) due to active swapfile",
    2135             :                                   btrfs_dev_name(device), device->devid);
    2136           0 :                 return -ETXTBSY;
    2137             :         }
    2138             : 
    2139           0 :         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
    2140             :                 return BTRFS_ERROR_DEV_TGT_REPLACE;
    2141             : 
    2142           0 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
    2143           0 :             fs_info->fs_devices->rw_devices == 1)
    2144             :                 return BTRFS_ERROR_DEV_ONLY_WRITABLE;
    2145             : 
    2146           0 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    2147           0 :                 mutex_lock(&fs_info->chunk_mutex);
    2148           0 :                 list_del_init(&device->dev_alloc_list);
    2149           0 :                 device->fs_devices->rw_devices--;
    2150           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    2151             :         }
    2152             : 
    2153           0 :         ret = btrfs_shrink_device(device, 0);
    2154           0 :         if (ret)
    2155           0 :                 goto error_undo;
    2156             : 
    2157           0 :         trans = btrfs_start_transaction(fs_info->chunk_root, 0);
    2158           0 :         if (IS_ERR(trans)) {
    2159           0 :                 ret = PTR_ERR(trans);
    2160           0 :                 goto error_undo;
    2161             :         }
    2162             : 
    2163           0 :         ret = btrfs_rm_dev_item(trans, device);
    2164           0 :         if (ret) {
    2165             :                 /* Any error in dev item removal is critical */
    2166           0 :                 btrfs_crit(fs_info,
    2167             :                            "failed to remove device item for devid %llu: %d",
    2168             :                            device->devid, ret);
    2169           0 :                 btrfs_abort_transaction(trans, ret);
    2170           0 :                 btrfs_end_transaction(trans);
    2171           0 :                 return ret;
    2172             :         }
    2173             : 
    2174           0 :         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
    2175           0 :         btrfs_scrub_cancel_dev(device);
    2176             : 
    2177             :         /*
    2178             :          * the device list mutex makes sure that we don't change
    2179             :          * the device list while someone else is writing out all
    2180             :          * the device supers. Whoever is writing all supers, should
    2181             :          * lock the device list mutex before getting the number of
    2182             :          * devices in the super block (super_copy). Conversely,
    2183             :          * whoever updates the number of devices in the super block
    2184             :          * (super_copy) should hold the device list mutex.
    2185             :          */
    2186             : 
    2187             :         /*
    2188             :          * In normal cases the cur_devices == fs_devices. But in case
    2189             :          * of deleting a seed device, the cur_devices should point to
    2190             :          * its own fs_devices listed under the fs_devices->seed_list.
    2191             :          */
    2192           0 :         cur_devices = device->fs_devices;
    2193           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2194           0 :         list_del_rcu(&device->dev_list);
    2195             : 
    2196           0 :         cur_devices->num_devices--;
    2197           0 :         cur_devices->total_devices--;
    2198             :         /* Update total_devices of the parent fs_devices if it's seed */
    2199           0 :         if (cur_devices != fs_devices)
    2200           0 :                 fs_devices->total_devices--;
    2201             : 
    2202           0 :         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
    2203           0 :                 cur_devices->missing_devices--;
    2204             : 
    2205           0 :         btrfs_assign_next_active_device(device, NULL);
    2206             : 
    2207           0 :         if (device->bdev) {
    2208           0 :                 cur_devices->open_devices--;
    2209             :                 /* remove sysfs entry */
    2210           0 :                 btrfs_sysfs_remove_device(device);
    2211             :         }
    2212             : 
    2213           0 :         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
    2214           0 :         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
    2215           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2216             : 
    2217             :         /*
    2218             :          * At this point, the device is zero sized and detached from the
    2219             :          * devices list.  All that's left is to zero out the old supers and
    2220             :          * free the device.
    2221             :          *
    2222             :          * We cannot call btrfs_close_bdev() here because we're holding the sb
    2223             :          * write lock, and blkdev_put() will pull in the ->open_mutex on the
    2224             :          * block device and it's dependencies.  Instead just flush the device
    2225             :          * and let the caller do the final blkdev_put.
    2226             :          */
    2227           0 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    2228           0 :                 btrfs_scratch_superblocks(fs_info, device->bdev,
    2229           0 :                                           device->name->str);
    2230           0 :                 if (device->bdev) {
    2231           0 :                         sync_blockdev(device->bdev);
    2232           0 :                         invalidate_bdev(device->bdev);
    2233             :                 }
    2234             :         }
    2235             : 
    2236           0 :         *bdev = device->bdev;
    2237           0 :         *holder = device->holder;
    2238           0 :         synchronize_rcu();
    2239           0 :         btrfs_free_device(device);
    2240             : 
    2241             :         /*
    2242             :          * This can happen if cur_devices is the private seed devices list.  We
    2243             :          * cannot call close_fs_devices() here because it expects the uuid_mutex
    2244             :          * to be held, but in fact we don't need that for the private
    2245             :          * seed_devices, we can simply decrement cur_devices->opened and then
    2246             :          * remove it from our list and free the fs_devices.
    2247             :          */
    2248           0 :         if (cur_devices->num_devices == 0) {
    2249           0 :                 list_del_init(&cur_devices->seed_list);
    2250           0 :                 ASSERT(cur_devices->opened == 1);
    2251           0 :                 cur_devices->opened--;
    2252           0 :                 free_fs_devices(cur_devices);
    2253             :         }
    2254             : 
    2255           0 :         ret = btrfs_commit_transaction(trans);
    2256             : 
    2257           0 :         return ret;
    2258             : 
    2259           0 : error_undo:
    2260           0 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    2261           0 :                 mutex_lock(&fs_info->chunk_mutex);
    2262           0 :                 list_add(&device->dev_alloc_list,
    2263             :                          &fs_devices->alloc_list);
    2264           0 :                 device->fs_devices->rw_devices++;
    2265           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    2266             :         }
    2267             :         return ret;
    2268             : }
    2269             : 
    2270           0 : void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
    2271             : {
    2272           0 :         struct btrfs_fs_devices *fs_devices;
    2273             : 
    2274           0 :         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
    2275             : 
    2276             :         /*
    2277             :          * in case of fs with no seed, srcdev->fs_devices will point
    2278             :          * to fs_devices of fs_info. However when the dev being replaced is
    2279             :          * a seed dev it will point to the seed's local fs_devices. In short
    2280             :          * srcdev will have its correct fs_devices in both the cases.
    2281             :          */
    2282           0 :         fs_devices = srcdev->fs_devices;
    2283             : 
    2284           0 :         list_del_rcu(&srcdev->dev_list);
    2285           0 :         list_del(&srcdev->dev_alloc_list);
    2286           0 :         fs_devices->num_devices--;
    2287           0 :         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
    2288           0 :                 fs_devices->missing_devices--;
    2289             : 
    2290           0 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
    2291           0 :                 fs_devices->rw_devices--;
    2292             : 
    2293           0 :         if (srcdev->bdev)
    2294           0 :                 fs_devices->open_devices--;
    2295           0 : }
    2296             : 
    2297           0 : void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
    2298             : {
    2299           0 :         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
    2300             : 
    2301           0 :         mutex_lock(&uuid_mutex);
    2302             : 
    2303           0 :         btrfs_close_bdev(srcdev);
    2304           0 :         synchronize_rcu();
    2305           0 :         btrfs_free_device(srcdev);
    2306             : 
    2307             :         /* if this is no devs we rather delete the fs_devices */
    2308           0 :         if (!fs_devices->num_devices) {
    2309             :                 /*
    2310             :                  * On a mounted FS, num_devices can't be zero unless it's a
    2311             :                  * seed. In case of a seed device being replaced, the replace
    2312             :                  * target added to the sprout FS, so there will be no more
    2313             :                  * device left under the seed FS.
    2314             :                  */
    2315           0 :                 ASSERT(fs_devices->seeding);
    2316             : 
    2317           0 :                 list_del_init(&fs_devices->seed_list);
    2318           0 :                 close_fs_devices(fs_devices);
    2319           0 :                 free_fs_devices(fs_devices);
    2320             :         }
    2321           0 :         mutex_unlock(&uuid_mutex);
    2322           0 : }
    2323             : 
    2324           0 : void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
    2325             : {
    2326           0 :         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
    2327             : 
    2328           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2329             : 
    2330           0 :         btrfs_sysfs_remove_device(tgtdev);
    2331             : 
    2332           0 :         if (tgtdev->bdev)
    2333           0 :                 fs_devices->open_devices--;
    2334             : 
    2335           0 :         fs_devices->num_devices--;
    2336             : 
    2337           0 :         btrfs_assign_next_active_device(tgtdev, NULL);
    2338             : 
    2339           0 :         list_del_rcu(&tgtdev->dev_list);
    2340             : 
    2341           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2342             : 
    2343           0 :         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
    2344           0 :                                   tgtdev->name->str);
    2345             : 
    2346           0 :         btrfs_close_bdev(tgtdev);
    2347           0 :         synchronize_rcu();
    2348           0 :         btrfs_free_device(tgtdev);
    2349           0 : }
    2350             : 
    2351             : /*
    2352             :  * Populate args from device at path.
    2353             :  *
    2354             :  * @fs_info:    the filesystem
    2355             :  * @args:       the args to populate
    2356             :  * @path:       the path to the device
    2357             :  *
    2358             :  * This will read the super block of the device at @path and populate @args with
    2359             :  * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
    2360             :  * lookup a device to operate on, but need to do it before we take any locks.
    2361             :  * This properly handles the special case of "missing" that a user may pass in,
    2362             :  * and does some basic sanity checks.  The caller must make sure that @path is
    2363             :  * properly NUL terminated before calling in, and must call
    2364             :  * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
    2365             :  * uuid buffers.
    2366             :  *
    2367             :  * Return: 0 for success, -errno for failure
    2368             :  */
    2369           0 : int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
    2370             :                                  struct btrfs_dev_lookup_args *args,
    2371             :                                  const char *path)
    2372             : {
    2373           0 :         struct btrfs_super_block *disk_super;
    2374           0 :         struct block_device *bdev;
    2375           0 :         int ret;
    2376             : 
    2377           0 :         if (!path || !path[0])
    2378             :                 return -EINVAL;
    2379           0 :         if (!strcmp(path, "missing")) {
    2380           0 :                 args->missing = true;
    2381           0 :                 return 0;
    2382             :         }
    2383             : 
    2384           0 :         args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
    2385           0 :         args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
    2386           0 :         if (!args->uuid || !args->fsid) {
    2387           0 :                 btrfs_put_dev_args_from_path(args);
    2388           0 :                 return -ENOMEM;
    2389             :         }
    2390             : 
    2391           0 :         ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
    2392             :                                     &bdev, &disk_super);
    2393           0 :         if (ret) {
    2394           0 :                 btrfs_put_dev_args_from_path(args);
    2395           0 :                 return ret;
    2396             :         }
    2397             : 
    2398           0 :         args->devid = btrfs_stack_device_id(&disk_super->dev_item);
    2399           0 :         memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
    2400           0 :         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
    2401           0 :                 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
    2402             :         else
    2403           0 :                 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
    2404           0 :         btrfs_release_disk_super(disk_super);
    2405           0 :         blkdev_put(bdev, NULL);
    2406           0 :         return 0;
    2407             : }
    2408             : 
    2409             : /*
    2410             :  * Only use this jointly with btrfs_get_dev_args_from_path() because we will
    2411             :  * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
    2412             :  * that don't need to be freed.
    2413             :  */
    2414           1 : void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
    2415             : {
    2416           1 :         kfree(args->uuid);
    2417           1 :         kfree(args->fsid);
    2418           1 :         args->uuid = NULL;
    2419           1 :         args->fsid = NULL;
    2420           1 : }
    2421             : 
    2422           0 : struct btrfs_device *btrfs_find_device_by_devspec(
    2423             :                 struct btrfs_fs_info *fs_info, u64 devid,
    2424             :                 const char *device_path)
    2425             : {
    2426           0 :         BTRFS_DEV_LOOKUP_ARGS(args);
    2427           0 :         struct btrfs_device *device;
    2428           0 :         int ret;
    2429             : 
    2430           0 :         if (devid) {
    2431           0 :                 args.devid = devid;
    2432           0 :                 device = btrfs_find_device(fs_info->fs_devices, &args);
    2433           0 :                 if (!device)
    2434             :                         return ERR_PTR(-ENOENT);
    2435           0 :                 return device;
    2436             :         }
    2437             : 
    2438           0 :         ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
    2439           0 :         if (ret)
    2440           0 :                 return ERR_PTR(ret);
    2441           0 :         device = btrfs_find_device(fs_info->fs_devices, &args);
    2442           0 :         btrfs_put_dev_args_from_path(&args);
    2443           0 :         if (!device)
    2444           0 :                 return ERR_PTR(-ENOENT);
    2445             :         return device;
    2446             : }
    2447             : 
    2448           0 : static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
    2449             : {
    2450           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2451           0 :         struct btrfs_fs_devices *old_devices;
    2452           0 :         struct btrfs_fs_devices *seed_devices;
    2453             : 
    2454           0 :         lockdep_assert_held(&uuid_mutex);
    2455           0 :         if (!fs_devices->seeding)
    2456             :                 return ERR_PTR(-EINVAL);
    2457             : 
    2458             :         /*
    2459             :          * Private copy of the seed devices, anchored at
    2460             :          * fs_info->fs_devices->seed_list
    2461             :          */
    2462           0 :         seed_devices = alloc_fs_devices(NULL, NULL);
    2463           0 :         if (IS_ERR(seed_devices))
    2464             :                 return seed_devices;
    2465             : 
    2466             :         /*
    2467             :          * It's necessary to retain a copy of the original seed fs_devices in
    2468             :          * fs_uuids so that filesystems which have been seeded can successfully
    2469             :          * reference the seed device from open_seed_devices. This also supports
    2470             :          * multiple fs seed.
    2471             :          */
    2472           0 :         old_devices = clone_fs_devices(fs_devices);
    2473           0 :         if (IS_ERR(old_devices)) {
    2474           0 :                 kfree(seed_devices);
    2475           0 :                 return old_devices;
    2476             :         }
    2477             : 
    2478           0 :         list_add(&old_devices->fs_list, &fs_uuids);
    2479             : 
    2480           0 :         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
    2481           0 :         seed_devices->opened = 1;
    2482           0 :         INIT_LIST_HEAD(&seed_devices->devices);
    2483           0 :         INIT_LIST_HEAD(&seed_devices->alloc_list);
    2484           0 :         mutex_init(&seed_devices->device_list_mutex);
    2485             : 
    2486           0 :         return seed_devices;
    2487             : }
    2488             : 
    2489             : /*
    2490             :  * Splice seed devices into the sprout fs_devices.
    2491             :  * Generate a new fsid for the sprouted read-write filesystem.
    2492             :  */
    2493           0 : static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
    2494             :                                struct btrfs_fs_devices *seed_devices)
    2495             : {
    2496           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2497           0 :         struct btrfs_super_block *disk_super = fs_info->super_copy;
    2498           0 :         struct btrfs_device *device;
    2499           0 :         u64 super_flags;
    2500             : 
    2501             :         /*
    2502             :          * We are updating the fsid, the thread leading to device_list_add()
    2503             :          * could race, so uuid_mutex is needed.
    2504             :          */
    2505           0 :         lockdep_assert_held(&uuid_mutex);
    2506             : 
    2507             :         /*
    2508             :          * The threads listed below may traverse dev_list but can do that without
    2509             :          * device_list_mutex:
    2510             :          * - All device ops and balance - as we are in btrfs_exclop_start.
    2511             :          * - Various dev_list readers - are using RCU.
    2512             :          * - btrfs_ioctl_fitrim() - is using RCU.
    2513             :          *
    2514             :          * For-read threads as below are using device_list_mutex:
    2515             :          * - Readonly scrub btrfs_scrub_dev()
    2516             :          * - Readonly scrub btrfs_scrub_progress()
    2517             :          * - btrfs_get_dev_stats()
    2518             :          */
    2519           0 :         lockdep_assert_held(&fs_devices->device_list_mutex);
    2520             : 
    2521           0 :         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
    2522             :                               synchronize_rcu);
    2523           0 :         list_for_each_entry(device, &seed_devices->devices, dev_list)
    2524           0 :                 device->fs_devices = seed_devices;
    2525             : 
    2526           0 :         fs_devices->seeding = false;
    2527           0 :         fs_devices->num_devices = 0;
    2528           0 :         fs_devices->open_devices = 0;
    2529           0 :         fs_devices->missing_devices = 0;
    2530           0 :         fs_devices->rotating = false;
    2531           0 :         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
    2532             : 
    2533           0 :         generate_random_uuid(fs_devices->fsid);
    2534           0 :         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
    2535           0 :         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
    2536             : 
    2537           0 :         super_flags = btrfs_super_flags(disk_super) &
    2538             :                       ~BTRFS_SUPER_FLAG_SEEDING;
    2539           0 :         btrfs_set_super_flags(disk_super, super_flags);
    2540           0 : }
    2541             : 
    2542             : /*
    2543             :  * Store the expected generation for seed devices in device items.
    2544             :  */
    2545           0 : static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
    2546             : {
    2547           0 :         BTRFS_DEV_LOOKUP_ARGS(args);
    2548           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2549           0 :         struct btrfs_root *root = fs_info->chunk_root;
    2550           0 :         struct btrfs_path *path;
    2551           0 :         struct extent_buffer *leaf;
    2552           0 :         struct btrfs_dev_item *dev_item;
    2553           0 :         struct btrfs_device *device;
    2554           0 :         struct btrfs_key key;
    2555           0 :         u8 fs_uuid[BTRFS_FSID_SIZE];
    2556           0 :         u8 dev_uuid[BTRFS_UUID_SIZE];
    2557           0 :         int ret;
    2558             : 
    2559           0 :         path = btrfs_alloc_path();
    2560           0 :         if (!path)
    2561             :                 return -ENOMEM;
    2562             : 
    2563           0 :         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
    2564           0 :         key.offset = 0;
    2565           0 :         key.type = BTRFS_DEV_ITEM_KEY;
    2566             : 
    2567           0 :         while (1) {
    2568           0 :                 btrfs_reserve_chunk_metadata(trans, false);
    2569           0 :                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2570           0 :                 btrfs_trans_release_chunk_metadata(trans);
    2571           0 :                 if (ret < 0)
    2572           0 :                         goto error;
    2573             : 
    2574           0 :                 leaf = path->nodes[0];
    2575           0 : next_slot:
    2576           0 :                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
    2577           0 :                         ret = btrfs_next_leaf(root, path);
    2578           0 :                         if (ret > 0)
    2579             :                                 break;
    2580           0 :                         if (ret < 0)
    2581           0 :                                 goto error;
    2582           0 :                         leaf = path->nodes[0];
    2583           0 :                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    2584           0 :                         btrfs_release_path(path);
    2585           0 :                         continue;
    2586             :                 }
    2587             : 
    2588           0 :                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
    2589           0 :                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
    2590           0 :                     key.type != BTRFS_DEV_ITEM_KEY)
    2591             :                         break;
    2592             : 
    2593           0 :                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
    2594             :                                           struct btrfs_dev_item);
    2595           0 :                 args.devid = btrfs_device_id(leaf, dev_item);
    2596           0 :                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
    2597             :                                    BTRFS_UUID_SIZE);
    2598           0 :                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
    2599             :                                    BTRFS_FSID_SIZE);
    2600           0 :                 args.uuid = dev_uuid;
    2601           0 :                 args.fsid = fs_uuid;
    2602           0 :                 device = btrfs_find_device(fs_info->fs_devices, &args);
    2603           0 :                 BUG_ON(!device); /* Logic error */
    2604             : 
    2605           0 :                 if (device->fs_devices->seeding) {
    2606           0 :                         btrfs_set_device_generation(leaf, dev_item,
    2607             :                                                     device->generation);
    2608           0 :                         btrfs_mark_buffer_dirty(leaf);
    2609             :                 }
    2610             : 
    2611           0 :                 path->slots[0]++;
    2612           0 :                 goto next_slot;
    2613             :         }
    2614             :         ret = 0;
    2615           0 : error:
    2616           0 :         btrfs_free_path(path);
    2617           0 :         return ret;
    2618             : }
    2619             : 
    2620           0 : int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
    2621             : {
    2622           0 :         struct btrfs_root *root = fs_info->dev_root;
    2623           0 :         struct btrfs_trans_handle *trans;
    2624           0 :         struct btrfs_device *device;
    2625           0 :         struct block_device *bdev;
    2626           0 :         struct super_block *sb = fs_info->sb;
    2627           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    2628           0 :         struct btrfs_fs_devices *seed_devices = NULL;
    2629           0 :         u64 orig_super_total_bytes;
    2630           0 :         u64 orig_super_num_devices;
    2631           0 :         int ret = 0;
    2632           0 :         bool seeding_dev = false;
    2633           0 :         bool locked = false;
    2634             : 
    2635           0 :         if (sb_rdonly(sb) && !fs_devices->seeding)
    2636             :                 return -EROFS;
    2637             : 
    2638           0 :         bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
    2639             :                                   fs_info->bdev_holder, NULL);
    2640           0 :         if (IS_ERR(bdev))
    2641           0 :                 return PTR_ERR(bdev);
    2642             : 
    2643           0 :         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
    2644           0 :                 ret = -EINVAL;
    2645           0 :                 goto error;
    2646             :         }
    2647             : 
    2648           0 :         if (fs_devices->seeding) {
    2649           0 :                 seeding_dev = true;
    2650           0 :                 down_write(&sb->s_umount);
    2651           0 :                 mutex_lock(&uuid_mutex);
    2652           0 :                 locked = true;
    2653             :         }
    2654             : 
    2655           0 :         sync_blockdev(bdev);
    2656             : 
    2657           0 :         rcu_read_lock();
    2658           0 :         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
    2659           0 :                 if (device->bdev == bdev) {
    2660           0 :                         ret = -EEXIST;
    2661           0 :                         rcu_read_unlock();
    2662           0 :                         goto error;
    2663             :                 }
    2664             :         }
    2665           0 :         rcu_read_unlock();
    2666             : 
    2667           0 :         device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
    2668           0 :         if (IS_ERR(device)) {
    2669             :                 /* we can safely leave the fs_devices entry around */
    2670           0 :                 ret = PTR_ERR(device);
    2671           0 :                 goto error;
    2672             :         }
    2673             : 
    2674           0 :         device->fs_info = fs_info;
    2675           0 :         device->bdev = bdev;
    2676           0 :         ret = lookup_bdev(device_path, &device->devt);
    2677           0 :         if (ret)
    2678           0 :                 goto error_free_device;
    2679             : 
    2680           0 :         ret = btrfs_get_dev_zone_info(device, false);
    2681           0 :         if (ret)
    2682           0 :                 goto error_free_device;
    2683             : 
    2684           0 :         trans = btrfs_start_transaction(root, 0);
    2685           0 :         if (IS_ERR(trans)) {
    2686           0 :                 ret = PTR_ERR(trans);
    2687           0 :                 goto error_free_zone;
    2688             :         }
    2689             : 
    2690           0 :         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
    2691           0 :         device->generation = trans->transid;
    2692           0 :         device->io_width = fs_info->sectorsize;
    2693           0 :         device->io_align = fs_info->sectorsize;
    2694           0 :         device->sector_size = fs_info->sectorsize;
    2695           0 :         device->total_bytes =
    2696           0 :                 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
    2697           0 :         device->disk_total_bytes = device->total_bytes;
    2698           0 :         device->commit_total_bytes = device->total_bytes;
    2699           0 :         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
    2700           0 :         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
    2701           0 :         device->holder = fs_info->bdev_holder;
    2702           0 :         device->dev_stats_valid = 1;
    2703           0 :         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
    2704             : 
    2705           0 :         if (seeding_dev) {
    2706           0 :                 btrfs_clear_sb_rdonly(sb);
    2707             : 
    2708             :                 /* GFP_KERNEL allocation must not be under device_list_mutex */
    2709           0 :                 seed_devices = btrfs_init_sprout(fs_info);
    2710           0 :                 if (IS_ERR(seed_devices)) {
    2711           0 :                         ret = PTR_ERR(seed_devices);
    2712           0 :                         btrfs_abort_transaction(trans, ret);
    2713           0 :                         goto error_trans;
    2714             :                 }
    2715             :         }
    2716             : 
    2717           0 :         mutex_lock(&fs_devices->device_list_mutex);
    2718           0 :         if (seeding_dev) {
    2719           0 :                 btrfs_setup_sprout(fs_info, seed_devices);
    2720           0 :                 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
    2721             :                                                 device);
    2722             :         }
    2723             : 
    2724           0 :         device->fs_devices = fs_devices;
    2725             : 
    2726           0 :         mutex_lock(&fs_info->chunk_mutex);
    2727           0 :         list_add_rcu(&device->dev_list, &fs_devices->devices);
    2728           0 :         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
    2729           0 :         fs_devices->num_devices++;
    2730           0 :         fs_devices->open_devices++;
    2731           0 :         fs_devices->rw_devices++;
    2732           0 :         fs_devices->total_devices++;
    2733           0 :         fs_devices->total_rw_bytes += device->total_bytes;
    2734             : 
    2735           0 :         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
    2736             : 
    2737           0 :         if (!bdev_nonrot(bdev))
    2738           0 :                 fs_devices->rotating = true;
    2739             : 
    2740           0 :         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
    2741           0 :         btrfs_set_super_total_bytes(fs_info->super_copy,
    2742           0 :                 round_down(orig_super_total_bytes + device->total_bytes,
    2743             :                            fs_info->sectorsize));
    2744             : 
    2745           0 :         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
    2746           0 :         btrfs_set_super_num_devices(fs_info->super_copy,
    2747             :                                     orig_super_num_devices + 1);
    2748             : 
    2749             :         /*
    2750             :          * we've got more storage, clear any full flags on the space
    2751             :          * infos
    2752             :          */
    2753           0 :         btrfs_clear_space_info_full(fs_info);
    2754             : 
    2755           0 :         mutex_unlock(&fs_info->chunk_mutex);
    2756             : 
    2757             :         /* Add sysfs device entry */
    2758           0 :         btrfs_sysfs_add_device(device);
    2759             : 
    2760           0 :         mutex_unlock(&fs_devices->device_list_mutex);
    2761             : 
    2762           0 :         if (seeding_dev) {
    2763           0 :                 mutex_lock(&fs_info->chunk_mutex);
    2764           0 :                 ret = init_first_rw_device(trans);
    2765           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    2766           0 :                 if (ret) {
    2767           0 :                         btrfs_abort_transaction(trans, ret);
    2768           0 :                         goto error_sysfs;
    2769             :                 }
    2770             :         }
    2771             : 
    2772           0 :         ret = btrfs_add_dev_item(trans, device);
    2773           0 :         if (ret) {
    2774           0 :                 btrfs_abort_transaction(trans, ret);
    2775           0 :                 goto error_sysfs;
    2776             :         }
    2777             : 
    2778           0 :         if (seeding_dev) {
    2779           0 :                 ret = btrfs_finish_sprout(trans);
    2780           0 :                 if (ret) {
    2781           0 :                         btrfs_abort_transaction(trans, ret);
    2782           0 :                         goto error_sysfs;
    2783             :                 }
    2784             : 
    2785             :                 /*
    2786             :                  * fs_devices now represents the newly sprouted filesystem and
    2787             :                  * its fsid has been changed by btrfs_sprout_splice().
    2788             :                  */
    2789           0 :                 btrfs_sysfs_update_sprout_fsid(fs_devices);
    2790             :         }
    2791             : 
    2792           0 :         ret = btrfs_commit_transaction(trans);
    2793             : 
    2794           0 :         if (seeding_dev) {
    2795           0 :                 mutex_unlock(&uuid_mutex);
    2796           0 :                 up_write(&sb->s_umount);
    2797           0 :                 locked = false;
    2798             : 
    2799           0 :                 if (ret) /* transaction commit */
    2800             :                         return ret;
    2801             : 
    2802           0 :                 ret = btrfs_relocate_sys_chunks(fs_info);
    2803           0 :                 if (ret < 0)
    2804           0 :                         btrfs_handle_fs_error(fs_info, ret,
    2805             :                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
    2806           0 :                 trans = btrfs_attach_transaction(root);
    2807           0 :                 if (IS_ERR(trans)) {
    2808           0 :                         if (PTR_ERR(trans) == -ENOENT)
    2809             :                                 return 0;
    2810           0 :                         ret = PTR_ERR(trans);
    2811           0 :                         trans = NULL;
    2812           0 :                         goto error_sysfs;
    2813             :                 }
    2814           0 :                 ret = btrfs_commit_transaction(trans);
    2815             :         }
    2816             : 
    2817             :         /*
    2818             :          * Now that we have written a new super block to this device, check all
    2819             :          * other fs_devices list if device_path alienates any other scanned
    2820             :          * device.
    2821             :          * We can ignore the return value as it typically returns -EINVAL and
    2822             :          * only succeeds if the device was an alien.
    2823             :          */
    2824           0 :         btrfs_forget_devices(device->devt);
    2825             : 
    2826             :         /* Update ctime/mtime for blkid or udev */
    2827           0 :         update_dev_time(device_path);
    2828             : 
    2829           0 :         return ret;
    2830             : 
    2831           0 : error_sysfs:
    2832           0 :         btrfs_sysfs_remove_device(device);
    2833           0 :         mutex_lock(&fs_info->fs_devices->device_list_mutex);
    2834           0 :         mutex_lock(&fs_info->chunk_mutex);
    2835           0 :         list_del_rcu(&device->dev_list);
    2836           0 :         list_del(&device->dev_alloc_list);
    2837           0 :         fs_info->fs_devices->num_devices--;
    2838           0 :         fs_info->fs_devices->open_devices--;
    2839           0 :         fs_info->fs_devices->rw_devices--;
    2840           0 :         fs_info->fs_devices->total_devices--;
    2841           0 :         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
    2842           0 :         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
    2843           0 :         btrfs_set_super_total_bytes(fs_info->super_copy,
    2844             :                                     orig_super_total_bytes);
    2845           0 :         btrfs_set_super_num_devices(fs_info->super_copy,
    2846             :                                     orig_super_num_devices);
    2847           0 :         mutex_unlock(&fs_info->chunk_mutex);
    2848           0 :         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2849           0 : error_trans:
    2850           0 :         if (seeding_dev)
    2851           0 :                 btrfs_set_sb_rdonly(sb);
    2852           0 :         if (trans)
    2853           0 :                 btrfs_end_transaction(trans);
    2854           0 : error_free_zone:
    2855           0 :         btrfs_destroy_dev_zone_info(device);
    2856           0 : error_free_device:
    2857           0 :         btrfs_free_device(device);
    2858           0 : error:
    2859           0 :         blkdev_put(bdev, fs_info->bdev_holder);
    2860           0 :         if (locked) {
    2861           0 :                 mutex_unlock(&uuid_mutex);
    2862           0 :                 up_write(&sb->s_umount);
    2863             :         }
    2864             :         return ret;
    2865             : }
    2866             : 
    2867        2566 : static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
    2868             :                                         struct btrfs_device *device)
    2869             : {
    2870        2566 :         int ret;
    2871        2566 :         struct btrfs_path *path;
    2872        2566 :         struct btrfs_root *root = device->fs_info->chunk_root;
    2873        2566 :         struct btrfs_dev_item *dev_item;
    2874        2566 :         struct extent_buffer *leaf;
    2875        2566 :         struct btrfs_key key;
    2876             : 
    2877        2566 :         path = btrfs_alloc_path();
    2878        2566 :         if (!path)
    2879             :                 return -ENOMEM;
    2880             : 
    2881        2566 :         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
    2882        2566 :         key.type = BTRFS_DEV_ITEM_KEY;
    2883        2566 :         key.offset = device->devid;
    2884             : 
    2885        2566 :         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
    2886        2566 :         if (ret < 0)
    2887           0 :                 goto out;
    2888             : 
    2889        2566 :         if (ret > 0) {
    2890           0 :                 ret = -ENOENT;
    2891           0 :                 goto out;
    2892             :         }
    2893             : 
    2894        2566 :         leaf = path->nodes[0];
    2895        2566 :         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
    2896             : 
    2897        2566 :         btrfs_set_device_id(leaf, dev_item, device->devid);
    2898        2566 :         btrfs_set_device_type(leaf, dev_item, device->type);
    2899        2566 :         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
    2900        2566 :         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
    2901        2566 :         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
    2902        2566 :         btrfs_set_device_total_bytes(leaf, dev_item,
    2903             :                                      btrfs_device_get_disk_total_bytes(device));
    2904        2566 :         btrfs_set_device_bytes_used(leaf, dev_item,
    2905             :                                     btrfs_device_get_bytes_used(device));
    2906        2566 :         btrfs_mark_buffer_dirty(leaf);
    2907             : 
    2908        2566 : out:
    2909        2566 :         btrfs_free_path(path);
    2910        2566 :         return ret;
    2911             : }
    2912             : 
    2913           1 : int btrfs_grow_device(struct btrfs_trans_handle *trans,
    2914             :                       struct btrfs_device *device, u64 new_size)
    2915             : {
    2916           1 :         struct btrfs_fs_info *fs_info = device->fs_info;
    2917           1 :         struct btrfs_super_block *super_copy = fs_info->super_copy;
    2918           1 :         u64 old_total;
    2919           1 :         u64 diff;
    2920           1 :         int ret;
    2921             : 
    2922           1 :         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
    2923             :                 return -EACCES;
    2924             : 
    2925           1 :         new_size = round_down(new_size, fs_info->sectorsize);
    2926             : 
    2927           1 :         mutex_lock(&fs_info->chunk_mutex);
    2928           1 :         old_total = btrfs_super_total_bytes(super_copy);
    2929           1 :         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
    2930             : 
    2931           1 :         if (new_size <= device->total_bytes ||
    2932           0 :             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
    2933           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    2934           0 :                 return -EINVAL;
    2935             :         }
    2936             : 
    2937           1 :         btrfs_set_super_total_bytes(super_copy,
    2938           1 :                         round_down(old_total + diff, fs_info->sectorsize));
    2939           1 :         device->fs_devices->total_rw_bytes += diff;
    2940             : 
    2941           1 :         btrfs_device_set_total_bytes(device, new_size);
    2942           1 :         btrfs_device_set_disk_total_bytes(device, new_size);
    2943           1 :         btrfs_clear_space_info_full(device->fs_info);
    2944           1 :         if (list_empty(&device->post_commit_list))
    2945           1 :                 list_add_tail(&device->post_commit_list,
    2946           1 :                               &trans->transaction->dev_update_list);
    2947           1 :         mutex_unlock(&fs_info->chunk_mutex);
    2948             : 
    2949           1 :         btrfs_reserve_chunk_metadata(trans, false);
    2950           1 :         ret = btrfs_update_device(trans, device);
    2951           1 :         btrfs_trans_release_chunk_metadata(trans);
    2952             : 
    2953           1 :         return ret;
    2954             : }
    2955             : 
    2956         532 : static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
    2957             : {
    2958         532 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    2959         532 :         struct btrfs_root *root = fs_info->chunk_root;
    2960         532 :         int ret;
    2961         532 :         struct btrfs_path *path;
    2962         532 :         struct btrfs_key key;
    2963             : 
    2964         532 :         path = btrfs_alloc_path();
    2965         532 :         if (!path)
    2966             :                 return -ENOMEM;
    2967             : 
    2968         532 :         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    2969         532 :         key.offset = chunk_offset;
    2970         532 :         key.type = BTRFS_CHUNK_ITEM_KEY;
    2971             : 
    2972         532 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    2973         532 :         if (ret < 0)
    2974           0 :                 goto out;
    2975         532 :         else if (ret > 0) { /* Logic error or corruption */
    2976           0 :                 btrfs_handle_fs_error(fs_info, -ENOENT,
    2977             :                                       "Failed lookup while freeing chunk.");
    2978           0 :                 ret = -ENOENT;
    2979           0 :                 goto out;
    2980             :         }
    2981             : 
    2982         532 :         ret = btrfs_del_item(trans, root, path);
    2983         532 :         if (ret < 0)
    2984           0 :                 btrfs_handle_fs_error(fs_info, ret,
    2985             :                                       "Failed to delete chunk item.");
    2986         532 : out:
    2987         532 :         btrfs_free_path(path);
    2988         532 :         return ret;
    2989             : }
    2990             : 
    2991          89 : static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
    2992             : {
    2993          89 :         struct btrfs_super_block *super_copy = fs_info->super_copy;
    2994          89 :         struct btrfs_disk_key *disk_key;
    2995          89 :         struct btrfs_chunk *chunk;
    2996          89 :         u8 *ptr;
    2997          89 :         int ret = 0;
    2998          89 :         u32 num_stripes;
    2999          89 :         u32 array_size;
    3000          89 :         u32 len = 0;
    3001          89 :         u32 cur;
    3002          89 :         struct btrfs_key key;
    3003             : 
    3004          89 :         lockdep_assert_held(&fs_info->chunk_mutex);
    3005          89 :         array_size = btrfs_super_sys_array_size(super_copy);
    3006             : 
    3007          89 :         ptr = super_copy->sys_chunk_array;
    3008          89 :         cur = 0;
    3009             : 
    3010         267 :         while (cur < array_size) {
    3011         178 :                 disk_key = (struct btrfs_disk_key *)ptr;
    3012         178 :                 btrfs_disk_key_to_cpu(&key, disk_key);
    3013             : 
    3014         178 :                 len = sizeof(*disk_key);
    3015             : 
    3016         178 :                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
    3017         178 :                         chunk = (struct btrfs_chunk *)(ptr + len);
    3018         178 :                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
    3019         178 :                         len += btrfs_chunk_item_size(num_stripes);
    3020             :                 } else {
    3021             :                         ret = -EIO;
    3022             :                         break;
    3023             :                 }
    3024         178 :                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
    3025         178 :                     key.offset == chunk_offset) {
    3026         178 :                         memmove(ptr, ptr + len, array_size - (cur + len));
    3027          89 :                         array_size -= len;
    3028          89 :                         btrfs_set_super_sys_array_size(super_copy, array_size);
    3029             :                 } else {
    3030          89 :                         ptr += len;
    3031          89 :                         cur += len;
    3032             :                 }
    3033             :         }
    3034          89 :         return ret;
    3035             : }
    3036             : 
    3037             : /*
    3038             :  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
    3039             :  * @logical: Logical block offset in bytes.
    3040             :  * @length: Length of extent in bytes.
    3041             :  *
    3042             :  * Return: Chunk mapping or ERR_PTR.
    3043             :  */
    3044    29778396 : struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
    3045             :                                        u64 logical, u64 length)
    3046             : {
    3047    29778396 :         struct extent_map_tree *em_tree;
    3048    29778396 :         struct extent_map *em;
    3049             : 
    3050    29778396 :         em_tree = &fs_info->mapping_tree;
    3051    29778396 :         read_lock(&em_tree->lock);
    3052    29780654 :         em = lookup_extent_mapping(em_tree, logical, length);
    3053    29780538 :         read_unlock(&em_tree->lock);
    3054             : 
    3055    29780209 :         if (!em) {
    3056           0 :                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
    3057             :                            logical, length);
    3058           0 :                 return ERR_PTR(-EINVAL);
    3059             :         }
    3060             : 
    3061    29780209 :         if (em->start > logical || em->start + em->len < logical) {
    3062           0 :                 btrfs_crit(fs_info,
    3063             :                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
    3064             :                            logical, length, em->start, em->start + em->len);
    3065           0 :                 free_extent_map(em);
    3066           0 :                 return ERR_PTR(-EINVAL);
    3067             :         }
    3068             : 
    3069             :         /* callers are responsible for dropping em's ref. */
    3070             :         return em;
    3071             : }
    3072             : 
    3073         532 : static int remove_chunk_item(struct btrfs_trans_handle *trans,
    3074             :                              struct map_lookup *map, u64 chunk_offset)
    3075             : {
    3076         532 :         int i;
    3077             : 
    3078             :         /*
    3079             :          * Removing chunk items and updating the device items in the chunks btree
    3080             :          * requires holding the chunk_mutex.
    3081             :          * See the comment at btrfs_chunk_alloc() for the details.
    3082             :          */
    3083         532 :         lockdep_assert_held(&trans->fs_info->chunk_mutex);
    3084             : 
    3085        1349 :         for (i = 0; i < map->num_stripes; i++) {
    3086         817 :                 int ret;
    3087             : 
    3088         817 :                 ret = btrfs_update_device(trans, map->stripes[i].dev);
    3089         817 :                 if (ret)
    3090           0 :                         return ret;
    3091             :         }
    3092             : 
    3093         532 :         return btrfs_free_chunk(trans, chunk_offset);
    3094             : }
    3095             : 
    3096         532 : int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
    3097             : {
    3098         532 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    3099         532 :         struct extent_map *em;
    3100         532 :         struct map_lookup *map;
    3101         532 :         u64 dev_extent_len = 0;
    3102         532 :         int i, ret = 0;
    3103         532 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    3104             : 
    3105         532 :         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
    3106         532 :         if (IS_ERR(em)) {
    3107             :                 /*
    3108             :                  * This is a logic error, but we don't want to just rely on the
    3109             :                  * user having built with ASSERT enabled, so if ASSERT doesn't
    3110             :                  * do anything we still error out.
    3111             :                  */
    3112           0 :                 ASSERT(0);
    3113           0 :                 return PTR_ERR(em);
    3114             :         }
    3115         532 :         map = em->map_lookup;
    3116             : 
    3117             :         /*
    3118             :          * First delete the device extent items from the devices btree.
    3119             :          * We take the device_list_mutex to avoid racing with the finishing phase
    3120             :          * of a device replace operation. See the comment below before acquiring
    3121             :          * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
    3122             :          * because that can result in a deadlock when deleting the device extent
    3123             :          * items from the devices btree - COWing an extent buffer from the btree
    3124             :          * may result in allocating a new metadata chunk, which would attempt to
    3125             :          * lock again fs_info->chunk_mutex.
    3126             :          */
    3127         532 :         mutex_lock(&fs_devices->device_list_mutex);
    3128        1881 :         for (i = 0; i < map->num_stripes; i++) {
    3129         817 :                 struct btrfs_device *device = map->stripes[i].dev;
    3130         817 :                 ret = btrfs_free_dev_extent(trans, device,
    3131             :                                             map->stripes[i].physical,
    3132             :                                             &dev_extent_len);
    3133         817 :                 if (ret) {
    3134           0 :                         mutex_unlock(&fs_devices->device_list_mutex);
    3135           0 :                         btrfs_abort_transaction(trans, ret);
    3136           0 :                         goto out;
    3137             :                 }
    3138             : 
    3139         817 :                 if (device->bytes_used > 0) {
    3140         817 :                         mutex_lock(&fs_info->chunk_mutex);
    3141         817 :                         btrfs_device_set_bytes_used(device,
    3142         817 :                                         device->bytes_used - dev_extent_len);
    3143         817 :                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
    3144         817 :                         btrfs_clear_space_info_full(fs_info);
    3145         817 :                         mutex_unlock(&fs_info->chunk_mutex);
    3146             :                 }
    3147             :         }
    3148         532 :         mutex_unlock(&fs_devices->device_list_mutex);
    3149             : 
    3150             :         /*
    3151             :          * We acquire fs_info->chunk_mutex for 2 reasons:
    3152             :          *
    3153             :          * 1) Just like with the first phase of the chunk allocation, we must
    3154             :          *    reserve system space, do all chunk btree updates and deletions, and
    3155             :          *    update the system chunk array in the superblock while holding this
    3156             :          *    mutex. This is for similar reasons as explained on the comment at
    3157             :          *    the top of btrfs_chunk_alloc();
    3158             :          *
    3159             :          * 2) Prevent races with the final phase of a device replace operation
    3160             :          *    that replaces the device object associated with the map's stripes,
    3161             :          *    because the device object's id can change at any time during that
    3162             :          *    final phase of the device replace operation
    3163             :          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
    3164             :          *    replaced device and then see it with an ID of
    3165             :          *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
    3166             :          *    the device item, which does not exists on the chunk btree.
    3167             :          *    The finishing phase of device replace acquires both the
    3168             :          *    device_list_mutex and the chunk_mutex, in that order, so we are
    3169             :          *    safe by just acquiring the chunk_mutex.
    3170             :          */
    3171         532 :         trans->removing_chunk = true;
    3172         532 :         mutex_lock(&fs_info->chunk_mutex);
    3173             : 
    3174         532 :         check_system_chunk(trans, map->type);
    3175             : 
    3176         532 :         ret = remove_chunk_item(trans, map, chunk_offset);
    3177             :         /*
    3178             :          * Normally we should not get -ENOSPC since we reserved space before
    3179             :          * through the call to check_system_chunk().
    3180             :          *
    3181             :          * Despite our system space_info having enough free space, we may not
    3182             :          * be able to allocate extents from its block groups, because all have
    3183             :          * an incompatible profile, which will force us to allocate a new system
    3184             :          * block group with the right profile, or right after we called
    3185             :          * check_system_space() above, a scrub turned the only system block group
    3186             :          * with enough free space into RO mode.
    3187             :          * This is explained with more detail at do_chunk_alloc().
    3188             :          *
    3189             :          * So if we get -ENOSPC, allocate a new system chunk and retry once.
    3190             :          */
    3191         532 :         if (ret == -ENOSPC) {
    3192           0 :                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
    3193           0 :                 struct btrfs_block_group *sys_bg;
    3194             : 
    3195           0 :                 sys_bg = btrfs_create_chunk(trans, sys_flags);
    3196           0 :                 if (IS_ERR(sys_bg)) {
    3197           0 :                         ret = PTR_ERR(sys_bg);
    3198           0 :                         btrfs_abort_transaction(trans, ret);
    3199           0 :                         goto out;
    3200             :                 }
    3201             : 
    3202           0 :                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
    3203           0 :                 if (ret) {
    3204           0 :                         btrfs_abort_transaction(trans, ret);
    3205           0 :                         goto out;
    3206             :                 }
    3207             : 
    3208           0 :                 ret = remove_chunk_item(trans, map, chunk_offset);
    3209           0 :                 if (ret) {
    3210           0 :                         btrfs_abort_transaction(trans, ret);
    3211           0 :                         goto out;
    3212             :                 }
    3213         532 :         } else if (ret) {
    3214           0 :                 btrfs_abort_transaction(trans, ret);
    3215           0 :                 goto out;
    3216             :         }
    3217             : 
    3218         532 :         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
    3219             : 
    3220         532 :         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
    3221          89 :                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
    3222          89 :                 if (ret) {
    3223           0 :                         btrfs_abort_transaction(trans, ret);
    3224           0 :                         goto out;
    3225             :                 }
    3226             :         }
    3227             : 
    3228         532 :         mutex_unlock(&fs_info->chunk_mutex);
    3229         532 :         trans->removing_chunk = false;
    3230             : 
    3231             :         /*
    3232             :          * We are done with chunk btree updates and deletions, so release the
    3233             :          * system space we previously reserved (with check_system_chunk()).
    3234             :          */
    3235         532 :         btrfs_trans_release_chunk_metadata(trans);
    3236             : 
    3237         532 :         ret = btrfs_remove_block_group(trans, chunk_offset, em);
    3238         532 :         if (ret) {
    3239           0 :                 btrfs_abort_transaction(trans, ret);
    3240           0 :                 goto out;
    3241             :         }
    3242             : 
    3243         532 : out:
    3244         532 :         if (trans->removing_chunk) {
    3245           0 :                 mutex_unlock(&fs_info->chunk_mutex);
    3246           0 :                 trans->removing_chunk = false;
    3247             :         }
    3248             :         /* once for us */
    3249         532 :         free_extent_map(em);
    3250         532 :         return ret;
    3251             : }
    3252             : 
    3253         523 : int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
    3254             : {
    3255         523 :         struct btrfs_root *root = fs_info->chunk_root;
    3256         523 :         struct btrfs_trans_handle *trans;
    3257         523 :         struct btrfs_block_group *block_group;
    3258         523 :         u64 length;
    3259         523 :         int ret;
    3260             : 
    3261         523 :         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
    3262           0 :                 btrfs_err(fs_info,
    3263             :                           "relocate: not supported on extent tree v2 yet");
    3264           0 :                 return -EINVAL;
    3265             :         }
    3266             : 
    3267             :         /*
    3268             :          * Prevent races with automatic removal of unused block groups.
    3269             :          * After we relocate and before we remove the chunk with offset
    3270             :          * chunk_offset, automatic removal of the block group can kick in,
    3271             :          * resulting in a failure when calling btrfs_remove_chunk() below.
    3272             :          *
    3273             :          * Make sure to acquire this mutex before doing a tree search (dev
    3274             :          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
    3275             :          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
    3276             :          * we release the path used to search the chunk/dev tree and before
    3277             :          * the current task acquires this mutex and calls us.
    3278             :          */
    3279         523 :         lockdep_assert_held(&fs_info->reclaim_bgs_lock);
    3280             : 
    3281             :         /* step one, relocate all the extents inside this chunk */
    3282         523 :         btrfs_scrub_pause(fs_info);
    3283         523 :         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
    3284         523 :         btrfs_scrub_continue(fs_info);
    3285         523 :         if (ret) {
    3286             :                 /*
    3287             :                  * If we had a transaction abort, stop all running scrubs.
    3288             :                  * See transaction.c:cleanup_transaction() why we do it here.
    3289             :                  */
    3290           3 :                 if (BTRFS_FS_ERROR(fs_info))
    3291           0 :                         btrfs_scrub_cancel(fs_info);
    3292           3 :                 return ret;
    3293             :         }
    3294             : 
    3295         520 :         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
    3296         520 :         if (!block_group)
    3297             :                 return -ENOENT;
    3298         520 :         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
    3299         520 :         length = block_group->length;
    3300         520 :         btrfs_put_block_group(block_group);
    3301             : 
    3302             :         /*
    3303             :          * On a zoned file system, discard the whole block group, this will
    3304             :          * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
    3305             :          * resetting the zone fails, don't treat it as a fatal problem from the
    3306             :          * filesystem's point of view.
    3307             :          */
    3308         520 :         if (btrfs_is_zoned(fs_info)) {
    3309           0 :                 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
    3310           0 :                 if (ret)
    3311           0 :                         btrfs_info(fs_info,
    3312             :                                 "failed to reset zone %llu after relocation",
    3313             :                                 chunk_offset);
    3314             :         }
    3315             : 
    3316         520 :         trans = btrfs_start_trans_remove_block_group(root->fs_info,
    3317             :                                                      chunk_offset);
    3318         520 :         if (IS_ERR(trans)) {
    3319           0 :                 ret = PTR_ERR(trans);
    3320           0 :                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
    3321           0 :                 return ret;
    3322             :         }
    3323             : 
    3324             :         /*
    3325             :          * step two, delete the device extents and the
    3326             :          * chunk tree entries
    3327             :          */
    3328         520 :         ret = btrfs_remove_chunk(trans, chunk_offset);
    3329         520 :         btrfs_end_transaction(trans);
    3330         520 :         return ret;
    3331             : }
    3332             : 
    3333           0 : static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
    3334             : {
    3335           0 :         struct btrfs_root *chunk_root = fs_info->chunk_root;
    3336           0 :         struct btrfs_path *path;
    3337           0 :         struct extent_buffer *leaf;
    3338           0 :         struct btrfs_chunk *chunk;
    3339           0 :         struct btrfs_key key;
    3340           0 :         struct btrfs_key found_key;
    3341           0 :         u64 chunk_type;
    3342           0 :         bool retried = false;
    3343           0 :         int failed = 0;
    3344           0 :         int ret;
    3345             : 
    3346           0 :         path = btrfs_alloc_path();
    3347           0 :         if (!path)
    3348             :                 return -ENOMEM;
    3349             : 
    3350           0 : again:
    3351           0 :         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    3352           0 :         key.offset = (u64)-1;
    3353           0 :         key.type = BTRFS_CHUNK_ITEM_KEY;
    3354             : 
    3355           0 :         while (1) {
    3356           0 :                 mutex_lock(&fs_info->reclaim_bgs_lock);
    3357           0 :                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
    3358           0 :                 if (ret < 0) {
    3359           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3360           0 :                         goto error;
    3361             :                 }
    3362           0 :                 BUG_ON(ret == 0); /* Corruption */
    3363             : 
    3364           0 :                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
    3365           0 :                                           key.type);
    3366           0 :                 if (ret)
    3367           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3368           0 :                 if (ret < 0)
    3369           0 :                         goto error;
    3370           0 :                 if (ret > 0)
    3371             :                         break;
    3372             : 
    3373           0 :                 leaf = path->nodes[0];
    3374           0 :                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
    3375             : 
    3376           0 :                 chunk = btrfs_item_ptr(leaf, path->slots[0],
    3377             :                                        struct btrfs_chunk);
    3378           0 :                 chunk_type = btrfs_chunk_type(leaf, chunk);
    3379           0 :                 btrfs_release_path(path);
    3380             : 
    3381           0 :                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
    3382           0 :                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
    3383           0 :                         if (ret == -ENOSPC)
    3384           0 :                                 failed++;
    3385             :                         else
    3386           0 :                                 BUG_ON(ret);
    3387             :                 }
    3388           0 :                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    3389             : 
    3390           0 :                 if (found_key.offset == 0)
    3391             :                         break;
    3392           0 :                 key.offset = found_key.offset - 1;
    3393             :         }
    3394           0 :         ret = 0;
    3395           0 :         if (failed && !retried) {
    3396           0 :                 failed = 0;
    3397           0 :                 retried = true;
    3398           0 :                 goto again;
    3399           0 :         } else if (WARN_ON(failed && retried)) {
    3400           0 :                 ret = -ENOSPC;
    3401             :         }
    3402           0 : error:
    3403           0 :         btrfs_free_path(path);
    3404           0 :         return ret;
    3405             : }
    3406             : 
    3407             : /*
    3408             :  * return 1 : allocate a data chunk successfully,
    3409             :  * return <0: errors during allocating a data chunk,
    3410             :  * return 0 : no need to allocate a data chunk.
    3411             :  */
    3412         503 : static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
    3413             :                                       u64 chunk_offset)
    3414             : {
    3415         503 :         struct btrfs_block_group *cache;
    3416         503 :         u64 bytes_used;
    3417         503 :         u64 chunk_type;
    3418             : 
    3419         503 :         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
    3420         503 :         ASSERT(cache);
    3421         503 :         chunk_type = cache->flags;
    3422         503 :         btrfs_put_block_group(cache);
    3423             : 
    3424         503 :         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
    3425             :                 return 0;
    3426             : 
    3427         235 :         spin_lock(&fs_info->data_sinfo->lock);
    3428         235 :         bytes_used = fs_info->data_sinfo->bytes_used;
    3429         235 :         spin_unlock(&fs_info->data_sinfo->lock);
    3430             : 
    3431         235 :         if (!bytes_used) {
    3432          25 :                 struct btrfs_trans_handle *trans;
    3433          25 :                 int ret;
    3434             : 
    3435          25 :                 trans = btrfs_join_transaction(fs_info->tree_root);
    3436          25 :                 if (IS_ERR(trans))
    3437           0 :                         return PTR_ERR(trans);
    3438             : 
    3439          25 :                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
    3440          25 :                 btrfs_end_transaction(trans);
    3441          25 :                 if (ret < 0)
    3442             :                         return ret;
    3443          25 :                 return 1;
    3444             :         }
    3445             : 
    3446             :         return 0;
    3447             : }
    3448             : 
    3449         195 : static int insert_balance_item(struct btrfs_fs_info *fs_info,
    3450             :                                struct btrfs_balance_control *bctl)
    3451             : {
    3452         195 :         struct btrfs_root *root = fs_info->tree_root;
    3453         195 :         struct btrfs_trans_handle *trans;
    3454         195 :         struct btrfs_balance_item *item;
    3455         195 :         struct btrfs_disk_balance_args disk_bargs;
    3456         195 :         struct btrfs_path *path;
    3457         195 :         struct extent_buffer *leaf;
    3458         195 :         struct btrfs_key key;
    3459         195 :         int ret, err;
    3460             : 
    3461         195 :         path = btrfs_alloc_path();
    3462         195 :         if (!path)
    3463             :                 return -ENOMEM;
    3464             : 
    3465         195 :         trans = btrfs_start_transaction(root, 0);
    3466         195 :         if (IS_ERR(trans)) {
    3467           0 :                 btrfs_free_path(path);
    3468           0 :                 return PTR_ERR(trans);
    3469             :         }
    3470             : 
    3471         195 :         key.objectid = BTRFS_BALANCE_OBJECTID;
    3472         195 :         key.type = BTRFS_TEMPORARY_ITEM_KEY;
    3473         195 :         key.offset = 0;
    3474             : 
    3475         195 :         ret = btrfs_insert_empty_item(trans, root, path, &key,
    3476             :                                       sizeof(*item));
    3477         195 :         if (ret)
    3478           0 :                 goto out;
    3479             : 
    3480         195 :         leaf = path->nodes[0];
    3481         195 :         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
    3482             : 
    3483         195 :         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
    3484             : 
    3485         195 :         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
    3486         195 :         btrfs_set_balance_data(leaf, item, &disk_bargs);
    3487         195 :         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
    3488         195 :         btrfs_set_balance_meta(leaf, item, &disk_bargs);
    3489         195 :         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
    3490         195 :         btrfs_set_balance_sys(leaf, item, &disk_bargs);
    3491             : 
    3492         195 :         btrfs_set_balance_flags(leaf, item, bctl->flags);
    3493             : 
    3494         195 :         btrfs_mark_buffer_dirty(leaf);
    3495         195 : out:
    3496         195 :         btrfs_free_path(path);
    3497         195 :         err = btrfs_commit_transaction(trans);
    3498         195 :         if (err && !ret)
    3499           0 :                 ret = err;
    3500             :         return ret;
    3501             : }
    3502             : 
    3503         195 : static int del_balance_item(struct btrfs_fs_info *fs_info)
    3504             : {
    3505         195 :         struct btrfs_root *root = fs_info->tree_root;
    3506         195 :         struct btrfs_trans_handle *trans;
    3507         195 :         struct btrfs_path *path;
    3508         195 :         struct btrfs_key key;
    3509         195 :         int ret, err;
    3510             : 
    3511         195 :         path = btrfs_alloc_path();
    3512         195 :         if (!path)
    3513             :                 return -ENOMEM;
    3514             : 
    3515         195 :         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
    3516         195 :         if (IS_ERR(trans)) {
    3517           0 :                 btrfs_free_path(path);
    3518           0 :                 return PTR_ERR(trans);
    3519             :         }
    3520             : 
    3521         195 :         key.objectid = BTRFS_BALANCE_OBJECTID;
    3522         195 :         key.type = BTRFS_TEMPORARY_ITEM_KEY;
    3523         195 :         key.offset = 0;
    3524             : 
    3525         195 :         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
    3526         195 :         if (ret < 0)
    3527           0 :                 goto out;
    3528         195 :         if (ret > 0) {
    3529           0 :                 ret = -ENOENT;
    3530           0 :                 goto out;
    3531             :         }
    3532             : 
    3533         195 :         ret = btrfs_del_item(trans, root, path);
    3534         195 : out:
    3535         195 :         btrfs_free_path(path);
    3536         195 :         err = btrfs_commit_transaction(trans);
    3537         195 :         if (err && !ret)
    3538           0 :                 ret = err;
    3539             :         return ret;
    3540             : }
    3541             : 
    3542             : /*
    3543             :  * This is a heuristic used to reduce the number of chunks balanced on
    3544             :  * resume after balance was interrupted.
    3545             :  */
    3546           0 : static void update_balance_args(struct btrfs_balance_control *bctl)
    3547             : {
    3548             :         /*
    3549             :          * Turn on soft mode for chunk types that were being converted.
    3550             :          */
    3551           0 :         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
    3552           0 :                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
    3553           0 :         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
    3554           0 :                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
    3555           0 :         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
    3556           0 :                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
    3557             : 
    3558             :         /*
    3559             :          * Turn on usage filter if is not already used.  The idea is
    3560             :          * that chunks that we have already balanced should be
    3561             :          * reasonably full.  Don't do it for chunks that are being
    3562             :          * converted - that will keep us from relocating unconverted
    3563             :          * (albeit full) chunks.
    3564             :          */
    3565           0 :         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
    3566           0 :             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
    3567             :             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
    3568           0 :                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
    3569           0 :                 bctl->data.usage = 90;
    3570             :         }
    3571           0 :         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
    3572           0 :             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
    3573             :             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
    3574           0 :                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
    3575           0 :                 bctl->sys.usage = 90;
    3576             :         }
    3577           0 :         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
    3578           0 :             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
    3579             :             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
    3580           0 :                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
    3581           0 :                 bctl->meta.usage = 90;
    3582             :         }
    3583           0 : }
    3584             : 
    3585             : /*
    3586             :  * Clear the balance status in fs_info and delete the balance item from disk.
    3587             :  */
    3588         195 : static void reset_balance_state(struct btrfs_fs_info *fs_info)
    3589             : {
    3590         195 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
    3591         195 :         int ret;
    3592             : 
    3593         195 :         BUG_ON(!fs_info->balance_ctl);
    3594             : 
    3595         195 :         spin_lock(&fs_info->balance_lock);
    3596         195 :         fs_info->balance_ctl = NULL;
    3597         195 :         spin_unlock(&fs_info->balance_lock);
    3598             : 
    3599         195 :         kfree(bctl);
    3600         195 :         ret = del_balance_item(fs_info);
    3601         195 :         if (ret)
    3602           0 :                 btrfs_handle_fs_error(fs_info, ret, NULL);
    3603         195 : }
    3604             : 
    3605             : /*
    3606             :  * Balance filters.  Return 1 if chunk should be filtered out
    3607             :  * (should not be balanced).
    3608             :  */
    3609             : static int chunk_profiles_filter(u64 chunk_type,
    3610             :                                  struct btrfs_balance_args *bargs)
    3611             : {
    3612           0 :         chunk_type = chunk_to_extended(chunk_type) &
    3613             :                                 BTRFS_EXTENDED_PROFILE_MASK;
    3614             : 
    3615           0 :         if (bargs->profiles & chunk_type)
    3616             :                 return 0;
    3617             : 
    3618             :         return 1;
    3619             : }
    3620             : 
    3621           0 : static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
    3622             :                               struct btrfs_balance_args *bargs)
    3623             : {
    3624           0 :         struct btrfs_block_group *cache;
    3625           0 :         u64 chunk_used;
    3626           0 :         u64 user_thresh_min;
    3627           0 :         u64 user_thresh_max;
    3628           0 :         int ret = 1;
    3629             : 
    3630           0 :         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
    3631           0 :         chunk_used = cache->used;
    3632             : 
    3633           0 :         if (bargs->usage_min == 0)
    3634             :                 user_thresh_min = 0;
    3635             :         else
    3636           0 :                 user_thresh_min = mult_perc(cache->length, bargs->usage_min);
    3637             : 
    3638           0 :         if (bargs->usage_max == 0)
    3639             :                 user_thresh_max = 1;
    3640           0 :         else if (bargs->usage_max > 100)
    3641           0 :                 user_thresh_max = cache->length;
    3642             :         else
    3643           0 :                 user_thresh_max = mult_perc(cache->length, bargs->usage_max);
    3644             : 
    3645           0 :         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
    3646           0 :                 ret = 0;
    3647             : 
    3648           0 :         btrfs_put_block_group(cache);
    3649           0 :         return ret;
    3650             : }
    3651             : 
    3652           2 : static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
    3653             :                 u64 chunk_offset, struct btrfs_balance_args *bargs)
    3654             : {
    3655           2 :         struct btrfs_block_group *cache;
    3656           2 :         u64 chunk_used, user_thresh;
    3657           2 :         int ret = 1;
    3658             : 
    3659           2 :         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
    3660           2 :         chunk_used = cache->used;
    3661             : 
    3662           2 :         if (bargs->usage_min == 0)
    3663             :                 user_thresh = 1;
    3664           0 :         else if (bargs->usage > 100)
    3665           0 :                 user_thresh = cache->length;
    3666             :         else
    3667           0 :                 user_thresh = mult_perc(cache->length, bargs->usage);
    3668             : 
    3669           2 :         if (chunk_used < user_thresh)
    3670           2 :                 ret = 0;
    3671             : 
    3672           2 :         btrfs_put_block_group(cache);
    3673           2 :         return ret;
    3674             : }
    3675             : 
    3676           0 : static int chunk_devid_filter(struct extent_buffer *leaf,
    3677             :                               struct btrfs_chunk *chunk,
    3678             :                               struct btrfs_balance_args *bargs)
    3679             : {
    3680           0 :         struct btrfs_stripe *stripe;
    3681           0 :         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
    3682           0 :         int i;
    3683             : 
    3684           0 :         for (i = 0; i < num_stripes; i++) {
    3685           0 :                 stripe = btrfs_stripe_nr(chunk, i);
    3686           0 :                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
    3687             :                         return 0;
    3688             :         }
    3689             : 
    3690             :         return 1;
    3691             : }
    3692             : 
    3693       57024 : static u64 calc_data_stripes(u64 type, int num_stripes)
    3694             : {
    3695       57024 :         const int index = btrfs_bg_flags_to_raid_index(type);
    3696       57024 :         const int ncopies = btrfs_raid_array[index].ncopies;
    3697       57024 :         const int nparity = btrfs_raid_array[index].nparity;
    3698             : 
    3699       57024 :         return (num_stripes - nparity) / ncopies;
    3700             : }
    3701             : 
    3702             : /* [pstart, pend) */
    3703           0 : static int chunk_drange_filter(struct extent_buffer *leaf,
    3704             :                                struct btrfs_chunk *chunk,
    3705             :                                struct btrfs_balance_args *bargs)
    3706             : {
    3707           0 :         struct btrfs_stripe *stripe;
    3708           0 :         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
    3709           0 :         u64 stripe_offset;
    3710           0 :         u64 stripe_length;
    3711           0 :         u64 type;
    3712           0 :         int factor;
    3713           0 :         int i;
    3714             : 
    3715           0 :         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
    3716             :                 return 0;
    3717             : 
    3718           0 :         type = btrfs_chunk_type(leaf, chunk);
    3719           0 :         factor = calc_data_stripes(type, num_stripes);
    3720             : 
    3721           0 :         for (i = 0; i < num_stripes; i++) {
    3722           0 :                 stripe = btrfs_stripe_nr(chunk, i);
    3723           0 :                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
    3724           0 :                         continue;
    3725             : 
    3726           0 :                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
    3727           0 :                 stripe_length = btrfs_chunk_length(leaf, chunk);
    3728           0 :                 stripe_length = div_u64(stripe_length, factor);
    3729             : 
    3730           0 :                 if (stripe_offset < bargs->pend &&
    3731           0 :                     stripe_offset + stripe_length > bargs->pstart)
    3732             :                         return 0;
    3733             :         }
    3734             : 
    3735             :         return 1;
    3736             : }
    3737             : 
    3738             : /* [vstart, vend) */
    3739           0 : static int chunk_vrange_filter(struct extent_buffer *leaf,
    3740             :                                struct btrfs_chunk *chunk,
    3741             :                                u64 chunk_offset,
    3742             :                                struct btrfs_balance_args *bargs)
    3743             : {
    3744           0 :         if (chunk_offset < bargs->vend &&
    3745           0 :             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
    3746             :                 /* at least part of the chunk is inside this vrange */
    3747           0 :                 return 0;
    3748             : 
    3749             :         return 1;
    3750             : }
    3751             : 
    3752           0 : static int chunk_stripes_range_filter(struct extent_buffer *leaf,
    3753             :                                struct btrfs_chunk *chunk,
    3754             :                                struct btrfs_balance_args *bargs)
    3755             : {
    3756           0 :         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
    3757             : 
    3758           0 :         if (bargs->stripes_min <= num_stripes
    3759           0 :                         && num_stripes <= bargs->stripes_max)
    3760           0 :                 return 0;
    3761             : 
    3762             :         return 1;
    3763             : }
    3764             : 
    3765           0 : static int chunk_soft_convert_filter(u64 chunk_type,
    3766             :                                      struct btrfs_balance_args *bargs)
    3767             : {
    3768           0 :         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
    3769             :                 return 0;
    3770             : 
    3771           0 :         chunk_type = chunk_to_extended(chunk_type) &
    3772             :                                 BTRFS_EXTENDED_PROFILE_MASK;
    3773             : 
    3774           0 :         if (bargs->target == chunk_type)
    3775           0 :                 return 1;
    3776             : 
    3777             :         return 0;
    3778             : }
    3779             : 
    3780        2153 : static int should_balance_chunk(struct extent_buffer *leaf,
    3781             :                                 struct btrfs_chunk *chunk, u64 chunk_offset)
    3782             : {
    3783        2153 :         struct btrfs_fs_info *fs_info = leaf->fs_info;
    3784        2153 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
    3785        2153 :         struct btrfs_balance_args *bargs = NULL;
    3786        2153 :         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
    3787             : 
    3788             :         /* type filter */
    3789        2153 :         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
    3790        2153 :               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
    3791             :                 return 0;
    3792             :         }
    3793             : 
    3794        1045 :         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
    3795         469 :                 bargs = &bctl->data;
    3796         576 :         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
    3797         176 :                 bargs = &bctl->sys;
    3798         400 :         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
    3799         400 :                 bargs = &bctl->meta;
    3800             : 
    3801             :         /* profiles filter */
    3802        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
    3803             :             chunk_profiles_filter(chunk_type, bargs)) {
    3804             :                 return 0;
    3805             :         }
    3806             : 
    3807             :         /* usage filter */
    3808        1047 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
    3809           2 :             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
    3810             :                 return 0;
    3811        1045 :         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
    3812           0 :             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
    3813             :                 return 0;
    3814             :         }
    3815             : 
    3816             :         /* devid filter */
    3817        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
    3818           0 :             chunk_devid_filter(leaf, chunk, bargs)) {
    3819             :                 return 0;
    3820             :         }
    3821             : 
    3822             :         /* drange filter, makes sense only with devid filter */
    3823        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
    3824           0 :             chunk_drange_filter(leaf, chunk, bargs)) {
    3825             :                 return 0;
    3826             :         }
    3827             : 
    3828             :         /* vrange filter */
    3829        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
    3830           0 :             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
    3831             :                 return 0;
    3832             :         }
    3833             : 
    3834             :         /* stripes filter */
    3835        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
    3836           0 :             chunk_stripes_range_filter(leaf, chunk, bargs)) {
    3837             :                 return 0;
    3838             :         }
    3839             : 
    3840             :         /* soft profile changing mode */
    3841        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
    3842           0 :             chunk_soft_convert_filter(chunk_type, bargs)) {
    3843             :                 return 0;
    3844             :         }
    3845             : 
    3846             :         /*
    3847             :          * limited by count, must be the last filter
    3848             :          */
    3849        1045 :         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
    3850           0 :                 if (bargs->limit == 0)
    3851             :                         return 0;
    3852             :                 else
    3853           0 :                         bargs->limit--;
    3854        1045 :         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
    3855             :                 /*
    3856             :                  * Same logic as the 'limit' filter; the minimum cannot be
    3857             :                  * determined here because we do not have the global information
    3858             :                  * about the count of all chunks that satisfy the filters.
    3859             :                  */
    3860           0 :                 if (bargs->limit_max == 0)
    3861             :                         return 0;
    3862             :                 else
    3863           0 :                         bargs->limit_max--;
    3864             :         }
    3865             : 
    3866             :         return 1;
    3867             : }
    3868             : 
    3869         195 : static int __btrfs_balance(struct btrfs_fs_info *fs_info)
    3870             : {
    3871         195 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
    3872         195 :         struct btrfs_root *chunk_root = fs_info->chunk_root;
    3873         195 :         u64 chunk_type;
    3874         195 :         struct btrfs_chunk *chunk;
    3875         195 :         struct btrfs_path *path = NULL;
    3876         195 :         struct btrfs_key key;
    3877         195 :         struct btrfs_key found_key;
    3878         195 :         struct extent_buffer *leaf;
    3879         195 :         int slot;
    3880         195 :         int ret;
    3881         195 :         int enospc_errors = 0;
    3882         195 :         bool counting = true;
    3883             :         /* The single value limit and min/max limits use the same bytes in the */
    3884         195 :         u64 limit_data = bctl->data.limit;
    3885         195 :         u64 limit_meta = bctl->meta.limit;
    3886         195 :         u64 limit_sys = bctl->sys.limit;
    3887         195 :         u32 count_data = 0;
    3888         195 :         u32 count_meta = 0;
    3889         195 :         u32 count_sys = 0;
    3890         195 :         int chunk_reserved = 0;
    3891             : 
    3892         195 :         path = btrfs_alloc_path();
    3893         195 :         if (!path) {
    3894           0 :                 ret = -ENOMEM;
    3895           0 :                 goto error;
    3896             :         }
    3897             : 
    3898             :         /* zero out stat counters */
    3899         195 :         spin_lock(&fs_info->balance_lock);
    3900         195 :         memset(&bctl->stat, 0, sizeof(bctl->stat));
    3901         195 :         spin_unlock(&fs_info->balance_lock);
    3902         390 : again:
    3903         390 :         if (!counting) {
    3904             :                 /*
    3905             :                  * The single value limit and min/max limits use the same bytes
    3906             :                  * in the
    3907             :                  */
    3908         195 :                 bctl->data.limit = limit_data;
    3909         195 :                 bctl->meta.limit = limit_meta;
    3910         195 :                 bctl->sys.limit = limit_sys;
    3911             :         }
    3912         390 :         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    3913         390 :         key.offset = (u64)-1;
    3914         390 :         key.type = BTRFS_CHUNK_ITEM_KEY;
    3915             : 
    3916        4694 :         while (1) {
    3917        2542 :                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
    3918             :                     atomic_read(&fs_info->balance_cancel_req)) {
    3919           1 :                         ret = -ECANCELED;
    3920           1 :                         goto error;
    3921             :                 }
    3922             : 
    3923        2541 :                 mutex_lock(&fs_info->reclaim_bgs_lock);
    3924        2541 :                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
    3925        2541 :                 if (ret < 0) {
    3926           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3927           0 :                         goto error;
    3928             :                 }
    3929             : 
    3930             :                 /*
    3931             :                  * this shouldn't happen, it means the last relocate
    3932             :                  * failed
    3933             :                  */
    3934        2541 :                 if (ret == 0)
    3935           0 :                         BUG(); /* FIXME break ? */
    3936             : 
    3937        2541 :                 ret = btrfs_previous_item(chunk_root, path, 0,
    3938             :                                           BTRFS_CHUNK_ITEM_KEY);
    3939        2541 :                 if (ret) {
    3940         388 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3941         388 :                         ret = 0;
    3942         388 :                         break;
    3943             :                 }
    3944             : 
    3945        2153 :                 leaf = path->nodes[0];
    3946        2153 :                 slot = path->slots[0];
    3947        2153 :                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
    3948             : 
    3949        2153 :                 if (found_key.objectid != key.objectid) {
    3950           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3951           0 :                         break;
    3952             :                 }
    3953             : 
    3954        2153 :                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
    3955        2153 :                 chunk_type = btrfs_chunk_type(leaf, chunk);
    3956             : 
    3957        2153 :                 if (!counting) {
    3958        1073 :                         spin_lock(&fs_info->balance_lock);
    3959        1073 :                         bctl->stat.considered++;
    3960        1073 :                         spin_unlock(&fs_info->balance_lock);
    3961             :                 }
    3962             : 
    3963        2153 :                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
    3964             : 
    3965        2153 :                 btrfs_release_path(path);
    3966        2153 :                 if (!ret) {
    3967        1108 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3968        1108 :                         goto loop;
    3969             :                 }
    3970             : 
    3971        1045 :                 if (counting) {
    3972         524 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3973         524 :                         spin_lock(&fs_info->balance_lock);
    3974         524 :                         bctl->stat.expected++;
    3975         524 :                         spin_unlock(&fs_info->balance_lock);
    3976             : 
    3977         524 :                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
    3978         236 :                                 count_data++;
    3979         288 :                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
    3980          88 :                                 count_sys++;
    3981         200 :                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
    3982         200 :                                 count_meta++;
    3983             : 
    3984         524 :                         goto loop;
    3985             :                 }
    3986             : 
    3987             :                 /*
    3988             :                  * Apply limit_min filter, no need to check if the LIMITS
    3989             :                  * filter is used, limit_min is 0 by default
    3990             :                  */
    3991         521 :                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
    3992         233 :                                         count_data < bctl->data.limit_min)
    3993         521 :                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
    3994         200 :                                         count_meta < bctl->meta.limit_min)
    3995         521 :                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
    3996          88 :                                         count_sys < bctl->sys.limit_min)) {
    3997           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    3998           0 :                         goto loop;
    3999             :                 }
    4000             : 
    4001         521 :                 if (!chunk_reserved) {
    4002             :                         /*
    4003             :                          * We may be relocating the only data chunk we have,
    4004             :                          * which could potentially end up with losing data's
    4005             :                          * raid profile, so lets allocate an empty one in
    4006             :                          * advance.
    4007             :                          */
    4008         501 :                         ret = btrfs_may_alloc_data_chunk(fs_info,
    4009             :                                                          found_key.offset);
    4010         501 :                         if (ret < 0) {
    4011           0 :                                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    4012           0 :                                 goto error;
    4013         501 :                         } else if (ret == 1) {
    4014          25 :                                 chunk_reserved = 1;
    4015             :                         }
    4016             :                 }
    4017             : 
    4018         521 :                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
    4019         521 :                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    4020         521 :                 if (ret == -ENOSPC) {
    4021           0 :                         enospc_errors++;
    4022         521 :                 } else if (ret == -ETXTBSY) {
    4023           1 :                         btrfs_info(fs_info,
    4024             :            "skipping relocation of block group %llu due to active swapfile",
    4025             :                                    found_key.offset);
    4026           1 :                         ret = 0;
    4027         520 :                 } else if (ret) {
    4028           1 :                         goto error;
    4029             :                 } else {
    4030         519 :                         spin_lock(&fs_info->balance_lock);
    4031         519 :                         bctl->stat.completed++;
    4032         519 :                         spin_unlock(&fs_info->balance_lock);
    4033             :                 }
    4034        2152 : loop:
    4035        2152 :                 if (found_key.offset == 0)
    4036             :                         break;
    4037        2152 :                 key.offset = found_key.offset - 1;
    4038             :         }
    4039             : 
    4040         388 :         if (counting) {
    4041         195 :                 btrfs_release_path(path);
    4042         195 :                 counting = false;
    4043         195 :                 goto again;
    4044             :         }
    4045         193 : error:
    4046         195 :         btrfs_free_path(path);
    4047         195 :         if (enospc_errors) {
    4048           0 :                 btrfs_info(fs_info, "%d enospc errors during balance",
    4049             :                            enospc_errors);
    4050           0 :                 if (!ret)
    4051           0 :                         ret = -ENOSPC;
    4052             :         }
    4053             : 
    4054         195 :         return ret;
    4055             : }
    4056             : 
    4057             : /*
    4058             :  * See if a given profile is valid and reduced.
    4059             :  *
    4060             :  * @flags:     profile to validate
    4061             :  * @extended:  if true @flags is treated as an extended profile
    4062             :  */
    4063             : static int alloc_profile_is_valid(u64 flags, int extended)
    4064             : {
    4065        1482 :         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
    4066             :                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
    4067             : 
    4068        1482 :         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
    4069             : 
    4070             :         /* 1) check that all other bits are zeroed */
    4071        1482 :         if (flags & ~mask)
    4072             :                 return 0;
    4073             : 
    4074             :         /* 2) see if profile is reduced */
    4075        1482 :         if (flags == 0)
    4076             :                 return !extended; /* "0" is valid for usual profiles */
    4077             : 
    4078         315 :         return has_single_bit_set(flags);
    4079             : }
    4080             : 
    4081             : /*
    4082             :  * Validate target profile against allowed profiles and return true if it's OK.
    4083             :  * Otherwise print the error message and return false.
    4084             :  */
    4085         585 : static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
    4086             :                 const struct btrfs_balance_args *bargs,
    4087             :                 u64 allowed, const char *type)
    4088             : {
    4089         585 :         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
    4090             :                 return true;
    4091             : 
    4092             :         /* Profile is valid and does not have bits outside of the allowed set */
    4093           0 :         if (alloc_profile_is_valid(bargs->target, 1) &&
    4094           0 :             (bargs->target & ~allowed) == 0)
    4095             :                 return true;
    4096             : 
    4097           0 :         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
    4098             :                         type, btrfs_bg_type_to_raid_name(bargs->target));
    4099           0 :         return false;
    4100             : }
    4101             : 
    4102             : /*
    4103             :  * Fill @buf with textual description of balance filter flags @bargs, up to
    4104             :  * @size_buf including the terminating null. The output may be trimmed if it
    4105             :  * does not fit into the provided buffer.
    4106             :  */
    4107         312 : static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
    4108             :                                  u32 size_buf)
    4109             : {
    4110         312 :         int ret;
    4111         312 :         u32 size_bp = size_buf;
    4112         312 :         char *bp = buf;
    4113         312 :         u64 flags = bargs->flags;
    4114         312 :         char tmp_buf[128] = {'\0'};
    4115             : 
    4116         312 :         if (!flags)
    4117         311 :                 return;
    4118             : 
    4119             : #define CHECK_APPEND_NOARG(a)                                           \
    4120             :         do {                                                            \
    4121             :                 ret = snprintf(bp, size_bp, (a));                       \
    4122             :                 if (ret < 0 || ret >= size_bp)                            \
    4123             :                         goto out_overflow;                              \
    4124             :                 size_bp -= ret;                                         \
    4125             :                 bp += ret;                                              \
    4126             :         } while (0)
    4127             : 
    4128             : #define CHECK_APPEND_1ARG(a, v1)                                        \
    4129             :         do {                                                            \
    4130             :                 ret = snprintf(bp, size_bp, (a), (v1));                 \
    4131             :                 if (ret < 0 || ret >= size_bp)                            \
    4132             :                         goto out_overflow;                              \
    4133             :                 size_bp -= ret;                                         \
    4134             :                 bp += ret;                                              \
    4135             :         } while (0)
    4136             : 
    4137             : #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
    4138             :         do {                                                            \
    4139             :                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
    4140             :                 if (ret < 0 || ret >= size_bp)                            \
    4141             :                         goto out_overflow;                              \
    4142             :                 size_bp -= ret;                                         \
    4143             :                 bp += ret;                                              \
    4144             :         } while (0)
    4145             : 
    4146           1 :         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
    4147           0 :                 CHECK_APPEND_1ARG("convert=%s,",
    4148             :                                   btrfs_bg_type_to_raid_name(bargs->target));
    4149             : 
    4150           1 :         if (flags & BTRFS_BALANCE_ARGS_SOFT)
    4151           0 :                 CHECK_APPEND_NOARG("soft,");
    4152             : 
    4153           1 :         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
    4154           0 :                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
    4155             :                                             sizeof(tmp_buf));
    4156           0 :                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
    4157             :         }
    4158             : 
    4159           1 :         if (flags & BTRFS_BALANCE_ARGS_USAGE)
    4160           1 :                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
    4161             : 
    4162           1 :         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
    4163           0 :                 CHECK_APPEND_2ARG("usage=%u..%u,",
    4164             :                                   bargs->usage_min, bargs->usage_max);
    4165             : 
    4166           1 :         if (flags & BTRFS_BALANCE_ARGS_DEVID)
    4167           0 :                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
    4168             : 
    4169           1 :         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
    4170           0 :                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
    4171             :                                   bargs->pstart, bargs->pend);
    4172             : 
    4173           1 :         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
    4174           0 :                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
    4175             :                                   bargs->vstart, bargs->vend);
    4176             : 
    4177           1 :         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
    4178           0 :                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
    4179             : 
    4180           1 :         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
    4181           0 :                 CHECK_APPEND_2ARG("limit=%u..%u,",
    4182             :                                 bargs->limit_min, bargs->limit_max);
    4183             : 
    4184           1 :         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
    4185           0 :                 CHECK_APPEND_2ARG("stripes=%u..%u,",
    4186             :                                   bargs->stripes_min, bargs->stripes_max);
    4187             : 
    4188             : #undef CHECK_APPEND_2ARG
    4189             : #undef CHECK_APPEND_1ARG
    4190             : #undef CHECK_APPEND_NOARG
    4191             : 
    4192           1 : out_overflow:
    4193             : 
    4194           1 :         if (size_bp < size_buf)
    4195           1 :                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
    4196             :         else
    4197           0 :                 buf[0] = '\0';
    4198             : }
    4199             : 
    4200         195 : static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
    4201             : {
    4202         195 :         u32 size_buf = 1024;
    4203         195 :         char tmp_buf[192] = {'\0'};
    4204         195 :         char *buf;
    4205         195 :         char *bp;
    4206         195 :         u32 size_bp = size_buf;
    4207         195 :         int ret;
    4208         195 :         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
    4209             : 
    4210         195 :         buf = kzalloc(size_buf, GFP_KERNEL);
    4211         195 :         if (!buf)
    4212           0 :                 return;
    4213             : 
    4214         195 :         bp = buf;
    4215             : 
    4216             : #define CHECK_APPEND_1ARG(a, v1)                                        \
    4217             :         do {                                                            \
    4218             :                 ret = snprintf(bp, size_bp, (a), (v1));                 \
    4219             :                 if (ret < 0 || ret >= size_bp)                            \
    4220             :                         goto out_overflow;                              \
    4221             :                 size_bp -= ret;                                         \
    4222             :                 bp += ret;                                              \
    4223             :         } while (0)
    4224             : 
    4225         195 :         if (bctl->flags & BTRFS_BALANCE_FORCE)
    4226          56 :                 CHECK_APPEND_1ARG("%s", "-f ");
    4227             : 
    4228         195 :         if (bctl->flags & BTRFS_BALANCE_DATA) {
    4229         136 :                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
    4230         136 :                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
    4231             :         }
    4232             : 
    4233         195 :         if (bctl->flags & BTRFS_BALANCE_METADATA) {
    4234          88 :                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
    4235          88 :                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
    4236             :         }
    4237             : 
    4238         195 :         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
    4239          88 :                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
    4240          88 :                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
    4241             :         }
    4242             : 
    4243             : #undef CHECK_APPEND_1ARG
    4244             : 
    4245         107 : out_overflow:
    4246             : 
    4247         195 :         if (size_bp < size_buf)
    4248         195 :                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
    4249         390 :         btrfs_info(fs_info, "balance: %s %s",
    4250             :                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
    4251             :                    "resume" : "start", buf);
    4252             : 
    4253         195 :         kfree(buf);
    4254             : }
    4255             : 
    4256             : /*
    4257             :  * Should be called with balance mutexe held
    4258             :  */
    4259         195 : int btrfs_balance(struct btrfs_fs_info *fs_info,
    4260             :                   struct btrfs_balance_control *bctl,
    4261             :                   struct btrfs_ioctl_balance_args *bargs)
    4262             : {
    4263         195 :         u64 meta_target, data_target;
    4264         195 :         u64 allowed;
    4265         195 :         int mixed = 0;
    4266         195 :         int ret;
    4267         195 :         u64 num_devices;
    4268         195 :         unsigned seq;
    4269         195 :         bool reducing_redundancy;
    4270         195 :         bool paused = false;
    4271         195 :         int i;
    4272             : 
    4273         195 :         if (btrfs_fs_closing(fs_info) ||
    4274         195 :             atomic_read(&fs_info->balance_pause_req) ||
    4275         195 :             btrfs_should_cancel_balance(fs_info)) {
    4276           0 :                 ret = -EINVAL;
    4277           0 :                 goto out;
    4278             :         }
    4279             : 
    4280         195 :         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
    4281         195 :         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
    4282           0 :                 mixed = 1;
    4283             : 
    4284             :         /*
    4285             :          * In case of mixed groups both data and meta should be picked,
    4286             :          * and identical options should be given for both of them.
    4287             :          */
    4288           0 :         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
    4289           0 :         if (mixed && (bctl->flags & allowed)) {
    4290           0 :                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
    4291           0 :                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
    4292           0 :                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
    4293           0 :                         btrfs_err(fs_info,
    4294             :           "balance: mixed groups data and metadata options must be the same");
    4295           0 :                         ret = -EINVAL;
    4296           0 :                         goto out;
    4297             :                 }
    4298             :         }
    4299             : 
    4300             :         /*
    4301             :          * rw_devices will not change at the moment, device add/delete/replace
    4302             :          * are exclusive
    4303             :          */
    4304         195 :         num_devices = fs_info->fs_devices->rw_devices;
    4305             : 
    4306             :         /*
    4307             :          * SINGLE profile on-disk has no profile bit, but in-memory we have a
    4308             :          * special bit for it, to make it easier to distinguish.  Thus we need
    4309             :          * to set it manually, or balance would refuse the profile.
    4310             :          */
    4311         195 :         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
    4312        1950 :         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
    4313        1755 :                 if (num_devices >= btrfs_raid_array[i].devs_min)
    4314         585 :                         allowed |= btrfs_raid_array[i].bg_flag;
    4315             : 
    4316         390 :         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
    4317         390 :             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
    4318         195 :             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
    4319           0 :                 ret = -EINVAL;
    4320           0 :                 goto out;
    4321             :         }
    4322             : 
    4323             :         /*
    4324             :          * Allow to reduce metadata or system integrity only if force set for
    4325             :          * profiles with redundancy (copies, parity)
    4326             :          */
    4327             :         allowed = 0;
    4328        1950 :         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
    4329        1755 :                 if (btrfs_raid_array[i].ncopies >= 2 ||
    4330         780 :                     btrfs_raid_array[i].tolerated_failures >= 1)
    4331        1365 :                         allowed |= btrfs_raid_array[i].bg_flag;
    4332             :         }
    4333         195 :         do {
    4334         195 :                 seq = read_seqbegin(&fs_info->profiles_lock);
    4335             : 
    4336         195 :                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
    4337           0 :                      (fs_info->avail_system_alloc_bits & allowed) &&
    4338           0 :                      !(bctl->sys.target & allowed)) ||
    4339         195 :                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
    4340           0 :                      (fs_info->avail_metadata_alloc_bits & allowed) &&
    4341           0 :                      !(bctl->meta.target & allowed)))
    4342             :                         reducing_redundancy = true;
    4343             :                 else
    4344         195 :                         reducing_redundancy = false;
    4345             : 
    4346             :                 /* if we're not converting, the target field is uninitialized */
    4347         195 :                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
    4348         195 :                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
    4349         195 :                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
    4350         195 :                         bctl->data.target : fs_info->avail_data_alloc_bits;
    4351         195 :         } while (read_seqretry(&fs_info->profiles_lock, seq));
    4352             : 
    4353         195 :         if (reducing_redundancy) {
    4354           0 :                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
    4355           0 :                         btrfs_info(fs_info,
    4356             :                            "balance: force reducing metadata redundancy");
    4357             :                 } else {
    4358           0 :                         btrfs_err(fs_info,
    4359             :         "balance: reduces metadata redundancy, use --force if you want this");
    4360           0 :                         ret = -EINVAL;
    4361           0 :                         goto out;
    4362             :                 }
    4363             :         }
    4364             : 
    4365         390 :         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
    4366         195 :                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
    4367           0 :                 btrfs_warn(fs_info,
    4368             :         "balance: metadata profile %s has lower redundancy than data profile %s",
    4369             :                                 btrfs_bg_type_to_raid_name(meta_target),
    4370             :                                 btrfs_bg_type_to_raid_name(data_target));
    4371             :         }
    4372             : 
    4373         195 :         ret = insert_balance_item(fs_info, bctl);
    4374         195 :         if (ret && ret != -EEXIST)
    4375           0 :                 goto out;
    4376             : 
    4377         195 :         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
    4378         195 :                 BUG_ON(ret == -EEXIST);
    4379         195 :                 BUG_ON(fs_info->balance_ctl);
    4380         195 :                 spin_lock(&fs_info->balance_lock);
    4381         195 :                 fs_info->balance_ctl = bctl;
    4382         195 :                 spin_unlock(&fs_info->balance_lock);
    4383             :         } else {
    4384           0 :                 BUG_ON(ret != -EEXIST);
    4385           0 :                 spin_lock(&fs_info->balance_lock);
    4386           0 :                 update_balance_args(bctl);
    4387           0 :                 spin_unlock(&fs_info->balance_lock);
    4388             :         }
    4389             : 
    4390         195 :         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
    4391         195 :         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
    4392         195 :         describe_balance_start_or_resume(fs_info);
    4393         195 :         mutex_unlock(&fs_info->balance_mutex);
    4394             : 
    4395         195 :         ret = __btrfs_balance(fs_info);
    4396             : 
    4397         195 :         mutex_lock(&fs_info->balance_mutex);
    4398         195 :         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
    4399           0 :                 btrfs_info(fs_info, "balance: paused");
    4400           0 :                 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
    4401           0 :                 paused = true;
    4402             :         }
    4403             :         /*
    4404             :          * Balance can be canceled by:
    4405             :          *
    4406             :          * - Regular cancel request
    4407             :          *   Then ret == -ECANCELED and balance_cancel_req > 0
    4408             :          *
    4409             :          * - Fatal signal to "btrfs" process
    4410             :          *   Either the signal caught by wait_reserve_ticket() and callers
    4411             :          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
    4412             :          *   got -ECANCELED.
    4413             :          *   Either way, in this case balance_cancel_req = 0, and
    4414             :          *   ret == -EINTR or ret == -ECANCELED.
    4415             :          *
    4416             :          * So here we only check the return value to catch canceled balance.
    4417             :          */
    4418         195 :         else if (ret == -ECANCELED || ret == -EINTR)
    4419           2 :                 btrfs_info(fs_info, "balance: canceled");
    4420             :         else
    4421         193 :                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
    4422             : 
    4423         195 :         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
    4424             : 
    4425         195 :         if (bargs) {
    4426         195 :                 memset(bargs, 0, sizeof(*bargs));
    4427         195 :                 btrfs_update_ioctl_balance_args(fs_info, bargs);
    4428             :         }
    4429             : 
    4430             :         /* We didn't pause, we can clean everything up. */
    4431         195 :         if (!paused) {
    4432         195 :                 reset_balance_state(fs_info);
    4433         195 :                 btrfs_exclop_finish(fs_info);
    4434             :         }
    4435             : 
    4436         195 :         wake_up(&fs_info->balance_wait_q);
    4437             : 
    4438         195 :         return ret;
    4439           0 : out:
    4440           0 :         if (bctl->flags & BTRFS_BALANCE_RESUME)
    4441           0 :                 reset_balance_state(fs_info);
    4442             :         else
    4443           0 :                 kfree(bctl);
    4444           0 :         btrfs_exclop_finish(fs_info);
    4445             : 
    4446           0 :         return ret;
    4447             : }
    4448             : 
    4449           0 : static int balance_kthread(void *data)
    4450             : {
    4451           0 :         struct btrfs_fs_info *fs_info = data;
    4452           0 :         int ret = 0;
    4453             : 
    4454           0 :         sb_start_write(fs_info->sb);
    4455           0 :         mutex_lock(&fs_info->balance_mutex);
    4456           0 :         if (fs_info->balance_ctl)
    4457           0 :                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
    4458           0 :         mutex_unlock(&fs_info->balance_mutex);
    4459           0 :         sb_end_write(fs_info->sb);
    4460             : 
    4461           0 :         return ret;
    4462             : }
    4463             : 
    4464        3179 : int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
    4465             : {
    4466        3179 :         struct task_struct *tsk;
    4467             : 
    4468        3179 :         mutex_lock(&fs_info->balance_mutex);
    4469        3179 :         if (!fs_info->balance_ctl) {
    4470        3179 :                 mutex_unlock(&fs_info->balance_mutex);
    4471        3179 :                 return 0;
    4472             :         }
    4473           0 :         mutex_unlock(&fs_info->balance_mutex);
    4474             : 
    4475           0 :         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
    4476           0 :                 btrfs_info(fs_info, "balance: resume skipped");
    4477           0 :                 return 0;
    4478             :         }
    4479             : 
    4480           0 :         spin_lock(&fs_info->super_lock);
    4481           0 :         ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
    4482           0 :         fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
    4483           0 :         spin_unlock(&fs_info->super_lock);
    4484             :         /*
    4485             :          * A ro->rw remount sequence should continue with the paused balance
    4486             :          * regardless of who pauses it, system or the user as of now, so set
    4487             :          * the resume flag.
    4488             :          */
    4489           0 :         spin_lock(&fs_info->balance_lock);
    4490           0 :         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
    4491           0 :         spin_unlock(&fs_info->balance_lock);
    4492             : 
    4493           0 :         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
    4494           0 :         return PTR_ERR_OR_ZERO(tsk);
    4495             : }
    4496             : 
    4497        3215 : int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
    4498             : {
    4499        3215 :         struct btrfs_balance_control *bctl;
    4500        3215 :         struct btrfs_balance_item *item;
    4501        3215 :         struct btrfs_disk_balance_args disk_bargs;
    4502        3215 :         struct btrfs_path *path;
    4503        3215 :         struct extent_buffer *leaf;
    4504        3215 :         struct btrfs_key key;
    4505        3215 :         int ret;
    4506             : 
    4507        3215 :         path = btrfs_alloc_path();
    4508        3215 :         if (!path)
    4509             :                 return -ENOMEM;
    4510             : 
    4511        3215 :         key.objectid = BTRFS_BALANCE_OBJECTID;
    4512        3215 :         key.type = BTRFS_TEMPORARY_ITEM_KEY;
    4513        3215 :         key.offset = 0;
    4514             : 
    4515        3215 :         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
    4516        3215 :         if (ret < 0)
    4517           0 :                 goto out;
    4518        3215 :         if (ret > 0) { /* ret = -ENOENT; */
    4519        3215 :                 ret = 0;
    4520        3215 :                 goto out;
    4521             :         }
    4522             : 
    4523           0 :         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
    4524           0 :         if (!bctl) {
    4525           0 :                 ret = -ENOMEM;
    4526           0 :                 goto out;
    4527             :         }
    4528             : 
    4529           0 :         leaf = path->nodes[0];
    4530           0 :         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
    4531             : 
    4532           0 :         bctl->flags = btrfs_balance_flags(leaf, item);
    4533           0 :         bctl->flags |= BTRFS_BALANCE_RESUME;
    4534             : 
    4535           0 :         btrfs_balance_data(leaf, item, &disk_bargs);
    4536           0 :         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
    4537           0 :         btrfs_balance_meta(leaf, item, &disk_bargs);
    4538           0 :         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
    4539           0 :         btrfs_balance_sys(leaf, item, &disk_bargs);
    4540           0 :         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
    4541             : 
    4542             :         /*
    4543             :          * This should never happen, as the paused balance state is recovered
    4544             :          * during mount without any chance of other exclusive ops to collide.
    4545             :          *
    4546             :          * This gives the exclusive op status to balance and keeps in paused
    4547             :          * state until user intervention (cancel or umount). If the ownership
    4548             :          * cannot be assigned, show a message but do not fail. The balance
    4549             :          * is in a paused state and must have fs_info::balance_ctl properly
    4550             :          * set up.
    4551             :          */
    4552           0 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
    4553           0 :                 btrfs_warn(fs_info,
    4554             :         "balance: cannot set exclusive op status, resume manually");
    4555             : 
    4556           0 :         btrfs_release_path(path);
    4557             : 
    4558           0 :         mutex_lock(&fs_info->balance_mutex);
    4559           0 :         BUG_ON(fs_info->balance_ctl);
    4560           0 :         spin_lock(&fs_info->balance_lock);
    4561           0 :         fs_info->balance_ctl = bctl;
    4562           0 :         spin_unlock(&fs_info->balance_lock);
    4563           0 :         mutex_unlock(&fs_info->balance_mutex);
    4564        3215 : out:
    4565        3215 :         btrfs_free_path(path);
    4566        3215 :         return ret;
    4567             : }
    4568             : 
    4569        3219 : int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
    4570             : {
    4571        3219 :         int ret = 0;
    4572             : 
    4573        3219 :         mutex_lock(&fs_info->balance_mutex);
    4574        3219 :         if (!fs_info->balance_ctl) {
    4575        3219 :                 mutex_unlock(&fs_info->balance_mutex);
    4576        3219 :                 return -ENOTCONN;
    4577             :         }
    4578             : 
    4579           0 :         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
    4580           0 :                 atomic_inc(&fs_info->balance_pause_req);
    4581           0 :                 mutex_unlock(&fs_info->balance_mutex);
    4582             : 
    4583           0 :                 wait_event(fs_info->balance_wait_q,
    4584             :                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
    4585             : 
    4586           0 :                 mutex_lock(&fs_info->balance_mutex);
    4587             :                 /* we are good with balance_ctl ripped off from under us */
    4588           0 :                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
    4589           0 :                 atomic_dec(&fs_info->balance_pause_req);
    4590             :         } else {
    4591             :                 ret = -ENOTCONN;
    4592             :         }
    4593             : 
    4594           0 :         mutex_unlock(&fs_info->balance_mutex);
    4595           0 :         return ret;
    4596             : }
    4597             : 
    4598          12 : int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
    4599             : {
    4600          12 :         mutex_lock(&fs_info->balance_mutex);
    4601          12 :         if (!fs_info->balance_ctl) {
    4602          10 :                 mutex_unlock(&fs_info->balance_mutex);
    4603          10 :                 return -ENOTCONN;
    4604             :         }
    4605             : 
    4606             :         /*
    4607             :          * A paused balance with the item stored on disk can be resumed at
    4608             :          * mount time if the mount is read-write. Otherwise it's still paused
    4609             :          * and we must not allow cancelling as it deletes the item.
    4610             :          */
    4611           2 :         if (sb_rdonly(fs_info->sb)) {
    4612           0 :                 mutex_unlock(&fs_info->balance_mutex);
    4613           0 :                 return -EROFS;
    4614             :         }
    4615             : 
    4616           2 :         atomic_inc(&fs_info->balance_cancel_req);
    4617             :         /*
    4618             :          * if we are running just wait and return, balance item is
    4619             :          * deleted in btrfs_balance in this case
    4620             :          */
    4621           4 :         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
    4622           2 :                 mutex_unlock(&fs_info->balance_mutex);
    4623           4 :                 wait_event(fs_info->balance_wait_q,
    4624             :                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
    4625           2 :                 mutex_lock(&fs_info->balance_mutex);
    4626             :         } else {
    4627           0 :                 mutex_unlock(&fs_info->balance_mutex);
    4628             :                 /*
    4629             :                  * Lock released to allow other waiters to continue, we'll
    4630             :                  * reexamine the status again.
    4631             :                  */
    4632           0 :                 mutex_lock(&fs_info->balance_mutex);
    4633             : 
    4634           0 :                 if (fs_info->balance_ctl) {
    4635           0 :                         reset_balance_state(fs_info);
    4636           0 :                         btrfs_exclop_finish(fs_info);
    4637           0 :                         btrfs_info(fs_info, "balance: canceled");
    4638             :                 }
    4639             :         }
    4640             : 
    4641           2 :         BUG_ON(fs_info->balance_ctl ||
    4642             :                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
    4643           2 :         atomic_dec(&fs_info->balance_cancel_req);
    4644           2 :         mutex_unlock(&fs_info->balance_mutex);
    4645           2 :         return 0;
    4646             : }
    4647             : 
    4648         994 : int btrfs_uuid_scan_kthread(void *data)
    4649             : {
    4650         994 :         struct btrfs_fs_info *fs_info = data;
    4651         994 :         struct btrfs_root *root = fs_info->tree_root;
    4652         994 :         struct btrfs_key key;
    4653         994 :         struct btrfs_path *path = NULL;
    4654         994 :         int ret = 0;
    4655         994 :         struct extent_buffer *eb;
    4656         994 :         int slot;
    4657         994 :         struct btrfs_root_item root_item;
    4658         994 :         u32 item_size;
    4659         994 :         struct btrfs_trans_handle *trans = NULL;
    4660         994 :         bool closing = false;
    4661             : 
    4662         994 :         path = btrfs_alloc_path();
    4663         994 :         if (!path) {
    4664           0 :                 ret = -ENOMEM;
    4665           0 :                 goto out;
    4666             :         }
    4667             : 
    4668         994 :         key.objectid = 0;
    4669         994 :         key.type = BTRFS_ROOT_ITEM_KEY;
    4670         994 :         key.offset = 0;
    4671             : 
    4672       12942 :         while (1) {
    4673       12942 :                 if (btrfs_fs_closing(fs_info)) {
    4674             :                         closing = true;
    4675             :                         break;
    4676             :                 }
    4677       12942 :                 ret = btrfs_search_forward(root, &key, path,
    4678             :                                 BTRFS_OLDEST_GENERATION);
    4679       12942 :                 if (ret) {
    4680         994 :                         if (ret > 0)
    4681             :                                 ret = 0;
    4682             :                         break;
    4683             :                 }
    4684             : 
    4685       11948 :                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
    4686        7947 :                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
    4687        2986 :                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
    4688             :                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
    4689        9956 :                         goto skip;
    4690             : 
    4691        1992 :                 eb = path->nodes[0];
    4692        1992 :                 slot = path->slots[0];
    4693        1992 :                 item_size = btrfs_item_size(eb, slot);
    4694        1992 :                 if (item_size < sizeof(root_item))
    4695           0 :                         goto skip;
    4696             : 
    4697        1992 :                 read_extent_buffer(eb, &root_item,
    4698        1992 :                                    btrfs_item_ptr_offset(eb, slot),
    4699             :                                    (int)sizeof(root_item));
    4700        1992 :                 if (btrfs_root_refs(&root_item) == 0)
    4701           0 :                         goto skip;
    4702             : 
    4703        1992 :                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
    4704          12 :                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
    4705        1980 :                         if (trans)
    4706         990 :                                 goto update_tree;
    4707             : 
    4708         990 :                         btrfs_release_path(path);
    4709             :                         /*
    4710             :                          * 1 - subvol uuid item
    4711             :                          * 1 - received_subvol uuid item
    4712             :                          */
    4713         990 :                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
    4714         990 :                         if (IS_ERR(trans)) {
    4715           0 :                                 ret = PTR_ERR(trans);
    4716           0 :                                 break;
    4717             :                         }
    4718         990 :                         continue;
    4719             :                 } else {
    4720          12 :                         goto skip;
    4721             :                 }
    4722             : update_tree:
    4723         990 :                 btrfs_release_path(path);
    4724         990 :                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
    4725         990 :                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
    4726             :                                                   BTRFS_UUID_KEY_SUBVOL,
    4727             :                                                   key.objectid);
    4728         990 :                         if (ret < 0) {
    4729           0 :                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
    4730             :                                         ret);
    4731           0 :                                 break;
    4732             :                         }
    4733             :                 }
    4734             : 
    4735         990 :                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
    4736           0 :                         ret = btrfs_uuid_tree_add(trans,
    4737             :                                                   root_item.received_uuid,
    4738             :                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
    4739             :                                                   key.objectid);
    4740           0 :                         if (ret < 0) {
    4741           0 :                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
    4742             :                                         ret);
    4743           0 :                                 break;
    4744             :                         }
    4745             :                 }
    4746             : 
    4747         990 : skip:
    4748       10958 :                 btrfs_release_path(path);
    4749       10958 :                 if (trans) {
    4750         990 :                         ret = btrfs_end_transaction(trans);
    4751         990 :                         trans = NULL;
    4752         990 :                         if (ret)
    4753             :                                 break;
    4754             :                 }
    4755             : 
    4756       10958 :                 if (key.offset < (u64)-1) {
    4757       10958 :                         key.offset++;
    4758           0 :                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
    4759           0 :                         key.offset = 0;
    4760           0 :                         key.type = BTRFS_ROOT_ITEM_KEY;
    4761           0 :                 } else if (key.objectid < (u64)-1) {
    4762           0 :                         key.offset = 0;
    4763           0 :                         key.type = BTRFS_ROOT_ITEM_KEY;
    4764           0 :                         key.objectid++;
    4765             :                 } else {
    4766             :                         break;
    4767             :                 }
    4768       10958 :                 cond_resched();
    4769             :         }
    4770             : 
    4771           0 : out:
    4772         994 :         btrfs_free_path(path);
    4773         994 :         if (trans && !IS_ERR(trans))
    4774           0 :                 btrfs_end_transaction(trans);
    4775         994 :         if (ret)
    4776           0 :                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
    4777         994 :         else if (!closing)
    4778         994 :                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
    4779         994 :         up(&fs_info->uuid_tree_rescan_sem);
    4780         994 :         return 0;
    4781             : }
    4782             : 
    4783           3 : int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
    4784             : {
    4785           3 :         struct btrfs_trans_handle *trans;
    4786           3 :         struct btrfs_root *tree_root = fs_info->tree_root;
    4787           3 :         struct btrfs_root *uuid_root;
    4788           3 :         struct task_struct *task;
    4789           3 :         int ret;
    4790             : 
    4791             :         /*
    4792             :          * 1 - root node
    4793             :          * 1 - root item
    4794             :          */
    4795           3 :         trans = btrfs_start_transaction(tree_root, 2);
    4796           3 :         if (IS_ERR(trans))
    4797           0 :                 return PTR_ERR(trans);
    4798             : 
    4799           3 :         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
    4800           3 :         if (IS_ERR(uuid_root)) {
    4801           0 :                 ret = PTR_ERR(uuid_root);
    4802           0 :                 btrfs_abort_transaction(trans, ret);
    4803           0 :                 btrfs_end_transaction(trans);
    4804           0 :                 return ret;
    4805             :         }
    4806             : 
    4807           3 :         fs_info->uuid_root = uuid_root;
    4808             : 
    4809           3 :         ret = btrfs_commit_transaction(trans);
    4810           3 :         if (ret)
    4811             :                 return ret;
    4812             : 
    4813           3 :         down(&fs_info->uuid_tree_rescan_sem);
    4814           3 :         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
    4815           3 :         if (IS_ERR(task)) {
    4816             :                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
    4817           0 :                 btrfs_warn(fs_info, "failed to start uuid_scan task");
    4818           0 :                 up(&fs_info->uuid_tree_rescan_sem);
    4819           0 :                 return PTR_ERR(task);
    4820             :         }
    4821             : 
    4822             :         return 0;
    4823             : }
    4824             : 
    4825             : /*
    4826             :  * shrinking a device means finding all of the device extents past
    4827             :  * the new size, and then following the back refs to the chunks.
    4828             :  * The chunk relocation code actually frees the device extent
    4829             :  */
    4830           3 : int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
    4831             : {
    4832           3 :         struct btrfs_fs_info *fs_info = device->fs_info;
    4833           3 :         struct btrfs_root *root = fs_info->dev_root;
    4834           3 :         struct btrfs_trans_handle *trans;
    4835           3 :         struct btrfs_dev_extent *dev_extent = NULL;
    4836           3 :         struct btrfs_path *path;
    4837           3 :         u64 length;
    4838           3 :         u64 chunk_offset;
    4839           3 :         int ret;
    4840           3 :         int slot;
    4841           3 :         int failed = 0;
    4842           3 :         bool retried = false;
    4843           3 :         struct extent_buffer *l;
    4844           3 :         struct btrfs_key key;
    4845           3 :         struct btrfs_super_block *super_copy = fs_info->super_copy;
    4846           3 :         u64 old_total = btrfs_super_total_bytes(super_copy);
    4847           3 :         u64 old_size = btrfs_device_get_total_bytes(device);
    4848           3 :         u64 diff;
    4849           3 :         u64 start;
    4850             : 
    4851           3 :         new_size = round_down(new_size, fs_info->sectorsize);
    4852           3 :         start = new_size;
    4853           3 :         diff = round_down(old_size - new_size, fs_info->sectorsize);
    4854             : 
    4855           6 :         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
    4856             :                 return -EINVAL;
    4857             : 
    4858           3 :         path = btrfs_alloc_path();
    4859           3 :         if (!path)
    4860             :                 return -ENOMEM;
    4861             : 
    4862           3 :         path->reada = READA_BACK;
    4863             : 
    4864           3 :         trans = btrfs_start_transaction(root, 0);
    4865           3 :         if (IS_ERR(trans)) {
    4866           0 :                 btrfs_free_path(path);
    4867           0 :                 return PTR_ERR(trans);
    4868             :         }
    4869             : 
    4870           3 :         mutex_lock(&fs_info->chunk_mutex);
    4871             : 
    4872           3 :         btrfs_device_set_total_bytes(device, new_size);
    4873           6 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    4874           3 :                 device->fs_devices->total_rw_bytes -= diff;
    4875           3 :                 atomic64_sub(diff, &fs_info->free_chunk_space);
    4876             :         }
    4877             : 
    4878             :         /*
    4879             :          * Once the device's size has been set to the new size, ensure all
    4880             :          * in-memory chunks are synced to disk so that the loop below sees them
    4881             :          * and relocates them accordingly.
    4882             :          */
    4883           3 :         if (contains_pending_extent(device, &start, diff)) {
    4884           2 :                 mutex_unlock(&fs_info->chunk_mutex);
    4885           2 :                 ret = btrfs_commit_transaction(trans);
    4886           2 :                 if (ret)
    4887           0 :                         goto done;
    4888             :         } else {
    4889           1 :                 mutex_unlock(&fs_info->chunk_mutex);
    4890           1 :                 btrfs_end_transaction(trans);
    4891             :         }
    4892             : 
    4893             : again:
    4894           3 :         key.objectid = device->devid;
    4895           3 :         key.offset = (u64)-1;
    4896           3 :         key.type = BTRFS_DEV_EXTENT_KEY;
    4897             : 
    4898           4 :         do {
    4899           4 :                 mutex_lock(&fs_info->reclaim_bgs_lock);
    4900           4 :                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    4901           4 :                 if (ret < 0) {
    4902           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    4903           0 :                         goto done;
    4904             :                 }
    4905             : 
    4906           4 :                 ret = btrfs_previous_item(root, path, 0, key.type);
    4907           4 :                 if (ret) {
    4908           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    4909           0 :                         if (ret < 0)
    4910           0 :                                 goto done;
    4911           0 :                         ret = 0;
    4912           0 :                         btrfs_release_path(path);
    4913           0 :                         break;
    4914             :                 }
    4915             : 
    4916           4 :                 l = path->nodes[0];
    4917           4 :                 slot = path->slots[0];
    4918           4 :                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
    4919             : 
    4920           4 :                 if (key.objectid != device->devid) {
    4921           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    4922           0 :                         btrfs_release_path(path);
    4923           0 :                         break;
    4924             :                 }
    4925             : 
    4926           4 :                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
    4927           4 :                 length = btrfs_dev_extent_length(l, dev_extent);
    4928             : 
    4929           4 :                 if (key.offset + length <= new_size) {
    4930           2 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    4931           2 :                         btrfs_release_path(path);
    4932           2 :                         break;
    4933             :                 }
    4934             : 
    4935           2 :                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
    4936           2 :                 btrfs_release_path(path);
    4937             : 
    4938             :                 /*
    4939             :                  * We may be relocating the only data chunk we have,
    4940             :                  * which could potentially end up with losing data's
    4941             :                  * raid profile, so lets allocate an empty one in
    4942             :                  * advance.
    4943             :                  */
    4944           2 :                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
    4945           2 :                 if (ret < 0) {
    4946           0 :                         mutex_unlock(&fs_info->reclaim_bgs_lock);
    4947           0 :                         goto done;
    4948             :                 }
    4949             : 
    4950           2 :                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
    4951           2 :                 mutex_unlock(&fs_info->reclaim_bgs_lock);
    4952           2 :                 if (ret == -ENOSPC) {
    4953           0 :                         failed++;
    4954           2 :                 } else if (ret) {
    4955           1 :                         if (ret == -ETXTBSY) {
    4956           1 :                                 btrfs_warn(fs_info,
    4957             :                    "could not shrink block group %llu due to active swapfile",
    4958             :                                            chunk_offset);
    4959             :                         }
    4960           1 :                         goto done;
    4961             :                 }
    4962           1 :         } while (key.offset-- > 0);
    4963             : 
    4964           2 :         if (failed && !retried) {
    4965           0 :                 failed = 0;
    4966           0 :                 retried = true;
    4967           0 :                 goto again;
    4968           2 :         } else if (failed && retried) {
    4969           0 :                 ret = -ENOSPC;
    4970           0 :                 goto done;
    4971             :         }
    4972             : 
    4973             :         /* Shrinking succeeded, else we would be at "done". */
    4974           2 :         trans = btrfs_start_transaction(root, 0);
    4975           2 :         if (IS_ERR(trans)) {
    4976           0 :                 ret = PTR_ERR(trans);
    4977           0 :                 goto done;
    4978             :         }
    4979             : 
    4980           2 :         mutex_lock(&fs_info->chunk_mutex);
    4981             :         /* Clear all state bits beyond the shrunk device size */
    4982           2 :         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
    4983             :                           CHUNK_STATE_MASK);
    4984             : 
    4985           2 :         btrfs_device_set_disk_total_bytes(device, new_size);
    4986           2 :         if (list_empty(&device->post_commit_list))
    4987           2 :                 list_add_tail(&device->post_commit_list,
    4988           2 :                               &trans->transaction->dev_update_list);
    4989             : 
    4990           2 :         WARN_ON(diff > old_total);
    4991           2 :         btrfs_set_super_total_bytes(super_copy,
    4992           2 :                         round_down(old_total - diff, fs_info->sectorsize));
    4993           2 :         mutex_unlock(&fs_info->chunk_mutex);
    4994             : 
    4995           2 :         btrfs_reserve_chunk_metadata(trans, false);
    4996             :         /* Now btrfs_update_device() will change the on-disk size. */
    4997           2 :         ret = btrfs_update_device(trans, device);
    4998           2 :         btrfs_trans_release_chunk_metadata(trans);
    4999           2 :         if (ret < 0) {
    5000           0 :                 btrfs_abort_transaction(trans, ret);
    5001           0 :                 btrfs_end_transaction(trans);
    5002             :         } else {
    5003           2 :                 ret = btrfs_commit_transaction(trans);
    5004             :         }
    5005           3 : done:
    5006           3 :         btrfs_free_path(path);
    5007           3 :         if (ret) {
    5008           1 :                 mutex_lock(&fs_info->chunk_mutex);
    5009           1 :                 btrfs_device_set_total_bytes(device, old_size);
    5010           2 :                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
    5011           1 :                         device->fs_devices->total_rw_bytes += diff;
    5012           1 :                 atomic64_add(diff, &fs_info->free_chunk_space);
    5013           1 :                 mutex_unlock(&fs_info->chunk_mutex);
    5014             :         }
    5015             :         return ret;
    5016             : }
    5017             : 
    5018          93 : static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
    5019             :                            struct btrfs_key *key,
    5020             :                            struct btrfs_chunk *chunk, int item_size)
    5021             : {
    5022          93 :         struct btrfs_super_block *super_copy = fs_info->super_copy;
    5023          93 :         struct btrfs_disk_key disk_key;
    5024          93 :         u32 array_size;
    5025          93 :         u8 *ptr;
    5026             : 
    5027          93 :         lockdep_assert_held(&fs_info->chunk_mutex);
    5028             : 
    5029          93 :         array_size = btrfs_super_sys_array_size(super_copy);
    5030          93 :         if (array_size + item_size + sizeof(disk_key)
    5031             :                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
    5032             :                 return -EFBIG;
    5033             : 
    5034          93 :         ptr = super_copy->sys_chunk_array + array_size;
    5035          93 :         btrfs_cpu_key_to_disk(&disk_key, key);
    5036         186 :         memcpy(ptr, &disk_key, sizeof(disk_key));
    5037          93 :         ptr += sizeof(disk_key);
    5038         186 :         memcpy(ptr, chunk, item_size);
    5039          93 :         item_size += sizeof(disk_key);
    5040          93 :         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
    5041             : 
    5042          93 :         return 0;
    5043             : }
    5044             : 
    5045             : /*
    5046             :  * sort the devices in descending order by max_avail, total_avail
    5047             :  */
    5048           0 : static int btrfs_cmp_device_info(const void *a, const void *b)
    5049             : {
    5050           0 :         const struct btrfs_device_info *di_a = a;
    5051           0 :         const struct btrfs_device_info *di_b = b;
    5052             : 
    5053           0 :         if (di_a->max_avail > di_b->max_avail)
    5054             :                 return -1;
    5055           0 :         if (di_a->max_avail < di_b->max_avail)
    5056             :                 return 1;
    5057           0 :         if (di_a->total_avail > di_b->total_avail)
    5058             :                 return -1;
    5059           0 :         if (di_a->total_avail < di_b->total_avail)
    5060           0 :                 return 1;
    5061             :         return 0;
    5062             : }
    5063             : 
    5064        1436 : static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
    5065             : {
    5066        1436 :         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
    5067             :                 return;
    5068             : 
    5069           0 :         btrfs_set_fs_incompat(info, RAID56);
    5070             : }
    5071             : 
    5072        1436 : static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
    5073             : {
    5074        1436 :         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
    5075             :                 return;
    5076             : 
    5077           0 :         btrfs_set_fs_incompat(info, RAID1C34);
    5078             : }
    5079             : 
    5080             : /*
    5081             :  * Structure used internally for btrfs_create_chunk() function.
    5082             :  * Wraps needed parameters.
    5083             :  */
    5084             : struct alloc_chunk_ctl {
    5085             :         u64 start;
    5086             :         u64 type;
    5087             :         /* Total number of stripes to allocate */
    5088             :         int num_stripes;
    5089             :         /* sub_stripes info for map */
    5090             :         int sub_stripes;
    5091             :         /* Stripes per device */
    5092             :         int dev_stripes;
    5093             :         /* Maximum number of devices to use */
    5094             :         int devs_max;
    5095             :         /* Minimum number of devices to use */
    5096             :         int devs_min;
    5097             :         /* ndevs has to be a multiple of this */
    5098             :         int devs_increment;
    5099             :         /* Number of copies */
    5100             :         int ncopies;
    5101             :         /* Number of stripes worth of bytes to store parity information */
    5102             :         int nparity;
    5103             :         u64 max_stripe_size;
    5104             :         u64 max_chunk_size;
    5105             :         u64 dev_extent_min;
    5106             :         u64 stripe_size;
    5107             :         u64 chunk_size;
    5108             :         int ndevs;
    5109             : };
    5110             : 
    5111        1482 : static void init_alloc_chunk_ctl_policy_regular(
    5112             :                                 struct btrfs_fs_devices *fs_devices,
    5113             :                                 struct alloc_chunk_ctl *ctl)
    5114             : {
    5115        1482 :         struct btrfs_space_info *space_info;
    5116             : 
    5117        1482 :         space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
    5118        1482 :         ASSERT(space_info);
    5119             : 
    5120        1482 :         ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
    5121        1482 :         ctl->max_stripe_size = ctl->max_chunk_size;
    5122             : 
    5123        1482 :         if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
    5124          93 :                 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
    5125             : 
    5126             :         /* We don't want a chunk larger than 10% of writable space */
    5127        1482 :         ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
    5128             :                                   ctl->max_chunk_size);
    5129        1482 :         ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
    5130        1482 : }
    5131             : 
    5132           0 : static void init_alloc_chunk_ctl_policy_zoned(
    5133             :                                       struct btrfs_fs_devices *fs_devices,
    5134             :                                       struct alloc_chunk_ctl *ctl)
    5135             : {
    5136           0 :         u64 zone_size = fs_devices->fs_info->zone_size;
    5137           0 :         u64 limit;
    5138           0 :         int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
    5139           0 :         int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
    5140           0 :         u64 min_chunk_size = min_data_stripes * zone_size;
    5141           0 :         u64 type = ctl->type;
    5142             : 
    5143           0 :         ctl->max_stripe_size = zone_size;
    5144           0 :         if (type & BTRFS_BLOCK_GROUP_DATA) {
    5145           0 :                 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
    5146             :                                                  zone_size);
    5147           0 :         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
    5148           0 :                 ctl->max_chunk_size = ctl->max_stripe_size;
    5149           0 :         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
    5150           0 :                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
    5151           0 :                 ctl->devs_max = min_t(int, ctl->devs_max,
    5152             :                                       BTRFS_MAX_DEVS_SYS_CHUNK);
    5153             :         } else {
    5154           0 :                 BUG();
    5155             :         }
    5156             : 
    5157             :         /* We don't want a chunk larger than 10% of writable space */
    5158           0 :         limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
    5159             :                                zone_size),
    5160             :                     min_chunk_size);
    5161           0 :         ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
    5162           0 :         ctl->dev_extent_min = zone_size * ctl->dev_stripes;
    5163           0 : }
    5164             : 
    5165        1482 : static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
    5166             :                                  struct alloc_chunk_ctl *ctl)
    5167             : {
    5168        1482 :         int index = btrfs_bg_flags_to_raid_index(ctl->type);
    5169             : 
    5170        1482 :         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
    5171        1482 :         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
    5172        1482 :         ctl->devs_max = btrfs_raid_array[index].devs_max;
    5173        1482 :         if (!ctl->devs_max)
    5174           0 :                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
    5175        1482 :         ctl->devs_min = btrfs_raid_array[index].devs_min;
    5176        1482 :         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
    5177        1482 :         ctl->ncopies = btrfs_raid_array[index].ncopies;
    5178        1482 :         ctl->nparity = btrfs_raid_array[index].nparity;
    5179        1482 :         ctl->ndevs = 0;
    5180             : 
    5181        1482 :         switch (fs_devices->chunk_alloc_policy) {
    5182        1482 :         case BTRFS_CHUNK_ALLOC_REGULAR:
    5183        1482 :                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
    5184        1482 :                 break;
    5185           0 :         case BTRFS_CHUNK_ALLOC_ZONED:
    5186           0 :                 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
    5187           0 :                 break;
    5188           0 :         default:
    5189           0 :                 BUG();
    5190             :         }
    5191        1482 : }
    5192             : 
    5193        1482 : static int gather_device_info(struct btrfs_fs_devices *fs_devices,
    5194             :                               struct alloc_chunk_ctl *ctl,
    5195             :                               struct btrfs_device_info *devices_info)
    5196             : {
    5197        1482 :         struct btrfs_fs_info *info = fs_devices->fs_info;
    5198        1482 :         struct btrfs_device *device;
    5199        1482 :         u64 total_avail;
    5200        1482 :         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
    5201        1482 :         int ret;
    5202        1482 :         int ndevs = 0;
    5203        1482 :         u64 max_avail;
    5204        1482 :         u64 dev_offset;
    5205             : 
    5206             :         /*
    5207             :          * in the first pass through the devices list, we gather information
    5208             :          * about the available holes on each device.
    5209             :          */
    5210        2964 :         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
    5211        1482 :                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
    5212           0 :                         WARN(1, KERN_ERR
    5213             :                                "BTRFS: read-only device in alloc_list\n");
    5214           0 :                         continue;
    5215             :                 }
    5216             : 
    5217        2964 :                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
    5218        2964 :                                         &device->dev_state) ||
    5219           0 :                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
    5220           0 :                         continue;
    5221             : 
    5222        1482 :                 if (device->total_bytes > device->bytes_used)
    5223        1482 :                         total_avail = device->total_bytes - device->bytes_used;
    5224             :                 else
    5225             :                         total_avail = 0;
    5226             : 
    5227             :                 /* If there is no space on this device, skip it. */
    5228        1482 :                 if (total_avail < ctl->dev_extent_min)
    5229           0 :                         continue;
    5230             : 
    5231        1482 :                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
    5232             :                                            &max_avail);
    5233        1482 :                 if (ret && ret != -ENOSPC)
    5234           0 :                         return ret;
    5235             : 
    5236        1482 :                 if (ret == 0)
    5237         940 :                         max_avail = dev_extent_want;
    5238             : 
    5239        1482 :                 if (max_avail < ctl->dev_extent_min) {
    5240          46 :                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
    5241             :                                 btrfs_debug(info,
    5242             :                         "%s: devid %llu has no free space, have=%llu want=%llu",
    5243             :                                             __func__, device->devid, max_avail,
    5244             :                                             ctl->dev_extent_min);
    5245          46 :                         continue;
    5246             :                 }
    5247             : 
    5248        1436 :                 if (ndevs == fs_devices->rw_devices) {
    5249           0 :                         WARN(1, "%s: found more than %llu devices\n",
    5250             :                              __func__, fs_devices->rw_devices);
    5251           0 :                         break;
    5252             :                 }
    5253        1436 :                 devices_info[ndevs].dev_offset = dev_offset;
    5254        1436 :                 devices_info[ndevs].max_avail = max_avail;
    5255        1436 :                 devices_info[ndevs].total_avail = total_avail;
    5256        1436 :                 devices_info[ndevs].dev = device;
    5257        1436 :                 ++ndevs;
    5258             :         }
    5259        1482 :         ctl->ndevs = ndevs;
    5260             : 
    5261             :         /*
    5262             :          * now sort the devices by hole size / available space
    5263             :          */
    5264        1482 :         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
    5265             :              btrfs_cmp_device_info, NULL);
    5266             : 
    5267        1482 :         return 0;
    5268             : }
    5269             : 
    5270        1436 : static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
    5271             :                                       struct btrfs_device_info *devices_info)
    5272             : {
    5273             :         /* Number of stripes that count for block group size */
    5274        1436 :         int data_stripes;
    5275             : 
    5276             :         /*
    5277             :          * The primary goal is to maximize the number of stripes, so use as
    5278             :          * many devices as possible, even if the stripes are not maximum sized.
    5279             :          *
    5280             :          * The DUP profile stores more than one stripe per device, the
    5281             :          * max_avail is the total size so we have to adjust.
    5282             :          */
    5283        1436 :         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
    5284        1436 :                                    ctl->dev_stripes);
    5285        1436 :         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
    5286             : 
    5287             :         /* This will have to be fixed for RAID1 and RAID10 over more drives */
    5288        1436 :         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
    5289             : 
    5290             :         /*
    5291             :          * Use the number of data stripes to figure out how big this chunk is
    5292             :          * really going to be in terms of logical address space, and compare
    5293             :          * that answer with the max chunk size. If it's higher, we try to
    5294             :          * reduce stripe_size.
    5295             :          */
    5296        1436 :         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
    5297             :                 /*
    5298             :                  * Reduce stripe_size, round it up to a 16MB boundary again and
    5299             :                  * then use it, unless it ends up being even bigger than the
    5300             :                  * previous value we had already.
    5301             :                  */
    5302        1054 :                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
    5303             :                                                         data_stripes), SZ_16M),
    5304             :                                        ctl->stripe_size);
    5305             :         }
    5306             : 
    5307             :         /* Stripe size should not go beyond 1G. */
    5308        1436 :         ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
    5309             : 
    5310             :         /* Align to BTRFS_STRIPE_LEN */
    5311        1436 :         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
    5312        1436 :         ctl->chunk_size = ctl->stripe_size * data_stripes;
    5313             : 
    5314        1436 :         return 0;
    5315             : }
    5316             : 
    5317           0 : static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
    5318             :                                     struct btrfs_device_info *devices_info)
    5319             : {
    5320           0 :         u64 zone_size = devices_info[0].dev->zone_info->zone_size;
    5321             :         /* Number of stripes that count for block group size */
    5322           0 :         int data_stripes;
    5323             : 
    5324             :         /*
    5325             :          * It should hold because:
    5326             :          *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
    5327             :          */
    5328           0 :         ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
    5329             : 
    5330           0 :         ctl->stripe_size = zone_size;
    5331           0 :         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
    5332           0 :         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
    5333             : 
    5334             :         /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
    5335           0 :         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
    5336           0 :                 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
    5337           0 :                                              ctl->stripe_size) + ctl->nparity,
    5338             :                                      ctl->dev_stripes);
    5339           0 :                 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
    5340           0 :                 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
    5341           0 :                 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
    5342             :         }
    5343             : 
    5344           0 :         ctl->chunk_size = ctl->stripe_size * data_stripes;
    5345             : 
    5346           0 :         return 0;
    5347             : }
    5348             : 
    5349        1482 : static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
    5350             :                               struct alloc_chunk_ctl *ctl,
    5351             :                               struct btrfs_device_info *devices_info)
    5352             : {
    5353        1482 :         struct btrfs_fs_info *info = fs_devices->fs_info;
    5354             : 
    5355             :         /*
    5356             :          * Round down to number of usable stripes, devs_increment can be any
    5357             :          * number so we can't use round_down() that requires power of 2, while
    5358             :          * rounddown is safe.
    5359             :          */
    5360        1482 :         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
    5361             : 
    5362        1482 :         if (ctl->ndevs < ctl->devs_min) {
    5363             :                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
    5364             :                         btrfs_debug(info,
    5365             :         "%s: not enough devices with free space: have=%d minimum required=%d",
    5366             :                                     __func__, ctl->ndevs, ctl->devs_min);
    5367             :                 }
    5368             :                 return -ENOSPC;
    5369             :         }
    5370             : 
    5371        1436 :         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
    5372             : 
    5373        1436 :         switch (fs_devices->chunk_alloc_policy) {
    5374        1436 :         case BTRFS_CHUNK_ALLOC_REGULAR:
    5375        1436 :                 return decide_stripe_size_regular(ctl, devices_info);
    5376           0 :         case BTRFS_CHUNK_ALLOC_ZONED:
    5377           0 :                 return decide_stripe_size_zoned(ctl, devices_info);
    5378           0 :         default:
    5379           0 :                 BUG();
    5380             :         }
    5381             : }
    5382             : 
    5383        1436 : static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
    5384             :                         struct alloc_chunk_ctl *ctl,
    5385             :                         struct btrfs_device_info *devices_info)
    5386             : {
    5387        1436 :         struct btrfs_fs_info *info = trans->fs_info;
    5388        1436 :         struct map_lookup *map = NULL;
    5389        1436 :         struct extent_map_tree *em_tree;
    5390        1436 :         struct btrfs_block_group *block_group;
    5391        1436 :         struct extent_map *em;
    5392        1436 :         u64 start = ctl->start;
    5393        1436 :         u64 type = ctl->type;
    5394        1436 :         int ret;
    5395        1436 :         int i;
    5396        1436 :         int j;
    5397             : 
    5398        1436 :         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
    5399        1436 :         if (!map)
    5400             :                 return ERR_PTR(-ENOMEM);
    5401        1436 :         map->num_stripes = ctl->num_stripes;
    5402             : 
    5403        2872 :         for (i = 0; i < ctl->ndevs; ++i) {
    5404        3182 :                 for (j = 0; j < ctl->dev_stripes; ++j) {
    5405        1746 :                         int s = i * ctl->dev_stripes + j;
    5406        1746 :                         map->stripes[s].dev = devices_info[i].dev;
    5407        1746 :                         map->stripes[s].physical = devices_info[i].dev_offset +
    5408        1746 :                                                    j * ctl->stripe_size;
    5409             :                 }
    5410             :         }
    5411        1436 :         map->io_align = BTRFS_STRIPE_LEN;
    5412        1436 :         map->io_width = BTRFS_STRIPE_LEN;
    5413        1436 :         map->type = type;
    5414        1436 :         map->sub_stripes = ctl->sub_stripes;
    5415             : 
    5416        1436 :         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
    5417             : 
    5418        1436 :         em = alloc_extent_map();
    5419        1436 :         if (!em) {
    5420           0 :                 kfree(map);
    5421           0 :                 return ERR_PTR(-ENOMEM);
    5422             :         }
    5423        1436 :         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
    5424        1436 :         em->map_lookup = map;
    5425        1436 :         em->start = start;
    5426        1436 :         em->len = ctl->chunk_size;
    5427        1436 :         em->block_start = 0;
    5428        1436 :         em->block_len = em->len;
    5429        1436 :         em->orig_block_len = ctl->stripe_size;
    5430             : 
    5431        1436 :         em_tree = &info->mapping_tree;
    5432        1436 :         write_lock(&em_tree->lock);
    5433        1436 :         ret = add_extent_mapping(em_tree, em, 0);
    5434        1436 :         if (ret) {
    5435           0 :                 write_unlock(&em_tree->lock);
    5436           0 :                 free_extent_map(em);
    5437           0 :                 return ERR_PTR(ret);
    5438             :         }
    5439        1436 :         write_unlock(&em_tree->lock);
    5440             : 
    5441        1436 :         block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
    5442        1436 :         if (IS_ERR(block_group))
    5443           0 :                 goto error_del_extent;
    5444             : 
    5445        3182 :         for (i = 0; i < map->num_stripes; i++) {
    5446        1746 :                 struct btrfs_device *dev = map->stripes[i].dev;
    5447             : 
    5448        1746 :                 btrfs_device_set_bytes_used(dev,
    5449        1746 :                                             dev->bytes_used + ctl->stripe_size);
    5450        1746 :                 if (list_empty(&dev->post_commit_list))
    5451         967 :                         list_add_tail(&dev->post_commit_list,
    5452         967 :                                       &trans->transaction->dev_update_list);
    5453             :         }
    5454             : 
    5455        1436 :         atomic64_sub(ctl->stripe_size * map->num_stripes,
    5456             :                      &info->free_chunk_space);
    5457             : 
    5458        1436 :         free_extent_map(em);
    5459        1436 :         check_raid56_incompat_flag(info, type);
    5460        1436 :         check_raid1c34_incompat_flag(info, type);
    5461             : 
    5462        1436 :         return block_group;
    5463             : 
    5464             : error_del_extent:
    5465           0 :         write_lock(&em_tree->lock);
    5466           0 :         remove_extent_mapping(em_tree, em);
    5467           0 :         write_unlock(&em_tree->lock);
    5468             : 
    5469             :         /* One for our allocation */
    5470           0 :         free_extent_map(em);
    5471             :         /* One for the tree reference */
    5472           0 :         free_extent_map(em);
    5473             : 
    5474           0 :         return block_group;
    5475             : }
    5476             : 
    5477        1482 : struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
    5478             :                                             u64 type)
    5479             : {
    5480        1482 :         struct btrfs_fs_info *info = trans->fs_info;
    5481        1482 :         struct btrfs_fs_devices *fs_devices = info->fs_devices;
    5482        1482 :         struct btrfs_device_info *devices_info = NULL;
    5483        1482 :         struct alloc_chunk_ctl ctl;
    5484        1482 :         struct btrfs_block_group *block_group;
    5485        1482 :         int ret;
    5486             : 
    5487        1482 :         lockdep_assert_held(&info->chunk_mutex);
    5488             : 
    5489        1482 :         if (!alloc_profile_is_valid(type, 0)) {
    5490             :                 ASSERT(0);
    5491             :                 return ERR_PTR(-EINVAL);
    5492             :         }
    5493             : 
    5494        1482 :         if (list_empty(&fs_devices->alloc_list)) {
    5495             :                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
    5496             :                         btrfs_debug(info, "%s: no writable device", __func__);
    5497             :                 return ERR_PTR(-ENOSPC);
    5498             :         }
    5499             : 
    5500        1482 :         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
    5501           0 :                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
    5502           0 :                 ASSERT(0);
    5503           0 :                 return ERR_PTR(-EINVAL);
    5504             :         }
    5505             : 
    5506        1482 :         ctl.start = find_next_chunk(info);
    5507        1482 :         ctl.type = type;
    5508        1482 :         init_alloc_chunk_ctl(fs_devices, &ctl);
    5509             : 
    5510        1482 :         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
    5511             :                                GFP_NOFS);
    5512        1482 :         if (!devices_info)
    5513             :                 return ERR_PTR(-ENOMEM);
    5514             : 
    5515        1482 :         ret = gather_device_info(fs_devices, &ctl, devices_info);
    5516        1482 :         if (ret < 0) {
    5517           0 :                 block_group = ERR_PTR(ret);
    5518           0 :                 goto out;
    5519             :         }
    5520             : 
    5521        1482 :         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
    5522        1482 :         if (ret < 0) {
    5523          46 :                 block_group = ERR_PTR(ret);
    5524          46 :                 goto out;
    5525             :         }
    5526             : 
    5527        1436 :         block_group = create_chunk(trans, &ctl, devices_info);
    5528             : 
    5529        1482 : out:
    5530        1482 :         kfree(devices_info);
    5531        1482 :         return block_group;
    5532             : }
    5533             : 
    5534             : /*
    5535             :  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
    5536             :  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
    5537             :  * chunks.
    5538             :  *
    5539             :  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
    5540             :  * phases.
    5541             :  */
    5542        1436 : int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
    5543             :                                      struct btrfs_block_group *bg)
    5544             : {
    5545        1436 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    5546        1436 :         struct btrfs_root *chunk_root = fs_info->chunk_root;
    5547        1436 :         struct btrfs_key key;
    5548        1436 :         struct btrfs_chunk *chunk;
    5549        1436 :         struct btrfs_stripe *stripe;
    5550        1436 :         struct extent_map *em;
    5551        1436 :         struct map_lookup *map;
    5552        1436 :         size_t item_size;
    5553        1436 :         int i;
    5554        1436 :         int ret;
    5555             : 
    5556             :         /*
    5557             :          * We take the chunk_mutex for 2 reasons:
    5558             :          *
    5559             :          * 1) Updates and insertions in the chunk btree must be done while holding
    5560             :          *    the chunk_mutex, as well as updating the system chunk array in the
    5561             :          *    superblock. See the comment on top of btrfs_chunk_alloc() for the
    5562             :          *    details;
    5563             :          *
    5564             :          * 2) To prevent races with the final phase of a device replace operation
    5565             :          *    that replaces the device object associated with the map's stripes,
    5566             :          *    because the device object's id can change at any time during that
    5567             :          *    final phase of the device replace operation
    5568             :          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
    5569             :          *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
    5570             :          *    which would cause a failure when updating the device item, which does
    5571             :          *    not exists, or persisting a stripe of the chunk item with such ID.
    5572             :          *    Here we can't use the device_list_mutex because our caller already
    5573             :          *    has locked the chunk_mutex, and the final phase of device replace
    5574             :          *    acquires both mutexes - first the device_list_mutex and then the
    5575             :          *    chunk_mutex. Using any of those two mutexes protects us from a
    5576             :          *    concurrent device replace.
    5577             :          */
    5578        1436 :         lockdep_assert_held(&fs_info->chunk_mutex);
    5579             : 
    5580        1436 :         em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
    5581        1436 :         if (IS_ERR(em)) {
    5582           0 :                 ret = PTR_ERR(em);
    5583           0 :                 btrfs_abort_transaction(trans, ret);
    5584           0 :                 return ret;
    5585             :         }
    5586             : 
    5587        1436 :         map = em->map_lookup;
    5588        1436 :         item_size = btrfs_chunk_item_size(map->num_stripes);
    5589             : 
    5590        1436 :         chunk = kzalloc(item_size, GFP_NOFS);
    5591        1436 :         if (!chunk) {
    5592           0 :                 ret = -ENOMEM;
    5593           0 :                 btrfs_abort_transaction(trans, ret);
    5594           0 :                 goto out;
    5595             :         }
    5596             : 
    5597        3182 :         for (i = 0; i < map->num_stripes; i++) {
    5598        1746 :                 struct btrfs_device *device = map->stripes[i].dev;
    5599             : 
    5600        1746 :                 ret = btrfs_update_device(trans, device);
    5601        1746 :                 if (ret)
    5602           0 :                         goto out;
    5603             :         }
    5604             : 
    5605        1436 :         stripe = &chunk->stripe;
    5606        3182 :         for (i = 0; i < map->num_stripes; i++) {
    5607        1746 :                 struct btrfs_device *device = map->stripes[i].dev;
    5608        1746 :                 const u64 dev_offset = map->stripes[i].physical;
    5609             : 
    5610        1746 :                 btrfs_set_stack_stripe_devid(stripe, device->devid);
    5611        1746 :                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
    5612        3492 :                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
    5613        1746 :                 stripe++;
    5614             :         }
    5615             : 
    5616        1436 :         btrfs_set_stack_chunk_length(chunk, bg->length);
    5617        1436 :         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
    5618        1436 :         btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
    5619        1436 :         btrfs_set_stack_chunk_type(chunk, map->type);
    5620        1436 :         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
    5621        1436 :         btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
    5622        1436 :         btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
    5623        1436 :         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
    5624        1436 :         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
    5625             : 
    5626        1436 :         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
    5627        1436 :         key.type = BTRFS_CHUNK_ITEM_KEY;
    5628        1436 :         key.offset = bg->start;
    5629             : 
    5630        1436 :         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
    5631        1436 :         if (ret)
    5632           0 :                 goto out;
    5633             : 
    5634        1436 :         set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
    5635             : 
    5636        1436 :         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
    5637          93 :                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
    5638          93 :                 if (ret)
    5639           0 :                         goto out;
    5640             :         }
    5641             : 
    5642        1436 : out:
    5643        1436 :         kfree(chunk);
    5644        1436 :         free_extent_map(em);
    5645        1436 :         return ret;
    5646             : }
    5647             : 
    5648           0 : static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
    5649             : {
    5650           0 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    5651           0 :         u64 alloc_profile;
    5652           0 :         struct btrfs_block_group *meta_bg;
    5653           0 :         struct btrfs_block_group *sys_bg;
    5654             : 
    5655             :         /*
    5656             :          * When adding a new device for sprouting, the seed device is read-only
    5657             :          * so we must first allocate a metadata and a system chunk. But before
    5658             :          * adding the block group items to the extent, device and chunk btrees,
    5659             :          * we must first:
    5660             :          *
    5661             :          * 1) Create both chunks without doing any changes to the btrees, as
    5662             :          *    otherwise we would get -ENOSPC since the block groups from the
    5663             :          *    seed device are read-only;
    5664             :          *
    5665             :          * 2) Add the device item for the new sprout device - finishing the setup
    5666             :          *    of a new block group requires updating the device item in the chunk
    5667             :          *    btree, so it must exist when we attempt to do it. The previous step
    5668             :          *    ensures this does not fail with -ENOSPC.
    5669             :          *
    5670             :          * After that we can add the block group items to their btrees:
    5671             :          * update existing device item in the chunk btree, add a new block group
    5672             :          * item to the extent btree, add a new chunk item to the chunk btree and
    5673             :          * finally add the new device extent items to the devices btree.
    5674             :          */
    5675             : 
    5676           0 :         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
    5677           0 :         meta_bg = btrfs_create_chunk(trans, alloc_profile);
    5678           0 :         if (IS_ERR(meta_bg))
    5679           0 :                 return PTR_ERR(meta_bg);
    5680             : 
    5681           0 :         alloc_profile = btrfs_system_alloc_profile(fs_info);
    5682           0 :         sys_bg = btrfs_create_chunk(trans, alloc_profile);
    5683           0 :         if (IS_ERR(sys_bg))
    5684           0 :                 return PTR_ERR(sys_bg);
    5685             : 
    5686             :         return 0;
    5687             : }
    5688             : 
    5689             : static inline int btrfs_chunk_max_errors(struct map_lookup *map)
    5690             : {
    5691     8564271 :         const int index = btrfs_bg_flags_to_raid_index(map->type);
    5692             : 
    5693     8564271 :         return btrfs_raid_array[index].tolerated_failures;
    5694             : }
    5695             : 
    5696       24971 : bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
    5697             : {
    5698       24971 :         struct extent_map *em;
    5699       24971 :         struct map_lookup *map;
    5700       24971 :         int miss_ndevs = 0;
    5701       24971 :         int i;
    5702       24971 :         bool ret = true;
    5703             : 
    5704       24971 :         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
    5705       24971 :         if (IS_ERR(em))
    5706             :                 return false;
    5707             : 
    5708       24971 :         map = em->map_lookup;
    5709       56998 :         for (i = 0; i < map->num_stripes; i++) {
    5710       64054 :                 if (test_bit(BTRFS_DEV_STATE_MISSING,
    5711             :                                         &map->stripes[i].dev->dev_state)) {
    5712           0 :                         miss_ndevs++;
    5713           0 :                         continue;
    5714             :                 }
    5715       32027 :                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
    5716             :                                         &map->stripes[i].dev->dev_state)) {
    5717           0 :                         ret = false;
    5718           0 :                         goto end;
    5719             :                 }
    5720             :         }
    5721             : 
    5722             :         /*
    5723             :          * If the number of missing devices is larger than max errors, we can
    5724             :          * not write the data into that chunk successfully.
    5725             :          */
    5726       24971 :         if (miss_ndevs > btrfs_chunk_max_errors(map))
    5727           0 :                 ret = false;
    5728       24971 : end:
    5729       24971 :         free_extent_map(em);
    5730       24971 :         return ret;
    5731             : }
    5732             : 
    5733        3242 : void btrfs_mapping_tree_free(struct extent_map_tree *tree)
    5734             : {
    5735       29085 :         struct extent_map *em;
    5736             : 
    5737       54928 :         while (1) {
    5738       29085 :                 write_lock(&tree->lock);
    5739       29085 :                 em = lookup_extent_mapping(tree, 0, (u64)-1);
    5740       29085 :                 if (em)
    5741       25843 :                         remove_extent_mapping(tree, em);
    5742       29085 :                 write_unlock(&tree->lock);
    5743       29085 :                 if (!em)
    5744             :                         break;
    5745             :                 /* once for us */
    5746       25843 :                 free_extent_map(em);
    5747             :                 /* once for the tree */
    5748       25843 :                 free_extent_map(em);
    5749             :         }
    5750        3242 : }
    5751             : 
    5752    14849584 : int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
    5753             : {
    5754    14849584 :         struct extent_map *em;
    5755    14849584 :         struct map_lookup *map;
    5756    14849584 :         enum btrfs_raid_types index;
    5757    14849584 :         int ret = 1;
    5758             : 
    5759    14849584 :         em = btrfs_get_chunk_map(fs_info, logical, len);
    5760    14851898 :         if (IS_ERR(em))
    5761             :                 /*
    5762             :                  * We could return errors for these cases, but that could get
    5763             :                  * ugly and we'd probably do the same thing which is just not do
    5764             :                  * anything else and exit, so return 1 so the callers don't try
    5765             :                  * to use other copies.
    5766             :                  */
    5767             :                 return 1;
    5768             : 
    5769    14851898 :         map = em->map_lookup;
    5770    14851898 :         index = btrfs_bg_flags_to_raid_index(map->type);
    5771             : 
    5772             :         /* Non-RAID56, use their ncopies from btrfs_raid_array. */
    5773    14851898 :         if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
    5774    14851898 :                 ret = btrfs_raid_array[index].ncopies;
    5775           0 :         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
    5776             :                 ret = 2;
    5777           0 :         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
    5778             :                 /*
    5779             :                  * There could be two corrupted data stripes, we need
    5780             :                  * to loop retry in order to rebuild the correct data.
    5781             :                  *
    5782             :                  * Fail a stripe at a time on every retry except the
    5783             :                  * stripe under reconstruction.
    5784             :                  */
    5785           0 :                 ret = map->num_stripes;
    5786    14851898 :         free_extent_map(em);
    5787    14851898 :         return ret;
    5788             : }
    5789             : 
    5790       26407 : unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
    5791             :                                     u64 logical)
    5792             : {
    5793       26407 :         struct extent_map *em;
    5794       26407 :         struct map_lookup *map;
    5795       26407 :         unsigned long len = fs_info->sectorsize;
    5796             : 
    5797       26407 :         if (!btrfs_fs_incompat(fs_info, RAID56))
    5798             :                 return len;
    5799             : 
    5800           0 :         em = btrfs_get_chunk_map(fs_info, logical, len);
    5801             : 
    5802           0 :         if (!WARN_ON(IS_ERR(em))) {
    5803           0 :                 map = em->map_lookup;
    5804           0 :                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
    5805           0 :                         len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
    5806           0 :                 free_extent_map(em);
    5807             :         }
    5808             :         return len;
    5809             : }
    5810             : 
    5811           0 : int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
    5812             : {
    5813           0 :         struct extent_map *em;
    5814           0 :         struct map_lookup *map;
    5815           0 :         int ret = 0;
    5816             : 
    5817           0 :         if (!btrfs_fs_incompat(fs_info, RAID56))
    5818             :                 return 0;
    5819             : 
    5820           0 :         em = btrfs_get_chunk_map(fs_info, logical, len);
    5821             : 
    5822           0 :         if(!WARN_ON(IS_ERR(em))) {
    5823           0 :                 map = em->map_lookup;
    5824           0 :                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
    5825           0 :                         ret = 1;
    5826           0 :                 free_extent_map(em);
    5827             :         }
    5828             :         return ret;
    5829             : }
    5830             : 
    5831           9 : static int find_live_mirror(struct btrfs_fs_info *fs_info,
    5832             :                             struct map_lookup *map, int first,
    5833             :                             int dev_replace_is_ongoing)
    5834             : {
    5835           9 :         int i;
    5836           9 :         int num_stripes;
    5837           9 :         int preferred_mirror;
    5838           9 :         int tolerance;
    5839           9 :         struct btrfs_device *srcdev;
    5840             : 
    5841           9 :         ASSERT((map->type &
    5842             :                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
    5843             : 
    5844           9 :         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
    5845           0 :                 num_stripes = map->sub_stripes;
    5846             :         else
    5847           9 :                 num_stripes = map->num_stripes;
    5848             : 
    5849           9 :         switch (fs_info->fs_devices->read_policy) {
    5850           0 :         default:
    5851             :                 /* Shouldn't happen, just warn and use pid instead of failing */
    5852           0 :                 btrfs_warn_rl(fs_info,
    5853             :                               "unknown read_policy type %u, reset to pid",
    5854             :                               fs_info->fs_devices->read_policy);
    5855           0 :                 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
    5856           9 :                 fallthrough;
    5857           9 :         case BTRFS_READ_POLICY_PID:
    5858           9 :                 preferred_mirror = first + (current->pid % num_stripes);
    5859           9 :                 break;
    5860             :         }
    5861             : 
    5862           9 :         if (dev_replace_is_ongoing &&
    5863           0 :             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
    5864             :              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
    5865           0 :                 srcdev = fs_info->dev_replace.srcdev;
    5866             :         else
    5867             :                 srcdev = NULL;
    5868             : 
    5869             :         /*
    5870             :          * try to avoid the drive that is the source drive for a
    5871             :          * dev-replace procedure, only choose it if no other non-missing
    5872             :          * mirror is available
    5873             :          */
    5874           9 :         for (tolerance = 0; tolerance < 2; tolerance++) {
    5875           9 :                 if (map->stripes[preferred_mirror].dev->bdev &&
    5876           9 :                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
    5877             :                         return preferred_mirror;
    5878           0 :                 for (i = first; i < first + num_stripes; i++) {
    5879           0 :                         if (map->stripes[i].dev->bdev &&
    5880           0 :                             (tolerance || map->stripes[i].dev != srcdev))
    5881           0 :                                 return i;
    5882             :                 }
    5883             :         }
    5884             : 
    5885             :         /* we couldn't find one that doesn't fail.  Just return something
    5886             :          * and the io error handling code will clean up eventually
    5887             :          */
    5888             :         return preferred_mirror;
    5889             : }
    5890             : 
    5891     8539254 : static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
    5892             :                                                        u16 total_stripes)
    5893             : {
    5894     8539254 :         struct btrfs_io_context *bioc;
    5895             : 
    5896     8539254 :         bioc = kzalloc(
    5897             :                  /* The size of btrfs_io_context */
    5898             :                 sizeof(struct btrfs_io_context) +
    5899             :                 /* Plus the variable array for the stripes */
    5900     8539254 :                 sizeof(struct btrfs_io_stripe) * (total_stripes),
    5901             :                 GFP_NOFS);
    5902             : 
    5903     8539255 :         if (!bioc)
    5904             :                 return NULL;
    5905             : 
    5906     8539255 :         refcount_set(&bioc->refs, 1);
    5907             : 
    5908     8539255 :         bioc->fs_info = fs_info;
    5909     8539255 :         bioc->replace_stripe_src = -1;
    5910     8539255 :         bioc->full_stripe_logical = (u64)-1;
    5911             : 
    5912     8539255 :         return bioc;
    5913             : }
    5914             : 
    5915           0 : void btrfs_get_bioc(struct btrfs_io_context *bioc)
    5916             : {
    5917           0 :         WARN_ON(!refcount_read(&bioc->refs));
    5918           0 :         refcount_inc(&bioc->refs);
    5919           0 : }
    5920             : 
    5921     8539326 : void btrfs_put_bioc(struct btrfs_io_context *bioc)
    5922             : {
    5923     8539326 :         if (!bioc)
    5924             :                 return;
    5925     8539326 :         if (refcount_dec_and_test(&bioc->refs))
    5926     8539326 :                 kfree(bioc);
    5927             : }
    5928             : 
    5929             : /*
    5930             :  * Please note that, discard won't be sent to target device of device
    5931             :  * replace.
    5932             :  */
    5933       33963 : struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
    5934             :                                                u64 logical, u64 *length_ret,
    5935             :                                                u32 *num_stripes)
    5936             : {
    5937       33963 :         struct extent_map *em;
    5938       33963 :         struct map_lookup *map;
    5939       33963 :         struct btrfs_discard_stripe *stripes;
    5940       33963 :         u64 length = *length_ret;
    5941       33963 :         u64 offset;
    5942       33963 :         u32 stripe_nr;
    5943       33963 :         u32 stripe_nr_end;
    5944       33963 :         u32 stripe_cnt;
    5945       33963 :         u64 stripe_end_offset;
    5946       33963 :         u64 stripe_offset;
    5947       33963 :         u32 stripe_index;
    5948       33963 :         u32 factor = 0;
    5949       33963 :         u32 sub_stripes = 0;
    5950       33963 :         u32 stripes_per_dev = 0;
    5951       33963 :         u32 remaining_stripes = 0;
    5952       33963 :         u32 last_stripe = 0;
    5953       33963 :         int ret;
    5954       33963 :         int i;
    5955             : 
    5956       33963 :         em = btrfs_get_chunk_map(fs_info, logical, length);
    5957       33963 :         if (IS_ERR(em))
    5958             :                 return ERR_CAST(em);
    5959             : 
    5960       33963 :         map = em->map_lookup;
    5961             : 
    5962             :         /* we don't discard raid56 yet */
    5963       33963 :         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
    5964           0 :                 ret = -EOPNOTSUPP;
    5965           0 :                 goto out_free_map;
    5966             :         }
    5967             : 
    5968       33963 :         offset = logical - em->start;
    5969       33963 :         length = min_t(u64, em->start + em->len - logical, length);
    5970       33963 :         *length_ret = length;
    5971             : 
    5972             :         /*
    5973             :          * stripe_nr counts the total number of stripes we have to stride
    5974             :          * to get to this block
    5975             :          */
    5976       33963 :         stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
    5977             : 
    5978             :         /* stripe_offset is the offset of this block in its stripe */
    5979       33963 :         stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
    5980             : 
    5981       33963 :         stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
    5982             :                         BTRFS_STRIPE_LEN_SHIFT;
    5983       33963 :         stripe_cnt = stripe_nr_end - stripe_nr;
    5984       33963 :         stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
    5985             :                             (offset + length);
    5986             :         /*
    5987             :          * after this, stripe_nr is the number of stripes on this
    5988             :          * device we have to walk to find the data, and stripe_index is
    5989             :          * the number of our device in the stripe array
    5990             :          */
    5991       33963 :         *num_stripes = 1;
    5992       33963 :         stripe_index = 0;
    5993       33963 :         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
    5994             :                          BTRFS_BLOCK_GROUP_RAID10)) {
    5995           0 :                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
    5996             :                         sub_stripes = 1;
    5997             :                 else
    5998           0 :                         sub_stripes = map->sub_stripes;
    5999             : 
    6000           0 :                 factor = map->num_stripes / sub_stripes;
    6001           0 :                 *num_stripes = min_t(u64, map->num_stripes,
    6002             :                                     sub_stripes * stripe_cnt);
    6003           0 :                 stripe_index = stripe_nr % factor;
    6004           0 :                 stripe_nr /= factor;
    6005           0 :                 stripe_index *= sub_stripes;
    6006             : 
    6007           0 :                 remaining_stripes = stripe_cnt % factor;
    6008           0 :                 stripes_per_dev = stripe_cnt / factor;
    6009           0 :                 last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
    6010       33963 :         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
    6011             :                                 BTRFS_BLOCK_GROUP_DUP)) {
    6012       11384 :                 *num_stripes = map->num_stripes;
    6013             :         } else {
    6014       22579 :                 stripe_index = stripe_nr % map->num_stripes;
    6015       22579 :                 stripe_nr /= map->num_stripes;
    6016             :         }
    6017             : 
    6018       33963 :         stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
    6019       33963 :         if (!stripes) {
    6020           0 :                 ret = -ENOMEM;
    6021           0 :                 goto out_free_map;
    6022             :         }
    6023             : 
    6024       79310 :         for (i = 0; i < *num_stripes; i++) {
    6025       45347 :                 stripes[i].physical =
    6026       45347 :                         map->stripes[stripe_index].physical +
    6027       45347 :                         stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
    6028       45347 :                 stripes[i].dev = map->stripes[stripe_index].dev;
    6029             : 
    6030       45347 :                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
    6031             :                                  BTRFS_BLOCK_GROUP_RAID10)) {
    6032           0 :                         stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
    6033             : 
    6034           0 :                         if (i / sub_stripes < remaining_stripes)
    6035           0 :                                 stripes[i].length += BTRFS_STRIPE_LEN;
    6036             : 
    6037             :                         /*
    6038             :                          * Special for the first stripe and
    6039             :                          * the last stripe:
    6040             :                          *
    6041             :                          * |-------|...|-------|
    6042             :                          *     |----------|
    6043             :                          *    off     end_off
    6044             :                          */
    6045           0 :                         if (i < sub_stripes)
    6046           0 :                                 stripes[i].length -= stripe_offset;
    6047             : 
    6048           0 :                         if (stripe_index >= last_stripe &&
    6049           0 :                             stripe_index <= (last_stripe +
    6050           0 :                                              sub_stripes - 1))
    6051           0 :                                 stripes[i].length -= stripe_end_offset;
    6052             : 
    6053           0 :                         if (i == sub_stripes - 1)
    6054           0 :                                 stripe_offset = 0;
    6055             :                 } else {
    6056       45347 :                         stripes[i].length = length;
    6057             :                 }
    6058             : 
    6059       45347 :                 stripe_index++;
    6060       45347 :                 if (stripe_index == map->num_stripes) {
    6061       33963 :                         stripe_index = 0;
    6062       33963 :                         stripe_nr++;
    6063             :                 }
    6064             :         }
    6065             : 
    6066       33963 :         free_extent_map(em);
    6067       33963 :         return stripes;
    6068           0 : out_free_map:
    6069           0 :         free_extent_map(em);
    6070           0 :         return ERR_PTR(ret);
    6071             : }
    6072             : 
    6073           0 : static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
    6074             : {
    6075           0 :         struct btrfs_block_group *cache;
    6076           0 :         bool ret;
    6077             : 
    6078             :         /* Non zoned filesystem does not use "to_copy" flag */
    6079           0 :         if (!btrfs_is_zoned(fs_info))
    6080             :                 return false;
    6081             : 
    6082           0 :         cache = btrfs_lookup_block_group(fs_info, logical);
    6083             : 
    6084           0 :         ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
    6085             : 
    6086           0 :         btrfs_put_block_group(cache);
    6087           0 :         return ret;
    6088             : }
    6089             : 
    6090           0 : static void handle_ops_on_dev_replace(enum btrfs_map_op op,
    6091             :                                       struct btrfs_io_context *bioc,
    6092             :                                       struct btrfs_dev_replace *dev_replace,
    6093             :                                       u64 logical,
    6094             :                                       int *num_stripes_ret, int *max_errors_ret)
    6095             : {
    6096           0 :         u64 srcdev_devid = dev_replace->srcdev->devid;
    6097             :         /*
    6098             :          * At this stage, num_stripes is still the real number of stripes,
    6099             :          * excluding the duplicated stripes.
    6100             :          */
    6101           0 :         int num_stripes = *num_stripes_ret;
    6102           0 :         int nr_extra_stripes = 0;
    6103           0 :         int max_errors = *max_errors_ret;
    6104           0 :         int i;
    6105             : 
    6106             :         /*
    6107             :          * A block group which has "to_copy" set will eventually be copied by
    6108             :          * the dev-replace process. We can avoid cloning IO here.
    6109             :          */
    6110           0 :         if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
    6111             :                 return;
    6112             : 
    6113             :         /*
    6114             :          * Duplicate the write operations while the dev-replace procedure is
    6115             :          * running. Since the copying of the old disk to the new disk takes
    6116             :          * place at run time while the filesystem is mounted writable, the
    6117             :          * regular write operations to the old disk have to be duplicated to go
    6118             :          * to the new disk as well.
    6119             :          *
    6120             :          * Note that device->missing is handled by the caller, and that the
    6121             :          * write to the old disk is already set up in the stripes array.
    6122             :          */
    6123           0 :         for (i = 0; i < num_stripes; i++) {
    6124           0 :                 struct btrfs_io_stripe *old = &bioc->stripes[i];
    6125           0 :                 struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
    6126             : 
    6127           0 :                 if (old->dev->devid != srcdev_devid)
    6128           0 :                         continue;
    6129             : 
    6130           0 :                 new->physical = old->physical;
    6131           0 :                 new->dev = dev_replace->tgtdev;
    6132           0 :                 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
    6133           0 :                         bioc->replace_stripe_src = i;
    6134           0 :                 nr_extra_stripes++;
    6135             :         }
    6136             : 
    6137             :         /* We can only have at most 2 extra nr_stripes (for DUP). */
    6138           0 :         ASSERT(nr_extra_stripes <= 2);
    6139             :         /*
    6140             :          * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
    6141             :          * replace.
    6142             :          * If we have 2 extra stripes, only choose the one with smaller physical.
    6143             :          */
    6144           0 :         if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
    6145           0 :                 struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
    6146           0 :                 struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
    6147             : 
    6148             :                 /* Only DUP can have two extra stripes. */
    6149           0 :                 ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
    6150             : 
    6151             :                 /*
    6152             :                  * Swap the last stripe stripes and reduce @nr_extra_stripes.
    6153             :                  * The extra stripe would still be there, but won't be accessed.
    6154             :                  */
    6155           0 :                 if (first->physical > second->physical) {
    6156           0 :                         swap(second->physical, first->physical);
    6157           0 :                         swap(second->dev, first->dev);
    6158           0 :                         nr_extra_stripes--;
    6159             :                 }
    6160             :         }
    6161             : 
    6162           0 :         *num_stripes_ret = num_stripes + nr_extra_stripes;
    6163           0 :         *max_errors_ret = max_errors + nr_extra_stripes;
    6164           0 :         bioc->replace_nr_stripes = nr_extra_stripes;
    6165             : }
    6166             : 
    6167    14786629 : static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
    6168             :                             u64 offset, u32 *stripe_nr, u64 *stripe_offset,
    6169             :                             u64 *full_stripe_start)
    6170             : {
    6171             :         /*
    6172             :          * Stripe_nr is the stripe where this block falls.  stripe_offset is
    6173             :          * the offset of this block in its stripe.
    6174             :          */
    6175    14786629 :         *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
    6176    14786629 :         *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
    6177    14786629 :         ASSERT(*stripe_offset < U32_MAX);
    6178             : 
    6179    14786629 :         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
    6180           0 :                 unsigned long full_stripe_len =
    6181           0 :                         btrfs_stripe_nr_to_offset(nr_data_stripes(map));
    6182             : 
    6183             :                 /*
    6184             :                  * For full stripe start, we use previously calculated
    6185             :                  * @stripe_nr. Align it to nr_data_stripes, then multiply with
    6186             :                  * STRIPE_LEN.
    6187             :                  *
    6188             :                  * By this we can avoid u64 division completely.  And we have
    6189             :                  * to go rounddown(), not round_down(), as nr_data_stripes is
    6190             :                  * not ensured to be power of 2.
    6191             :                  */
    6192           0 :                 *full_stripe_start =
    6193             :                         btrfs_stripe_nr_to_offset(
    6194           0 :                                 rounddown(*stripe_nr, nr_data_stripes(map)));
    6195             : 
    6196           0 :                 ASSERT(*full_stripe_start + full_stripe_len > offset);
    6197           0 :                 ASSERT(*full_stripe_start <= offset);
    6198             :                 /*
    6199             :                  * For writes to RAID56, allow to write a full stripe set, but
    6200             :                  * no straddling of stripe sets.
    6201             :                  */
    6202           0 :                 if (op == BTRFS_MAP_WRITE)
    6203           0 :                         return full_stripe_len - (offset - *full_stripe_start);
    6204             :         }
    6205             : 
    6206             :         /*
    6207             :          * For other RAID types and for RAID56 reads, allow a single stripe (on
    6208             :          * a single disk).
    6209             :          */
    6210    14786629 :         if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
    6211           0 :                 return BTRFS_STRIPE_LEN - *stripe_offset;
    6212             :         return U64_MAX;
    6213             : }
    6214             : 
    6215             : static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
    6216             :                           u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
    6217             : {
    6218    23322589 :         dst->dev = map->stripes[stripe_index].dev;
    6219    23322589 :         dst->physical = map->stripes[stripe_index].physical +
    6220    23322589 :                         stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
    6221             : }
    6222             : 
    6223    14784407 : int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
    6224             :                     u64 logical, u64 *length,
    6225             :                     struct btrfs_io_context **bioc_ret,
    6226             :                     struct btrfs_io_stripe *smap, int *mirror_num_ret,
    6227             :                     int need_raid_map)
    6228             : {
    6229    14784407 :         struct extent_map *em;
    6230    14784407 :         struct map_lookup *map;
    6231    14784407 :         u64 map_offset;
    6232    14784407 :         u64 stripe_offset;
    6233    14784407 :         u32 stripe_nr;
    6234    14784407 :         u32 stripe_index;
    6235    14784407 :         int data_stripes;
    6236    14784407 :         int i;
    6237    14784407 :         int ret = 0;
    6238    14784407 :         int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
    6239    14784407 :         int num_stripes;
    6240    14784407 :         int num_copies;
    6241    14784407 :         int max_errors = 0;
    6242    14784407 :         struct btrfs_io_context *bioc = NULL;
    6243    14784407 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    6244    14784407 :         int dev_replace_is_ongoing = 0;
    6245    14784407 :         u16 num_alloc_stripes;
    6246    14784407 :         u64 raid56_full_stripe_start = (u64)-1;
    6247    14784407 :         u64 max_len;
    6248             : 
    6249    14784407 :         ASSERT(bioc_ret);
    6250             : 
    6251    14784407 :         num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
    6252    14786722 :         if (mirror_num > num_copies)
    6253             :                 return -EINVAL;
    6254             : 
    6255    14786722 :         em = btrfs_get_chunk_map(fs_info, logical, *length);
    6256    14786559 :         if (IS_ERR(em))
    6257           0 :                 return PTR_ERR(em);
    6258             : 
    6259    14786559 :         map = em->map_lookup;
    6260    14786559 :         data_stripes = nr_data_stripes(map);
    6261             : 
    6262    14786559 :         map_offset = logical - em->start;
    6263    14786559 :         max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
    6264             :                                    &stripe_offset, &raid56_full_stripe_start);
    6265    14786396 :         *length = min_t(u64, em->len - map_offset, max_len);
    6266             : 
    6267    14786396 :         down_read(&dev_replace->rwsem);
    6268    14782082 :         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
    6269             :         /*
    6270             :          * Hold the semaphore for read during the whole operation, write is
    6271             :          * requested at commit time but must wait.
    6272             :          */
    6273    14782082 :         if (!dev_replace_is_ongoing)
    6274    14782168 :                 up_read(&dev_replace->rwsem);
    6275             : 
    6276    14783997 :         num_stripes = 1;
    6277    14783997 :         stripe_index = 0;
    6278    14783997 :         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
    6279           0 :                 stripe_index = stripe_nr % map->num_stripes;
    6280           0 :                 stripe_nr /= map->num_stripes;
    6281           0 :                 if (op == BTRFS_MAP_READ)
    6282           0 :                         mirror_num = 1;
    6283    14783997 :         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
    6284           9 :                 if (op != BTRFS_MAP_READ) {
    6285           0 :                         num_stripes = map->num_stripes;
    6286           9 :                 } else if (mirror_num) {
    6287           0 :                         stripe_index = mirror_num - 1;
    6288             :                 } else {
    6289           9 :                         stripe_index = find_live_mirror(fs_info, map, 0,
    6290             :                                             dev_replace_is_ongoing);
    6291           9 :                         mirror_num = stripe_index + 1;
    6292             :                 }
    6293             : 
    6294    14783988 :         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
    6295     8635250 :                 if (op != BTRFS_MAP_READ) {
    6296     8539256 :                         num_stripes = map->num_stripes;
    6297       95994 :                 } else if (mirror_num) {
    6298         176 :                         stripe_index = mirror_num - 1;
    6299             :                 } else {
    6300             :                         mirror_num = 1;
    6301             :                 }
    6302             : 
    6303     6148738 :         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
    6304           0 :                 u32 factor = map->num_stripes / map->sub_stripes;
    6305             : 
    6306           0 :                 stripe_index = (stripe_nr % factor) * map->sub_stripes;
    6307           0 :                 stripe_nr /= factor;
    6308             : 
    6309           0 :                 if (op != BTRFS_MAP_READ)
    6310           0 :                         num_stripes = map->sub_stripes;
    6311           0 :                 else if (mirror_num)
    6312           0 :                         stripe_index += mirror_num - 1;
    6313             :                 else {
    6314           0 :                         int old_stripe_index = stripe_index;
    6315           0 :                         stripe_index = find_live_mirror(fs_info, map,
    6316             :                                               stripe_index,
    6317             :                                               dev_replace_is_ongoing);
    6318           0 :                         mirror_num = stripe_index - old_stripe_index + 1;
    6319             :                 }
    6320             : 
    6321     6148738 :         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
    6322           0 :                 if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
    6323             :                         /*
    6324             :                          * Push stripe_nr back to the start of the full stripe
    6325             :                          * For those cases needing a full stripe, @stripe_nr
    6326             :                          * is the full stripe number.
    6327             :                          *
    6328             :                          * Originally we go raid56_full_stripe_start / full_stripe_len,
    6329             :                          * but that can be expensive.  Here we just divide
    6330             :                          * @stripe_nr with @data_stripes.
    6331             :                          */
    6332           0 :                         stripe_nr /= data_stripes;
    6333             : 
    6334             :                         /* RAID[56] write or recovery. Return all stripes */
    6335           0 :                         num_stripes = map->num_stripes;
    6336           0 :                         max_errors = btrfs_chunk_max_errors(map);
    6337             : 
    6338             :                         /* Return the length to the full stripe end */
    6339           0 :                         *length = min(logical + *length,
    6340             :                                       raid56_full_stripe_start + em->start +
    6341           0 :                                       btrfs_stripe_nr_to_offset(data_stripes)) -
    6342             :                                   logical;
    6343           0 :                         stripe_index = 0;
    6344           0 :                         stripe_offset = 0;
    6345             :                 } else {
    6346             :                         /*
    6347             :                          * Mirror #0 or #1 means the original data block.
    6348             :                          * Mirror #2 is RAID5 parity block.
    6349             :                          * Mirror #3 is RAID6 Q block.
    6350             :                          */
    6351           0 :                         stripe_index = stripe_nr % data_stripes;
    6352           0 :                         stripe_nr /= data_stripes;
    6353           0 :                         if (mirror_num > 1)
    6354           0 :                                 stripe_index = data_stripes + mirror_num - 2;
    6355             : 
    6356             :                         /* We distribute the parity blocks across stripes */
    6357           0 :                         stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
    6358           0 :                         if (op == BTRFS_MAP_READ && mirror_num <= 1)
    6359           0 :                                 mirror_num = 1;
    6360             :                 }
    6361             :         } else {
    6362             :                 /*
    6363             :                  * After this, stripe_nr is the number of stripes on this
    6364             :                  * device we have to walk to find the data, and stripe_index is
    6365             :                  * the number of our device in the stripe array
    6366             :                  */
    6367     6148738 :                 stripe_index = stripe_nr % map->num_stripes;
    6368     6148738 :                 stripe_nr /= map->num_stripes;
    6369     6148738 :                 mirror_num = stripe_index + 1;
    6370             :         }
    6371    14783997 :         if (stripe_index >= map->num_stripes) {
    6372           0 :                 btrfs_crit(fs_info,
    6373             :                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
    6374             :                            stripe_index, map->num_stripes);
    6375           0 :                 ret = -EINVAL;
    6376           0 :                 goto out;
    6377             :         }
    6378             : 
    6379    14783997 :         num_alloc_stripes = num_stripes;
    6380    14783997 :         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
    6381             :             op != BTRFS_MAP_READ)
    6382             :                 /*
    6383             :                  * For replace case, we need to add extra stripes for extra
    6384             :                  * duplicated stripes.
    6385             :                  *
    6386             :                  * For both WRITE and GET_READ_MIRRORS, we may have at most
    6387             :                  * 2 more stripes (DUP types, otherwise 1).
    6388             :                  */
    6389           0 :                 num_alloc_stripes += 2;
    6390             : 
    6391             :         /*
    6392             :          * If this I/O maps to a single device, try to return the device and
    6393             :          * physical block information on the stack instead of allocating an
    6394             :          * I/O context structure.
    6395             :          */
    6396    14783997 :         if (smap && num_alloc_stripes == 1 &&
    6397     6244118 :             !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
    6398     6244118 :             (op == BTRFS_MAP_READ || !dev_replace_is_ongoing ||
    6399           0 :              !dev_replace->tgtdev)) {
    6400     6244118 :                 set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
    6401     6244118 :                 if (mirror_num_ret)
    6402     6244118 :                         *mirror_num_ret = mirror_num;
    6403     6244118 :                 *bioc_ret = NULL;
    6404     6244118 :                 ret = 0;
    6405     6244118 :                 goto out;
    6406             :         }
    6407             : 
    6408     8539879 :         bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
    6409     8539256 :         if (!bioc) {
    6410           0 :                 ret = -ENOMEM;
    6411           0 :                 goto out;
    6412             :         }
    6413     8539256 :         bioc->map_type = map->type;
    6414             : 
    6415             :         /*
    6416             :          * For RAID56 full map, we need to make sure the stripes[] follows the
    6417             :          * rule that data stripes are all ordered, then followed with P and Q
    6418             :          * (if we have).
    6419             :          *
    6420             :          * It's still mostly the same as other profiles, just with extra rotation.
    6421             :          */
    6422     8539256 :         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
    6423           0 :             (op != BTRFS_MAP_READ || mirror_num > 1)) {
    6424             :                 /*
    6425             :                  * For RAID56 @stripe_nr is already the number of full stripes
    6426             :                  * before us, which is also the rotation value (needs to modulo
    6427             :                  * with num_stripes).
    6428             :                  *
    6429             :                  * In this case, we just add @stripe_nr with @i, then do the
    6430             :                  * modulo, to reduce one modulo call.
    6431             :                  */
    6432           0 :                 bioc->full_stripe_logical = em->start +
    6433           0 :                         btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
    6434           0 :                 for (i = 0; i < num_stripes; i++)
    6435           0 :                         set_io_stripe(&bioc->stripes[i], map,
    6436           0 :                                       (i + stripe_nr) % num_stripes,
    6437             :                                       stripe_offset, stripe_nr);
    6438             :         } else {
    6439             :                 /*
    6440             :                  * For all other non-RAID56 profiles, just copy the target
    6441             :                  * stripe into the bioc.
    6442             :                  */
    6443    25617727 :                 for (i = 0; i < num_stripes; i++) {
    6444    17078471 :                         set_io_stripe(&bioc->stripes[i], map, stripe_index,
    6445             :                                       stripe_offset, stripe_nr);
    6446    17078471 :                         stripe_index++;
    6447             :                 }
    6448             :         }
    6449             : 
    6450     8539256 :         if (op != BTRFS_MAP_READ)
    6451     8539300 :                 max_errors = btrfs_chunk_max_errors(map);
    6452             : 
    6453     8539256 :         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
    6454             :             op != BTRFS_MAP_READ) {
    6455           0 :                 handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
    6456             :                                           &num_stripes, &max_errors);
    6457             :         }
    6458             : 
    6459     8539256 :         *bioc_ret = bioc;
    6460     8539256 :         bioc->num_stripes = num_stripes;
    6461     8539256 :         bioc->max_errors = max_errors;
    6462     8539256 :         bioc->mirror_num = mirror_num;
    6463             : 
    6464    14783374 : out:
    6465    14783374 :         if (dev_replace_is_ongoing) {
    6466           0 :                 lockdep_assert_held(&dev_replace->rwsem);
    6467             :                 /* Unlock and let waiting writers proceed */
    6468           0 :                 up_read(&dev_replace->rwsem);
    6469             :         }
    6470    14783374 :         free_extent_map(em);
    6471    14783374 :         return ret;
    6472             : }
    6473             : 
    6474       74035 : static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
    6475             :                                       const struct btrfs_fs_devices *fs_devices)
    6476             : {
    6477       74035 :         if (args->fsid == NULL)
    6478             :                 return true;
    6479        6434 :         if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
    6480        3217 :                 return true;
    6481             :         return false;
    6482             : }
    6483             : 
    6484       74048 : static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
    6485             :                                   const struct btrfs_device *device)
    6486             : {
    6487       74048 :         if (args->missing) {
    6488           0 :                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
    6489           0 :                     !device->bdev)
    6490             :                         return true;
    6491           0 :                 return false;
    6492             :         }
    6493             : 
    6494       74048 :         if (device->devid != args->devid)
    6495             :                 return false;
    6496      109516 :         if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
    6497           0 :                 return false;
    6498             :         return true;
    6499             : }
    6500             : 
    6501             : /*
    6502             :  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
    6503             :  * return NULL.
    6504             :  *
    6505             :  * If devid and uuid are both specified, the match must be exact, otherwise
    6506             :  * only devid is used.
    6507             :  */
    6508       74035 : struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
    6509             :                                        const struct btrfs_dev_lookup_args *args)
    6510             : {
    6511       74035 :         struct btrfs_device *device;
    6512       74035 :         struct btrfs_fs_devices *seed_devs;
    6513             : 
    6514       74035 :         if (dev_args_match_fs_devices(args, fs_devices)) {
    6515       77289 :                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
    6516       74048 :                         if (dev_args_match_device(args, device))
    6517       70794 :                                 return device;
    6518             :                 }
    6519             :         }
    6520             : 
    6521        3241 :         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
    6522           0 :                 if (!dev_args_match_fs_devices(args, seed_devs))
    6523           0 :                         continue;
    6524           0 :                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
    6525           0 :                         if (dev_args_match_device(args, device))
    6526           0 :                                 return device;
    6527             :                 }
    6528             :         }
    6529             : 
    6530             :         return NULL;
    6531             : }
    6532             : 
    6533           0 : static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
    6534             :                                             u64 devid, u8 *dev_uuid)
    6535             : {
    6536           0 :         struct btrfs_device *device;
    6537           0 :         unsigned int nofs_flag;
    6538             : 
    6539             :         /*
    6540             :          * We call this under the chunk_mutex, so we want to use NOFS for this
    6541             :          * allocation, however we don't want to change btrfs_alloc_device() to
    6542             :          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
    6543             :          * places.
    6544             :          */
    6545             : 
    6546           0 :         nofs_flag = memalloc_nofs_save();
    6547           0 :         device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
    6548           0 :         memalloc_nofs_restore(nofs_flag);
    6549           0 :         if (IS_ERR(device))
    6550             :                 return device;
    6551             : 
    6552           0 :         list_add(&device->dev_list, &fs_devices->devices);
    6553           0 :         device->fs_devices = fs_devices;
    6554           0 :         fs_devices->num_devices++;
    6555             : 
    6556           0 :         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
    6557           0 :         fs_devices->missing_devices++;
    6558             : 
    6559           0 :         return device;
    6560             : }
    6561             : 
    6562             : /*
    6563             :  * Allocate new device struct, set up devid and UUID.
    6564             :  *
    6565             :  * @fs_info:    used only for generating a new devid, can be NULL if
    6566             :  *              devid is provided (i.e. @devid != NULL).
    6567             :  * @devid:      a pointer to devid for this device.  If NULL a new devid
    6568             :  *              is generated.
    6569             :  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
    6570             :  *              is generated.
    6571             :  * @path:       a pointer to device path if available, NULL otherwise.
    6572             :  *
    6573             :  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
    6574             :  * on error.  Returned struct is not linked onto any lists and must be
    6575             :  * destroyed with btrfs_free_device.
    6576             :  */
    6577        3265 : struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
    6578             :                                         const u64 *devid, const u8 *uuid,
    6579             :                                         const char *path)
    6580             : {
    6581        3265 :         struct btrfs_device *dev;
    6582        3265 :         u64 tmp;
    6583             : 
    6584        3265 :         if (WARN_ON(!devid && !fs_info))
    6585             :                 return ERR_PTR(-EINVAL);
    6586             : 
    6587        3265 :         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
    6588        3265 :         if (!dev)
    6589             :                 return ERR_PTR(-ENOMEM);
    6590             : 
    6591        3265 :         INIT_LIST_HEAD(&dev->dev_list);
    6592        3265 :         INIT_LIST_HEAD(&dev->dev_alloc_list);
    6593        3265 :         INIT_LIST_HEAD(&dev->post_commit_list);
    6594             : 
    6595        3265 :         atomic_set(&dev->dev_stats_ccnt, 0);
    6596        3265 :         btrfs_device_data_ordered_init(dev);
    6597        3265 :         extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
    6598             : 
    6599        3265 :         if (devid)
    6600        3265 :                 tmp = *devid;
    6601             :         else {
    6602           0 :                 int ret;
    6603             : 
    6604           0 :                 ret = find_next_devid(fs_info, &tmp);
    6605           0 :                 if (ret) {
    6606           0 :                         btrfs_free_device(dev);
    6607           0 :                         return ERR_PTR(ret);
    6608             :                 }
    6609             :         }
    6610        3265 :         dev->devid = tmp;
    6611             : 
    6612        3265 :         if (uuid)
    6613        6530 :                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
    6614             :         else
    6615           0 :                 generate_random_uuid(dev->uuid);
    6616             : 
    6617        3265 :         if (path) {
    6618        3265 :                 struct rcu_string *name;
    6619             : 
    6620        3265 :                 name = rcu_string_strdup(path, GFP_KERNEL);
    6621        3265 :                 if (!name) {
    6622           0 :                         btrfs_free_device(dev);
    6623           0 :                         return ERR_PTR(-ENOMEM);
    6624             :                 }
    6625        3265 :                 rcu_assign_pointer(dev->name, name);
    6626             :         }
    6627             : 
    6628             :         return dev;
    6629             : }
    6630             : 
    6631           0 : static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
    6632             :                                         u64 devid, u8 *uuid, bool error)
    6633             : {
    6634           0 :         if (error)
    6635           0 :                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
    6636             :                               devid, uuid);
    6637             :         else
    6638           0 :                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
    6639             :                               devid, uuid);
    6640           0 : }
    6641             : 
    6642       57024 : u64 btrfs_calc_stripe_length(const struct extent_map *em)
    6643             : {
    6644       57024 :         const struct map_lookup *map = em->map_lookup;
    6645       57024 :         const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
    6646             : 
    6647       57024 :         return div_u64(em->len, data_stripes);
    6648             : }
    6649             : 
    6650             : #if BITS_PER_LONG == 32
    6651             : /*
    6652             :  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
    6653             :  * can't be accessed on 32bit systems.
    6654             :  *
    6655             :  * This function do mount time check to reject the fs if it already has
    6656             :  * metadata chunk beyond that limit.
    6657             :  */
    6658             : static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
    6659             :                                   u64 logical, u64 length, u64 type)
    6660             : {
    6661             :         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
    6662             :                 return 0;
    6663             : 
    6664             :         if (logical + length < MAX_LFS_FILESIZE)
    6665             :                 return 0;
    6666             : 
    6667             :         btrfs_err_32bit_limit(fs_info);
    6668             :         return -EOVERFLOW;
    6669             : }
    6670             : 
    6671             : /*
    6672             :  * This is to give early warning for any metadata chunk reaching
    6673             :  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
    6674             :  * Although we can still access the metadata, it's not going to be possible
    6675             :  * once the limit is reached.
    6676             :  */
    6677             : static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
    6678             :                                   u64 logical, u64 length, u64 type)
    6679             : {
    6680             :         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
    6681             :                 return;
    6682             : 
    6683             :         if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
    6684             :                 return;
    6685             : 
    6686             :         btrfs_warn_32bit_limit(fs_info);
    6687             : }
    6688             : #endif
    6689             : 
    6690           0 : static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
    6691             :                                                   u64 devid, u8 *uuid)
    6692             : {
    6693           0 :         struct btrfs_device *dev;
    6694             : 
    6695           0 :         if (!btrfs_test_opt(fs_info, DEGRADED)) {
    6696           0 :                 btrfs_report_missing_device(fs_info, devid, uuid, true);
    6697           0 :                 return ERR_PTR(-ENOENT);
    6698             :         }
    6699             : 
    6700           0 :         dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
    6701           0 :         if (IS_ERR(dev)) {
    6702           0 :                 btrfs_err(fs_info, "failed to init missing device %llu: %ld",
    6703             :                           devid, PTR_ERR(dev));
    6704           0 :                 return dev;
    6705             :         }
    6706           0 :         btrfs_report_missing_device(fs_info, devid, uuid, false);
    6707             : 
    6708           0 :         return dev;
    6709             : }
    6710             : 
    6711       28195 : static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
    6712             :                           struct btrfs_chunk *chunk)
    6713             : {
    6714       28195 :         BTRFS_DEV_LOOKUP_ARGS(args);
    6715       28195 :         struct btrfs_fs_info *fs_info = leaf->fs_info;
    6716       28195 :         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
    6717       28195 :         struct map_lookup *map;
    6718       28195 :         struct extent_map *em;
    6719       28195 :         u64 logical;
    6720       28195 :         u64 length;
    6721       28195 :         u64 devid;
    6722       28195 :         u64 type;
    6723       28195 :         u8 uuid[BTRFS_UUID_SIZE];
    6724       28195 :         int index;
    6725       28195 :         int num_stripes;
    6726       28195 :         int ret;
    6727       28195 :         int i;
    6728             : 
    6729       28195 :         logical = key->offset;
    6730       28195 :         length = btrfs_chunk_length(leaf, chunk);
    6731       28195 :         type = btrfs_chunk_type(leaf, chunk);
    6732       28195 :         index = btrfs_bg_flags_to_raid_index(type);
    6733       28195 :         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
    6734             : 
    6735             : #if BITS_PER_LONG == 32
    6736             :         ret = check_32bit_meta_chunk(fs_info, logical, length, type);
    6737             :         if (ret < 0)
    6738             :                 return ret;
    6739             :         warn_32bit_meta_chunk(fs_info, logical, length, type);
    6740             : #endif
    6741             : 
    6742             :         /*
    6743             :          * Only need to verify chunk item if we're reading from sys chunk array,
    6744             :          * as chunk item in tree block is already verified by tree-checker.
    6745             :          */
    6746       28195 :         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
    6747        3221 :                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
    6748        3221 :                 if (ret)
    6749             :                         return ret;
    6750             :         }
    6751             : 
    6752       28195 :         read_lock(&map_tree->lock);
    6753       28195 :         em = lookup_extent_mapping(map_tree, logical, 1);
    6754       28195 :         read_unlock(&map_tree->lock);
    6755             : 
    6756             :         /* already mapped? */
    6757       28195 :         if (em && em->start <= logical && em->start + em->len > logical) {
    6758        3221 :                 free_extent_map(em);
    6759        3221 :                 return 0;
    6760       24974 :         } else if (em) {
    6761           0 :                 free_extent_map(em);
    6762             :         }
    6763             : 
    6764       24974 :         em = alloc_extent_map();
    6765       24974 :         if (!em)
    6766             :                 return -ENOMEM;
    6767       24974 :         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
    6768       24974 :         if (!map) {
    6769           0 :                 free_extent_map(em);
    6770           0 :                 return -ENOMEM;
    6771             :         }
    6772             : 
    6773       24974 :         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
    6774       24974 :         em->map_lookup = map;
    6775       24974 :         em->start = logical;
    6776       24974 :         em->len = length;
    6777       24974 :         em->orig_start = 0;
    6778       24974 :         em->block_start = 0;
    6779       24974 :         em->block_len = em->len;
    6780             : 
    6781       24974 :         map->num_stripes = num_stripes;
    6782       24974 :         map->io_width = btrfs_chunk_io_width(leaf, chunk);
    6783       24974 :         map->io_align = btrfs_chunk_io_align(leaf, chunk);
    6784       24974 :         map->type = type;
    6785             :         /*
    6786             :          * We can't use the sub_stripes value, as for profiles other than
    6787             :          * RAID10, they may have 0 as sub_stripes for filesystems created by
    6788             :          * older mkfs (<v5.4).
    6789             :          * In that case, it can cause divide-by-zero errors later.
    6790             :          * Since currently sub_stripes is fixed for each profile, let's
    6791             :          * use the trusted value instead.
    6792             :          */
    6793       24974 :         map->sub_stripes = btrfs_raid_array[index].sub_stripes;
    6794       24974 :         map->verified_stripes = 0;
    6795       24974 :         em->orig_block_len = btrfs_calc_stripe_length(em);
    6796       57004 :         for (i = 0; i < num_stripes; i++) {
    6797       32030 :                 map->stripes[i].physical =
    6798             :                         btrfs_stripe_offset_nr(leaf, chunk, i);
    6799       32030 :                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
    6800       32030 :                 args.devid = devid;
    6801       32030 :                 read_extent_buffer(leaf, uuid, (unsigned long)
    6802             :                                    btrfs_stripe_dev_uuid_nr(chunk, i),
    6803             :                                    BTRFS_UUID_SIZE);
    6804       32030 :                 args.uuid = uuid;
    6805       32030 :                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
    6806       32030 :                 if (!map->stripes[i].dev) {
    6807           0 :                         map->stripes[i].dev = handle_missing_device(fs_info,
    6808             :                                                                     devid, uuid);
    6809           0 :                         if (IS_ERR(map->stripes[i].dev)) {
    6810           0 :                                 ret = PTR_ERR(map->stripes[i].dev);
    6811           0 :                                 free_extent_map(em);
    6812           0 :                                 return ret;
    6813             :                         }
    6814             :                 }
    6815             : 
    6816       32030 :                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
    6817       32030 :                                 &(map->stripes[i].dev->dev_state));
    6818             :         }
    6819             : 
    6820       24974 :         write_lock(&map_tree->lock);
    6821       24974 :         ret = add_extent_mapping(map_tree, em, 0);
    6822       24974 :         write_unlock(&map_tree->lock);
    6823       24974 :         if (ret < 0) {
    6824           0 :                 btrfs_err(fs_info,
    6825             :                           "failed to add chunk map, start=%llu len=%llu: %d",
    6826             :                           em->start, em->len, ret);
    6827             :         }
    6828       24974 :         free_extent_map(em);
    6829             : 
    6830       24974 :         return ret;
    6831             : }
    6832             : 
    6833        3217 : static void fill_device_from_item(struct extent_buffer *leaf,
    6834             :                                  struct btrfs_dev_item *dev_item,
    6835             :                                  struct btrfs_device *device)
    6836             : {
    6837        3217 :         unsigned long ptr;
    6838             : 
    6839        3217 :         device->devid = btrfs_device_id(leaf, dev_item);
    6840        3217 :         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
    6841        3217 :         device->total_bytes = device->disk_total_bytes;
    6842        3217 :         device->commit_total_bytes = device->disk_total_bytes;
    6843        3217 :         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
    6844        3217 :         device->commit_bytes_used = device->bytes_used;
    6845        3217 :         device->type = btrfs_device_type(leaf, dev_item);
    6846        3217 :         device->io_align = btrfs_device_io_align(leaf, dev_item);
    6847        3217 :         device->io_width = btrfs_device_io_width(leaf, dev_item);
    6848        3217 :         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
    6849        3217 :         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
    6850        3217 :         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
    6851             : 
    6852        3217 :         ptr = btrfs_device_uuid(dev_item);
    6853        3217 :         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
    6854        3217 : }
    6855             : 
    6856           0 : static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
    6857             :                                                   u8 *fsid)
    6858             : {
    6859           0 :         struct btrfs_fs_devices *fs_devices;
    6860           0 :         int ret;
    6861             : 
    6862           0 :         lockdep_assert_held(&uuid_mutex);
    6863           0 :         ASSERT(fsid);
    6864             : 
    6865             :         /* This will match only for multi-device seed fs */
    6866           0 :         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
    6867           0 :                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
    6868           0 :                         return fs_devices;
    6869             : 
    6870             : 
    6871           0 :         fs_devices = find_fsid(fsid, NULL);
    6872           0 :         if (!fs_devices) {
    6873           0 :                 if (!btrfs_test_opt(fs_info, DEGRADED))
    6874             :                         return ERR_PTR(-ENOENT);
    6875             : 
    6876           0 :                 fs_devices = alloc_fs_devices(fsid, NULL);
    6877           0 :                 if (IS_ERR(fs_devices))
    6878             :                         return fs_devices;
    6879             : 
    6880           0 :                 fs_devices->seeding = true;
    6881           0 :                 fs_devices->opened = 1;
    6882           0 :                 return fs_devices;
    6883             :         }
    6884             : 
    6885             :         /*
    6886             :          * Upon first call for a seed fs fsid, just create a private copy of the
    6887             :          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
    6888             :          */
    6889           0 :         fs_devices = clone_fs_devices(fs_devices);
    6890           0 :         if (IS_ERR(fs_devices))
    6891             :                 return fs_devices;
    6892             : 
    6893           0 :         ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
    6894           0 :         if (ret) {
    6895           0 :                 free_fs_devices(fs_devices);
    6896           0 :                 return ERR_PTR(ret);
    6897             :         }
    6898             : 
    6899           0 :         if (!fs_devices->seeding) {
    6900           0 :                 close_fs_devices(fs_devices);
    6901           0 :                 free_fs_devices(fs_devices);
    6902           0 :                 return ERR_PTR(-EINVAL);
    6903             :         }
    6904             : 
    6905           0 :         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
    6906             : 
    6907           0 :         return fs_devices;
    6908             : }
    6909             : 
    6910        3217 : static int read_one_dev(struct extent_buffer *leaf,
    6911             :                         struct btrfs_dev_item *dev_item)
    6912             : {
    6913        3217 :         BTRFS_DEV_LOOKUP_ARGS(args);
    6914        3217 :         struct btrfs_fs_info *fs_info = leaf->fs_info;
    6915        3217 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    6916        3217 :         struct btrfs_device *device;
    6917        3217 :         u64 devid;
    6918        3217 :         int ret;
    6919        3217 :         u8 fs_uuid[BTRFS_FSID_SIZE];
    6920        3217 :         u8 dev_uuid[BTRFS_UUID_SIZE];
    6921             : 
    6922        3217 :         devid = btrfs_device_id(leaf, dev_item);
    6923        3217 :         args.devid = devid;
    6924        3217 :         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
    6925             :                            BTRFS_UUID_SIZE);
    6926        3217 :         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
    6927             :                            BTRFS_FSID_SIZE);
    6928        3217 :         args.uuid = dev_uuid;
    6929        3217 :         args.fsid = fs_uuid;
    6930             : 
    6931        6434 :         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
    6932           0 :                 fs_devices = open_seed_devices(fs_info, fs_uuid);
    6933           0 :                 if (IS_ERR(fs_devices))
    6934           0 :                         return PTR_ERR(fs_devices);
    6935             :         }
    6936             : 
    6937        3217 :         device = btrfs_find_device(fs_info->fs_devices, &args);
    6938        3217 :         if (!device) {
    6939           0 :                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
    6940           0 :                         btrfs_report_missing_device(fs_info, devid,
    6941             :                                                         dev_uuid, true);
    6942           0 :                         return -ENOENT;
    6943             :                 }
    6944             : 
    6945           0 :                 device = add_missing_dev(fs_devices, devid, dev_uuid);
    6946           0 :                 if (IS_ERR(device)) {
    6947           0 :                         btrfs_err(fs_info,
    6948             :                                 "failed to add missing dev %llu: %ld",
    6949             :                                 devid, PTR_ERR(device));
    6950           0 :                         return PTR_ERR(device);
    6951             :                 }
    6952           0 :                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
    6953             :         } else {
    6954        3217 :                 if (!device->bdev) {
    6955           0 :                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
    6956           0 :                                 btrfs_report_missing_device(fs_info,
    6957             :                                                 devid, dev_uuid, true);
    6958           0 :                                 return -ENOENT;
    6959             :                         }
    6960           0 :                         btrfs_report_missing_device(fs_info, devid,
    6961             :                                                         dev_uuid, false);
    6962             :                 }
    6963             : 
    6964        3217 :                 if (!device->bdev &&
    6965           0 :                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
    6966             :                         /*
    6967             :                          * this happens when a device that was properly setup
    6968             :                          * in the device info lists suddenly goes bad.
    6969             :                          * device->bdev is NULL, and so we have to set
    6970             :                          * device->missing to one here
    6971             :                          */
    6972           0 :                         device->fs_devices->missing_devices++;
    6973           0 :                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
    6974             :                 }
    6975             : 
    6976             :                 /* Move the device to its own fs_devices */
    6977        3217 :                 if (device->fs_devices != fs_devices) {
    6978           0 :                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
    6979             :                                                         &device->dev_state));
    6980             : 
    6981           0 :                         list_move(&device->dev_list, &fs_devices->devices);
    6982           0 :                         device->fs_devices->num_devices--;
    6983           0 :                         fs_devices->num_devices++;
    6984             : 
    6985           0 :                         device->fs_devices->missing_devices--;
    6986           0 :                         fs_devices->missing_devices++;
    6987             : 
    6988           0 :                         device->fs_devices = fs_devices;
    6989             :                 }
    6990             :         }
    6991             : 
    6992        3217 :         if (device->fs_devices != fs_info->fs_devices) {
    6993           0 :                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
    6994           0 :                 if (device->generation !=
    6995             :                     btrfs_device_generation(leaf, dev_item))
    6996             :                         return -EINVAL;
    6997             :         }
    6998             : 
    6999        3217 :         fill_device_from_item(leaf, dev_item, device);
    7000        3217 :         if (device->bdev) {
    7001        3217 :                 u64 max_total_bytes = bdev_nr_bytes(device->bdev);
    7002             : 
    7003        3217 :                 if (device->total_bytes > max_total_bytes) {
    7004           0 :                         btrfs_err(fs_info,
    7005             :                         "device total_bytes should be at most %llu but found %llu",
    7006             :                                   max_total_bytes, device->total_bytes);
    7007           0 :                         return -EINVAL;
    7008             :                 }
    7009             :         }
    7010        3217 :         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
    7011        6434 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
    7012           0 :            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
    7013        3217 :                 device->fs_devices->total_rw_bytes += device->total_bytes;
    7014        3217 :                 atomic64_add(device->total_bytes - device->bytes_used,
    7015             :                                 &fs_info->free_chunk_space);
    7016             :         }
    7017             :         ret = 0;
    7018             :         return ret;
    7019             : }
    7020             : 
    7021        3216 : int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
    7022             : {
    7023        3216 :         struct btrfs_super_block *super_copy = fs_info->super_copy;
    7024        3216 :         struct extent_buffer *sb;
    7025        3216 :         struct btrfs_disk_key *disk_key;
    7026        3216 :         struct btrfs_chunk *chunk;
    7027        3216 :         u8 *array_ptr;
    7028        3216 :         unsigned long sb_array_offset;
    7029        3216 :         int ret = 0;
    7030        3216 :         u32 num_stripes;
    7031        3216 :         u32 array_size;
    7032        3216 :         u32 len = 0;
    7033        3216 :         u32 cur_offset;
    7034        3216 :         u64 type;
    7035        3216 :         struct btrfs_key key;
    7036             : 
    7037        3216 :         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
    7038             : 
    7039             :         /*
    7040             :          * We allocated a dummy extent, just to use extent buffer accessors.
    7041             :          * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
    7042             :          * that's fine, we will not go beyond system chunk array anyway.
    7043             :          */
    7044        3216 :         sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
    7045        3216 :         if (!sb)
    7046             :                 return -ENOMEM;
    7047        3216 :         set_extent_buffer_uptodate(sb);
    7048             : 
    7049        3216 :         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
    7050        3216 :         array_size = btrfs_super_sys_array_size(super_copy);
    7051             : 
    7052        3216 :         array_ptr = super_copy->sys_chunk_array;
    7053        3216 :         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
    7054        3216 :         cur_offset = 0;
    7055             : 
    7056        6437 :         while (cur_offset < array_size) {
    7057        3221 :                 disk_key = (struct btrfs_disk_key *)array_ptr;
    7058        3221 :                 len = sizeof(*disk_key);
    7059        3221 :                 if (cur_offset + len > array_size)
    7060           0 :                         goto out_short_read;
    7061             : 
    7062        3221 :                 btrfs_disk_key_to_cpu(&key, disk_key);
    7063             : 
    7064        3221 :                 array_ptr += len;
    7065        3221 :                 sb_array_offset += len;
    7066        3221 :                 cur_offset += len;
    7067             : 
    7068        3221 :                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
    7069           0 :                         btrfs_err(fs_info,
    7070             :                             "unexpected item type %u in sys_array at offset %u",
    7071             :                                   (u32)key.type, cur_offset);
    7072           0 :                         ret = -EIO;
    7073           0 :                         break;
    7074             :                 }
    7075             : 
    7076        3221 :                 chunk = (struct btrfs_chunk *)sb_array_offset;
    7077             :                 /*
    7078             :                  * At least one btrfs_chunk with one stripe must be present,
    7079             :                  * exact stripe count check comes afterwards
    7080             :                  */
    7081        3221 :                 len = btrfs_chunk_item_size(1);
    7082        3221 :                 if (cur_offset + len > array_size)
    7083           0 :                         goto out_short_read;
    7084             : 
    7085        3221 :                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
    7086        3221 :                 if (!num_stripes) {
    7087           0 :                         btrfs_err(fs_info,
    7088             :                         "invalid number of stripes %u in sys_array at offset %u",
    7089             :                                   num_stripes, cur_offset);
    7090           0 :                         ret = -EIO;
    7091           0 :                         break;
    7092             :                 }
    7093             : 
    7094        3221 :                 type = btrfs_chunk_type(sb, chunk);
    7095        3221 :                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
    7096           0 :                         btrfs_err(fs_info,
    7097             :                         "invalid chunk type %llu in sys_array at offset %u",
    7098             :                                   type, cur_offset);
    7099           0 :                         ret = -EIO;
    7100           0 :                         break;
    7101             :                 }
    7102             : 
    7103        3221 :                 len = btrfs_chunk_item_size(num_stripes);
    7104        3221 :                 if (cur_offset + len > array_size)
    7105           0 :                         goto out_short_read;
    7106             : 
    7107        3221 :                 ret = read_one_chunk(&key, sb, chunk);
    7108        3221 :                 if (ret)
    7109             :                         break;
    7110             : 
    7111        3221 :                 array_ptr += len;
    7112        3221 :                 sb_array_offset += len;
    7113        3221 :                 cur_offset += len;
    7114             :         }
    7115        3216 :         clear_extent_buffer_uptodate(sb);
    7116        3216 :         free_extent_buffer_stale(sb);
    7117        3216 :         return ret;
    7118             : 
    7119           0 : out_short_read:
    7120           0 :         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
    7121             :                         len, cur_offset);
    7122           0 :         clear_extent_buffer_uptodate(sb);
    7123           0 :         free_extent_buffer_stale(sb);
    7124           0 :         return -EIO;
    7125             : }
    7126             : 
    7127             : /*
    7128             :  * Check if all chunks in the fs are OK for read-write degraded mount
    7129             :  *
    7130             :  * If the @failing_dev is specified, it's accounted as missing.
    7131             :  *
    7132             :  * Return true if all chunks meet the minimal RW mount requirements.
    7133             :  * Return false if any chunk doesn't meet the minimal RW mount requirements.
    7134             :  */
    7135           2 : bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
    7136             :                                         struct btrfs_device *failing_dev)
    7137             : {
    7138           2 :         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
    7139           2 :         struct extent_map *em;
    7140           2 :         u64 next_start = 0;
    7141           2 :         bool ret = true;
    7142             : 
    7143           2 :         read_lock(&map_tree->lock);
    7144           2 :         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
    7145           2 :         read_unlock(&map_tree->lock);
    7146             :         /* No chunk at all? Return false anyway */
    7147           2 :         if (!em) {
    7148           0 :                 ret = false;
    7149           0 :                 goto out;
    7150             :         }
    7151           8 :         while (em) {
    7152           6 :                 struct map_lookup *map;
    7153           6 :                 int missing = 0;
    7154           6 :                 int max_tolerated;
    7155           6 :                 int i;
    7156             : 
    7157           6 :                 map = em->map_lookup;
    7158           6 :                 max_tolerated =
    7159           6 :                         btrfs_get_num_tolerated_disk_barrier_failures(
    7160             :                                         map->type);
    7161          22 :                 for (i = 0; i < map->num_stripes; i++) {
    7162          10 :                         struct btrfs_device *dev = map->stripes[i].dev;
    7163             : 
    7164          20 :                         if (!dev || !dev->bdev ||
    7165          10 :                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
    7166          10 :                             dev->last_flush_error)
    7167           0 :                                 missing++;
    7168          10 :                         else if (failing_dev && failing_dev == dev)
    7169           0 :                                 missing++;
    7170             :                 }
    7171           6 :                 if (missing > max_tolerated) {
    7172           0 :                         if (!failing_dev)
    7173           0 :                                 btrfs_warn(fs_info,
    7174             :         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
    7175             :                                    em->start, missing, max_tolerated);
    7176           0 :                         free_extent_map(em);
    7177           0 :                         ret = false;
    7178           0 :                         goto out;
    7179             :                 }
    7180           6 :                 next_start = extent_map_end(em);
    7181           6 :                 free_extent_map(em);
    7182             : 
    7183           6 :                 read_lock(&map_tree->lock);
    7184           6 :                 em = lookup_extent_mapping(map_tree, next_start,
    7185             :                                            (u64)(-1) - next_start);
    7186           6 :                 read_unlock(&map_tree->lock);
    7187             :         }
    7188           2 : out:
    7189           2 :         return ret;
    7190             : }
    7191             : 
    7192           1 : static void readahead_tree_node_children(struct extent_buffer *node)
    7193             : {
    7194           1 :         int i;
    7195           1 :         const int nr_items = btrfs_header_nritems(node);
    7196             : 
    7197           3 :         for (i = 0; i < nr_items; i++)
    7198           2 :                 btrfs_readahead_node_child(node, i);
    7199           1 : }
    7200             : 
    7201        3216 : int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
    7202             : {
    7203        3216 :         struct btrfs_root *root = fs_info->chunk_root;
    7204        3216 :         struct btrfs_path *path;
    7205        3216 :         struct extent_buffer *leaf;
    7206        3216 :         struct btrfs_key key;
    7207        3216 :         struct btrfs_key found_key;
    7208        3216 :         int ret;
    7209        3216 :         int slot;
    7210        3216 :         int iter_ret = 0;
    7211        3216 :         u64 total_dev = 0;
    7212        3216 :         u64 last_ra_node = 0;
    7213             : 
    7214        3216 :         path = btrfs_alloc_path();
    7215        3216 :         if (!path)
    7216             :                 return -ENOMEM;
    7217             : 
    7218             :         /*
    7219             :          * uuid_mutex is needed only if we are mounting a sprout FS
    7220             :          * otherwise we don't need it.
    7221             :          */
    7222        3216 :         mutex_lock(&uuid_mutex);
    7223             : 
    7224             :         /*
    7225             :          * It is possible for mount and umount to race in such a way that
    7226             :          * we execute this code path, but open_fs_devices failed to clear
    7227             :          * total_rw_bytes. We certainly want it cleared before reading the
    7228             :          * device items, so clear it here.
    7229             :          */
    7230        3216 :         fs_info->fs_devices->total_rw_bytes = 0;
    7231             : 
    7232             :         /*
    7233             :          * Lockdep complains about possible circular locking dependency between
    7234             :          * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
    7235             :          * used for freeze procection of a fs (struct super_block.s_writers),
    7236             :          * which we take when starting a transaction, and extent buffers of the
    7237             :          * chunk tree if we call read_one_dev() while holding a lock on an
    7238             :          * extent buffer of the chunk tree. Since we are mounting the filesystem
    7239             :          * and at this point there can't be any concurrent task modifying the
    7240             :          * chunk tree, to keep it simple, just skip locking on the chunk tree.
    7241             :          */
    7242        3216 :         ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
    7243        3216 :         path->skip_locking = 1;
    7244             : 
    7245             :         /*
    7246             :          * Read all device items, and then all the chunk items. All
    7247             :          * device items are found before any chunk item (their object id
    7248             :          * is smaller than the lowest possible object id for a chunk
    7249             :          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
    7250             :          */
    7251        3216 :         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
    7252        3216 :         key.offset = 0;
    7253        3216 :         key.type = 0;
    7254       31407 :         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
    7255       28191 :                 struct extent_buffer *node = path->nodes[1];
    7256             : 
    7257       28191 :                 leaf = path->nodes[0];
    7258       28191 :                 slot = path->slots[0];
    7259             : 
    7260       28191 :                 if (node) {
    7261         266 :                         if (last_ra_node != node->start) {
    7262           1 :                                 readahead_tree_node_children(node);
    7263           1 :                                 last_ra_node = node->start;
    7264             :                         }
    7265             :                 }
    7266       28191 :                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
    7267        3217 :                         struct btrfs_dev_item *dev_item;
    7268        3217 :                         dev_item = btrfs_item_ptr(leaf, slot,
    7269             :                                                   struct btrfs_dev_item);
    7270        3217 :                         ret = read_one_dev(leaf, dev_item);
    7271        3217 :                         if (ret)
    7272           0 :                                 goto error;
    7273        3217 :                         total_dev++;
    7274       24974 :                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
    7275       24974 :                         struct btrfs_chunk *chunk;
    7276             : 
    7277             :                         /*
    7278             :                          * We are only called at mount time, so no need to take
    7279             :                          * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
    7280             :                          * we always lock first fs_info->chunk_mutex before
    7281             :                          * acquiring any locks on the chunk tree. This is a
    7282             :                          * requirement for chunk allocation, see the comment on
    7283             :                          * top of btrfs_chunk_alloc() for details.
    7284             :                          */
    7285       24974 :                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
    7286       24974 :                         ret = read_one_chunk(&found_key, leaf, chunk);
    7287       24974 :                         if (ret)
    7288           0 :                                 goto error;
    7289             :                 }
    7290             :         }
    7291             :         /* Catch error found during iteration */
    7292        3216 :         if (iter_ret < 0) {
    7293           0 :                 ret = iter_ret;
    7294           0 :                 goto error;
    7295             :         }
    7296             : 
    7297             :         /*
    7298             :          * After loading chunk tree, we've got all device information,
    7299             :          * do another round of validation checks.
    7300             :          */
    7301        3216 :         if (total_dev != fs_info->fs_devices->total_devices) {
    7302           0 :                 btrfs_warn(fs_info,
    7303             : "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
    7304             :                           btrfs_super_num_devices(fs_info->super_copy),
    7305             :                           total_dev);
    7306           0 :                 fs_info->fs_devices->total_devices = total_dev;
    7307           0 :                 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
    7308             :         }
    7309        3216 :         if (btrfs_super_total_bytes(fs_info->super_copy) <
    7310        3216 :             fs_info->fs_devices->total_rw_bytes) {
    7311           0 :                 btrfs_err(fs_info,
    7312             :         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
    7313             :                           btrfs_super_total_bytes(fs_info->super_copy),
    7314             :                           fs_info->fs_devices->total_rw_bytes);
    7315           0 :                 ret = -EINVAL;
    7316           0 :                 goto error;
    7317             :         }
    7318             :         ret = 0;
    7319        3216 : error:
    7320        3216 :         mutex_unlock(&uuid_mutex);
    7321             : 
    7322        3216 :         btrfs_free_path(path);
    7323        3216 :         return ret;
    7324             : }
    7325             : 
    7326        3215 : int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
    7327             : {
    7328        3215 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
    7329        3215 :         struct btrfs_device *device;
    7330        3215 :         int ret = 0;
    7331             : 
    7332        3215 :         fs_devices->fs_info = fs_info;
    7333             : 
    7334        3215 :         mutex_lock(&fs_devices->device_list_mutex);
    7335        6431 :         list_for_each_entry(device, &fs_devices->devices, dev_list)
    7336        3216 :                 device->fs_info = fs_info;
    7337             : 
    7338        3215 :         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
    7339           0 :                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
    7340           0 :                         device->fs_info = fs_info;
    7341           0 :                         ret = btrfs_get_dev_zone_info(device, false);
    7342           0 :                         if (ret)
    7343             :                                 break;
    7344             :                 }
    7345             : 
    7346           0 :                 seed_devs->fs_info = fs_info;
    7347             :         }
    7348        3215 :         mutex_unlock(&fs_devices->device_list_mutex);
    7349             : 
    7350        3215 :         return ret;
    7351             : }
    7352             : 
    7353             : static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
    7354             :                                  const struct btrfs_dev_stats_item *ptr,
    7355             :                                  int index)
    7356             : {
    7357       11155 :         u64 val;
    7358             : 
    7359       11155 :         read_extent_buffer(eb, &val,
    7360             :                            offsetof(struct btrfs_dev_stats_item, values) +
    7361       11155 :                             ((unsigned long)ptr) + (index * sizeof(u64)),
    7362             :                            sizeof(val));
    7363       11155 :         return val;
    7364             : }
    7365             : 
    7366             : static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
    7367             :                                       struct btrfs_dev_stats_item *ptr,
    7368             :                                       int index, u64 val)
    7369             : {
    7370       15945 :         write_extent_buffer(eb, &val,
    7371             :                             offsetof(struct btrfs_dev_stats_item, values) +
    7372       15945 :                              ((unsigned long)ptr) + (index * sizeof(u64)),
    7373             :                             sizeof(val));
    7374             : }
    7375             : 
    7376        3216 : static int btrfs_device_init_dev_stats(struct btrfs_device *device,
    7377             :                                        struct btrfs_path *path)
    7378             : {
    7379        3216 :         struct btrfs_dev_stats_item *ptr;
    7380        3216 :         struct extent_buffer *eb;
    7381        3216 :         struct btrfs_key key;
    7382        3216 :         int item_size;
    7383        3216 :         int i, ret, slot;
    7384             : 
    7385        3216 :         if (!device->fs_info->dev_root)
    7386             :                 return 0;
    7387             : 
    7388        3216 :         key.objectid = BTRFS_DEV_STATS_OBJECTID;
    7389        3216 :         key.type = BTRFS_PERSISTENT_ITEM_KEY;
    7390        3216 :         key.offset = device->devid;
    7391        3216 :         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
    7392        3216 :         if (ret) {
    7393        5910 :                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
    7394        4925 :                         btrfs_dev_stat_set(device, i, 0);
    7395         985 :                 device->dev_stats_valid = 1;
    7396         985 :                 btrfs_release_path(path);
    7397         985 :                 return ret < 0 ? ret : 0;
    7398             :         }
    7399        2231 :         slot = path->slots[0];
    7400        2231 :         eb = path->nodes[0];
    7401        2231 :         item_size = btrfs_item_size(eb, slot);
    7402             : 
    7403        2231 :         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
    7404             : 
    7405       13386 :         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
    7406       11155 :                 if (item_size >= (1 + i) * sizeof(__le64))
    7407       11155 :                         btrfs_dev_stat_set(device, i,
    7408             :                                            btrfs_dev_stats_value(eb, ptr, i));
    7409             :                 else
    7410           0 :                         btrfs_dev_stat_set(device, i, 0);
    7411             :         }
    7412             : 
    7413        2231 :         device->dev_stats_valid = 1;
    7414        2231 :         btrfs_dev_stat_print_on_load(device);
    7415        2231 :         btrfs_release_path(path);
    7416             : 
    7417        2231 :         return 0;
    7418             : }
    7419             : 
    7420        3215 : int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
    7421             : {
    7422        3215 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
    7423        3215 :         struct btrfs_device *device;
    7424        3215 :         struct btrfs_path *path = NULL;
    7425        3215 :         int ret = 0;
    7426             : 
    7427        3215 :         path = btrfs_alloc_path();
    7428        3215 :         if (!path)
    7429             :                 return -ENOMEM;
    7430             : 
    7431        3215 :         mutex_lock(&fs_devices->device_list_mutex);
    7432        6431 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
    7433        3216 :                 ret = btrfs_device_init_dev_stats(device, path);
    7434        3216 :                 if (ret)
    7435           0 :                         goto out;
    7436             :         }
    7437        3215 :         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
    7438           0 :                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
    7439           0 :                         ret = btrfs_device_init_dev_stats(device, path);
    7440           0 :                         if (ret)
    7441           0 :                                 goto out;
    7442             :                 }
    7443             :         }
    7444        3215 : out:
    7445        3215 :         mutex_unlock(&fs_devices->device_list_mutex);
    7446             : 
    7447        3215 :         btrfs_free_path(path);
    7448        3215 :         return ret;
    7449             : }
    7450             : 
    7451        3189 : static int update_dev_stat_item(struct btrfs_trans_handle *trans,
    7452             :                                 struct btrfs_device *device)
    7453             : {
    7454        3189 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    7455        3189 :         struct btrfs_root *dev_root = fs_info->dev_root;
    7456        3189 :         struct btrfs_path *path;
    7457        3189 :         struct btrfs_key key;
    7458        3189 :         struct extent_buffer *eb;
    7459        3189 :         struct btrfs_dev_stats_item *ptr;
    7460        3189 :         int ret;
    7461        3189 :         int i;
    7462             : 
    7463        3189 :         key.objectid = BTRFS_DEV_STATS_OBJECTID;
    7464        3189 :         key.type = BTRFS_PERSISTENT_ITEM_KEY;
    7465        3189 :         key.offset = device->devid;
    7466             : 
    7467        3189 :         path = btrfs_alloc_path();
    7468        3189 :         if (!path)
    7469             :                 return -ENOMEM;
    7470        3189 :         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
    7471        3189 :         if (ret < 0) {
    7472           0 :                 btrfs_warn_in_rcu(fs_info,
    7473             :                         "error %d while searching for dev_stats item for device %s",
    7474             :                                   ret, btrfs_dev_name(device));
    7475           0 :                 goto out;
    7476             :         }
    7477             : 
    7478        5399 :         if (ret == 0 &&
    7479        2210 :             btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
    7480             :                 /* need to delete old one and insert a new one */
    7481           0 :                 ret = btrfs_del_item(trans, dev_root, path);
    7482           0 :                 if (ret != 0) {
    7483           0 :                         btrfs_warn_in_rcu(fs_info,
    7484             :                                 "delete too small dev_stats item for device %s failed %d",
    7485             :                                           btrfs_dev_name(device), ret);
    7486           0 :                         goto out;
    7487             :                 }
    7488             :                 ret = 1;
    7489             :         }
    7490             : 
    7491        3189 :         if (ret == 1) {
    7492             :                 /* need to insert a new item */
    7493         979 :                 btrfs_release_path(path);
    7494         979 :                 ret = btrfs_insert_empty_item(trans, dev_root, path,
    7495             :                                               &key, sizeof(*ptr));
    7496         979 :                 if (ret < 0) {
    7497           0 :                         btrfs_warn_in_rcu(fs_info,
    7498             :                                 "insert dev_stats item for device %s failed %d",
    7499             :                                 btrfs_dev_name(device), ret);
    7500           0 :                         goto out;
    7501             :                 }
    7502             :         }
    7503             : 
    7504        3189 :         eb = path->nodes[0];
    7505        3189 :         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
    7506       19134 :         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
    7507       15945 :                 btrfs_set_dev_stats_value(eb, ptr, i,
    7508             :                                           btrfs_dev_stat_read(device, i));
    7509        3189 :         btrfs_mark_buffer_dirty(eb);
    7510             : 
    7511        3189 : out:
    7512        3189 :         btrfs_free_path(path);
    7513        3189 :         return ret;
    7514             : }
    7515             : 
    7516             : /*
    7517             :  * called from commit_transaction. Writes all changed device stats to disk.
    7518             :  */
    7519      203188 : int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
    7520             : {
    7521      203188 :         struct btrfs_fs_info *fs_info = trans->fs_info;
    7522      203188 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    7523      203188 :         struct btrfs_device *device;
    7524      203188 :         int stats_cnt;
    7525      203188 :         int ret = 0;
    7526             : 
    7527      203188 :         mutex_lock(&fs_devices->device_list_mutex);
    7528      406376 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
    7529      203188 :                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
    7530      203188 :                 if (!device->dev_stats_valid || stats_cnt == 0)
    7531      199999 :                         continue;
    7532             : 
    7533             : 
    7534             :                 /*
    7535             :                  * There is a LOAD-LOAD control dependency between the value of
    7536             :                  * dev_stats_ccnt and updating the on-disk values which requires
    7537             :                  * reading the in-memory counters. Such control dependencies
    7538             :                  * require explicit read memory barriers.
    7539             :                  *
    7540             :                  * This memory barriers pairs with smp_mb__before_atomic in
    7541             :                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
    7542             :                  * barrier implied by atomic_xchg in
    7543             :                  * btrfs_dev_stats_read_and_reset
    7544             :                  */
    7545        3189 :                 smp_rmb();
    7546             : 
    7547        3189 :                 ret = update_dev_stat_item(trans, device);
    7548        3189 :                 if (!ret)
    7549        3189 :                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
    7550             :         }
    7551      203188 :         mutex_unlock(&fs_devices->device_list_mutex);
    7552             : 
    7553      203188 :         return ret;
    7554             : }
    7555             : 
    7556         226 : void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
    7557             : {
    7558         226 :         btrfs_dev_stat_inc(dev, index);
    7559             : 
    7560         226 :         if (!dev->dev_stats_valid)
    7561             :                 return;
    7562         226 :         btrfs_err_rl_in_rcu(dev->fs_info,
    7563             :                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
    7564             :                            btrfs_dev_name(dev),
    7565             :                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
    7566             :                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
    7567             :                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
    7568             :                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
    7569             :                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
    7570             : }
    7571             : 
    7572        2231 : static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
    7573             : {
    7574        2231 :         int i;
    7575             : 
    7576       13329 :         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
    7577       11110 :                 if (btrfs_dev_stat_read(dev, i) != 0)
    7578             :                         break;
    7579        2231 :         if (i == BTRFS_DEV_STAT_VALUES_MAX)
    7580             :                 return; /* all values == 0, suppress message */
    7581             : 
    7582          12 :         btrfs_info_in_rcu(dev->fs_info,
    7583             :                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
    7584             :                btrfs_dev_name(dev),
    7585             :                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
    7586             :                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
    7587             :                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
    7588             :                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
    7589             :                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
    7590             : }
    7591             : 
    7592           2 : int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
    7593             :                         struct btrfs_ioctl_get_dev_stats *stats)
    7594             : {
    7595           2 :         BTRFS_DEV_LOOKUP_ARGS(args);
    7596           2 :         struct btrfs_device *dev;
    7597           2 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    7598           2 :         int i;
    7599             : 
    7600           2 :         mutex_lock(&fs_devices->device_list_mutex);
    7601           2 :         args.devid = stats->devid;
    7602           2 :         dev = btrfs_find_device(fs_info->fs_devices, &args);
    7603           2 :         mutex_unlock(&fs_devices->device_list_mutex);
    7604             : 
    7605           2 :         if (!dev) {
    7606           0 :                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
    7607           0 :                 return -ENODEV;
    7608           2 :         } else if (!dev->dev_stats_valid) {
    7609           0 :                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
    7610           0 :                 return -ENODEV;
    7611           2 :         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
    7612           0 :                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
    7613           0 :                         if (stats->nr_items > i)
    7614           0 :                                 stats->values[i] =
    7615           0 :                                         btrfs_dev_stat_read_and_reset(dev, i);
    7616             :                         else
    7617           0 :                                 btrfs_dev_stat_set(dev, i, 0);
    7618             :                 }
    7619           0 :                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
    7620             :                            current->comm, task_pid_nr(current));
    7621             :         } else {
    7622          12 :                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
    7623          10 :                         if (stats->nr_items > i)
    7624          10 :                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
    7625             :         }
    7626           2 :         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
    7627           0 :                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
    7628             :         return 0;
    7629             : }
    7630             : 
    7631             : /*
    7632             :  * Update the size and bytes used for each device where it changed.  This is
    7633             :  * delayed since we would otherwise get errors while writing out the
    7634             :  * superblocks.
    7635             :  *
    7636             :  * Must be invoked during transaction commit.
    7637             :  */
    7638      203014 : void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
    7639             : {
    7640      203014 :         struct btrfs_device *curr, *next;
    7641             : 
    7642      203014 :         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
    7643             : 
    7644      203014 :         if (list_empty(&trans->dev_update_list))
    7645             :                 return;
    7646             : 
    7647             :         /*
    7648             :          * We don't need the device_list_mutex here.  This list is owned by the
    7649             :          * transaction and the transaction must complete before the device is
    7650             :          * released.
    7651             :          */
    7652         969 :         mutex_lock(&trans->fs_info->chunk_mutex);
    7653        1938 :         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
    7654             :                                  post_commit_list) {
    7655         969 :                 list_del_init(&curr->post_commit_list);
    7656         969 :                 curr->commit_total_bytes = curr->disk_total_bytes;
    7657         969 :                 curr->commit_bytes_used = curr->bytes_used;
    7658             :         }
    7659         969 :         mutex_unlock(&trans->fs_info->chunk_mutex);
    7660             : }
    7661             : 
    7662             : /*
    7663             :  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
    7664             :  */
    7665    92353624 : int btrfs_bg_type_to_factor(u64 flags)
    7666             : {
    7667    92353624 :         const int index = btrfs_bg_flags_to_raid_index(flags);
    7668             : 
    7669    92353624 :         return btrfs_raid_array[index].ncopies;
    7670             : }
    7671             : 
    7672             : 
    7673             : 
    7674       32027 : static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
    7675             :                                  u64 chunk_offset, u64 devid,
    7676             :                                  u64 physical_offset, u64 physical_len)
    7677             : {
    7678       32027 :         struct btrfs_dev_lookup_args args = { .devid = devid };
    7679       32027 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    7680       32027 :         struct extent_map *em;
    7681       32027 :         struct map_lookup *map;
    7682       32027 :         struct btrfs_device *dev;
    7683       32027 :         u64 stripe_len;
    7684       32027 :         bool found = false;
    7685       32027 :         int ret = 0;
    7686       32027 :         int i;
    7687             : 
    7688       32027 :         read_lock(&em_tree->lock);
    7689       32027 :         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
    7690       32027 :         read_unlock(&em_tree->lock);
    7691             : 
    7692       32027 :         if (!em) {
    7693           0 :                 btrfs_err(fs_info,
    7694             : "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
    7695             :                           physical_offset, devid);
    7696           0 :                 ret = -EUCLEAN;
    7697           0 :                 goto out;
    7698             :         }
    7699             : 
    7700       32027 :         map = em->map_lookup;
    7701       32027 :         stripe_len = btrfs_calc_stripe_length(em);
    7702       32027 :         if (physical_len != stripe_len) {
    7703           0 :                 btrfs_err(fs_info,
    7704             : "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
    7705             :                           physical_offset, devid, em->start, physical_len,
    7706             :                           stripe_len);
    7707           0 :                 ret = -EUCLEAN;
    7708           0 :                 goto out;
    7709             :         }
    7710             : 
    7711             :         /*
    7712             :          * Very old mkfs.btrfs (before v4.1) will not respect the reserved
    7713             :          * space. Although kernel can handle it without problem, better to warn
    7714             :          * the users.
    7715             :          */
    7716       32027 :         if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
    7717           0 :                 btrfs_warn(fs_info,
    7718             :                 "devid %llu physical %llu len %llu inside the reserved space",
    7719             :                            devid, physical_offset, physical_len);
    7720             : 
    7721       39083 :         for (i = 0; i < map->num_stripes; i++) {
    7722       39083 :                 if (map->stripes[i].dev->devid == devid &&
    7723       39080 :                     map->stripes[i].physical == physical_offset) {
    7724       32027 :                         found = true;
    7725       32027 :                         if (map->verified_stripes >= map->num_stripes) {
    7726           0 :                                 btrfs_err(fs_info,
    7727             :                                 "too many dev extents for chunk %llu found",
    7728             :                                           em->start);
    7729           0 :                                 ret = -EUCLEAN;
    7730           0 :                                 goto out;
    7731             :                         }
    7732       32027 :                         map->verified_stripes++;
    7733       32027 :                         break;
    7734             :                 }
    7735             :         }
    7736       32027 :         if (!found) {
    7737           0 :                 btrfs_err(fs_info,
    7738             :         "dev extent physical offset %llu devid %llu has no corresponding chunk",
    7739             :                         physical_offset, devid);
    7740           0 :                 ret = -EUCLEAN;
    7741             :         }
    7742             : 
    7743             :         /* Make sure no dev extent is beyond device boundary */
    7744       32027 :         dev = btrfs_find_device(fs_info->fs_devices, &args);
    7745       32027 :         if (!dev) {
    7746           0 :                 btrfs_err(fs_info, "failed to find devid %llu", devid);
    7747           0 :                 ret = -EUCLEAN;
    7748           0 :                 goto out;
    7749             :         }
    7750             : 
    7751       32027 :         if (physical_offset + physical_len > dev->disk_total_bytes) {
    7752           0 :                 btrfs_err(fs_info,
    7753             : "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
    7754             :                           devid, physical_offset, physical_len,
    7755             :                           dev->disk_total_bytes);
    7756           0 :                 ret = -EUCLEAN;
    7757           0 :                 goto out;
    7758             :         }
    7759             : 
    7760       32027 :         if (dev->zone_info) {
    7761           0 :                 u64 zone_size = dev->zone_info->zone_size;
    7762             : 
    7763           0 :                 if (!IS_ALIGNED(physical_offset, zone_size) ||
    7764           0 :                     !IS_ALIGNED(physical_len, zone_size)) {
    7765           0 :                         btrfs_err(fs_info,
    7766             : "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
    7767             :                                   devid, physical_offset, physical_len);
    7768           0 :                         ret = -EUCLEAN;
    7769           0 :                         goto out;
    7770             :                 }
    7771             :         }
    7772             : 
    7773       32027 : out:
    7774       32027 :         free_extent_map(em);
    7775       32027 :         return ret;
    7776             : }
    7777             : 
    7778        3215 : static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
    7779             : {
    7780        3215 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
    7781        3215 :         struct extent_map *em;
    7782        3215 :         struct rb_node *node;
    7783        3215 :         int ret = 0;
    7784             : 
    7785        3215 :         read_lock(&em_tree->lock);
    7786       28186 :         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
    7787       24971 :                 em = rb_entry(node, struct extent_map, rb_node);
    7788       24971 :                 if (em->map_lookup->num_stripes !=
    7789       24971 :                     em->map_lookup->verified_stripes) {
    7790           0 :                         btrfs_err(fs_info,
    7791             :                         "chunk %llu has missing dev extent, have %d expect %d",
    7792             :                                   em->start, em->map_lookup->verified_stripes,
    7793             :                                   em->map_lookup->num_stripes);
    7794           0 :                         ret = -EUCLEAN;
    7795           0 :                         goto out;
    7796             :                 }
    7797             :         }
    7798        3215 : out:
    7799        3215 :         read_unlock(&em_tree->lock);
    7800        3215 :         return ret;
    7801             : }
    7802             : 
    7803             : /*
    7804             :  * Ensure that all dev extents are mapped to correct chunk, otherwise
    7805             :  * later chunk allocation/free would cause unexpected behavior.
    7806             :  *
    7807             :  * NOTE: This will iterate through the whole device tree, which should be of
    7808             :  * the same size level as the chunk tree.  This slightly increases mount time.
    7809             :  */
    7810        3215 : int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
    7811             : {
    7812        3215 :         struct btrfs_path *path;
    7813        3215 :         struct btrfs_root *root = fs_info->dev_root;
    7814        3215 :         struct btrfs_key key;
    7815        3215 :         u64 prev_devid = 0;
    7816        3215 :         u64 prev_dev_ext_end = 0;
    7817        3215 :         int ret = 0;
    7818             : 
    7819             :         /*
    7820             :          * We don't have a dev_root because we mounted with ignorebadroots and
    7821             :          * failed to load the root, so we want to skip the verification in this
    7822             :          * case for sure.
    7823             :          *
    7824             :          * However if the dev root is fine, but the tree itself is corrupted
    7825             :          * we'd still fail to mount.  This verification is only to make sure
    7826             :          * writes can happen safely, so instead just bypass this check
    7827             :          * completely in the case of IGNOREBADROOTS.
    7828             :          */
    7829        3215 :         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
    7830             :                 return 0;
    7831             : 
    7832        3215 :         key.objectid = 1;
    7833        3215 :         key.type = BTRFS_DEV_EXTENT_KEY;
    7834        3215 :         key.offset = 0;
    7835             : 
    7836        3215 :         path = btrfs_alloc_path();
    7837        3215 :         if (!path)
    7838             :                 return -ENOMEM;
    7839             : 
    7840        3215 :         path->reada = READA_FORWARD;
    7841        3215 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    7842        3215 :         if (ret < 0)
    7843           0 :                 goto out;
    7844             : 
    7845        3215 :         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
    7846           0 :                 ret = btrfs_next_leaf(root, path);
    7847           0 :                 if (ret < 0)
    7848           0 :                         goto out;
    7849             :                 /* No dev extents at all? Not good */
    7850           0 :                 if (ret > 0) {
    7851           0 :                         ret = -EUCLEAN;
    7852           0 :                         goto out;
    7853             :                 }
    7854             :         }
    7855       32027 :         while (1) {
    7856       32027 :                 struct extent_buffer *leaf = path->nodes[0];
    7857       32027 :                 struct btrfs_dev_extent *dext;
    7858       32027 :                 int slot = path->slots[0];
    7859       32027 :                 u64 chunk_offset;
    7860       32027 :                 u64 physical_offset;
    7861       32027 :                 u64 physical_len;
    7862       32027 :                 u64 devid;
    7863             : 
    7864       32027 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
    7865       32027 :                 if (key.type != BTRFS_DEV_EXTENT_KEY)
    7866             :                         break;
    7867       32027 :                 devid = key.objectid;
    7868       32027 :                 physical_offset = key.offset;
    7869             : 
    7870       32027 :                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
    7871       32027 :                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
    7872       32027 :                 physical_len = btrfs_dev_extent_length(leaf, dext);
    7873             : 
    7874             :                 /* Check if this dev extent overlaps with the previous one */
    7875       32027 :                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
    7876           0 :                         btrfs_err(fs_info,
    7877             : "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
    7878             :                                   devid, physical_offset, prev_dev_ext_end);
    7879           0 :                         ret = -EUCLEAN;
    7880           0 :                         goto out;
    7881             :                 }
    7882             : 
    7883       32027 :                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
    7884             :                                             physical_offset, physical_len);
    7885       32027 :                 if (ret < 0)
    7886           0 :                         goto out;
    7887       32027 :                 prev_devid = devid;
    7888       32027 :                 prev_dev_ext_end = physical_offset + physical_len;
    7889             : 
    7890       32027 :                 ret = btrfs_next_item(root, path);
    7891       32027 :                 if (ret < 0)
    7892           0 :                         goto out;
    7893       32027 :                 if (ret > 0) {
    7894             :                         ret = 0;
    7895             :                         break;
    7896             :                 }
    7897             :         }
    7898             : 
    7899             :         /* Ensure all chunks have corresponding dev extents */
    7900        3215 :         ret = verify_chunk_dev_extent_mapping(fs_info);
    7901        3215 : out:
    7902        3215 :         btrfs_free_path(path);
    7903        3215 :         return ret;
    7904             : }
    7905             : 
    7906             : /*
    7907             :  * Check whether the given block group or device is pinned by any inode being
    7908             :  * used as a swapfile.
    7909             :  */
    7910         523 : bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
    7911             : {
    7912         523 :         struct btrfs_swapfile_pin *sp;
    7913         523 :         struct rb_node *node;
    7914             : 
    7915         523 :         spin_lock(&fs_info->swapfile_pins_lock);
    7916         523 :         node = fs_info->swapfile_pins.rb_node;
    7917         545 :         while (node) {
    7918          24 :                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
    7919          24 :                 if (ptr < sp->ptr)
    7920          14 :                         node = node->rb_left;
    7921          10 :                 else if (ptr > sp->ptr)
    7922           8 :                         node = node->rb_right;
    7923             :                 else
    7924             :                         break;
    7925             :         }
    7926         523 :         spin_unlock(&fs_info->swapfile_pins_lock);
    7927         523 :         return node != NULL;
    7928             : }
    7929             : 
    7930           0 : static int relocating_repair_kthread(void *data)
    7931             : {
    7932           0 :         struct btrfs_block_group *cache = data;
    7933           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
    7934           0 :         u64 target;
    7935           0 :         int ret = 0;
    7936             : 
    7937           0 :         target = cache->start;
    7938           0 :         btrfs_put_block_group(cache);
    7939             : 
    7940           0 :         sb_start_write(fs_info->sb);
    7941           0 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
    7942           0 :                 btrfs_info(fs_info,
    7943             :                            "zoned: skip relocating block group %llu to repair: EBUSY",
    7944             :                            target);
    7945           0 :                 sb_end_write(fs_info->sb);
    7946           0 :                 return -EBUSY;
    7947             :         }
    7948             : 
    7949           0 :         mutex_lock(&fs_info->reclaim_bgs_lock);
    7950             : 
    7951             :         /* Ensure block group still exists */
    7952           0 :         cache = btrfs_lookup_block_group(fs_info, target);
    7953           0 :         if (!cache)
    7954           0 :                 goto out;
    7955             : 
    7956           0 :         if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
    7957           0 :                 goto out;
    7958             : 
    7959           0 :         ret = btrfs_may_alloc_data_chunk(fs_info, target);
    7960           0 :         if (ret < 0)
    7961           0 :                 goto out;
    7962             : 
    7963           0 :         btrfs_info(fs_info,
    7964             :                    "zoned: relocating block group %llu to repair IO failure",
    7965             :                    target);
    7966           0 :         ret = btrfs_relocate_chunk(fs_info, target);
    7967             : 
    7968           0 : out:
    7969           0 :         if (cache)
    7970           0 :                 btrfs_put_block_group(cache);
    7971           0 :         mutex_unlock(&fs_info->reclaim_bgs_lock);
    7972           0 :         btrfs_exclop_finish(fs_info);
    7973           0 :         sb_end_write(fs_info->sb);
    7974             : 
    7975           0 :         return ret;
    7976             : }
    7977             : 
    7978           0 : bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
    7979             : {
    7980           0 :         struct btrfs_block_group *cache;
    7981             : 
    7982           0 :         if (!btrfs_is_zoned(fs_info))
    7983             :                 return false;
    7984             : 
    7985             :         /* Do not attempt to repair in degraded state */
    7986           0 :         if (btrfs_test_opt(fs_info, DEGRADED))
    7987             :                 return true;
    7988             : 
    7989           0 :         cache = btrfs_lookup_block_group(fs_info, logical);
    7990           0 :         if (!cache)
    7991             :                 return true;
    7992             : 
    7993           0 :         if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
    7994           0 :                 btrfs_put_block_group(cache);
    7995           0 :                 return true;
    7996             :         }
    7997             : 
    7998           0 :         kthread_run(relocating_repair_kthread, cache,
    7999             :                     "btrfs-relocating-repair");
    8000             : 
    8001             :         return true;
    8002             : }
    8003             : 
    8004           0 : static void map_raid56_repair_block(struct btrfs_io_context *bioc,
    8005             :                                     struct btrfs_io_stripe *smap,
    8006             :                                     u64 logical)
    8007             : {
    8008           0 :         int data_stripes = nr_bioc_data_stripes(bioc);
    8009           0 :         int i;
    8010             : 
    8011           0 :         for (i = 0; i < data_stripes; i++) {
    8012           0 :                 u64 stripe_start = bioc->full_stripe_logical +
    8013             :                                    btrfs_stripe_nr_to_offset(i);
    8014             : 
    8015           0 :                 if (logical >= stripe_start &&
    8016           0 :                     logical < stripe_start + BTRFS_STRIPE_LEN)
    8017             :                         break;
    8018             :         }
    8019           0 :         ASSERT(i < data_stripes);
    8020           0 :         smap->dev = bioc->stripes[i].dev;
    8021           0 :         smap->physical = bioc->stripes[i].physical +
    8022           0 :                         ((logical - bioc->full_stripe_logical) &
    8023             :                          BTRFS_STRIPE_LEN_MASK);
    8024           0 : }
    8025             : 
    8026             : /*
    8027             :  * Map a repair write into a single device.
    8028             :  *
    8029             :  * A repair write is triggered by read time repair or scrub, which would only
    8030             :  * update the contents of a single device.
    8031             :  * Not update any other mirrors nor go through RMW path.
    8032             :  *
    8033             :  * Callers should ensure:
    8034             :  *
    8035             :  * - Call btrfs_bio_counter_inc_blocked() first
    8036             :  * - The range does not cross stripe boundary
    8037             :  * - Has a valid @mirror_num passed in.
    8038             :  */
    8039           2 : int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
    8040             :                            struct btrfs_io_stripe *smap, u64 logical,
    8041             :                            u32 length, int mirror_num)
    8042             : {
    8043           2 :         struct btrfs_io_context *bioc = NULL;
    8044           2 :         u64 map_length = length;
    8045           2 :         int mirror_ret = mirror_num;
    8046           2 :         int ret;
    8047             : 
    8048           2 :         ASSERT(mirror_num > 0);
    8049             : 
    8050           2 :         ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
    8051             :                               &bioc, smap, &mirror_ret, true);
    8052           2 :         if (ret < 0)
    8053             :                 return ret;
    8054             : 
    8055             :         /* The map range should not cross stripe boundary. */
    8056           2 :         ASSERT(map_length >= length);
    8057             : 
    8058             :         /* Already mapped to single stripe. */
    8059           2 :         if (!bioc)
    8060           0 :                 goto out;
    8061             : 
    8062             :         /* Map the RAID56 multi-stripe writes to a single one. */
    8063           2 :         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
    8064           0 :                 map_raid56_repair_block(bioc, smap, logical);
    8065           0 :                 goto out;
    8066             :         }
    8067             : 
    8068           2 :         ASSERT(mirror_num <= bioc->num_stripes);
    8069           2 :         smap->dev = bioc->stripes[mirror_num - 1].dev;
    8070           2 :         smap->physical = bioc->stripes[mirror_num - 1].physical;
    8071           2 : out:
    8072           2 :         btrfs_put_bioc(bioc);
    8073           2 :         ASSERT(smap->dev);
    8074           2 :         return 0;
    8075             : }

Generated by: LCOV version 1.14