LCOV - code coverage report
Current view: top level - fs/btrfs - dev-replace.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 100 606 16.5 %
Date: 2023-07-31 20:08:34 Functions: 9 20 45.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (C) STRATO AG 2012.  All rights reserved.
       4             :  */
       5             : 
       6             : #include <linux/sched.h>
       7             : #include <linux/bio.h>
       8             : #include <linux/slab.h>
       9             : #include <linux/blkdev.h>
      10             : #include <linux/kthread.h>
      11             : #include <linux/math64.h>
      12             : #include "misc.h"
      13             : #include "ctree.h"
      14             : #include "extent_map.h"
      15             : #include "disk-io.h"
      16             : #include "transaction.h"
      17             : #include "print-tree.h"
      18             : #include "volumes.h"
      19             : #include "async-thread.h"
      20             : #include "check-integrity.h"
      21             : #include "dev-replace.h"
      22             : #include "sysfs.h"
      23             : #include "zoned.h"
      24             : #include "block-group.h"
      25             : #include "fs.h"
      26             : #include "accessors.h"
      27             : #include "scrub.h"
      28             : 
      29             : /*
      30             :  * Device replace overview
      31             :  *
      32             :  * [Objective]
      33             :  * To copy all extents (both new and on-disk) from source device to target
      34             :  * device, while still keeping the filesystem read-write.
      35             :  *
      36             :  * [Method]
      37             :  * There are two main methods involved:
      38             :  *
      39             :  * - Write duplication
      40             :  *
      41             :  *   All new writes will be written to both target and source devices, so even
      42             :  *   if replace gets canceled, sources device still contains up-to-date data.
      43             :  *
      44             :  *   Location:          handle_ops_on_dev_replace() from btrfs_map_block()
      45             :  *   Start:             btrfs_dev_replace_start()
      46             :  *   End:               btrfs_dev_replace_finishing()
      47             :  *   Content:           Latest data/metadata
      48             :  *
      49             :  * - Copy existing extents
      50             :  *
      51             :  *   This happens by re-using scrub facility, as scrub also iterates through
      52             :  *   existing extents from commit root.
      53             :  *
      54             :  *   Location:          scrub_write_block_to_dev_replace() from
      55             :  *                      scrub_block_complete()
      56             :  *   Content:           Data/meta from commit root.
      57             :  *
      58             :  * Due to the content difference, we need to avoid nocow write when dev-replace
      59             :  * is happening.  This is done by marking the block group read-only and waiting
      60             :  * for NOCOW writes.
      61             :  *
      62             :  * After replace is done, the finishing part is done by swapping the target and
      63             :  * source devices.
      64             :  *
      65             :  *   Location:          btrfs_dev_replace_update_device_in_mapping_tree() from
      66             :  *                      btrfs_dev_replace_finishing()
      67             :  */
      68             : 
      69             : static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
      70             :                                        int scrub_ret);
      71             : static int btrfs_dev_replace_kthread(void *data);
      72             : 
      73        3217 : int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
      74             : {
      75        3217 :         struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
      76        3217 :         struct btrfs_key key;
      77        3217 :         struct btrfs_root *dev_root = fs_info->dev_root;
      78        3217 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
      79        3217 :         struct extent_buffer *eb;
      80        3217 :         int slot;
      81        3217 :         int ret = 0;
      82        3217 :         struct btrfs_path *path = NULL;
      83        3217 :         int item_size;
      84        3217 :         struct btrfs_dev_replace_item *ptr;
      85        3217 :         u64 src_devid;
      86             : 
      87        3217 :         if (!dev_root)
      88             :                 return 0;
      89             : 
      90        3217 :         path = btrfs_alloc_path();
      91        3217 :         if (!path) {
      92           0 :                 ret = -ENOMEM;
      93           0 :                 goto out;
      94             :         }
      95             : 
      96        3217 :         key.objectid = 0;
      97        3217 :         key.type = BTRFS_DEV_REPLACE_KEY;
      98        3217 :         key.offset = 0;
      99        3217 :         ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
     100        3217 :         if (ret) {
     101        3217 : no_valid_dev_replace_entry_found:
     102             :                 /*
     103             :                  * We don't have a replace item or it's corrupted.  If there is
     104             :                  * a replace target, fail the mount.
     105             :                  */
     106        3217 :                 if (btrfs_find_device(fs_info->fs_devices, &args)) {
     107           0 :                         btrfs_err(fs_info,
     108             :                         "found replace target device without a valid replace item");
     109           0 :                         ret = -EUCLEAN;
     110           0 :                         goto out;
     111             :                 }
     112        3217 :                 ret = 0;
     113        3217 :                 dev_replace->replace_state =
     114             :                         BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
     115        3217 :                 dev_replace->cont_reading_from_srcdev_mode =
     116             :                     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
     117        3217 :                 dev_replace->time_started = 0;
     118        3217 :                 dev_replace->time_stopped = 0;
     119        3217 :                 atomic64_set(&dev_replace->num_write_errors, 0);
     120        3217 :                 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
     121        3217 :                 dev_replace->cursor_left = 0;
     122        3217 :                 dev_replace->committed_cursor_left = 0;
     123        3217 :                 dev_replace->cursor_left_last_write_of_item = 0;
     124        3217 :                 dev_replace->cursor_right = 0;
     125        3217 :                 dev_replace->srcdev = NULL;
     126        3217 :                 dev_replace->tgtdev = NULL;
     127        3217 :                 dev_replace->is_valid = 0;
     128        3217 :                 dev_replace->item_needs_writeback = 0;
     129        3217 :                 goto out;
     130             :         }
     131           0 :         slot = path->slots[0];
     132           0 :         eb = path->nodes[0];
     133           0 :         item_size = btrfs_item_size(eb, slot);
     134           0 :         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
     135             : 
     136           0 :         if (item_size != sizeof(struct btrfs_dev_replace_item)) {
     137           0 :                 btrfs_warn(fs_info,
     138             :                         "dev_replace entry found has unexpected size, ignore entry");
     139           0 :                 goto no_valid_dev_replace_entry_found;
     140             :         }
     141             : 
     142           0 :         src_devid = btrfs_dev_replace_src_devid(eb, ptr);
     143           0 :         dev_replace->cont_reading_from_srcdev_mode =
     144             :                 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
     145           0 :         dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
     146           0 :         dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
     147           0 :         dev_replace->time_stopped =
     148           0 :                 btrfs_dev_replace_time_stopped(eb, ptr);
     149           0 :         atomic64_set(&dev_replace->num_write_errors,
     150             :                      btrfs_dev_replace_num_write_errors(eb, ptr));
     151           0 :         atomic64_set(&dev_replace->num_uncorrectable_read_errors,
     152             :                      btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
     153           0 :         dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
     154           0 :         dev_replace->committed_cursor_left = dev_replace->cursor_left;
     155           0 :         dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
     156           0 :         dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
     157           0 :         dev_replace->is_valid = 1;
     158             : 
     159           0 :         dev_replace->item_needs_writeback = 0;
     160           0 :         switch (dev_replace->replace_state) {
     161           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
     162             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
     163             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
     164             :                 /*
     165             :                  * We don't have an active replace item but if there is a
     166             :                  * replace target, fail the mount.
     167             :                  */
     168           0 :                 if (btrfs_find_device(fs_info->fs_devices, &args)) {
     169           0 :                         btrfs_err(fs_info,
     170             : "replace without active item, run 'device scan --forget' on the target device");
     171           0 :                         ret = -EUCLEAN;
     172             :                 } else {
     173           0 :                         dev_replace->srcdev = NULL;
     174           0 :                         dev_replace->tgtdev = NULL;
     175             :                 }
     176             :                 break;
     177           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
     178             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
     179           0 :                 dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
     180           0 :                 args.devid = src_devid;
     181           0 :                 dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
     182             : 
     183             :                 /*
     184             :                  * allow 'btrfs dev replace_cancel' if src/tgt device is
     185             :                  * missing
     186             :                  */
     187           0 :                 if (!dev_replace->srcdev &&
     188           0 :                     !btrfs_test_opt(fs_info, DEGRADED)) {
     189           0 :                         ret = -EIO;
     190           0 :                         btrfs_warn(fs_info,
     191             :                            "cannot mount because device replace operation is ongoing and");
     192           0 :                         btrfs_warn(fs_info,
     193             :                            "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
     194             :                            src_devid);
     195             :                 }
     196           0 :                 if (!dev_replace->tgtdev &&
     197           0 :                     !btrfs_test_opt(fs_info, DEGRADED)) {
     198           0 :                         ret = -EIO;
     199           0 :                         btrfs_warn(fs_info,
     200             :                            "cannot mount because device replace operation is ongoing and");
     201           0 :                         btrfs_warn(fs_info,
     202             :                            "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
     203             :                                 BTRFS_DEV_REPLACE_DEVID);
     204             :                 }
     205           0 :                 if (dev_replace->tgtdev) {
     206           0 :                         if (dev_replace->srcdev) {
     207           0 :                                 dev_replace->tgtdev->total_bytes =
     208           0 :                                         dev_replace->srcdev->total_bytes;
     209           0 :                                 dev_replace->tgtdev->disk_total_bytes =
     210           0 :                                         dev_replace->srcdev->disk_total_bytes;
     211           0 :                                 dev_replace->tgtdev->commit_total_bytes =
     212           0 :                                         dev_replace->srcdev->commit_total_bytes;
     213           0 :                                 dev_replace->tgtdev->bytes_used =
     214           0 :                                         dev_replace->srcdev->bytes_used;
     215           0 :                                 dev_replace->tgtdev->commit_bytes_used =
     216           0 :                                         dev_replace->srcdev->commit_bytes_used;
     217             :                         }
     218           0 :                         set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
     219           0 :                                 &dev_replace->tgtdev->dev_state);
     220             : 
     221           0 :                         WARN_ON(fs_info->fs_devices->rw_devices == 0);
     222           0 :                         dev_replace->tgtdev->io_width = fs_info->sectorsize;
     223           0 :                         dev_replace->tgtdev->io_align = fs_info->sectorsize;
     224           0 :                         dev_replace->tgtdev->sector_size = fs_info->sectorsize;
     225           0 :                         dev_replace->tgtdev->fs_info = fs_info;
     226           0 :                         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
     227           0 :                                 &dev_replace->tgtdev->dev_state);
     228             :                 }
     229             :                 break;
     230             :         }
     231             : 
     232        3217 : out:
     233        3217 :         btrfs_free_path(path);
     234        3217 :         return ret;
     235             : }
     236             : 
     237             : /*
     238             :  * Initialize a new device for device replace target from a given source dev
     239             :  * and path.
     240             :  *
     241             :  * Return 0 and new device in @device_out, otherwise return < 0
     242             :  */
     243           0 : static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
     244             :                                   const char *device_path,
     245             :                                   struct btrfs_device *srcdev,
     246             :                                   struct btrfs_device **device_out)
     247             : {
     248           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
     249           0 :         struct btrfs_device *device;
     250           0 :         struct block_device *bdev;
     251           0 :         u64 devid = BTRFS_DEV_REPLACE_DEVID;
     252           0 :         int ret = 0;
     253             : 
     254           0 :         *device_out = NULL;
     255           0 :         if (srcdev->fs_devices->seeding) {
     256           0 :                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
     257           0 :                 return -EINVAL;
     258             :         }
     259             : 
     260           0 :         bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
     261             :                                   fs_info->bdev_holder, NULL);
     262           0 :         if (IS_ERR(bdev)) {
     263           0 :                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
     264           0 :                 return PTR_ERR(bdev);
     265             :         }
     266             : 
     267           0 :         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
     268             :                 btrfs_err(fs_info,
     269             :                 "dev-replace: zoned type of target device mismatch with filesystem");
     270             :                 ret = -EINVAL;
     271             :                 goto error;
     272             :         }
     273             : 
     274           0 :         sync_blockdev(bdev);
     275             : 
     276           0 :         list_for_each_entry(device, &fs_devices->devices, dev_list) {
     277           0 :                 if (device->bdev == bdev) {
     278           0 :                         btrfs_err(fs_info,
     279             :                                   "target device is in the filesystem!");
     280           0 :                         ret = -EEXIST;
     281           0 :                         goto error;
     282             :                 }
     283             :         }
     284             : 
     285             : 
     286           0 :         if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
     287           0 :                 btrfs_err(fs_info,
     288             :                           "target device is smaller than source device!");
     289           0 :                 ret = -EINVAL;
     290           0 :                 goto error;
     291             :         }
     292             : 
     293             : 
     294           0 :         device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
     295           0 :         if (IS_ERR(device)) {
     296           0 :                 ret = PTR_ERR(device);
     297           0 :                 goto error;
     298             :         }
     299             : 
     300           0 :         ret = lookup_bdev(device_path, &device->devt);
     301           0 :         if (ret)
     302           0 :                 goto error;
     303             : 
     304           0 :         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
     305           0 :         device->generation = 0;
     306           0 :         device->io_width = fs_info->sectorsize;
     307           0 :         device->io_align = fs_info->sectorsize;
     308           0 :         device->sector_size = fs_info->sectorsize;
     309           0 :         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
     310           0 :         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
     311           0 :         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
     312           0 :         device->commit_total_bytes = srcdev->commit_total_bytes;
     313           0 :         device->commit_bytes_used = device->bytes_used;
     314           0 :         device->fs_info = fs_info;
     315           0 :         device->bdev = bdev;
     316           0 :         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
     317           0 :         set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
     318           0 :         device->holder = fs_info->bdev_holder;
     319           0 :         device->dev_stats_valid = 1;
     320           0 :         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
     321           0 :         device->fs_devices = fs_devices;
     322             : 
     323           0 :         ret = btrfs_get_dev_zone_info(device, false);
     324           0 :         if (ret)
     325             :                 goto error;
     326             : 
     327           0 :         mutex_lock(&fs_devices->device_list_mutex);
     328           0 :         list_add(&device->dev_list, &fs_devices->devices);
     329           0 :         fs_devices->num_devices++;
     330           0 :         fs_devices->open_devices++;
     331           0 :         mutex_unlock(&fs_devices->device_list_mutex);
     332             : 
     333           0 :         *device_out = device;
     334           0 :         return 0;
     335             : 
     336           0 : error:
     337           0 :         blkdev_put(bdev, fs_info->bdev_holder);
     338           0 :         return ret;
     339             : }
     340             : 
     341             : /*
     342             :  * called from commit_transaction. Writes changed device replace state to
     343             :  * disk.
     344             :  */
     345      206349 : int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
     346             : {
     347      206349 :         struct btrfs_fs_info *fs_info = trans->fs_info;
     348      206349 :         int ret;
     349      206349 :         struct btrfs_root *dev_root = fs_info->dev_root;
     350      206349 :         struct btrfs_path *path;
     351      206349 :         struct btrfs_key key;
     352      206349 :         struct extent_buffer *eb;
     353      206349 :         struct btrfs_dev_replace_item *ptr;
     354      206349 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
     355             : 
     356      206349 :         down_read(&dev_replace->rwsem);
     357      206349 :         if (!dev_replace->is_valid ||
     358           0 :             !dev_replace->item_needs_writeback) {
     359      206349 :                 up_read(&dev_replace->rwsem);
     360      206349 :                 return 0;
     361             :         }
     362           0 :         up_read(&dev_replace->rwsem);
     363             : 
     364           0 :         key.objectid = 0;
     365           0 :         key.type = BTRFS_DEV_REPLACE_KEY;
     366           0 :         key.offset = 0;
     367             : 
     368           0 :         path = btrfs_alloc_path();
     369           0 :         if (!path) {
     370           0 :                 ret = -ENOMEM;
     371           0 :                 goto out;
     372             :         }
     373           0 :         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
     374           0 :         if (ret < 0) {
     375           0 :                 btrfs_warn(fs_info,
     376             :                            "error %d while searching for dev_replace item!",
     377             :                            ret);
     378           0 :                 goto out;
     379             :         }
     380             : 
     381           0 :         if (ret == 0 &&
     382           0 :             btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
     383             :                 /*
     384             :                  * need to delete old one and insert a new one.
     385             :                  * Since no attempt is made to recover any old state, if the
     386             :                  * dev_replace state is 'running', the data on the target
     387             :                  * drive is lost.
     388             :                  * It would be possible to recover the state: just make sure
     389             :                  * that the beginning of the item is never changed and always
     390             :                  * contains all the essential information. Then read this
     391             :                  * minimal set of information and use it as a base for the
     392             :                  * new state.
     393             :                  */
     394           0 :                 ret = btrfs_del_item(trans, dev_root, path);
     395           0 :                 if (ret != 0) {
     396           0 :                         btrfs_warn(fs_info,
     397             :                                    "delete too small dev_replace item failed %d!",
     398             :                                    ret);
     399           0 :                         goto out;
     400             :                 }
     401             :                 ret = 1;
     402             :         }
     403             : 
     404           0 :         if (ret == 1) {
     405             :                 /* need to insert a new item */
     406           0 :                 btrfs_release_path(path);
     407           0 :                 ret = btrfs_insert_empty_item(trans, dev_root, path,
     408             :                                               &key, sizeof(*ptr));
     409           0 :                 if (ret < 0) {
     410           0 :                         btrfs_warn(fs_info,
     411             :                                    "insert dev_replace item failed %d!", ret);
     412           0 :                         goto out;
     413             :                 }
     414             :         }
     415             : 
     416           0 :         eb = path->nodes[0];
     417           0 :         ptr = btrfs_item_ptr(eb, path->slots[0],
     418             :                              struct btrfs_dev_replace_item);
     419             : 
     420           0 :         down_write(&dev_replace->rwsem);
     421           0 :         if (dev_replace->srcdev)
     422           0 :                 btrfs_set_dev_replace_src_devid(eb, ptr,
     423             :                         dev_replace->srcdev->devid);
     424             :         else
     425           0 :                 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
     426           0 :         btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
     427             :                 dev_replace->cont_reading_from_srcdev_mode);
     428           0 :         btrfs_set_dev_replace_replace_state(eb, ptr,
     429             :                 dev_replace->replace_state);
     430           0 :         btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
     431           0 :         btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
     432           0 :         btrfs_set_dev_replace_num_write_errors(eb, ptr,
     433             :                 atomic64_read(&dev_replace->num_write_errors));
     434           0 :         btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
     435             :                 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
     436           0 :         dev_replace->cursor_left_last_write_of_item =
     437           0 :                 dev_replace->cursor_left;
     438           0 :         btrfs_set_dev_replace_cursor_left(eb, ptr,
     439             :                 dev_replace->cursor_left_last_write_of_item);
     440           0 :         btrfs_set_dev_replace_cursor_right(eb, ptr,
     441             :                 dev_replace->cursor_right);
     442           0 :         dev_replace->item_needs_writeback = 0;
     443           0 :         up_write(&dev_replace->rwsem);
     444             : 
     445           0 :         btrfs_mark_buffer_dirty(eb);
     446             : 
     447           0 : out:
     448           0 :         btrfs_free_path(path);
     449             : 
     450           0 :         return ret;
     451             : }
     452             : 
     453             : static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
     454             :                                     struct btrfs_device *src_dev)
     455             : {
     456           0 :         struct btrfs_path *path;
     457           0 :         struct btrfs_key key;
     458           0 :         struct btrfs_key found_key;
     459           0 :         struct btrfs_root *root = fs_info->dev_root;
     460           0 :         struct btrfs_dev_extent *dev_extent = NULL;
     461           0 :         struct btrfs_block_group *cache;
     462           0 :         struct btrfs_trans_handle *trans;
     463           0 :         int iter_ret = 0;
     464           0 :         int ret = 0;
     465           0 :         u64 chunk_offset;
     466             : 
     467             :         /* Do not use "to_copy" on non zoned filesystem for now */
     468           0 :         if (!btrfs_is_zoned(fs_info))
     469           0 :                 return 0;
     470             : 
     471             :         mutex_lock(&fs_info->chunk_mutex);
     472             : 
     473             :         /* Ensure we don't have pending new block group */
     474             :         spin_lock(&fs_info->trans_lock);
     475             :         while (fs_info->running_transaction &&
     476             :                !list_empty(&fs_info->running_transaction->dev_update_list)) {
     477             :                 spin_unlock(&fs_info->trans_lock);
     478             :                 mutex_unlock(&fs_info->chunk_mutex);
     479             :                 trans = btrfs_attach_transaction(root);
     480             :                 if (IS_ERR(trans)) {
     481             :                         ret = PTR_ERR(trans);
     482             :                         mutex_lock(&fs_info->chunk_mutex);
     483             :                         if (ret == -ENOENT) {
     484             :                                 spin_lock(&fs_info->trans_lock);
     485             :                                 continue;
     486             :                         } else {
     487             :                                 goto unlock;
     488             :                         }
     489             :                 }
     490             : 
     491             :                 ret = btrfs_commit_transaction(trans);
     492             :                 mutex_lock(&fs_info->chunk_mutex);
     493             :                 if (ret)
     494             :                         goto unlock;
     495             : 
     496             :                 spin_lock(&fs_info->trans_lock);
     497             :         }
     498             :         spin_unlock(&fs_info->trans_lock);
     499             : 
     500             :         path = btrfs_alloc_path();
     501             :         if (!path) {
     502             :                 ret = -ENOMEM;
     503             :                 goto unlock;
     504             :         }
     505             : 
     506             :         path->reada = READA_FORWARD;
     507             :         path->search_commit_root = 1;
     508             :         path->skip_locking = 1;
     509             : 
     510             :         key.objectid = src_dev->devid;
     511             :         key.type = BTRFS_DEV_EXTENT_KEY;
     512             :         key.offset = 0;
     513             : 
     514             :         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
     515             :                 struct extent_buffer *leaf = path->nodes[0];
     516             : 
     517             :                 if (found_key.objectid != src_dev->devid)
     518             :                         break;
     519             : 
     520             :                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
     521             :                         break;
     522             : 
     523             :                 if (found_key.offset < key.offset)
     524             :                         break;
     525             : 
     526             :                 dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
     527             : 
     528             :                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
     529             : 
     530             :                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
     531             :                 if (!cache)
     532             :                         continue;
     533             : 
     534             :                 set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
     535             :                 btrfs_put_block_group(cache);
     536             :         }
     537             :         if (iter_ret < 0)
     538             :                 ret = iter_ret;
     539             : 
     540             :         btrfs_free_path(path);
     541             : unlock:
     542             :         mutex_unlock(&fs_info->chunk_mutex);
     543             : 
     544             :         return ret;
     545             : }
     546             : 
     547           0 : bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
     548             :                                       struct btrfs_block_group *cache,
     549             :                                       u64 physical)
     550             : {
     551           0 :         struct btrfs_fs_info *fs_info = cache->fs_info;
     552           0 :         struct extent_map *em;
     553           0 :         struct map_lookup *map;
     554           0 :         u64 chunk_offset = cache->start;
     555           0 :         int num_extents, cur_extent;
     556           0 :         int i;
     557             : 
     558             :         /* Do not use "to_copy" on non zoned filesystem for now */
     559           0 :         if (!btrfs_is_zoned(fs_info))
     560           0 :                 return true;
     561             : 
     562             :         spin_lock(&cache->lock);
     563             :         if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
     564             :                 spin_unlock(&cache->lock);
     565             :                 return true;
     566             :         }
     567             :         spin_unlock(&cache->lock);
     568             : 
     569             :         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
     570             :         ASSERT(!IS_ERR(em));
     571             :         map = em->map_lookup;
     572             : 
     573             :         num_extents = 0;
     574             :         cur_extent = 0;
     575             :         for (i = 0; i < map->num_stripes; i++) {
     576             :                 /* We have more device extent to copy */
     577             :                 if (srcdev != map->stripes[i].dev)
     578             :                         continue;
     579             : 
     580             :                 num_extents++;
     581             :                 if (physical == map->stripes[i].physical)
     582             :                         cur_extent = i;
     583             :         }
     584             : 
     585             :         free_extent_map(em);
     586             : 
     587             :         if (num_extents > 1 && cur_extent < num_extents - 1) {
     588             :                 /*
     589             :                  * Has more stripes on this device. Keep this block group
     590             :                  * readonly until we finish all the stripes.
     591             :                  */
     592             :                 return false;
     593             :         }
     594             : 
     595             :         /* Last stripe on this device */
     596             :         clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
     597             : 
     598             :         return true;
     599             : }
     600             : 
     601           0 : static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
     602             :                 const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
     603             :                 int read_src)
     604             : {
     605           0 :         struct btrfs_root *root = fs_info->dev_root;
     606           0 :         struct btrfs_trans_handle *trans;
     607           0 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
     608           0 :         int ret;
     609           0 :         struct btrfs_device *tgt_device = NULL;
     610           0 :         struct btrfs_device *src_device = NULL;
     611             : 
     612           0 :         src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
     613             :                                                   srcdev_name);
     614           0 :         if (IS_ERR(src_device))
     615           0 :                 return PTR_ERR(src_device);
     616             : 
     617           0 :         if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
     618           0 :                 btrfs_warn_in_rcu(fs_info,
     619             :           "cannot replace device %s (devid %llu) due to active swapfile",
     620             :                         btrfs_dev_name(src_device), src_device->devid);
     621           0 :                 return -ETXTBSY;
     622             :         }
     623             : 
     624             :         /*
     625             :          * Here we commit the transaction to make sure commit_total_bytes
     626             :          * of all the devices are updated.
     627             :          */
     628           0 :         trans = btrfs_attach_transaction(root);
     629           0 :         if (!IS_ERR(trans)) {
     630           0 :                 ret = btrfs_commit_transaction(trans);
     631           0 :                 if (ret)
     632             :                         return ret;
     633           0 :         } else if (PTR_ERR(trans) != -ENOENT) {
     634           0 :                 return PTR_ERR(trans);
     635             :         }
     636             : 
     637           0 :         ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
     638             :                                             src_device, &tgt_device);
     639           0 :         if (ret)
     640             :                 return ret;
     641             : 
     642           0 :         ret = mark_block_group_to_copy(fs_info, src_device);
     643           0 :         if (ret)
     644             :                 return ret;
     645             : 
     646           0 :         down_write(&dev_replace->rwsem);
     647           0 :         switch (dev_replace->replace_state) {
     648             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
     649             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
     650             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
     651             :                 break;
     652           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
     653             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
     654           0 :                 ASSERT(0);
     655           0 :                 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
     656           0 :                 up_write(&dev_replace->rwsem);
     657           0 :                 goto leave;
     658             :         }
     659             : 
     660           0 :         dev_replace->cont_reading_from_srcdev_mode = read_src;
     661           0 :         dev_replace->srcdev = src_device;
     662           0 :         dev_replace->tgtdev = tgt_device;
     663             : 
     664           0 :         btrfs_info_in_rcu(fs_info,
     665             :                       "dev_replace from %s (devid %llu) to %s started",
     666             :                       btrfs_dev_name(src_device),
     667             :                       src_device->devid,
     668             :                       btrfs_dev_name(tgt_device));
     669             : 
     670             :         /*
     671             :          * from now on, the writes to the srcdev are all duplicated to
     672             :          * go to the tgtdev as well (refer to btrfs_map_block()).
     673             :          */
     674           0 :         dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
     675           0 :         dev_replace->time_started = ktime_get_real_seconds();
     676           0 :         dev_replace->cursor_left = 0;
     677           0 :         dev_replace->committed_cursor_left = 0;
     678           0 :         dev_replace->cursor_left_last_write_of_item = 0;
     679           0 :         dev_replace->cursor_right = 0;
     680           0 :         dev_replace->is_valid = 1;
     681           0 :         dev_replace->item_needs_writeback = 1;
     682           0 :         atomic64_set(&dev_replace->num_write_errors, 0);
     683           0 :         atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
     684           0 :         up_write(&dev_replace->rwsem);
     685             : 
     686           0 :         ret = btrfs_sysfs_add_device(tgt_device);
     687           0 :         if (ret)
     688           0 :                 btrfs_err(fs_info, "kobj add dev failed %d", ret);
     689             : 
     690           0 :         btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
     691             : 
     692             :         /*
     693             :          * Commit dev_replace state and reserve 1 item for it.
     694             :          * This is crucial to ensure we won't miss copying extents for new block
     695             :          * groups that are allocated after we started the device replace, and
     696             :          * must be done after setting up the device replace state.
     697             :          */
     698           0 :         trans = btrfs_start_transaction(root, 1);
     699           0 :         if (IS_ERR(trans)) {
     700           0 :                 ret = PTR_ERR(trans);
     701           0 :                 down_write(&dev_replace->rwsem);
     702           0 :                 dev_replace->replace_state =
     703             :                         BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
     704           0 :                 dev_replace->srcdev = NULL;
     705           0 :                 dev_replace->tgtdev = NULL;
     706           0 :                 up_write(&dev_replace->rwsem);
     707           0 :                 goto leave;
     708             :         }
     709             : 
     710           0 :         ret = btrfs_commit_transaction(trans);
     711           0 :         WARN_ON(ret);
     712             : 
     713             :         /* the disk copy procedure reuses the scrub code */
     714           0 :         ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
     715             :                               btrfs_device_get_total_bytes(src_device),
     716             :                               &dev_replace->scrub_progress, 0, 1);
     717             : 
     718           0 :         ret = btrfs_dev_replace_finishing(fs_info, ret);
     719           0 :         if (ret == -EINPROGRESS)
     720           0 :                 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
     721             : 
     722             :         return ret;
     723             : 
     724           0 : leave:
     725           0 :         btrfs_destroy_dev_replace_tgtdev(tgt_device);
     726           0 :         return ret;
     727             : }
     728             : 
     729           0 : int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
     730             :                             struct btrfs_ioctl_dev_replace_args *args)
     731             : {
     732           0 :         int ret;
     733             : 
     734           0 :         switch (args->start.cont_reading_from_srcdev_mode) {
     735             :         case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
     736             :         case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
     737           0 :                 break;
     738             :         default:
     739             :                 return -EINVAL;
     740             :         }
     741             : 
     742           0 :         if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
     743           0 :             args->start.tgtdev_name[0] == '\0')
     744             :                 return -EINVAL;
     745             : 
     746           0 :         ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
     747             :                                         args->start.srcdevid,
     748           0 :                                         args->start.srcdev_name,
     749             :                                         args->start.cont_reading_from_srcdev_mode);
     750           0 :         args->result = ret;
     751             :         /* don't warn if EINPROGRESS, someone else might be running scrub */
     752           0 :         if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
     753           0 :             ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
     754           0 :                 return 0;
     755             : 
     756             :         return ret;
     757             : }
     758             : 
     759             : /*
     760             :  * blocked until all in-flight bios operations are finished.
     761             :  */
     762           0 : static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
     763             : {
     764           0 :         set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
     765           0 :         wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
     766             :                    &fs_info->dev_replace.bio_counter));
     767           0 : }
     768             : 
     769             : /*
     770             :  * we have removed target device, it is safe to allow new bios request.
     771             :  */
     772           0 : static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
     773             : {
     774           0 :         clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
     775           0 :         wake_up(&fs_info->dev_replace.replace_wait);
     776           0 : }
     777             : 
     778             : /*
     779             :  * When finishing the device replace, before swapping the source device with the
     780             :  * target device we must update the chunk allocation state in the target device,
     781             :  * as it is empty because replace works by directly copying the chunks and not
     782             :  * through the normal chunk allocation path.
     783             :  */
     784           0 : static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
     785             :                                         struct btrfs_device *tgtdev)
     786             : {
     787           0 :         struct extent_state *cached_state = NULL;
     788           0 :         u64 start = 0;
     789           0 :         u64 found_start;
     790           0 :         u64 found_end;
     791           0 :         int ret = 0;
     792             : 
     793           0 :         lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
     794             : 
     795           0 :         while (!find_first_extent_bit(&srcdev->alloc_state, start,
     796             :                                       &found_start, &found_end,
     797             :                                       CHUNK_ALLOCATED, &cached_state)) {
     798           0 :                 ret = set_extent_bit(&tgtdev->alloc_state, found_start,
     799             :                                      found_end, CHUNK_ALLOCATED, NULL);
     800           0 :                 if (ret)
     801             :                         break;
     802           0 :                 start = found_end + 1;
     803             :         }
     804             : 
     805           0 :         free_extent_state(cached_state);
     806           0 :         return ret;
     807             : }
     808             : 
     809           0 : static void btrfs_dev_replace_update_device_in_mapping_tree(
     810             :                                                 struct btrfs_fs_info *fs_info,
     811             :                                                 struct btrfs_device *srcdev,
     812             :                                                 struct btrfs_device *tgtdev)
     813             : {
     814           0 :         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
     815           0 :         struct extent_map *em;
     816           0 :         struct map_lookup *map;
     817           0 :         u64 start = 0;
     818           0 :         int i;
     819             : 
     820           0 :         write_lock(&em_tree->lock);
     821           0 :         do {
     822           0 :                 em = lookup_extent_mapping(em_tree, start, (u64)-1);
     823           0 :                 if (!em)
     824             :                         break;
     825           0 :                 map = em->map_lookup;
     826           0 :                 for (i = 0; i < map->num_stripes; i++)
     827           0 :                         if (srcdev == map->stripes[i].dev)
     828           0 :                                 map->stripes[i].dev = tgtdev;
     829           0 :                 start = em->start + em->len;
     830           0 :                 free_extent_map(em);
     831           0 :         } while (start);
     832           0 :         write_unlock(&em_tree->lock);
     833           0 : }
     834             : 
     835           0 : static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
     836             :                                        int scrub_ret)
     837             : {
     838           0 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
     839           0 :         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
     840           0 :         struct btrfs_device *tgt_device;
     841           0 :         struct btrfs_device *src_device;
     842           0 :         struct btrfs_root *root = fs_info->tree_root;
     843           0 :         u8 uuid_tmp[BTRFS_UUID_SIZE];
     844           0 :         struct btrfs_trans_handle *trans;
     845           0 :         int ret = 0;
     846             : 
     847             :         /* don't allow cancel or unmount to disturb the finishing procedure */
     848           0 :         mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
     849             : 
     850           0 :         down_read(&dev_replace->rwsem);
     851             :         /* was the operation canceled, or is it finished? */
     852           0 :         if (dev_replace->replace_state !=
     853             :             BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
     854           0 :                 up_read(&dev_replace->rwsem);
     855           0 :                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
     856           0 :                 return 0;
     857             :         }
     858             : 
     859           0 :         tgt_device = dev_replace->tgtdev;
     860           0 :         src_device = dev_replace->srcdev;
     861           0 :         up_read(&dev_replace->rwsem);
     862             : 
     863             :         /*
     864             :          * flush all outstanding I/O and inode extent mappings before the
     865             :          * copy operation is declared as being finished
     866             :          */
     867           0 :         ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
     868           0 :         if (ret) {
     869           0 :                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
     870           0 :                 return ret;
     871             :         }
     872           0 :         btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
     873             : 
     874             :         /*
     875             :          * We have to use this loop approach because at this point src_device
     876             :          * has to be available for transaction commit to complete, yet new
     877             :          * chunks shouldn't be allocated on the device.
     878             :          */
     879           0 :         while (1) {
     880           0 :                 trans = btrfs_start_transaction(root, 0);
     881           0 :                 if (IS_ERR(trans)) {
     882           0 :                         mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
     883           0 :                         return PTR_ERR(trans);
     884             :                 }
     885           0 :                 ret = btrfs_commit_transaction(trans);
     886           0 :                 WARN_ON(ret);
     887             : 
     888             :                 /* Prevent write_all_supers() during the finishing procedure */
     889           0 :                 mutex_lock(&fs_devices->device_list_mutex);
     890             :                 /* Prevent new chunks being allocated on the source device */
     891           0 :                 mutex_lock(&fs_info->chunk_mutex);
     892             : 
     893           0 :                 if (!list_empty(&src_device->post_commit_list)) {
     894           0 :                         mutex_unlock(&fs_devices->device_list_mutex);
     895           0 :                         mutex_unlock(&fs_info->chunk_mutex);
     896             :                 } else {
     897             :                         break;
     898             :                 }
     899             :         }
     900             : 
     901           0 :         down_write(&dev_replace->rwsem);
     902           0 :         dev_replace->replace_state =
     903             :                 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
     904           0 :                           : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
     905           0 :         dev_replace->tgtdev = NULL;
     906           0 :         dev_replace->srcdev = NULL;
     907           0 :         dev_replace->time_stopped = ktime_get_real_seconds();
     908           0 :         dev_replace->item_needs_writeback = 1;
     909             : 
     910             :         /*
     911             :          * Update allocation state in the new device and replace the old device
     912             :          * with the new one in the mapping tree.
     913             :          */
     914           0 :         if (!scrub_ret) {
     915           0 :                 scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
     916           0 :                 if (scrub_ret)
     917           0 :                         goto error;
     918           0 :                 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
     919             :                                                                 src_device,
     920             :                                                                 tgt_device);
     921             :         } else {
     922           0 :                 if (scrub_ret != -ECANCELED)
     923           0 :                         btrfs_err_in_rcu(fs_info,
     924             :                                  "btrfs_scrub_dev(%s, %llu, %s) failed %d",
     925             :                                  btrfs_dev_name(src_device),
     926             :                                  src_device->devid,
     927             :                                  btrfs_dev_name(tgt_device), scrub_ret);
     928           0 : error:
     929           0 :                 up_write(&dev_replace->rwsem);
     930           0 :                 mutex_unlock(&fs_info->chunk_mutex);
     931           0 :                 mutex_unlock(&fs_devices->device_list_mutex);
     932           0 :                 btrfs_rm_dev_replace_blocked(fs_info);
     933           0 :                 if (tgt_device)
     934           0 :                         btrfs_destroy_dev_replace_tgtdev(tgt_device);
     935           0 :                 btrfs_rm_dev_replace_unblocked(fs_info);
     936           0 :                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
     937             : 
     938           0 :                 return scrub_ret;
     939             :         }
     940             : 
     941           0 :         btrfs_info_in_rcu(fs_info,
     942             :                           "dev_replace from %s (devid %llu) to %s finished",
     943             :                           btrfs_dev_name(src_device),
     944             :                           src_device->devid,
     945             :                           btrfs_dev_name(tgt_device));
     946           0 :         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
     947           0 :         tgt_device->devid = src_device->devid;
     948           0 :         src_device->devid = BTRFS_DEV_REPLACE_DEVID;
     949           0 :         memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
     950           0 :         memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
     951           0 :         memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
     952           0 :         btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
     953           0 :         btrfs_device_set_disk_total_bytes(tgt_device,
     954             :                                           src_device->disk_total_bytes);
     955           0 :         btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
     956           0 :         tgt_device->commit_bytes_used = src_device->bytes_used;
     957             : 
     958           0 :         btrfs_assign_next_active_device(src_device, tgt_device);
     959             : 
     960           0 :         list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
     961           0 :         fs_devices->rw_devices++;
     962             : 
     963           0 :         up_write(&dev_replace->rwsem);
     964           0 :         btrfs_rm_dev_replace_blocked(fs_info);
     965             : 
     966           0 :         btrfs_rm_dev_replace_remove_srcdev(src_device);
     967             : 
     968           0 :         btrfs_rm_dev_replace_unblocked(fs_info);
     969             : 
     970             :         /*
     971             :          * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
     972             :          * update on-disk dev stats value during commit transaction
     973             :          */
     974           0 :         atomic_inc(&tgt_device->dev_stats_ccnt);
     975             : 
     976             :         /*
     977             :          * this is again a consistent state where no dev_replace procedure
     978             :          * is running, the target device is part of the filesystem, the
     979             :          * source device is not part of the filesystem anymore and its 1st
     980             :          * superblock is scratched out so that it is no longer marked to
     981             :          * belong to this filesystem.
     982             :          */
     983           0 :         mutex_unlock(&fs_info->chunk_mutex);
     984           0 :         mutex_unlock(&fs_devices->device_list_mutex);
     985             : 
     986             :         /* replace the sysfs entry */
     987           0 :         btrfs_sysfs_remove_device(src_device);
     988           0 :         btrfs_sysfs_update_devid(tgt_device);
     989           0 :         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
     990           0 :                 btrfs_scratch_superblocks(fs_info, src_device->bdev,
     991           0 :                                           src_device->name->str);
     992             : 
     993             :         /* write back the superblocks */
     994           0 :         trans = btrfs_start_transaction(root, 0);
     995           0 :         if (!IS_ERR(trans))
     996           0 :                 btrfs_commit_transaction(trans);
     997             : 
     998           0 :         mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
     999             : 
    1000           0 :         btrfs_rm_dev_replace_free_srcdev(src_device);
    1001             : 
    1002           0 :         return 0;
    1003             : }
    1004             : 
    1005             : /*
    1006             :  * Read progress of device replace status according to the state and last
    1007             :  * stored position. The value format is the same as for
    1008             :  * btrfs_dev_replace::progress_1000
    1009             :  */
    1010           1 : static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
    1011             : {
    1012           1 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1013           1 :         u64 ret = 0;
    1014             : 
    1015           1 :         switch (dev_replace->replace_state) {
    1016             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    1017             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    1018             :                 ret = 0;
    1019             :                 break;
    1020           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    1021           0 :                 ret = 1000;
    1022           0 :                 break;
    1023           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    1024             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    1025           0 :                 ret = div64_u64(dev_replace->cursor_left,
    1026             :                                 div_u64(btrfs_device_get_total_bytes(
    1027           0 :                                                 dev_replace->srcdev), 1000));
    1028           0 :                 break;
    1029             :         }
    1030             : 
    1031           1 :         return ret;
    1032             : }
    1033             : 
    1034           1 : void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
    1035             :                               struct btrfs_ioctl_dev_replace_args *args)
    1036             : {
    1037           1 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1038             : 
    1039           1 :         down_read(&dev_replace->rwsem);
    1040             :         /* even if !dev_replace_is_valid, the values are good enough for
    1041             :          * the replace_status ioctl */
    1042           1 :         args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
    1043           1 :         args->status.replace_state = dev_replace->replace_state;
    1044           1 :         args->status.time_started = dev_replace->time_started;
    1045           1 :         args->status.time_stopped = dev_replace->time_stopped;
    1046           1 :         args->status.num_write_errors =
    1047           1 :                 atomic64_read(&dev_replace->num_write_errors);
    1048           1 :         args->status.num_uncorrectable_read_errors =
    1049           1 :                 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
    1050           1 :         args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
    1051           1 :         up_read(&dev_replace->rwsem);
    1052           1 : }
    1053             : 
    1054           0 : int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
    1055             : {
    1056           0 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1057           0 :         struct btrfs_device *tgt_device = NULL;
    1058           0 :         struct btrfs_device *src_device = NULL;
    1059           0 :         struct btrfs_trans_handle *trans;
    1060           0 :         struct btrfs_root *root = fs_info->tree_root;
    1061           0 :         int result;
    1062           0 :         int ret;
    1063             : 
    1064           0 :         if (sb_rdonly(fs_info->sb))
    1065             :                 return -EROFS;
    1066             : 
    1067           0 :         mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
    1068           0 :         down_write(&dev_replace->rwsem);
    1069           0 :         switch (dev_replace->replace_state) {
    1070           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    1071             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    1072             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    1073           0 :                 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
    1074           0 :                 up_write(&dev_replace->rwsem);
    1075           0 :                 break;
    1076           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    1077           0 :                 tgt_device = dev_replace->tgtdev;
    1078           0 :                 src_device = dev_replace->srcdev;
    1079           0 :                 up_write(&dev_replace->rwsem);
    1080           0 :                 ret = btrfs_scrub_cancel(fs_info);
    1081           0 :                 if (ret < 0) {
    1082             :                         result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
    1083             :                 } else {
    1084           0 :                         result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
    1085             :                         /*
    1086             :                          * btrfs_dev_replace_finishing() will handle the
    1087             :                          * cleanup part
    1088             :                          */
    1089           0 :                         btrfs_info_in_rcu(fs_info,
    1090             :                                 "dev_replace from %s (devid %llu) to %s canceled",
    1091             :                                 btrfs_dev_name(src_device), src_device->devid,
    1092             :                                 btrfs_dev_name(tgt_device));
    1093             :                 }
    1094             :                 break;
    1095           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    1096             :                 /*
    1097             :                  * Scrub doing the replace isn't running so we need to do the
    1098             :                  * cleanup step of btrfs_dev_replace_finishing() here
    1099             :                  */
    1100           0 :                 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
    1101           0 :                 tgt_device = dev_replace->tgtdev;
    1102           0 :                 src_device = dev_replace->srcdev;
    1103           0 :                 dev_replace->tgtdev = NULL;
    1104           0 :                 dev_replace->srcdev = NULL;
    1105           0 :                 dev_replace->replace_state =
    1106             :                                 BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
    1107           0 :                 dev_replace->time_stopped = ktime_get_real_seconds();
    1108           0 :                 dev_replace->item_needs_writeback = 1;
    1109             : 
    1110           0 :                 up_write(&dev_replace->rwsem);
    1111             : 
    1112             :                 /* Scrub for replace must not be running in suspended state */
    1113           0 :                 btrfs_scrub_cancel(fs_info);
    1114             : 
    1115           0 :                 trans = btrfs_start_transaction(root, 0);
    1116           0 :                 if (IS_ERR(trans)) {
    1117           0 :                         mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    1118           0 :                         return PTR_ERR(trans);
    1119             :                 }
    1120           0 :                 ret = btrfs_commit_transaction(trans);
    1121           0 :                 WARN_ON(ret);
    1122             : 
    1123           0 :                 btrfs_info_in_rcu(fs_info,
    1124             :                 "suspended dev_replace from %s (devid %llu) to %s canceled",
    1125             :                         btrfs_dev_name(src_device), src_device->devid,
    1126             :                         btrfs_dev_name(tgt_device));
    1127             : 
    1128           0 :                 if (tgt_device)
    1129           0 :                         btrfs_destroy_dev_replace_tgtdev(tgt_device);
    1130             :                 break;
    1131           0 :         default:
    1132           0 :                 up_write(&dev_replace->rwsem);
    1133           0 :                 result = -EINVAL;
    1134             :         }
    1135             : 
    1136           0 :         mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    1137           0 :         return result;
    1138             : }
    1139             : 
    1140        3221 : void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
    1141             : {
    1142        3221 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1143             : 
    1144        3221 :         mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
    1145        3221 :         down_write(&dev_replace->rwsem);
    1146             : 
    1147        3221 :         switch (dev_replace->replace_state) {
    1148             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    1149             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    1150             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    1151             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    1152             :                 break;
    1153           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    1154           0 :                 dev_replace->replace_state =
    1155             :                         BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
    1156           0 :                 dev_replace->time_stopped = ktime_get_real_seconds();
    1157           0 :                 dev_replace->item_needs_writeback = 1;
    1158           0 :                 btrfs_info(fs_info, "suspending dev_replace for unmount");
    1159           0 :                 break;
    1160             :         }
    1161             : 
    1162        3221 :         up_write(&dev_replace->rwsem);
    1163        3221 :         mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
    1164        3221 : }
    1165             : 
    1166             : /* resume dev_replace procedure that was interrupted by unmount */
    1167        3181 : int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
    1168             : {
    1169        3181 :         struct task_struct *task;
    1170        3181 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1171             : 
    1172        3181 :         down_write(&dev_replace->rwsem);
    1173             : 
    1174        3181 :         switch (dev_replace->replace_state) {
    1175        3181 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    1176             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    1177             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    1178        3181 :                 up_write(&dev_replace->rwsem);
    1179        3181 :                 return 0;
    1180             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    1181             :                 break;
    1182           0 :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    1183           0 :                 dev_replace->replace_state =
    1184             :                         BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
    1185           0 :                 break;
    1186             :         }
    1187           0 :         if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
    1188           0 :                 btrfs_info(fs_info,
    1189             :                            "cannot continue dev_replace, tgtdev is missing");
    1190           0 :                 btrfs_info(fs_info,
    1191             :                            "you may cancel the operation after 'mount -o degraded'");
    1192           0 :                 dev_replace->replace_state =
    1193             :                                         BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
    1194           0 :                 up_write(&dev_replace->rwsem);
    1195           0 :                 return 0;
    1196             :         }
    1197           0 :         up_write(&dev_replace->rwsem);
    1198             : 
    1199             :         /*
    1200             :          * This could collide with a paused balance, but the exclusive op logic
    1201             :          * should never allow both to start and pause. We don't want to allow
    1202             :          * dev-replace to start anyway.
    1203             :          */
    1204           0 :         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
    1205           0 :                 down_write(&dev_replace->rwsem);
    1206           0 :                 dev_replace->replace_state =
    1207             :                                         BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
    1208           0 :                 up_write(&dev_replace->rwsem);
    1209           0 :                 btrfs_info(fs_info,
    1210             :                 "cannot resume dev-replace, other exclusive operation running");
    1211           0 :                 return 0;
    1212             :         }
    1213             : 
    1214           0 :         task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
    1215           0 :         return PTR_ERR_OR_ZERO(task);
    1216             : }
    1217             : 
    1218           0 : static int btrfs_dev_replace_kthread(void *data)
    1219             : {
    1220           0 :         struct btrfs_fs_info *fs_info = data;
    1221           0 :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    1222           0 :         u64 progress;
    1223           0 :         int ret;
    1224             : 
    1225           0 :         progress = btrfs_dev_replace_progress(fs_info);
    1226           0 :         progress = div_u64(progress, 10);
    1227           0 :         btrfs_info_in_rcu(fs_info,
    1228             :                 "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
    1229             :                 btrfs_dev_name(dev_replace->srcdev),
    1230             :                 dev_replace->srcdev->devid,
    1231             :                 btrfs_dev_name(dev_replace->tgtdev),
    1232             :                 (unsigned int)progress);
    1233             : 
    1234           0 :         ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
    1235             :                               dev_replace->committed_cursor_left,
    1236           0 :                               btrfs_device_get_total_bytes(dev_replace->srcdev),
    1237             :                               &dev_replace->scrub_progress, 0, 1);
    1238           0 :         ret = btrfs_dev_replace_finishing(fs_info, ret);
    1239           0 :         WARN_ON(ret && ret != -ECANCELED);
    1240             : 
    1241           0 :         btrfs_exclop_finish(fs_info);
    1242           0 :         return 0;
    1243             : }
    1244             : 
    1245    15467812 : int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
    1246             : {
    1247    15467812 :         if (!dev_replace->is_valid)
    1248             :                 return 0;
    1249             : 
    1250           0 :         switch (dev_replace->replace_state) {
    1251             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
    1252             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
    1253             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
    1254             :                 return 0;
    1255             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
    1256             :         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
    1257             :                 /*
    1258             :                  * return true even if tgtdev is missing (this is
    1259             :                  * something that can happen if the dev_replace
    1260             :                  * procedure is suspended by an umount and then
    1261             :                  * the tgtdev is missing (or "btrfs dev scan") was
    1262             :                  * not called and the filesystem is remounted
    1263             :                  * in degraded state. This does not stop the
    1264             :                  * dev_replace procedure. It needs to be canceled
    1265             :                  * manually if the cancellation is wanted.
    1266             :                  */
    1267             :                 break;
    1268             :         }
    1269           0 :         return 1;
    1270             : }
    1271             : 
    1272    15519082 : void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
    1273             : {
    1274    15519082 :         percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
    1275    15519081 :         cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
    1276    15519080 : }
    1277             : 
    1278    15515934 : void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
    1279             : {
    1280    15515934 :         while (1) {
    1281    15515934 :                 percpu_counter_inc(&fs_info->dev_replace.bio_counter);
    1282    15515867 :                 if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
    1283             :                                      &fs_info->fs_state)))
    1284             :                         break;
    1285             : 
    1286           0 :                 btrfs_bio_counter_dec(fs_info);
    1287           0 :                 wait_event(fs_info->dev_replace.replace_wait,
    1288             :                            !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
    1289             :                                      &fs_info->fs_state));
    1290             :         }
    1291    15515867 : }

Generated by: LCOV version 1.14