LCOV - code coverage report
Current view: top level - fs/btrfs - reflink.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023 Lines: 388 415 93.5 %
Date: 2023-07-31 20:08:12 Functions: 13 13 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/blkdev.h>
       4             : #include <linux/iversion.h>
       5             : #include "ctree.h"
       6             : #include "fs.h"
       7             : #include "messages.h"
       8             : #include "compression.h"
       9             : #include "delalloc-space.h"
      10             : #include "disk-io.h"
      11             : #include "reflink.h"
      12             : #include "transaction.h"
      13             : #include "subpage.h"
      14             : #include "accessors.h"
      15             : #include "file-item.h"
      16             : #include "file.h"
      17             : #include "super.h"
      18             : 
      19             : #define BTRFS_MAX_DEDUPE_LEN    SZ_16M
      20             : 
      21    16440599 : static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
      22             :                                      struct inode *inode,
      23             :                                      u64 endoff,
      24             :                                      const u64 destoff,
      25             :                                      const u64 olen,
      26             :                                      int no_time_update)
      27             : {
      28    16440599 :         struct btrfs_root *root = BTRFS_I(inode)->root;
      29    16440599 :         int ret;
      30             : 
      31    16440599 :         inode_inc_iversion(inode);
      32    16440648 :         if (!no_time_update) {
      33    15190667 :                 inode->i_mtime = current_time(inode);
      34    15190665 :                 inode->i_ctime = inode->i_mtime;
      35             :         }
      36             :         /*
      37             :          * We round up to the block size at eof when determining which
      38             :          * extents to clone above, but shouldn't round up the file size.
      39             :          */
      40    16440646 :         if (endoff > destoff + olen)
      41             :                 endoff = destoff + olen;
      42    16440646 :         if (endoff > inode->i_size) {
      43    14708307 :                 i_size_write(inode, endoff);
      44    14708307 :                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
      45             :         }
      46             : 
      47    16440648 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
      48    16440733 :         if (ret) {
      49           0 :                 btrfs_abort_transaction(trans, ret);
      50           0 :                 btrfs_end_transaction(trans);
      51           0 :                 goto out;
      52             :         }
      53    16440733 :         ret = btrfs_end_transaction(trans);
      54    16440653 : out:
      55    16440653 :         return ret;
      56             : }
      57             : 
      58          17 : static int copy_inline_to_page(struct btrfs_inode *inode,
      59             :                                const u64 file_offset,
      60             :                                char *inline_data,
      61             :                                const u64 size,
      62             :                                const u64 datal,
      63             :                                const u8 comp_type)
      64             : {
      65          17 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
      66          17 :         const u32 block_size = fs_info->sectorsize;
      67          17 :         const u64 range_end = file_offset + block_size - 1;
      68          17 :         const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
      69          17 :         char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
      70          17 :         struct extent_changeset *data_reserved = NULL;
      71          17 :         struct page *page = NULL;
      72          17 :         struct address_space *mapping = inode->vfs_inode.i_mapping;
      73          17 :         int ret;
      74             : 
      75          17 :         ASSERT(IS_ALIGNED(file_offset, block_size));
      76             : 
      77             :         /*
      78             :          * We have flushed and locked the ranges of the source and destination
      79             :          * inodes, we also have locked the inodes, so we are safe to do a
      80             :          * reservation here. Also we must not do the reservation while holding
      81             :          * a transaction open, otherwise we would deadlock.
      82             :          */
      83          17 :         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
      84             :                                            block_size);
      85          17 :         if (ret)
      86           0 :                 goto out;
      87             : 
      88          17 :         page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
      89             :                                    btrfs_alloc_write_mask(mapping));
      90          17 :         if (!page) {
      91           0 :                 ret = -ENOMEM;
      92           0 :                 goto out_unlock;
      93             :         }
      94             : 
      95          17 :         ret = set_page_extent_mapped(page);
      96          17 :         if (ret < 0)
      97           0 :                 goto out_unlock;
      98             : 
      99          17 :         clear_extent_bit(&inode->io_tree, file_offset, range_end,
     100             :                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
     101             :                          NULL);
     102          17 :         ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
     103          17 :         if (ret)
     104           0 :                 goto out_unlock;
     105             : 
     106             :         /*
     107             :          * After dirtying the page our caller will need to start a transaction,
     108             :          * and if we are low on metadata free space, that can cause flushing of
     109             :          * delalloc for all inodes in order to get metadata space released.
     110             :          * However we are holding the range locked for the whole duration of
     111             :          * the clone/dedupe operation, so we may deadlock if that happens and no
     112             :          * other task releases enough space. So mark this inode as not being
     113             :          * possible to flush to avoid such deadlock. We will clear that flag
     114             :          * when we finish cloning all extents, since a transaction is started
     115             :          * after finding each extent to clone.
     116             :          */
     117          17 :         set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
     118             : 
     119          17 :         if (comp_type == BTRFS_COMPRESS_NONE) {
     120           6 :                 memcpy_to_page(page, offset_in_page(file_offset), data_start,
     121             :                                datal);
     122             :         } else {
     123          11 :                 ret = btrfs_decompress(comp_type, data_start, page,
     124             :                                        offset_in_page(file_offset),
     125             :                                        inline_size, datal);
     126          11 :                 if (ret)
     127           0 :                         goto out_unlock;
     128             :                 flush_dcache_page(page);
     129             :         }
     130             : 
     131             :         /*
     132             :          * If our inline data is smaller then the block/page size, then the
     133             :          * remaining of the block/page is equivalent to zeroes. We had something
     134             :          * like the following done:
     135             :          *
     136             :          * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
     137             :          * $ sync  # (or fsync)
     138             :          * $ xfs_io -c "falloc 0 4K" file
     139             :          * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
     140             :          *
     141             :          * So what's in the range [500, 4095] corresponds to zeroes.
     142             :          */
     143          17 :         if (datal < block_size)
     144           8 :                 memzero_page(page, datal, block_size - datal);
     145             : 
     146          17 :         btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
     147          17 :         btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
     148          17 :         btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
     149          17 : out_unlock:
     150          17 :         if (page) {
     151          17 :                 unlock_page(page);
     152          17 :                 put_page(page);
     153             :         }
     154          17 :         if (ret)
     155           0 :                 btrfs_delalloc_release_space(inode, data_reserved, file_offset,
     156             :                                              block_size, true);
     157          17 :         btrfs_delalloc_release_extents(inode, block_size);
     158          17 : out:
     159          17 :         extent_changeset_free(data_reserved);
     160             : 
     161          17 :         return ret;
     162             : }
     163             : 
     164             : /*
     165             :  * Deal with cloning of inline extents. We try to copy the inline extent from
     166             :  * the source inode to destination inode when possible. When not possible we
     167             :  * copy the inline extent's data into the respective page of the inode.
     168             :  */
     169          44 : static int clone_copy_inline_extent(struct inode *dst,
     170             :                                     struct btrfs_path *path,
     171             :                                     struct btrfs_key *new_key,
     172             :                                     const u64 drop_start,
     173             :                                     const u64 datal,
     174             :                                     const u64 size,
     175             :                                     const u8 comp_type,
     176             :                                     char *inline_data,
     177             :                                     struct btrfs_trans_handle **trans_out)
     178             : {
     179          44 :         struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
     180          44 :         struct btrfs_root *root = BTRFS_I(dst)->root;
     181          44 :         const u64 aligned_end = ALIGN(new_key->offset + datal,
     182             :                                       fs_info->sectorsize);
     183          44 :         struct btrfs_trans_handle *trans = NULL;
     184          44 :         struct btrfs_drop_extents_args drop_args = { 0 };
     185          44 :         int ret;
     186          44 :         struct btrfs_key key;
     187             : 
     188          44 :         if (new_key->offset > 0) {
     189          11 :                 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
     190             :                                           inline_data, size, datal, comp_type);
     191          11 :                 goto out;
     192             :         }
     193             : 
     194          33 :         key.objectid = btrfs_ino(BTRFS_I(dst));
     195          33 :         key.type = BTRFS_EXTENT_DATA_KEY;
     196          33 :         key.offset = 0;
     197          33 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
     198          33 :         if (ret < 0) {
     199             :                 return ret;
     200          33 :         } else if (ret > 0) {
     201          18 :                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
     202          17 :                         ret = btrfs_next_leaf(root, path);
     203          17 :                         if (ret < 0)
     204             :                                 return ret;
     205          17 :                         else if (ret > 0)
     206          17 :                                 goto copy_inline_extent;
     207             :                 }
     208           1 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
     209           1 :                 if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
     210           1 :                     key.type == BTRFS_EXTENT_DATA_KEY) {
     211             :                         /*
     212             :                          * There's an implicit hole at file offset 0, copy the
     213             :                          * inline extent's data to the page.
     214             :                          */
     215           1 :                         ASSERT(key.offset > 0);
     216           1 :                         goto copy_to_page;
     217             :                 }
     218          15 :         } else if (i_size_read(dst) <= datal) {
     219          13 :                 struct btrfs_file_extent_item *ei;
     220             : 
     221          13 :                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
     222             :                                     struct btrfs_file_extent_item);
     223             :                 /*
     224             :                  * If it's an inline extent replace it with the source inline
     225             :                  * extent, otherwise copy the source inline extent data into
     226             :                  * the respective page at the destination inode.
     227             :                  */
     228          13 :                 if (btrfs_file_extent_type(path->nodes[0], ei) ==
     229             :                     BTRFS_FILE_EXTENT_INLINE)
     230          10 :                         goto copy_inline_extent;
     231             : 
     232           3 :                 goto copy_to_page;
     233             :         }
     234             : 
     235           2 : copy_inline_extent:
     236             :         /*
     237             :          * We have no extent items, or we have an extent at offset 0 which may
     238             :          * or may not be inlined. All these cases are dealt the same way.
     239             :          */
     240          29 :         if (i_size_read(dst) > datal) {
     241             :                 /*
     242             :                  * At the destination offset 0 we have either a hole, a regular
     243             :                  * extent or an inline extent larger then the one we want to
     244             :                  * clone. Deal with all these cases by copying the inline extent
     245             :                  * data into the respective page at the destination inode.
     246             :                  */
     247           2 :                 goto copy_to_page;
     248             :         }
     249             : 
     250             :         /*
     251             :          * Release path before starting a new transaction so we don't hold locks
     252             :          * that would confuse lockdep.
     253             :          */
     254          27 :         btrfs_release_path(path);
     255             :         /*
     256             :          * If we end up here it means were copy the inline extent into a leaf
     257             :          * of the destination inode. We know we will drop or adjust at most one
     258             :          * extent item in the destination root.
     259             :          *
     260             :          * 1 unit - adjusting old extent (we may have to split it)
     261             :          * 1 unit - add new extent
     262             :          * 1 unit - inode update
     263             :          */
     264          27 :         trans = btrfs_start_transaction(root, 3);
     265          27 :         if (IS_ERR(trans)) {
     266           0 :                 ret = PTR_ERR(trans);
     267           0 :                 trans = NULL;
     268           0 :                 goto out;
     269             :         }
     270          27 :         drop_args.path = path;
     271          27 :         drop_args.start = drop_start;
     272          27 :         drop_args.end = aligned_end;
     273          27 :         drop_args.drop_cache = true;
     274          27 :         ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
     275          27 :         if (ret)
     276           0 :                 goto out;
     277          27 :         ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
     278          27 :         if (ret)
     279           0 :                 goto out;
     280             : 
     281          54 :         write_extent_buffer(path->nodes[0], inline_data,
     282          27 :                             btrfs_item_ptr_offset(path->nodes[0],
     283             :                                                   path->slots[0]),
     284             :                             size);
     285          27 :         btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
     286          27 :         btrfs_set_inode_full_sync(BTRFS_I(dst));
     287          27 :         ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
     288          44 : out:
     289          44 :         if (!ret && !trans) {
     290             :                 /*
     291             :                  * No transaction here means we copied the inline extent into a
     292             :                  * page of the destination inode.
     293             :                  *
     294             :                  * 1 unit to update inode item
     295             :                  */
     296          17 :                 trans = btrfs_start_transaction(root, 1);
     297          17 :                 if (IS_ERR(trans)) {
     298           0 :                         ret = PTR_ERR(trans);
     299           0 :                         trans = NULL;
     300             :                 }
     301             :         }
     302          44 :         if (ret && trans) {
     303           0 :                 btrfs_abort_transaction(trans, ret);
     304           0 :                 btrfs_end_transaction(trans);
     305             :         }
     306          44 :         if (!ret)
     307          44 :                 *trans_out = trans;
     308             : 
     309             :         return ret;
     310             : 
     311           6 : copy_to_page:
     312             :         /*
     313             :          * Release our path because we don't need it anymore and also because
     314             :          * copy_inline_to_page() needs to reserve data and metadata, which may
     315             :          * need to flush delalloc when we are low on available space and
     316             :          * therefore cause a deadlock if writeback of an inline extent needs to
     317             :          * write to the same leaf or an ordered extent completion needs to write
     318             :          * to the same leaf.
     319             :          */
     320           6 :         btrfs_release_path(path);
     321             : 
     322           6 :         ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
     323             :                                   inline_data, size, datal, comp_type);
     324           6 :         goto out;
     325             : }
     326             : 
     327             : /*
     328             :  * Clone a range from inode file to another.
     329             :  *
     330             :  * @src:             Inode to clone from
     331             :  * @inode:           Inode to clone to
     332             :  * @off:             Offset within source to start clone from
     333             :  * @olen:            Original length, passed by user, of range to clone
     334             :  * @olen_aligned:    Block-aligned value of olen
     335             :  * @destoff:         Offset within @inode to start clone
     336             :  * @no_time_update:  Whether to update mtime/ctime on the target inode
     337             :  */
     338     1429260 : static int btrfs_clone(struct inode *src, struct inode *inode,
     339             :                        const u64 off, const u64 olen, const u64 olen_aligned,
     340             :                        const u64 destoff, int no_time_update)
     341             : {
     342     1429260 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
     343     1429260 :         struct btrfs_path *path = NULL;
     344     1429260 :         struct extent_buffer *leaf;
     345     1429260 :         struct btrfs_trans_handle *trans;
     346     1429260 :         char *buf = NULL;
     347     1429260 :         struct btrfs_key key;
     348     1429260 :         u32 nritems;
     349     1429260 :         int slot;
     350     1429260 :         int ret;
     351     1429260 :         const u64 len = olen_aligned;
     352     1429260 :         u64 last_dest_end = destoff;
     353     1429260 :         u64 prev_extent_end = off;
     354             : 
     355     1429260 :         ret = -ENOMEM;
     356     1429260 :         buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
     357     1429131 :         if (!buf)
     358             :                 return ret;
     359             : 
     360     1429131 :         path = btrfs_alloc_path();
     361     1429187 :         if (!path) {
     362           0 :                 kvfree(buf);
     363           0 :                 return ret;
     364             :         }
     365             : 
     366     1429187 :         path->reada = READA_FORWARD;
     367             :         /* Clone data */
     368     1429187 :         key.objectid = btrfs_ino(BTRFS_I(src));
     369     1429187 :         key.type = BTRFS_EXTENT_DATA_KEY;
     370     1429187 :         key.offset = off;
     371             : 
     372    15013520 :         while (1) {
     373    16442707 :                 struct btrfs_file_extent_item *extent;
     374    16442707 :                 u64 extent_gen;
     375    16442707 :                 int type;
     376    16442707 :                 u32 size;
     377    16442707 :                 struct btrfs_key new_key;
     378    16442707 :                 u64 disko = 0, diskl = 0;
     379    16442707 :                 u64 datao = 0, datal = 0;
     380    16442707 :                 u8 comp;
     381    16442707 :                 u64 drop_start;
     382             : 
     383             :                 /* Note the key will change type as we walk through the tree */
     384    16442707 :                 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
     385             :                                 0, 0);
     386    16442720 :                 if (ret < 0)
     387         306 :                         goto out;
     388             :                 /*
     389             :                  * First search, if no extent item that starts at offset off was
     390             :                  * found but the previous item is an extent item, it's possible
     391             :                  * it might overlap our target range, therefore process it.
     392             :                  */
     393    16442720 :                 if (key.offset == off && ret > 0 && path->slots[0] > 0) {
     394     1343287 :                         btrfs_item_key_to_cpu(path->nodes[0], &key,
     395             :                                               path->slots[0] - 1);
     396     1343279 :                         if (key.type == BTRFS_EXTENT_DATA_KEY)
     397     1130102 :                                 path->slots[0]--;
     398             :                 }
     399             : 
     400    16442712 :                 nritems = btrfs_header_nritems(path->nodes[0]);
     401    17321479 : process_slot:
     402    17321479 :                 if (path->slots[0] >= nritems) {
     403       32777 :                         ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
     404       32777 :                         if (ret < 0)
     405           0 :                                 goto out;
     406       32777 :                         if (ret > 0)
     407             :                                 break;
     408       12210 :                         nritems = btrfs_header_nritems(path->nodes[0]);
     409             :                 }
     410    17300912 :                 leaf = path->nodes[0];
     411    17300912 :                 slot = path->slots[0];
     412             : 
     413    17300912 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
     414    17301038 :                 if (key.type > BTRFS_EXTENT_DATA_KEY ||
     415    17301051 :                     key.objectid != btrfs_ino(BTRFS_I(src)))
     416             :                         break;
     417             : 
     418    17208913 :                 ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
     419             : 
     420    17208913 :                 extent = btrfs_item_ptr(leaf, slot,
     421             :                                         struct btrfs_file_extent_item);
     422    17208849 :                 extent_gen = btrfs_file_extent_generation(leaf, extent);
     423    17208884 :                 comp = btrfs_file_extent_compression(leaf, extent);
     424    17208809 :                 type = btrfs_file_extent_type(leaf, extent);
     425    17208827 :                 if (type == BTRFS_FILE_EXTENT_REG ||
     426             :                     type == BTRFS_FILE_EXTENT_PREALLOC) {
     427    17208779 :                         disko = btrfs_file_extent_disk_bytenr(leaf, extent);
     428    17208798 :                         diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
     429    17208799 :                         datao = btrfs_file_extent_offset(leaf, extent);
     430    17208799 :                         datal = btrfs_file_extent_num_bytes(leaf, extent);
     431          48 :                 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
     432             :                         /* Take upper bound, may be compressed */
     433          48 :                         datal = btrfs_file_extent_ram_bytes(leaf, extent);
     434             :                 }
     435             : 
     436             :                 /*
     437             :                  * The first search might have left us at an extent item that
     438             :                  * ends before our target range's start, can happen if we have
     439             :                  * holes and NO_HOLES feature enabled.
     440             :                  *
     441             :                  * Subsequent searches may leave us on a file range we have
     442             :                  * processed before - this happens due to a race with ordered
     443             :                  * extent completion for a file range that is outside our source
     444             :                  * range, but that range was part of a file extent item that
     445             :                  * also covered a leading part of our source range.
     446             :                  */
     447    17208834 :                 if (key.offset + datal <= prev_extent_end) {
     448      878767 :                         path->slots[0]++;
     449      878767 :                         goto process_slot;
     450    16330067 :                 } else if (key.offset >= off + len) {
     451             :                         break;
     452             :                 }
     453             : 
     454    15358305 :                 prev_extent_end = key.offset + datal;
     455    15358305 :                 size = btrfs_item_size(leaf, slot);
     456    15358306 :                 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
     457             :                                    size);
     458             : 
     459    15358303 :                 btrfs_release_path(path);
     460             : 
     461    15358309 :                 memcpy(&new_key, &key, sizeof(new_key));
     462    15358309 :                 new_key.objectid = btrfs_ino(BTRFS_I(inode));
     463    15358309 :                 if (off <= key.offset)
     464    15107142 :                         new_key.offset = key.offset + destoff - off;
     465             :                 else
     466      251167 :                         new_key.offset = destoff;
     467             : 
     468             :                 /*
     469             :                  * Deal with a hole that doesn't have an extent item that
     470             :                  * represents it (NO_HOLES feature enabled).
     471             :                  * This hole is either in the middle of the cloning range or at
     472             :                  * the beginning (fully overlaps it or partially overlaps it).
     473             :                  */
     474    15358309 :                 if (new_key.offset != last_dest_end)
     475             :                         drop_start = last_dest_end;
     476             :                 else
     477    14086351 :                         drop_start = new_key.offset;
     478             : 
     479    15358309 :                 if (type == BTRFS_FILE_EXTENT_REG ||
     480             :                     type == BTRFS_FILE_EXTENT_PREALLOC) {
     481    15358265 :                         struct btrfs_replace_extent_info clone_info;
     482             : 
     483             :                         /*
     484             :                          *    a  | --- range to clone ---|  b
     485             :                          * | ------------- extent ------------- |
     486             :                          */
     487             : 
     488             :                         /* Subtract range b */
     489    15358265 :                         if (key.offset + datal > off + len)
     490      265507 :                                 datal = off + len - key.offset;
     491             : 
     492             :                         /* Subtract range a */
     493    15358265 :                         if (off > key.offset) {
     494      251167 :                                 datao += off - key.offset;
     495      251167 :                                 datal -= off - key.offset;
     496             :                         }
     497             : 
     498    15358265 :                         clone_info.disk_offset = disko;
     499    15358265 :                         clone_info.disk_len = diskl;
     500    15358265 :                         clone_info.data_offset = datao;
     501    15358265 :                         clone_info.data_len = datal;
     502    15358265 :                         clone_info.file_offset = new_key.offset;
     503    15358265 :                         clone_info.extent_buf = buf;
     504    15358265 :                         clone_info.is_new_extent = false;
     505    15358265 :                         clone_info.update_times = !no_time_update;
     506    15358265 :                         ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
     507    15358265 :                                         drop_start, new_key.offset + datal - 1,
     508             :                                         &clone_info, &trans);
     509    15358265 :                         if (ret)
     510         304 :                                 goto out;
     511             :                 } else {
     512          44 :                         ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
     513             :                         /*
     514             :                          * Inline extents always have to start at file offset 0
     515             :                          * and can never be bigger then the sector size. We can
     516             :                          * never clone only parts of an inline extent, since all
     517             :                          * reflink operations must start at a sector size aligned
     518             :                          * offset, and the length must be aligned too or end at
     519             :                          * the i_size (which implies the whole inlined data).
     520             :                          */
     521          44 :                         ASSERT(key.offset == 0);
     522          44 :                         ASSERT(datal <= fs_info->sectorsize);
     523          44 :                         if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
     524          44 :                             WARN_ON(key.offset != 0) ||
     525          44 :                             WARN_ON(datal > fs_info->sectorsize)) {
     526           0 :                                 ret = -EUCLEAN;
     527           0 :                                 goto out;
     528             :                         }
     529             : 
     530          44 :                         ret = clone_copy_inline_extent(inode, path, &new_key,
     531             :                                                        drop_start, datal, size,
     532             :                                                        comp, buf, &trans);
     533          44 :                         if (ret)
     534           0 :                                 goto out;
     535             :                 }
     536             : 
     537    15358005 :                 btrfs_release_path(path);
     538             : 
     539             :                 /*
     540             :                  * Whenever we share an extent we update the last_reflink_trans
     541             :                  * of each inode to the current transaction. This is needed to
     542             :                  * make sure fsync does not log multiple checksum items with
     543             :                  * overlapping ranges (because some extent items might refer
     544             :                  * only to sections of the original extent). For the destination
     545             :                  * inode we do this regardless of the generation of the extents
     546             :                  * or even if they are inline extents or explicit holes, to make
     547             :                  * sure a full fsync does not skip them. For the source inode,
     548             :                  * we only need to update last_reflink_trans in case it's a new
     549             :                  * extent that is not a hole or an inline extent, to deal with
     550             :                  * the checksums problem on fsync.
     551             :                  */
     552    15358005 :                 if (extent_gen == trans->transid && disko > 0)
     553     9734840 :                         BTRFS_I(src)->last_reflink_trans = trans->transid;
     554             : 
     555    15358005 :                 BTRFS_I(inode)->last_reflink_trans = trans->transid;
     556             : 
     557    15358005 :                 last_dest_end = ALIGN(new_key.offset + datal,
     558             :                                       fs_info->sectorsize);
     559    15358005 :                 ret = clone_finish_inode_update(trans, inode, last_dest_end,
     560             :                                                 destoff, olen, no_time_update);
     561    15358002 :                 if (ret)
     562           0 :                         goto out;
     563    15358002 :                 if (new_key.offset + datal >= destoff + len)
     564             :                         break;
     565             : 
     566    15013521 :                 btrfs_release_path(path);
     567    15013523 :                 key.offset = prev_extent_end;
     568             : 
     569    15013523 :                 if (fatal_signal_pending(current)) {
     570           2 :                         ret = -EINTR;
     571           2 :                         goto out;
     572             :                 }
     573             : 
     574    15013518 :                 cond_resched();
     575             :         }
     576     1428935 :         ret = 0;
     577             : 
     578     1428935 :         if (last_dest_end < destoff + len) {
     579             :                 /*
     580             :                  * We have an implicit hole that fully or partially overlaps our
     581             :                  * cloning range at its end. This means that we either have the
     582             :                  * NO_HOLES feature enabled or the implicit hole happened due to
     583             :                  * mixing buffered and direct IO writes against this file.
     584             :                  */
     585     1084434 :                 btrfs_release_path(path);
     586             : 
     587             :                 /*
     588             :                  * When using NO_HOLES and we are cloning a range that covers
     589             :                  * only a hole (no extents) into a range beyond the current
     590             :                  * i_size, punching a hole in the target range will not create
     591             :                  * an extent map defining a hole, because the range starts at or
     592             :                  * beyond current i_size. If the file previously had an i_size
     593             :                  * greater than the new i_size set by this clone operation, we
     594             :                  * need to make sure the next fsync is a full fsync, so that it
     595             :                  * detects and logs a hole covering a range from the current
     596             :                  * i_size to the new i_size. If the clone range covers extents,
     597             :                  * besides a hole, then we know the full sync flag was already
     598             :                  * set by previous calls to btrfs_replace_file_extents() that
     599             :                  * replaced file extent items.
     600             :                  */
     601     1084475 :                 if (last_dest_end >= i_size_read(inode))
     602       38211 :                         btrfs_set_inode_full_sync(BTRFS_I(inode));
     603             : 
     604     1084474 :                 ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
     605             :                                 last_dest_end, destoff + len - 1, NULL, &trans);
     606     1084371 :                 if (ret)
     607        1744 :                         goto out;
     608             : 
     609     1082627 :                 ret = clone_finish_inode_update(trans, inode, destoff + len,
     610             :                                                 destoff, olen, no_time_update);
     611             :         }
     612             : 
     613      344501 : out:
     614     1429232 :         btrfs_free_path(path);
     615     1429197 :         kvfree(buf);
     616     1429015 :         clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
     617             : 
     618     1429015 :         return ret;
     619             : }
     620             : 
     621     1429106 : static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
     622             :                                        struct inode *inode2, u64 loff2, u64 len)
     623             : {
     624     1429106 :         unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
     625     1429194 :         unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
     626     1429220 : }
     627             : 
     628     1429266 : static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
     629             :                                      struct inode *inode2, u64 loff2, u64 len)
     630             : {
     631     1429266 :         u64 range1_end = loff1 + len - 1;
     632     1429266 :         u64 range2_end = loff2 + len - 1;
     633             : 
     634     1429266 :         if (inode1 < inode2) {
     635             :                 swap(inode1, inode2);
     636             :                 swap(loff1, loff2);
     637             :                 swap(range1_end, range2_end);
     638      878251 :         } else if (inode1 == inode2 && loff2 < loff1) {
     639      131687 :                 swap(loff1, loff2);
     640      131687 :                 swap(range1_end, range2_end);
     641             :         }
     642             : 
     643     1429266 :         lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
     644     1429260 :         lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
     645             : 
     646     1429275 :         btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
     647     1429256 :         btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
     648     1429261 : }
     649             : 
     650     1816936 : static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
     651             : {
     652     1816936 :         if (inode1 < inode2)
     653      904590 :                 swap(inode1, inode2);
     654     1816936 :         down_write(&BTRFS_I(inode1)->i_mmap_lock);
     655     1817011 :         down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
     656     1817011 : }
     657             : 
     658             : static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
     659             : {
     660     1817067 :         up_write(&BTRFS_I(inode1)->i_mmap_lock);
     661     1817059 :         up_write(&BTRFS_I(inode2)->i_mmap_lock);
     662             : }
     663             : 
     664     1085749 : static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
     665             :                                    struct inode *dst, u64 dst_loff)
     666             : {
     667     1085749 :         struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
     668     1085749 :         const u64 bs = fs_info->sb->s_blocksize;
     669     1085749 :         int ret;
     670             : 
     671             :         /*
     672             :          * Lock destination range to serialize with concurrent readahead() and
     673             :          * source range to serialize with relocation.
     674             :          */
     675     1085749 :         btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
     676     1085729 :         ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
     677     1085666 :         btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
     678             : 
     679     1085702 :         btrfs_btree_balance_dirty(fs_info);
     680             : 
     681     1085355 :         return ret;
     682             : }
     683             : 
     684     1105275 : static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
     685             :                              struct inode *dst, u64 dst_loff)
     686             : {
     687     1105275 :         int ret = 0;
     688     1105275 :         u64 i, tail_len, chunk_count;
     689     1105275 :         struct btrfs_root *root_dst = BTRFS_I(dst)->root;
     690             : 
     691     1105275 :         spin_lock(&root_dst->root_item_lock);
     692     1105309 :         if (root_dst->send_in_progress) {
     693       19618 :                 btrfs_warn_rl(root_dst->fs_info,
     694             : "cannot deduplicate to root %llu while send operations are using it (%d in progress)",
     695             :                               root_dst->root_key.objectid,
     696             :                               root_dst->send_in_progress);
     697       19618 :                 spin_unlock(&root_dst->root_item_lock);
     698       19618 :                 return -EAGAIN;
     699             :         }
     700     1085691 :         root_dst->dedupe_in_progress++;
     701     1085691 :         spin_unlock(&root_dst->root_item_lock);
     702             : 
     703     1085689 :         tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
     704     1085689 :         chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
     705             : 
     706     1085783 :         for (i = 0; i < chunk_count; i++) {
     707          99 :                 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
     708             :                                               dst, dst_loff);
     709          94 :                 if (ret)
     710           0 :                         goto out;
     711             : 
     712          94 :                 loff += BTRFS_MAX_DEDUPE_LEN;
     713          94 :                 dst_loff += BTRFS_MAX_DEDUPE_LEN;
     714             :         }
     715             : 
     716     1085684 :         if (tail_len > 0)
     717     1085664 :                 ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
     718          20 : out:
     719     1085360 :         spin_lock(&root_dst->root_item_lock);
     720     1085691 :         root_dst->dedupe_in_progress--;
     721     1085691 :         spin_unlock(&root_dst->root_item_lock);
     722             : 
     723     1085691 :         return ret;
     724             : }
     725             : 
     726      343830 : static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
     727             :                                         u64 off, u64 olen, u64 destoff)
     728             : {
     729      343830 :         struct inode *inode = file_inode(file);
     730      343830 :         struct inode *src = file_inode(file_src);
     731      343830 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
     732      343830 :         int ret;
     733      343830 :         int wb_ret;
     734      343830 :         u64 len = olen;
     735      343830 :         u64 bs = fs_info->sb->s_blocksize;
     736             : 
     737             :         /*
     738             :          * VFS's generic_remap_file_range_prep() protects us from cloning the
     739             :          * eof block into the middle of a file, which would result in corruption
     740             :          * if the file size is not blocksize aligned. So we don't need to check
     741             :          * for that case here.
     742             :          */
     743      343830 :         if (off + len == src->i_size)
     744        3940 :                 len = ALIGN(src->i_size, bs) - off;
     745             : 
     746      343830 :         if (destoff > inode->i_size) {
     747       64215 :                 const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
     748             : 
     749       64215 :                 ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
     750       64216 :                 if (ret)
     751             :                         return ret;
     752             :                 /*
     753             :                  * We may have truncated the last block if the inode's size is
     754             :                  * not sector size aligned, so we need to wait for writeback to
     755             :                  * complete before proceeding further, otherwise we can race
     756             :                  * with cloning and attempt to increment a reference to an
     757             :                  * extent that no longer exists (writeback completed right after
     758             :                  * we found the previous extent covering eof and before we
     759             :                  * attempted to increment its reference count).
     760             :                  */
     761       63911 :                 ret = btrfs_wait_ordered_range(inode, wb_start,
     762             :                                                destoff - wb_start);
     763       63911 :                 if (ret)
     764             :                         return ret;
     765             :         }
     766             : 
     767             :         /*
     768             :          * Lock destination range to serialize with concurrent readahead() and
     769             :          * source range to serialize with relocation.
     770             :          */
     771      343526 :         btrfs_double_extent_lock(src, off, inode, destoff, len);
     772      343527 :         ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
     773      343527 :         btrfs_double_extent_unlock(src, off, inode, destoff, len);
     774             : 
     775             :         /*
     776             :          * We may have copied an inline extent into a page of the destination
     777             :          * range, so wait for writeback to complete before truncating pages
     778             :          * from the page cache. This is a rare case.
     779             :          */
     780      343527 :         wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
     781      343525 :         ret = ret ? ret : wb_ret;
     782             :         /*
     783             :          * Truncate page cache pages so that future reads will see the cloned
     784             :          * data immediately and not the previous data.
     785             :          */
     786      343525 :         truncate_inode_pages_range(&inode->i_data,
     787      343525 :                                 round_down(destoff, PAGE_SIZE),
     788      343525 :                                 round_up(destoff + len, PAGE_SIZE) - 1);
     789             : 
     790      343524 :         btrfs_btree_balance_dirty(fs_info);
     791             : 
     792      343524 :         return ret;
     793             : }
     794             : 
     795     2361420 : static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
     796             :                                        struct file *file_out, loff_t pos_out,
     797             :                                        loff_t *len, unsigned int remap_flags)
     798             : {
     799     2361420 :         struct inode *inode_in = file_inode(file_in);
     800     2361420 :         struct inode *inode_out = file_inode(file_out);
     801     2361420 :         u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
     802     2361420 :         u64 wb_len;
     803     2361420 :         int ret;
     804             : 
     805     2361420 :         if (!(remap_flags & REMAP_FILE_DEDUP)) {
     806      490595 :                 struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
     807             : 
     808      490595 :                 if (btrfs_root_readonly(root_out))
     809             :                         return -EROFS;
     810             : 
     811             :                 ASSERT(inode_in->i_sb == inode_out->i_sb);
     812             :         }
     813             : 
     814             :         /* Don't make the dst file partly checksummed */
     815     2361420 :         if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
     816     2361420 :             (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
     817             :                 return -EINVAL;
     818             :         }
     819             : 
     820             :         /*
     821             :          * Now that the inodes are locked, we need to start writeback ourselves
     822             :          * and can not rely on the writeback from the VFS's generic helper
     823             :          * generic_remap_file_range_prep() because:
     824             :          *
     825             :          * 1) For compression we must call filemap_fdatawrite_range() range
     826             :          *    twice (btrfs_fdatawrite_range() does it for us), and the generic
     827             :          *    helper only calls it once;
     828             :          *
     829             :          * 2) filemap_fdatawrite_range(), called by the generic helper only
     830             :          *    waits for the writeback to complete, i.e. for IO to be done, and
     831             :          *    not for the ordered extents to complete. We need to wait for them
     832             :          *    to complete so that new file extent items are in the fs tree.
     833             :          */
     834     2361420 :         if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
     835        9398 :                 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
     836             :         else
     837     2352022 :                 wb_len = ALIGN(*len, bs);
     838             : 
     839             :         /*
     840             :          * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
     841             :          *
     842             :          * Btrfs' back references do not have a block level granularity, they
     843             :          * work at the whole extent level.
     844             :          * NOCOW buffered write without data space reserved may not be able
     845             :          * to fall back to CoW due to lack of data space, thus could cause
     846             :          * data loss.
     847             :          *
     848             :          * Here we take a shortcut by flushing the whole inode, so that all
     849             :          * nocow write should reach disk as nocow before we increase the
     850             :          * reference of the extent. We could do better by only flushing NOCOW
     851             :          * data, but that needs extra accounting.
     852             :          *
     853             :          * Also we don't need to check ASYNC_EXTENT, as async extent will be
     854             :          * CoWed anyway, not affecting nocow part.
     855             :          */
     856     2361420 :         ret = filemap_flush(inode_in->i_mapping);
     857     2361497 :         if (ret < 0)
     858             :                 return ret;
     859             : 
     860     2361497 :         ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
     861             :                                        wb_len);
     862     2361083 :         if (ret < 0)
     863             :                 return ret;
     864     2361083 :         ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
     865             :                                        wb_len);
     866     2361538 :         if (ret < 0)
     867             :                 return ret;
     868             : 
     869     2361538 :         return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
     870             :                                             len, remap_flags);
     871             : }
     872             : 
     873     2853631 : static bool file_sync_write(const struct file *file)
     874             : {
     875     2853631 :         if (file->f_flags & (__O_SYNC | O_DSYNC))
     876             :                 return true;
     877     2853628 :         if (IS_SYNC(file_inode(file)))
     878           2 :                 return true;
     879             : 
     880             :         return false;
     881             : }
     882             : 
     883     2361319 : loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
     884             :                 struct file *dst_file, loff_t destoff, loff_t len,
     885             :                 unsigned int remap_flags)
     886             : {
     887     2361319 :         struct inode *src_inode = file_inode(src_file);
     888     2361319 :         struct inode *dst_inode = file_inode(dst_file);
     889     2361319 :         bool same_inode = dst_inode == src_inode;
     890     2361319 :         int ret;
     891             : 
     892     2361319 :         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
     893             :                 return -EINVAL;
     894             : 
     895     2361319 :         if (same_inode) {
     896      544524 :                 btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
     897             :         } else {
     898     1816795 :                 lock_two_nondirectories(src_inode, dst_inode);
     899     1816938 :                 btrfs_double_mmap_lock(src_inode, dst_inode);
     900             :         }
     901             : 
     902     2361500 :         ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
     903             :                                           &len, remap_flags);
     904     2361574 :         if (ret < 0 || len == 0)
     905      912467 :                 goto out_unlock;
     906             : 
     907     1449107 :         if (remap_flags & REMAP_FILE_DEDUP)
     908     1105277 :                 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
     909             :         else
     910      343830 :                 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
     911             : 
     912     2361590 : out_unlock:
     913     2361590 :         if (same_inode) {
     914      544523 :                 btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
     915             :         } else {
     916     1817067 :                 btrfs_double_mmap_unlock(src_inode, dst_inode);
     917     1817057 :                 unlock_two_nondirectories(src_inode, dst_inode);
     918             :         }
     919             : 
     920             :         /*
     921             :          * If either the source or the destination file was opened with O_SYNC,
     922             :          * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
     923             :          * source files/ranges, so that after a successful return (0) followed
     924             :          * by a power failure results in the reflinked data to be readable from
     925             :          * both files/ranges.
     926             :          */
     927     2361380 :         if (ret == 0 && len > 0 &&
     928     1426812 :             (file_sync_write(src_file) || file_sync_write(dst_file))) {
     929           5 :                 ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
     930           5 :                 if (ret == 0)
     931           5 :                         ret = btrfs_sync_file(dst_file, destoff,
     932           5 :                                               destoff + len - 1, 0);
     933             :         }
     934             : 
     935     2361380 :         return ret < 0 ? ret : len;
     936             : }

Generated by: LCOV version 1.14