LCOV - code coverage report
Current view: top level - fs/btrfs - reflink.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 388 415 93.5 %
Date: 2023-07-31 20:08:22 Functions: 13 13 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : 
       3             : #include <linux/blkdev.h>
       4             : #include <linux/iversion.h>
       5             : #include "ctree.h"
       6             : #include "fs.h"
       7             : #include "messages.h"
       8             : #include "compression.h"
       9             : #include "delalloc-space.h"
      10             : #include "disk-io.h"
      11             : #include "reflink.h"
      12             : #include "transaction.h"
      13             : #include "subpage.h"
      14             : #include "accessors.h"
      15             : #include "file-item.h"
      16             : #include "file.h"
      17             : #include "super.h"
      18             : 
      19             : #define BTRFS_MAX_DEDUPE_LEN    SZ_16M
      20             : 
      21    15492753 : static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
      22             :                                      struct inode *inode,
      23             :                                      u64 endoff,
      24             :                                      const u64 destoff,
      25             :                                      const u64 olen,
      26             :                                      int no_time_update)
      27             : {
      28    15492753 :         struct btrfs_root *root = BTRFS_I(inode)->root;
      29    15492753 :         int ret;
      30             : 
      31    15492753 :         inode_inc_iversion(inode);
      32    15492773 :         if (!no_time_update) {
      33    14246147 :                 inode->i_mtime = current_time(inode);
      34    14246144 :                 inode->i_ctime = inode->i_mtime;
      35             :         }
      36             :         /*
      37             :          * We round up to the block size at eof when determining which
      38             :          * extents to clone above, but shouldn't round up the file size.
      39             :          */
      40    15492770 :         if (endoff > destoff + olen)
      41             :                 endoff = destoff + olen;
      42    15492770 :         if (endoff > inode->i_size) {
      43    13763695 :                 i_size_write(inode, endoff);
      44    13763695 :                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
      45             :         }
      46             : 
      47    15492773 :         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
      48    15492828 :         if (ret) {
      49           0 :                 btrfs_abort_transaction(trans, ret);
      50           0 :                 btrfs_end_transaction(trans);
      51           0 :                 goto out;
      52             :         }
      53    15492828 :         ret = btrfs_end_transaction(trans);
      54    15492785 : out:
      55    15492785 :         return ret;
      56             : }
      57             : 
      58          17 : static int copy_inline_to_page(struct btrfs_inode *inode,
      59             :                                const u64 file_offset,
      60             :                                char *inline_data,
      61             :                                const u64 size,
      62             :                                const u64 datal,
      63             :                                const u8 comp_type)
      64             : {
      65          17 :         struct btrfs_fs_info *fs_info = inode->root->fs_info;
      66          17 :         const u32 block_size = fs_info->sectorsize;
      67          17 :         const u64 range_end = file_offset + block_size - 1;
      68          17 :         const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
      69          17 :         char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
      70          17 :         struct extent_changeset *data_reserved = NULL;
      71          17 :         struct page *page = NULL;
      72          17 :         struct address_space *mapping = inode->vfs_inode.i_mapping;
      73          17 :         int ret;
      74             : 
      75          17 :         ASSERT(IS_ALIGNED(file_offset, block_size));
      76             : 
      77             :         /*
      78             :          * We have flushed and locked the ranges of the source and destination
      79             :          * inodes, we also have locked the inodes, so we are safe to do a
      80             :          * reservation here. Also we must not do the reservation while holding
      81             :          * a transaction open, otherwise we would deadlock.
      82             :          */
      83          17 :         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
      84             :                                            block_size);
      85          17 :         if (ret)
      86           0 :                 goto out;
      87             : 
      88          17 :         page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
      89             :                                    btrfs_alloc_write_mask(mapping));
      90          17 :         if (!page) {
      91           0 :                 ret = -ENOMEM;
      92           0 :                 goto out_unlock;
      93             :         }
      94             : 
      95          17 :         ret = set_page_extent_mapped(page);
      96          17 :         if (ret < 0)
      97           0 :                 goto out_unlock;
      98             : 
      99          17 :         clear_extent_bit(&inode->io_tree, file_offset, range_end,
     100             :                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
     101             :                          NULL);
     102          17 :         ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
     103          17 :         if (ret)
     104           0 :                 goto out_unlock;
     105             : 
     106             :         /*
     107             :          * After dirtying the page our caller will need to start a transaction,
     108             :          * and if we are low on metadata free space, that can cause flushing of
     109             :          * delalloc for all inodes in order to get metadata space released.
     110             :          * However we are holding the range locked for the whole duration of
     111             :          * the clone/dedupe operation, so we may deadlock if that happens and no
     112             :          * other task releases enough space. So mark this inode as not being
     113             :          * possible to flush to avoid such deadlock. We will clear that flag
     114             :          * when we finish cloning all extents, since a transaction is started
     115             :          * after finding each extent to clone.
     116             :          */
     117          17 :         set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
     118             : 
     119          17 :         if (comp_type == BTRFS_COMPRESS_NONE) {
     120           6 :                 memcpy_to_page(page, offset_in_page(file_offset), data_start,
     121             :                                datal);
     122             :         } else {
     123          11 :                 ret = btrfs_decompress(comp_type, data_start, page,
     124             :                                        offset_in_page(file_offset),
     125             :                                        inline_size, datal);
     126          11 :                 if (ret)
     127           0 :                         goto out_unlock;
     128             :                 flush_dcache_page(page);
     129             :         }
     130             : 
     131             :         /*
     132             :          * If our inline data is smaller then the block/page size, then the
     133             :          * remaining of the block/page is equivalent to zeroes. We had something
     134             :          * like the following done:
     135             :          *
     136             :          * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
     137             :          * $ sync  # (or fsync)
     138             :          * $ xfs_io -c "falloc 0 4K" file
     139             :          * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
     140             :          *
     141             :          * So what's in the range [500, 4095] corresponds to zeroes.
     142             :          */
     143          17 :         if (datal < block_size)
     144           8 :                 memzero_page(page, datal, block_size - datal);
     145             : 
     146          17 :         btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
     147          17 :         btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
     148          17 :         btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
     149          17 : out_unlock:
     150          17 :         if (page) {
     151          17 :                 unlock_page(page);
     152          17 :                 put_page(page);
     153             :         }
     154          17 :         if (ret)
     155           0 :                 btrfs_delalloc_release_space(inode, data_reserved, file_offset,
     156             :                                              block_size, true);
     157          17 :         btrfs_delalloc_release_extents(inode, block_size);
     158          17 : out:
     159          17 :         extent_changeset_free(data_reserved);
     160             : 
     161          17 :         return ret;
     162             : }
     163             : 
     164             : /*
     165             :  * Deal with cloning of inline extents. We try to copy the inline extent from
     166             :  * the source inode to destination inode when possible. When not possible we
     167             :  * copy the inline extent's data into the respective page of the inode.
     168             :  */
     169        2262 : static int clone_copy_inline_extent(struct inode *dst,
     170             :                                     struct btrfs_path *path,
     171             :                                     struct btrfs_key *new_key,
     172             :                                     const u64 drop_start,
     173             :                                     const u64 datal,
     174             :                                     const u64 size,
     175             :                                     const u8 comp_type,
     176             :                                     char *inline_data,
     177             :                                     struct btrfs_trans_handle **trans_out)
     178             : {
     179        2262 :         struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
     180        2262 :         struct btrfs_root *root = BTRFS_I(dst)->root;
     181        2262 :         const u64 aligned_end = ALIGN(new_key->offset + datal,
     182             :                                       fs_info->sectorsize);
     183        2262 :         struct btrfs_trans_handle *trans = NULL;
     184        2262 :         struct btrfs_drop_extents_args drop_args = { 0 };
     185        2262 :         int ret;
     186        2262 :         struct btrfs_key key;
     187             : 
     188        2262 :         if (new_key->offset > 0) {
     189          11 :                 ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
     190             :                                           inline_data, size, datal, comp_type);
     191          11 :                 goto out;
     192             :         }
     193             : 
     194        2251 :         key.objectid = btrfs_ino(BTRFS_I(dst));
     195        2251 :         key.type = BTRFS_EXTENT_DATA_KEY;
     196        2251 :         key.offset = 0;
     197        2251 :         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
     198        2251 :         if (ret < 0) {
     199             :                 return ret;
     200        2251 :         } else if (ret > 0) {
     201        2236 :                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
     202          17 :                         ret = btrfs_next_leaf(root, path);
     203          17 :                         if (ret < 0)
     204             :                                 return ret;
     205          17 :                         else if (ret > 0)
     206          17 :                                 goto copy_inline_extent;
     207             :                 }
     208        2219 :                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
     209        2219 :                 if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
     210           1 :                     key.type == BTRFS_EXTENT_DATA_KEY) {
     211             :                         /*
     212             :                          * There's an implicit hole at file offset 0, copy the
     213             :                          * inline extent's data to the page.
     214             :                          */
     215           1 :                         ASSERT(key.offset > 0);
     216           1 :                         goto copy_to_page;
     217             :                 }
     218          15 :         } else if (i_size_read(dst) <= datal) {
     219          13 :                 struct btrfs_file_extent_item *ei;
     220             : 
     221          13 :                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
     222             :                                     struct btrfs_file_extent_item);
     223             :                 /*
     224             :                  * If it's an inline extent replace it with the source inline
     225             :                  * extent, otherwise copy the source inline extent data into
     226             :                  * the respective page at the destination inode.
     227             :                  */
     228          13 :                 if (btrfs_file_extent_type(path->nodes[0], ei) ==
     229             :                     BTRFS_FILE_EXTENT_INLINE)
     230          10 :                         goto copy_inline_extent;
     231             : 
     232           3 :                 goto copy_to_page;
     233             :         }
     234             : 
     235           2 : copy_inline_extent:
     236             :         /*
     237             :          * We have no extent items, or we have an extent at offset 0 which may
     238             :          * or may not be inlined. All these cases are dealt the same way.
     239             :          */
     240        2247 :         if (i_size_read(dst) > datal) {
     241             :                 /*
     242             :                  * At the destination offset 0 we have either a hole, a regular
     243             :                  * extent or an inline extent larger then the one we want to
     244             :                  * clone. Deal with all these cases by copying the inline extent
     245             :                  * data into the respective page at the destination inode.
     246             :                  */
     247           2 :                 goto copy_to_page;
     248             :         }
     249             : 
     250             :         /*
     251             :          * Release path before starting a new transaction so we don't hold locks
     252             :          * that would confuse lockdep.
     253             :          */
     254        2245 :         btrfs_release_path(path);
     255             :         /*
     256             :          * If we end up here it means were copy the inline extent into a leaf
     257             :          * of the destination inode. We know we will drop or adjust at most one
     258             :          * extent item in the destination root.
     259             :          *
     260             :          * 1 unit - adjusting old extent (we may have to split it)
     261             :          * 1 unit - add new extent
     262             :          * 1 unit - inode update
     263             :          */
     264        2245 :         trans = btrfs_start_transaction(root, 3);
     265        2245 :         if (IS_ERR(trans)) {
     266           0 :                 ret = PTR_ERR(trans);
     267           0 :                 trans = NULL;
     268           0 :                 goto out;
     269             :         }
     270        2245 :         drop_args.path = path;
     271        2245 :         drop_args.start = drop_start;
     272        2245 :         drop_args.end = aligned_end;
     273        2245 :         drop_args.drop_cache = true;
     274        2245 :         ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
     275        2245 :         if (ret)
     276           0 :                 goto out;
     277        2245 :         ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
     278        2245 :         if (ret)
     279           0 :                 goto out;
     280             : 
     281        4490 :         write_extent_buffer(path->nodes[0], inline_data,
     282        2245 :                             btrfs_item_ptr_offset(path->nodes[0],
     283             :                                                   path->slots[0]),
     284             :                             size);
     285        2245 :         btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
     286        2245 :         btrfs_set_inode_full_sync(BTRFS_I(dst));
     287        2245 :         ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
     288        2262 : out:
     289        2262 :         if (!ret && !trans) {
     290             :                 /*
     291             :                  * No transaction here means we copied the inline extent into a
     292             :                  * page of the destination inode.
     293             :                  *
     294             :                  * 1 unit to update inode item
     295             :                  */
     296          17 :                 trans = btrfs_start_transaction(root, 1);
     297          17 :                 if (IS_ERR(trans)) {
     298           0 :                         ret = PTR_ERR(trans);
     299           0 :                         trans = NULL;
     300             :                 }
     301             :         }
     302        2262 :         if (ret && trans) {
     303           0 :                 btrfs_abort_transaction(trans, ret);
     304           0 :                 btrfs_end_transaction(trans);
     305             :         }
     306        2262 :         if (!ret)
     307        2262 :                 *trans_out = trans;
     308             : 
     309             :         return ret;
     310             : 
     311           6 : copy_to_page:
     312             :         /*
     313             :          * Release our path because we don't need it anymore and also because
     314             :          * copy_inline_to_page() needs to reserve data and metadata, which may
     315             :          * need to flush delalloc when we are low on available space and
     316             :          * therefore cause a deadlock if writeback of an inline extent needs to
     317             :          * write to the same leaf or an ordered extent completion needs to write
     318             :          * to the same leaf.
     319             :          */
     320           6 :         btrfs_release_path(path);
     321             : 
     322           6 :         ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
     323             :                                   inline_data, size, datal, comp_type);
     324           6 :         goto out;
     325             : }
     326             : 
     327             : /*
     328             :  * Clone a range from inode file to another.
     329             :  *
     330             :  * @src:             Inode to clone from
     331             :  * @inode:           Inode to clone to
     332             :  * @off:             Offset within source to start clone from
     333             :  * @olen:            Original length, passed by user, of range to clone
     334             :  * @olen_aligned:    Block-aligned value of olen
     335             :  * @destoff:         Offset within @inode to start clone
     336             :  * @no_time_update:  Whether to update mtime/ctime on the target inode
     337             :  */
     338     1433522 : static int btrfs_clone(struct inode *src, struct inode *inode,
     339             :                        const u64 off, const u64 olen, const u64 olen_aligned,
     340             :                        const u64 destoff, int no_time_update)
     341             : {
     342     1433522 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
     343     1433522 :         struct btrfs_path *path = NULL;
     344     1433522 :         struct extent_buffer *leaf;
     345     1433522 :         struct btrfs_trans_handle *trans;
     346     1433522 :         char *buf = NULL;
     347     1433522 :         struct btrfs_key key;
     348     1433522 :         u32 nritems;
     349     1433522 :         int slot;
     350     1433522 :         int ret;
     351     1433522 :         const u64 len = olen_aligned;
     352     1433522 :         u64 last_dest_end = destoff;
     353     1433522 :         u64 prev_extent_end = off;
     354             : 
     355     1433522 :         ret = -ENOMEM;
     356     1433522 :         buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
     357     1433332 :         if (!buf)
     358             :                 return ret;
     359             : 
     360     1433332 :         path = btrfs_alloc_path();
     361     1433456 :         if (!path) {
     362           0 :                 kvfree(buf);
     363           0 :                 return ret;
     364             :         }
     365             : 
     366     1433456 :         path->reada = READA_FORWARD;
     367             :         /* Clone data */
     368     1433456 :         key.objectid = btrfs_ino(BTRFS_I(src));
     369     1433456 :         key.type = BTRFS_EXTENT_DATA_KEY;
     370     1433456 :         key.offset = off;
     371             : 
     372    14063472 :         while (1) {
     373    15496928 :                 struct btrfs_file_extent_item *extent;
     374    15496928 :                 u64 extent_gen;
     375    15496928 :                 int type;
     376    15496928 :                 u32 size;
     377    15496928 :                 struct btrfs_key new_key;
     378    15496928 :                 u64 disko = 0, diskl = 0;
     379    15496928 :                 u64 datao = 0, datal = 0;
     380    15496928 :                 u8 comp;
     381    15496928 :                 u64 drop_start;
     382             : 
     383             :                 /* Note the key will change type as we walk through the tree */
     384    15496928 :                 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
     385             :                                 0, 0);
     386    15496963 :                 if (ret < 0)
     387         219 :                         goto out;
     388             :                 /*
     389             :                  * First search, if no extent item that starts at offset off was
     390             :                  * found but the previous item is an extent item, it's possible
     391             :                  * it might overlap our target range, therefore process it.
     392             :                  */
     393    15496963 :                 if (key.offset == off && ret > 0 && path->slots[0] > 0) {
     394     1348135 :                         btrfs_item_key_to_cpu(path->nodes[0], &key,
     395             :                                               path->slots[0] - 1);
     396     1348092 :                         if (key.type == BTRFS_EXTENT_DATA_KEY)
     397     1124686 :                                 path->slots[0]--;
     398             :                 }
     399             : 
     400    15496920 :                 nritems = btrfs_header_nritems(path->nodes[0]);
     401    16363463 : process_slot:
     402    16363463 :                 if (path->slots[0] >= nritems) {
     403       27269 :                         ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
     404       27269 :                         if (ret < 0)
     405           0 :                                 goto out;
     406       27269 :                         if (ret > 0)
     407             :                                 break;
     408       12556 :                         nritems = btrfs_header_nritems(path->nodes[0]);
     409             :                 }
     410    16348750 :                 leaf = path->nodes[0];
     411    16348750 :                 slot = path->slots[0];
     412             : 
     413    16348750 :                 btrfs_item_key_to_cpu(leaf, &key, slot);
     414    16348917 :                 if (key.type > BTRFS_EXTENT_DATA_KEY ||
     415    16348928 :                     key.objectid != btrfs_ino(BTRFS_I(src)))
     416             :                         break;
     417             : 
     418    16245893 :                 ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
     419             : 
     420    16245893 :                 extent = btrfs_item_ptr(leaf, slot,
     421             :                                         struct btrfs_file_extent_item);
     422    16245791 :                 extent_gen = btrfs_file_extent_generation(leaf, extent);
     423    16245840 :                 comp = btrfs_file_extent_compression(leaf, extent);
     424    16245803 :                 type = btrfs_file_extent_type(leaf, extent);
     425    16245825 :                 if (type == BTRFS_FILE_EXTENT_REG ||
     426             :                     type == BTRFS_FILE_EXTENT_PREALLOC) {
     427    16243559 :                         disko = btrfs_file_extent_disk_bytenr(leaf, extent);
     428    16243588 :                         diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
     429    16243528 :                         datao = btrfs_file_extent_offset(leaf, extent);
     430    16243553 :                         datal = btrfs_file_extent_num_bytes(leaf, extent);
     431        2266 :                 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
     432             :                         /* Take upper bound, may be compressed */
     433        2266 :                         datal = btrfs_file_extent_ram_bytes(leaf, extent);
     434             :                 }
     435             : 
     436             :                 /*
     437             :                  * The first search might have left us at an extent item that
     438             :                  * ends before our target range's start, can happen if we have
     439             :                  * holes and NO_HOLES feature enabled.
     440             :                  *
     441             :                  * Subsequent searches may leave us on a file range we have
     442             :                  * processed before - this happens due to a race with ordered
     443             :                  * extent completion for a file range that is outside our source
     444             :                  * range, but that range was part of a file extent item that
     445             :                  * also covered a leading part of our source range.
     446             :                  */
     447    16245804 :                 if (key.offset + datal <= prev_extent_end) {
     448      866543 :                         path->slots[0]++;
     449      866543 :                         goto process_slot;
     450    15379261 :                 } else if (key.offset >= off + len) {
     451             :                         break;
     452             :                 }
     453             : 
     454    14409439 :                 prev_extent_end = key.offset + datal;
     455    14409439 :                 size = btrfs_item_size(leaf, slot);
     456    14409443 :                 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
     457             :                                    size);
     458             : 
     459    14409438 :                 btrfs_release_path(path);
     460             : 
     461    14409443 :                 memcpy(&new_key, &key, sizeof(new_key));
     462    14409443 :                 new_key.objectid = btrfs_ino(BTRFS_I(inode));
     463    14409443 :                 if (off <= key.offset)
     464    14151429 :                         new_key.offset = key.offset + destoff - off;
     465             :                 else
     466      258014 :                         new_key.offset = destoff;
     467             : 
     468             :                 /*
     469             :                  * Deal with a hole that doesn't have an extent item that
     470             :                  * represents it (NO_HOLES feature enabled).
     471             :                  * This hole is either in the middle of the cloning range or at
     472             :                  * the beginning (fully overlaps it or partially overlaps it).
     473             :                  */
     474    14409443 :                 if (new_key.offset != last_dest_end)
     475             :                         drop_start = last_dest_end;
     476             :                 else
     477    13142395 :                         drop_start = new_key.offset;
     478             : 
     479    14409443 :                 if (type == BTRFS_FILE_EXTENT_REG ||
     480             :                     type == BTRFS_FILE_EXTENT_PREALLOC) {
     481    14407181 :                         struct btrfs_replace_extent_info clone_info;
     482             : 
     483             :                         /*
     484             :                          *    a  | --- range to clone ---|  b
     485             :                          * | ------------- extent ------------- |
     486             :                          */
     487             : 
     488             :                         /* Subtract range b */
     489    14407181 :                         if (key.offset + datal > off + len)
     490      268737 :                                 datal = off + len - key.offset;
     491             : 
     492             :                         /* Subtract range a */
     493    14407181 :                         if (off > key.offset) {
     494      258016 :                                 datao += off - key.offset;
     495      258016 :                                 datal -= off - key.offset;
     496             :                         }
     497             : 
     498    14407181 :                         clone_info.disk_offset = disko;
     499    14407181 :                         clone_info.disk_len = diskl;
     500    14407181 :                         clone_info.data_offset = datao;
     501    14407181 :                         clone_info.data_len = datal;
     502    14407181 :                         clone_info.file_offset = new_key.offset;
     503    14407181 :                         clone_info.extent_buf = buf;
     504    14407181 :                         clone_info.is_new_extent = false;
     505    14407181 :                         clone_info.update_times = !no_time_update;
     506    14407181 :                         ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
     507    14407181 :                                         drop_start, new_key.offset + datal - 1,
     508             :                                         &clone_info, &trans);
     509    14407183 :                         if (ret)
     510         217 :                                 goto out;
     511             :                 } else {
     512        2262 :                         ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
     513             :                         /*
     514             :                          * Inline extents always have to start at file offset 0
     515             :                          * and can never be bigger then the sector size. We can
     516             :                          * never clone only parts of an inline extent, since all
     517             :                          * reflink operations must start at a sector size aligned
     518             :                          * offset, and the length must be aligned too or end at
     519             :                          * the i_size (which implies the whole inlined data).
     520             :                          */
     521        2262 :                         ASSERT(key.offset == 0);
     522        2262 :                         ASSERT(datal <= fs_info->sectorsize);
     523        2262 :                         if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
     524        2262 :                             WARN_ON(key.offset != 0) ||
     525        2262 :                             WARN_ON(datal > fs_info->sectorsize)) {
     526           0 :                                 ret = -EUCLEAN;
     527           0 :                                 goto out;
     528             :                         }
     529             : 
     530        2262 :                         ret = clone_copy_inline_extent(inode, path, &new_key,
     531             :                                                        drop_start, datal, size,
     532             :                                                        comp, buf, &trans);
     533        2262 :                         if (ret)
     534           0 :                                 goto out;
     535             :                 }
     536             : 
     537    14409228 :                 btrfs_release_path(path);
     538             : 
     539             :                 /*
     540             :                  * Whenever we share an extent we update the last_reflink_trans
     541             :                  * of each inode to the current transaction. This is needed to
     542             :                  * make sure fsync does not log multiple checksum items with
     543             :                  * overlapping ranges (because some extent items might refer
     544             :                  * only to sections of the original extent). For the destination
     545             :                  * inode we do this regardless of the generation of the extents
     546             :                  * or even if they are inline extents or explicit holes, to make
     547             :                  * sure a full fsync does not skip them. For the source inode,
     548             :                  * we only need to update last_reflink_trans in case it's a new
     549             :                  * extent that is not a hole or an inline extent, to deal with
     550             :                  * the checksums problem on fsync.
     551             :                  */
     552    14409228 :                 if (extent_gen == trans->transid && disko > 0)
     553     9572810 :                         BTRFS_I(src)->last_reflink_trans = trans->transid;
     554             : 
     555    14409228 :                 BTRFS_I(inode)->last_reflink_trans = trans->transid;
     556             : 
     557    14409228 :                 last_dest_end = ALIGN(new_key.offset + datal,
     558             :                                       fs_info->sectorsize);
     559    14409228 :                 ret = clone_finish_inode_update(trans, inode, last_dest_end,
     560             :                                                 destoff, olen, no_time_update);
     561    14409225 :                 if (ret)
     562           0 :                         goto out;
     563    14409225 :                 if (new_key.offset + datal >= destoff + len)
     564             :                         break;
     565             : 
     566    14063474 :                 btrfs_release_path(path);
     567    14063475 :                 key.offset = prev_extent_end;
     568             : 
     569    14063475 :                 if (fatal_signal_pending(current)) {
     570           2 :                         ret = -EINTR;
     571           2 :                         goto out;
     572             :                 }
     573             : 
     574    14063472 :                 cond_resched();
     575             :         }
     576     1433310 :         ret = 0;
     577             : 
     578     1433310 :         if (last_dest_end < destoff + len) {
     579             :                 /*
     580             :                  * We have an implicit hole that fully or partially overlaps our
     581             :                  * cloning range at its end. This means that we either have the
     582             :                  * NO_HOLES feature enabled or the implicit hole happened due to
     583             :                  * mixing buffered and direct IO writes against this file.
     584             :                  */
     585     1085319 :                 btrfs_release_path(path);
     586             : 
     587             :                 /*
     588             :                  * When using NO_HOLES and we are cloning a range that covers
     589             :                  * only a hole (no extents) into a range beyond the current
     590             :                  * i_size, punching a hole in the target range will not create
     591             :                  * an extent map defining a hole, because the range starts at or
     592             :                  * beyond current i_size. If the file previously had an i_size
     593             :                  * greater than the new i_size set by this clone operation, we
     594             :                  * need to make sure the next fsync is a full fsync, so that it
     595             :                  * detects and logs a hole covering a range from the current
     596             :                  * i_size to the new i_size. If the clone range covers extents,
     597             :                  * besides a hole, then we know the full sync flag was already
     598             :                  * set by previous calls to btrfs_replace_file_extents() that
     599             :                  * replaced file extent items.
     600             :                  */
     601     1085367 :                 if (last_dest_end >= i_size_read(inode))
     602       39858 :                         btrfs_set_inode_full_sync(BTRFS_I(inode));
     603             : 
     604     1085368 :                 ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
     605             :                                 last_dest_end, destoff + len - 1, NULL, &trans);
     606     1085325 :                 if (ret)
     607        1764 :                         goto out;
     608             : 
     609     1083561 :                 ret = clone_finish_inode_update(trans, inode, destoff + len,
     610             :                                                 destoff, olen, no_time_update);
     611             :         }
     612             : 
     613      347991 : out:
     614     1433518 :         btrfs_free_path(path);
     615     1433489 :         kvfree(buf);
     616     1433234 :         clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
     617             : 
     618     1433234 :         return ret;
     619             : }
     620             : 
     621     1433335 : static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
     622             :                                        struct inode *inode2, u64 loff2, u64 len)
     623             : {
     624     1433335 :         unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
     625     1433518 :         unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
     626     1433526 : }
     627             : 
     628     1433561 : static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
     629             :                                      struct inode *inode2, u64 loff2, u64 len)
     630             : {
     631     1433561 :         u64 range1_end = loff1 + len - 1;
     632     1433561 :         u64 range2_end = loff2 + len - 1;
     633             : 
     634     1433561 :         if (inode1 < inode2) {
     635             :                 swap(inode1, inode2);
     636             :                 swap(loff1, loff2);
     637             :                 swap(range1_end, range2_end);
     638      872025 :         } else if (inode1 == inode2 && loff2 < loff1) {
     639      132657 :                 swap(loff1, loff2);
     640      132657 :                 swap(range1_end, range2_end);
     641             :         }
     642             : 
     643     1433561 :         lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
     644     1433533 :         lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
     645             : 
     646     1433569 :         btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
     647     1433551 :         btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
     648     1433545 : }
     649             : 
     650     1825004 : static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
     651             : {
     652     1825004 :         if (inode1 < inode2)
     653      914485 :                 swap(inode1, inode2);
     654     1825004 :         down_write(&BTRFS_I(inode1)->i_mmap_lock);
     655     1825034 :         down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
     656     1825034 : }
     657             : 
     658             : static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
     659             : {
     660     1825095 :         up_write(&BTRFS_I(inode1)->i_mmap_lock);
     661     1825078 :         up_write(&BTRFS_I(inode2)->i_mmap_lock);
     662             : }
     663             : 
     664     1086101 : static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
     665             :                                    struct inode *dst, u64 dst_loff)
     666             : {
     667     1086101 :         struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
     668     1086101 :         const u64 bs = fs_info->sb->s_blocksize;
     669     1086101 :         int ret;
     670             : 
     671             :         /*
     672             :          * Lock destination range to serialize with concurrent readahead() and
     673             :          * source range to serialize with relocation.
     674             :          */
     675     1086101 :         btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
     676     1086057 :         ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
     677     1086008 :         btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
     678             : 
     679     1086044 :         btrfs_btree_balance_dirty(fs_info);
     680             : 
     681     1085888 :         return ret;
     682             : }
     683             : 
     684     1104635 : static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
     685             :                              struct inode *dst, u64 dst_loff)
     686             : {
     687     1104635 :         int ret = 0;
     688     1104635 :         u64 i, tail_len, chunk_count;
     689     1104635 :         struct btrfs_root *root_dst = BTRFS_I(dst)->root;
     690             : 
     691     1104635 :         spin_lock(&root_dst->root_item_lock);
     692     1104689 :         if (root_dst->send_in_progress) {
     693       18650 :                 btrfs_warn_rl(root_dst->fs_info,
     694             : "cannot deduplicate to root %llu while send operations are using it (%d in progress)",
     695             :                               root_dst->root_key.objectid,
     696             :                               root_dst->send_in_progress);
     697       18650 :                 spin_unlock(&root_dst->root_item_lock);
     698       18650 :                 return -EAGAIN;
     699             :         }
     700     1086039 :         root_dst->dedupe_in_progress++;
     701     1086039 :         spin_unlock(&root_dst->root_item_lock);
     702             : 
     703     1086039 :         tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
     704     1086039 :         chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
     705             : 
     706     1086133 :         for (i = 0; i < chunk_count; i++) {
     707          99 :                 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
     708             :                                               dst, dst_loff);
     709          94 :                 if (ret)
     710           0 :                         goto out;
     711             : 
     712          94 :                 loff += BTRFS_MAX_DEDUPE_LEN;
     713          94 :                 dst_loff += BTRFS_MAX_DEDUPE_LEN;
     714             :         }
     715             : 
     716     1086034 :         if (tail_len > 0)
     717     1086014 :                 ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
     718          20 : out:
     719     1085793 :         spin_lock(&root_dst->root_item_lock);
     720     1086039 :         root_dst->dedupe_in_progress--;
     721     1086039 :         spin_unlock(&root_dst->root_item_lock);
     722             : 
     723     1086039 :         return ret;
     724             : }
     725             : 
     726      347770 : static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
     727             :                                         u64 off, u64 olen, u64 destoff)
     728             : {
     729      347770 :         struct inode *inode = file_inode(file);
     730      347770 :         struct inode *src = file_inode(file_src);
     731      347770 :         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
     732      347770 :         int ret;
     733      347770 :         int wb_ret;
     734      347770 :         u64 len = olen;
     735      347770 :         u64 bs = fs_info->sb->s_blocksize;
     736             : 
     737             :         /*
     738             :          * VFS's generic_remap_file_range_prep() protects us from cloning the
     739             :          * eof block into the middle of a file, which would result in corruption
     740             :          * if the file size is not blocksize aligned. So we don't need to check
     741             :          * for that case here.
     742             :          */
     743      347770 :         if (off + len == src->i_size)
     744        6021 :                 len = ALIGN(src->i_size, bs) - off;
     745             : 
     746      347770 :         if (destoff > inode->i_size) {
     747       66102 :                 const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
     748             : 
     749       66102 :                 ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
     750       66102 :                 if (ret)
     751             :                         return ret;
     752             :                 /*
     753             :                  * We may have truncated the last block if the inode's size is
     754             :                  * not sector size aligned, so we need to wait for writeback to
     755             :                  * complete before proceeding further, otherwise we can race
     756             :                  * with cloning and attempt to increment a reference to an
     757             :                  * extent that no longer exists (writeback completed right after
     758             :                  * we found the previous extent covering eof and before we
     759             :                  * attempted to increment its reference count).
     760             :                  */
     761       65809 :                 ret = btrfs_wait_ordered_range(inode, wb_start,
     762             :                                                destoff - wb_start);
     763       65809 :                 if (ret)
     764             :                         return ret;
     765             :         }
     766             : 
     767             :         /*
     768             :          * Lock destination range to serialize with concurrent readahead() and
     769             :          * source range to serialize with relocation.
     770             :          */
     771      347477 :         btrfs_double_extent_lock(src, off, inode, destoff, len);
     772      347477 :         ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
     773      347476 :         btrfs_double_extent_unlock(src, off, inode, destoff, len);
     774             : 
     775             :         /*
     776             :          * We may have copied an inline extent into a page of the destination
     777             :          * range, so wait for writeback to complete before truncating pages
     778             :          * from the page cache. This is a rare case.
     779             :          */
     780      347477 :         wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
     781      347473 :         ret = ret ? ret : wb_ret;
     782             :         /*
     783             :          * Truncate page cache pages so that future reads will see the cloned
     784             :          * data immediately and not the previous data.
     785             :          */
     786      347473 :         truncate_inode_pages_range(&inode->i_data,
     787      347473 :                                 round_down(destoff, PAGE_SIZE),
     788      347473 :                                 round_up(destoff + len, PAGE_SIZE) - 1);
     789             : 
     790      347472 :         btrfs_btree_balance_dirty(fs_info);
     791             : 
     792      347472 :         return ret;
     793             : }
     794             : 
     795     2372462 : static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
     796             :                                        struct file *file_out, loff_t pos_out,
     797             :                                        loff_t *len, unsigned int remap_flags)
     798             : {
     799     2372462 :         struct inode *inode_in = file_inode(file_in);
     800     2372462 :         struct inode *inode_out = file_inode(file_out);
     801     2372462 :         u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
     802     2372462 :         u64 wb_len;
     803     2372462 :         int ret;
     804             : 
     805     2372462 :         if (!(remap_flags & REMAP_FILE_DEDUP)) {
     806      498345 :                 struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
     807             : 
     808      498345 :                 if (btrfs_root_readonly(root_out))
     809             :                         return -EROFS;
     810             : 
     811             :                 ASSERT(inode_in->i_sb == inode_out->i_sb);
     812             :         }
     813             : 
     814             :         /* Don't make the dst file partly checksummed */
     815     2372462 :         if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
     816     2372462 :             (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
     817             :                 return -EINVAL;
     818             :         }
     819             : 
     820             :         /*
     821             :          * Now that the inodes are locked, we need to start writeback ourselves
     822             :          * and can not rely on the writeback from the VFS's generic helper
     823             :          * generic_remap_file_range_prep() because:
     824             :          *
     825             :          * 1) For compression we must call filemap_fdatawrite_range() range
     826             :          *    twice (btrfs_fdatawrite_range() does it for us), and the generic
     827             :          *    helper only calls it once;
     828             :          *
     829             :          * 2) filemap_fdatawrite_range(), called by the generic helper only
     830             :          *    waits for the writeback to complete, i.e. for IO to be done, and
     831             :          *    not for the ordered extents to complete. We need to wait for them
     832             :          *    to complete so that new file extent items are in the fs tree.
     833             :          */
     834     2372462 :         if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
     835        9801 :                 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
     836             :         else
     837     2362661 :                 wb_len = ALIGN(*len, bs);
     838             : 
     839             :         /*
     840             :          * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
     841             :          *
     842             :          * Btrfs' back references do not have a block level granularity, they
     843             :          * work at the whole extent level.
     844             :          * NOCOW buffered write without data space reserved may not be able
     845             :          * to fall back to CoW due to lack of data space, thus could cause
     846             :          * data loss.
     847             :          *
     848             :          * Here we take a shortcut by flushing the whole inode, so that all
     849             :          * nocow write should reach disk as nocow before we increase the
     850             :          * reference of the extent. We could do better by only flushing NOCOW
     851             :          * data, but that needs extra accounting.
     852             :          *
     853             :          * Also we don't need to check ASYNC_EXTENT, as async extent will be
     854             :          * CoWed anyway, not affecting nocow part.
     855             :          */
     856     2372462 :         ret = filemap_flush(inode_in->i_mapping);
     857     2372460 :         if (ret < 0)
     858             :                 return ret;
     859             : 
     860     2372460 :         ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
     861             :                                        wb_len);
     862     2372045 :         if (ret < 0)
     863             :                 return ret;
     864     2372045 :         ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
     865             :                                        wb_len);
     866     2372591 :         if (ret < 0)
     867             :                 return ret;
     868             : 
     869     2372591 :         return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
     870             :                                             len, remap_flags);
     871             : }
     872             : 
     873     2862536 : static bool file_sync_write(const struct file *file)
     874             : {
     875     2862536 :         if (file->f_flags & (__O_SYNC | O_DSYNC))
     876             :                 return true;
     877     2862533 :         if (IS_SYNC(file_inode(file)))
     878           2 :                 return true;
     879             : 
     880             :         return false;
     881             : }
     882             : 
     883     2372362 : loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
     884             :                 struct file *dst_file, loff_t destoff, loff_t len,
     885             :                 unsigned int remap_flags)
     886             : {
     887     2372362 :         struct inode *src_inode = file_inode(src_file);
     888     2372362 :         struct inode *dst_inode = file_inode(dst_file);
     889     2372362 :         bool same_inode = dst_inode == src_inode;
     890     2372362 :         int ret;
     891             : 
     892     2372362 :         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
     893             :                 return -EINVAL;
     894             : 
     895     2372362 :         if (same_inode) {
     896      547499 :                 btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
     897             :         } else {
     898     1824863 :                 lock_two_nondirectories(src_inode, dst_inode);
     899     1825003 :                 btrfs_double_mmap_lock(src_inode, dst_inode);
     900             :         }
     901             : 
     902     2372512 :         ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
     903             :                                           &len, remap_flags);
     904     2372577 :         if (ret < 0 || len == 0)
     905      920159 :                 goto out_unlock;
     906             : 
     907     1452418 :         if (remap_flags & REMAP_FILE_DEDUP)
     908     1104649 :                 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
     909             :         else
     910      347769 :                 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
     911             : 
     912     2372594 : out_unlock:
     913     2372594 :         if (same_inode) {
     914      547499 :                 btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
     915             :         } else {
     916     1825095 :                 btrfs_double_mmap_unlock(src_inode, dst_inode);
     917     1825069 :                 unlock_two_nondirectories(src_inode, dst_inode);
     918             :         }
     919             : 
     920             :         /*
     921             :          * If either the source or the destination file was opened with O_SYNC,
     922             :          * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
     923             :          * source files/ranges, so that after a successful return (0) followed
     924             :          * by a power failure results in the reflinked data to be readable from
     925             :          * both files/ranges.
     926             :          */
     927     2372406 :         if (ret == 0 && len > 0 &&
     928     1431272 :             (file_sync_write(src_file) || file_sync_write(dst_file))) {
     929           5 :                 ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
     930           5 :                 if (ret == 0)
     931           5 :                         ret = btrfs_sync_file(dst_file, destoff,
     932           5 :                                               destoff + len - 1, 0);
     933             :         }
     934             : 
     935     2372406 :         return ret < 0 ? ret : len;
     936             : }

Generated by: LCOV version 1.14