LCOV - code coverage report
Current view: top level - fs/xfs - xfs_reflink.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-acha @ Mon Jul 31 20:08:06 PDT 2023 Lines: 605 652 92.8 %
Date: 2023-07-31 20:08:07 Functions: 26 26 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0+
       2             : /*
       3             :  * Copyright (C) 2016 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <darrick.wong@oracle.com>
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_defer.h"
      14             : #include "xfs_inode.h"
      15             : #include "xfs_trans.h"
      16             : #include "xfs_bmap.h"
      17             : #include "xfs_bmap_util.h"
      18             : #include "xfs_trace.h"
      19             : #include "xfs_icache.h"
      20             : #include "xfs_btree.h"
      21             : #include "xfs_refcount_btree.h"
      22             : #include "xfs_refcount.h"
      23             : #include "xfs_bmap_btree.h"
      24             : #include "xfs_trans_space.h"
      25             : #include "xfs_bit.h"
      26             : #include "xfs_alloc.h"
      27             : #include "xfs_quota.h"
      28             : #include "xfs_reflink.h"
      29             : #include "xfs_iomap.h"
      30             : #include "xfs_ag.h"
      31             : #include "xfs_ag_resv.h"
      32             : #include "xfs_health.h"
      33             : 
      34             : /*
      35             :  * Copy on Write of Shared Blocks
      36             :  *
      37             :  * XFS must preserve "the usual" file semantics even when two files share
      38             :  * the same physical blocks.  This means that a write to one file must not
      39             :  * alter the blocks in a different file; the way that we'll do that is
      40             :  * through the use of a copy-on-write mechanism.  At a high level, that
      41             :  * means that when we want to write to a shared block, we allocate a new
      42             :  * block, write the data to the new block, and if that succeeds we map the
      43             :  * new block into the file.
      44             :  *
      45             :  * XFS provides a "delayed allocation" mechanism that defers the allocation
      46             :  * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
      47             :  * possible.  This reduces fragmentation by enabling the filesystem to ask
      48             :  * for bigger chunks less often, which is exactly what we want for CoW.
      49             :  *
      50             :  * The delalloc mechanism begins when the kernel wants to make a block
      51             :  * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
      52             :  * create a delalloc mapping, which is a regular in-core extent, but without
      53             :  * a real startblock.  (For delalloc mappings, the startblock encodes both
      54             :  * a flag that this is a delalloc mapping, and a worst-case estimate of how
      55             :  * many blocks might be required to put the mapping into the BMBT.)  delalloc
      56             :  * mappings are a reservation against the free space in the filesystem;
      57             :  * adjacent mappings can also be combined into fewer larger mappings.
      58             :  *
      59             :  * As an optimization, the CoW extent size hint (cowextsz) creates
      60             :  * outsized aligned delalloc reservations in the hope of landing out of
      61             :  * order nearby CoW writes in a single extent on disk, thereby reducing
      62             :  * fragmentation and improving future performance.
      63             :  *
      64             :  * D: --RRRRRRSSSRRRRRRRR--- (data fork)
      65             :  * C: ------DDDDDDD--------- (CoW fork)
      66             :  *
      67             :  * When dirty pages are being written out (typically in writepage), the
      68             :  * delalloc reservations are converted into unwritten mappings by
      69             :  * allocating blocks and replacing the delalloc mapping with real ones.
      70             :  * A delalloc mapping can be replaced by several unwritten ones if the
      71             :  * free space is fragmented.
      72             :  *
      73             :  * D: --RRRRRRSSSRRRRRRRR---
      74             :  * C: ------UUUUUUU---------
      75             :  *
      76             :  * We want to adapt the delalloc mechanism for copy-on-write, since the
      77             :  * write paths are similar.  The first two steps (creating the reservation
      78             :  * and allocating the blocks) are exactly the same as delalloc except that
      79             :  * the mappings must be stored in a separate CoW fork because we do not want
      80             :  * to disturb the mapping in the data fork until we're sure that the write
      81             :  * succeeded.  IO completion in this case is the process of removing the old
      82             :  * mapping from the data fork and moving the new mapping from the CoW fork to
      83             :  * the data fork.  This will be discussed shortly.
      84             :  *
      85             :  * For now, unaligned directio writes will be bounced back to the page cache.
      86             :  * Block-aligned directio writes will use the same mechanism as buffered
      87             :  * writes.
      88             :  *
      89             :  * Just prior to submitting the actual disk write requests, we convert
      90             :  * the extents representing the range of the file actually being written
      91             :  * (as opposed to extra pieces created for the cowextsize hint) to real
      92             :  * extents.  This will become important in the next step:
      93             :  *
      94             :  * D: --RRRRRRSSSRRRRRRRR---
      95             :  * C: ------UUrrUUU---------
      96             :  *
      97             :  * CoW remapping must be done after the data block write completes,
      98             :  * because we don't want to destroy the old data fork map until we're sure
      99             :  * the new block has been written.  Since the new mappings are kept in a
     100             :  * separate fork, we can simply iterate these mappings to find the ones
     101             :  * that cover the file blocks that we just CoW'd.  For each extent, simply
     102             :  * unmap the corresponding range in the data fork, map the new range into
     103             :  * the data fork, and remove the extent from the CoW fork.  Because of
     104             :  * the presence of the cowextsize hint, however, we must be careful
     105             :  * only to remap the blocks that we've actually written out --  we must
     106             :  * never remap delalloc reservations nor CoW staging blocks that have
     107             :  * yet to be written.  This corresponds exactly to the real extents in
     108             :  * the CoW fork:
     109             :  *
     110             :  * D: --RRRRRRrrSRRRRRRRR---
     111             :  * C: ------UU--UUU---------
     112             :  *
     113             :  * Since the remapping operation can be applied to an arbitrary file
     114             :  * range, we record the need for the remap step as a flag in the ioend
     115             :  * instead of declaring a new IO type.  This is required for direct io
     116             :  * because we only have ioend for the whole dio, and we have to be able to
     117             :  * remember the presence of unwritten blocks and CoW blocks with a single
     118             :  * ioend structure.  Better yet, the more ground we can cover with one
     119             :  * ioend, the better.
     120             :  */
     121             : 
     122             : /*
     123             :  * Given an AG extent, find the lowest-numbered run of shared blocks
     124             :  * within that range and return the range in fbno/flen.  If
     125             :  * find_end_of_shared is true, return the longest contiguous extent of
     126             :  * shared blocks.  If there are no shared extents, fbno and flen will
     127             :  * be set to NULLAGBLOCK and 0, respectively.
     128             :  */
     129             : static int
     130   264757935 : xfs_reflink_find_shared(
     131             :         struct xfs_perag        *pag,
     132             :         struct xfs_trans        *tp,
     133             :         xfs_agblock_t           agbno,
     134             :         xfs_extlen_t            aglen,
     135             :         xfs_agblock_t           *fbno,
     136             :         xfs_extlen_t            *flen,
     137             :         bool                    find_end_of_shared)
     138             : {
     139   264757935 :         struct xfs_buf          *agbp;
     140   264757935 :         struct xfs_btree_cur    *cur;
     141   264757935 :         int                     error;
     142             : 
     143   264757935 :         error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
     144   264759574 :         if (error)
     145             :                 return error;
     146             : 
     147   264759539 :         cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
     148             : 
     149   264759015 :         error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
     150             :                         find_end_of_shared);
     151             : 
     152   264756625 :         xfs_btree_del_cursor(cur, error);
     153             : 
     154   264760942 :         xfs_trans_brelse(tp, agbp);
     155   264760942 :         return error;
     156             : }
     157             : 
     158             : /*
     159             :  * Trim the mapping to the next block where there's a change in the
     160             :  * shared/unshared status.  More specifically, this means that we
     161             :  * find the lowest-numbered extent of shared blocks that coincides with
     162             :  * the given block mapping.  If the shared extent overlaps the start of
     163             :  * the mapping, trim the mapping to the end of the shared extent.  If
     164             :  * the shared region intersects the mapping, trim the mapping to the
     165             :  * start of the shared extent.  If there are no shared regions that
     166             :  * overlap, just return the original extent.
     167             :  */
     168             : int
     169    15399819 : xfs_reflink_trim_around_shared(
     170             :         struct xfs_inode        *ip,
     171             :         struct xfs_bmbt_irec    *irec,
     172             :         bool                    *shared)
     173             : {
     174    15399819 :         struct xfs_mount        *mp = ip->i_mount;
     175    15399819 :         struct xfs_perag        *pag;
     176    15399819 :         xfs_agblock_t           agbno;
     177    15399819 :         xfs_extlen_t            aglen;
     178    15399819 :         xfs_agblock_t           fbno;
     179    15399819 :         xfs_extlen_t            flen;
     180    15399819 :         int                     error = 0;
     181             : 
     182             :         /* Holes, unwritten, and delalloc extents cannot be shared */
     183    15399819 :         if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
     184     4912331 :                 *shared = false;
     185     4912331 :                 return 0;
     186             :         }
     187             : 
     188    10487488 :         trace_xfs_reflink_trim_around_shared(ip, irec);
     189             : 
     190    10487470 :         pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
     191    10487460 :         agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
     192    10487460 :         aglen = irec->br_blockcount;
     193             : 
     194    10487460 :         error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
     195             :                         true);
     196    10487541 :         xfs_perag_put(pag);
     197    10487535 :         if (error)
     198             :                 return error;
     199             : 
     200    10487433 :         *shared = false;
     201    10487433 :         if (fbno == NULLAGBLOCK) {
     202             :                 /* No shared blocks at all. */
     203             :                 return 0;
     204             :         }
     205             : 
     206      889813 :         if (fbno == agbno) {
     207             :                 /*
     208             :                  * The start of this extent is shared.  Truncate the
     209             :                  * mapping at the end of the shared region so that a
     210             :                  * subsequent iteration starts at the start of the
     211             :                  * unshared region.
     212             :                  */
     213      876140 :                 irec->br_blockcount = flen;
     214      876140 :                 *shared = true;
     215      876140 :                 return 0;
     216             :         }
     217             : 
     218             :         /*
     219             :          * There's a shared extent midway through this extent.
     220             :          * Truncate the mapping at the start of the shared
     221             :          * extent so that a subsequent iteration starts at the
     222             :          * start of the shared region.
     223             :          */
     224       13673 :         irec->br_blockcount = fbno - agbno;
     225       13673 :         return 0;
     226             : }
     227             : 
     228             : int
     229     8491728 : xfs_bmap_trim_cow(
     230             :         struct xfs_inode        *ip,
     231             :         struct xfs_bmbt_irec    *imap,
     232             :         bool                    *shared)
     233             : {
     234             :         /* We can't update any real extents in always COW mode. */
     235     8491728 :         if (xfs_is_always_cow_inode(ip) &&
     236           0 :             !isnullstartblock(imap->br_startblock)) {
     237           0 :                 *shared = true;
     238           0 :                 return 0;
     239             :         }
     240             : 
     241             :         /* Trim the mapping to the nearest shared extent boundary. */
     242     8491728 :         return xfs_reflink_trim_around_shared(ip, imap, shared);
     243             : }
     244             : 
     245             : static int
     246     1659111 : xfs_reflink_convert_cow_locked(
     247             :         struct xfs_inode        *ip,
     248             :         xfs_fileoff_t           offset_fsb,
     249             :         xfs_filblks_t           count_fsb)
     250             : {
     251     1659111 :         struct xfs_iext_cursor  icur;
     252     1659111 :         struct xfs_bmbt_irec    got;
     253     1659111 :         struct xfs_btree_cur    *dummy_cur = NULL;
     254     1659111 :         int                     dummy_logflags;
     255     1659111 :         int                     error = 0;
     256             : 
     257     1659111 :         if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
     258             :                 return 0;
     259             : 
     260     3223051 :         do {
     261     3223051 :                 if (got.br_startoff >= offset_fsb + count_fsb)
     262             :                         break;
     263     1660019 :                 if (got.br_state == XFS_EXT_NORM)
     264           6 :                         continue;
     265     1660013 :                 if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
     266             :                         return -EIO;
     267             : 
     268     1660013 :                 xfs_trim_extent(&got, offset_fsb, count_fsb);
     269     1660034 :                 if (!got.br_blockcount)
     270           0 :                         continue;
     271             : 
     272     1660034 :                 got.br_state = XFS_EXT_NORM;
     273     1660034 :                 error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
     274             :                                 XFS_COW_FORK, &icur, &dummy_cur, &got,
     275             :                                 &dummy_logflags);
     276     1659869 :                 if (error)
     277           0 :                         return error;
     278     1659875 :         } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
     279             : 
     280             :         return error;
     281             : }
     282             : 
     283             : /* Convert all of the unwritten CoW extents in a file's range to real ones. */
     284             : int
     285      256720 : xfs_reflink_convert_cow(
     286             :         struct xfs_inode        *ip,
     287             :         xfs_off_t               offset,
     288             :         xfs_off_t               count)
     289             : {
     290      256720 :         struct xfs_mount        *mp = ip->i_mount;
     291      256720 :         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
     292      256720 :         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
     293      256720 :         xfs_filblks_t           count_fsb = end_fsb - offset_fsb;
     294      256720 :         int                     error;
     295             : 
     296      256720 :         ASSERT(count != 0);
     297             : 
     298      256720 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     299      256511 :         error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
     300      256389 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     301      256151 :         return error;
     302             : }
     303             : 
     304             : /*
     305             :  * Find the extent that maps the given range in the COW fork. Even if the extent
     306             :  * is not shared we might have a preallocation for it in the COW fork. If so we
     307             :  * use it that rather than trigger a new allocation.
     308             :  */
     309             : static int
     310     3277436 : xfs_find_trim_cow_extent(
     311             :         struct xfs_inode        *ip,
     312             :         struct xfs_bmbt_irec    *imap,
     313             :         struct xfs_bmbt_irec    *cmap,
     314             :         bool                    *shared,
     315             :         bool                    *found)
     316             : {
     317     3277436 :         xfs_fileoff_t           offset_fsb = imap->br_startoff;
     318     3277436 :         xfs_filblks_t           count_fsb = imap->br_blockcount;
     319     3277436 :         struct xfs_iext_cursor  icur;
     320             : 
     321     3277436 :         *found = false;
     322             : 
     323             :         /*
     324             :          * If we don't find an overlapping extent, trim the range we need to
     325             :          * allocate to fit the hole we found.
     326             :          */
     327     3277436 :         if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
     328      227054 :                 cmap->br_startoff = offset_fsb + count_fsb;
     329     3277436 :         if (cmap->br_startoff > offset_fsb) {
     330     2086450 :                 xfs_trim_extent(imap, imap->br_startoff,
     331             :                                 cmap->br_startoff - imap->br_startoff);
     332     2086450 :                 return xfs_bmap_trim_cow(ip, imap, shared);
     333             :         }
     334             : 
     335     1190986 :         *shared = true;
     336     1190986 :         if (isnullstartblock(cmap->br_startblock)) {
     337         926 :                 xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
     338         926 :                 return 0;
     339             :         }
     340             : 
     341             :         /* real extent found - no need to allocate */
     342     1190060 :         xfs_trim_extent(cmap, offset_fsb, count_fsb);
     343     1190060 :         *found = true;
     344     1190060 :         return 0;
     345             : }
     346             : 
     347             : static int
     348     1409828 : xfs_reflink_convert_unwritten(
     349             :         struct xfs_inode        *ip,
     350             :         struct xfs_bmbt_irec    *imap,
     351             :         struct xfs_bmbt_irec    *cmap,
     352             :         bool                    convert_now)
     353             : {
     354     1409828 :         xfs_fileoff_t           offset_fsb = imap->br_startoff;
     355     1409828 :         xfs_filblks_t           count_fsb = imap->br_blockcount;
     356     1409828 :         int                     error;
     357             : 
     358             :         /*
     359             :          * cmap might larger than imap due to cowextsize hint.
     360             :          */
     361     1409828 :         xfs_trim_extent(cmap, offset_fsb, count_fsb);
     362             : 
     363             :         /*
     364             :          * COW fork extents are supposed to remain unwritten until we're ready
     365             :          * to initiate a disk write.  For direct I/O we are going to write the
     366             :          * data and need the conversion, but for buffered writes we're done.
     367             :          */
     368     1409828 :         if (!convert_now || cmap->br_state == XFS_EXT_NORM)
     369             :                 return 0;
     370             : 
     371     1402399 :         trace_xfs_reflink_convert_cow(ip, cmap);
     372             : 
     373     1402399 :         error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
     374     1402399 :         if (!error)
     375     1402399 :                 cmap->br_state = XFS_EXT_NORM;
     376             : 
     377             :         return error;
     378             : }
     379             : 
     380             : static int
     381      219352 : xfs_reflink_fill_cow_hole(
     382             :         struct xfs_inode        *ip,
     383             :         struct xfs_bmbt_irec    *imap,
     384             :         struct xfs_bmbt_irec    *cmap,
     385             :         bool                    *shared,
     386             :         uint                    *lockmode,
     387             :         bool                    convert_now)
     388             : {
     389      219352 :         struct xfs_mount        *mp = ip->i_mount;
     390      219352 :         struct xfs_trans        *tp;
     391      219352 :         xfs_filblks_t           resaligned;
     392      219352 :         xfs_extlen_t            resblks;
     393      219352 :         int                     nimaps;
     394      219352 :         int                     error;
     395      219352 :         bool                    found;
     396             : 
     397      219352 :         resaligned = xfs_aligned_fsb_count(imap->br_startoff,
     398             :                 imap->br_blockcount, xfs_get_cowextsz_hint(ip));
     399      219352 :         resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
     400             : 
     401      219352 :         xfs_iunlock(ip, *lockmode);
     402      219352 :         *lockmode = 0;
     403             : 
     404      219352 :         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
     405             :                         false, &tp);
     406      219352 :         if (error)
     407             :                 return error;
     408             : 
     409      219344 :         *lockmode = XFS_ILOCK_EXCL;
     410             : 
     411      219344 :         error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
     412      219344 :         if (error || !*shared)
     413           0 :                 goto out_trans_cancel;
     414             : 
     415      219344 :         if (found) {
     416           0 :                 xfs_trans_cancel(tp);
     417           0 :                 goto convert;
     418             :         }
     419             : 
     420             :         /* Allocate the entire reservation as unwritten blocks. */
     421      219344 :         nimaps = 1;
     422      219344 :         error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
     423             :                         XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
     424             :                         &nimaps);
     425      219344 :         if (error)
     426          13 :                 goto out_trans_cancel;
     427             : 
     428      219331 :         xfs_inode_set_cowblocks_tag(ip);
     429      219331 :         error = xfs_trans_commit(tp);
     430      219331 :         if (error)
     431             :                 return error;
     432             : 
     433             :         /*
     434             :          * Allocation succeeded but the requested range was not even partially
     435             :          * satisfied?  Bail out!
     436             :          */
     437      219331 :         if (nimaps == 0)
     438             :                 return -ENOSPC;
     439             : 
     440      219331 : convert:
     441      219331 :         return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
     442             : 
     443          13 : out_trans_cancel:
     444          13 :         xfs_trans_cancel(tp);
     445          13 :         return error;
     446             : }
     447             : 
     448             : static int
     449         437 : xfs_reflink_fill_delalloc(
     450             :         struct xfs_inode        *ip,
     451             :         struct xfs_bmbt_irec    *imap,
     452             :         struct xfs_bmbt_irec    *cmap,
     453             :         bool                    *shared,
     454             :         uint                    *lockmode,
     455             :         bool                    convert_now)
     456             : {
     457         437 :         struct xfs_mount        *mp = ip->i_mount;
     458         489 :         struct xfs_trans        *tp;
     459         489 :         int                     nimaps;
     460         489 :         int                     error;
     461         489 :         bool                    found;
     462             : 
     463         489 :         do {
     464         489 :                 xfs_iunlock(ip, *lockmode);
     465         489 :                 *lockmode = 0;
     466             : 
     467         489 :                 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
     468             :                                 false, &tp);
     469         489 :                 if (error)
     470           0 :                         return error;
     471             : 
     472         489 :                 *lockmode = XFS_ILOCK_EXCL;
     473             : 
     474         489 :                 error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
     475             :                                 &found);
     476         489 :                 if (error || !*shared)
     477           0 :                         goto out_trans_cancel;
     478             : 
     479         489 :                 if (found) {
     480           0 :                         xfs_trans_cancel(tp);
     481           0 :                         break;
     482             :                 }
     483             : 
     484         489 :                 ASSERT(isnullstartblock(cmap->br_startblock) ||
     485             :                        cmap->br_startblock == DELAYSTARTBLOCK);
     486             : 
     487             :                 /*
     488             :                  * Replace delalloc reservation with an unwritten extent.
     489             :                  */
     490         489 :                 nimaps = 1;
     491         489 :                 error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
     492             :                                 cmap->br_blockcount,
     493             :                                 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
     494             :                                 cmap, &nimaps);
     495         489 :                 if (error)
     496           0 :                         goto out_trans_cancel;
     497             : 
     498         489 :                 xfs_inode_set_cowblocks_tag(ip);
     499         489 :                 error = xfs_trans_commit(tp);
     500         489 :                 if (error)
     501           0 :                         return error;
     502             : 
     503             :                 /*
     504             :                  * Allocation succeeded but the requested range was not even
     505             :                  * partially satisfied?  Bail out!
     506             :                  */
     507         489 :                 if (nimaps == 0)
     508             :                         return -ENOSPC;
     509         489 :         } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
     510             : 
     511         437 :         return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
     512             : 
     513           0 : out_trans_cancel:
     514           0 :         xfs_trans_cancel(tp);
     515           0 :         return error;
     516             : }
     517             : 
     518             : /* Allocate all CoW reservations covering a range of blocks in a file. */
     519             : int
     520     3057603 : xfs_reflink_allocate_cow(
     521             :         struct xfs_inode        *ip,
     522             :         struct xfs_bmbt_irec    *imap,
     523             :         struct xfs_bmbt_irec    *cmap,
     524             :         bool                    *shared,
     525             :         uint                    *lockmode,
     526             :         bool                    convert_now)
     527             : {
     528     3057603 :         int                     error;
     529     3057603 :         bool                    found;
     530             : 
     531     3057603 :         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
     532     3057603 :         if (!ip->i_cowfp) {
     533           0 :                 ASSERT(!xfs_is_reflink_inode(ip));
     534           0 :                 xfs_ifork_init_cow(ip);
     535             :         }
     536             : 
     537     3057603 :         error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
     538     3057603 :         if (error || !*shared)
     539             :                 return error;
     540             : 
     541             :         /* CoW fork has a real extent */
     542     1409849 :         if (found)
     543     1190060 :                 return xfs_reflink_convert_unwritten(ip, imap, cmap,
     544             :                                 convert_now);
     545             : 
     546             :         /*
     547             :          * CoW fork does not have an extent and data extent is shared.
     548             :          * Allocate a real extent in the CoW fork.
     549             :          */
     550      219789 :         if (cmap->br_startoff > imap->br_startoff)
     551      219352 :                 return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
     552             :                                 lockmode, convert_now);
     553             : 
     554             :         /*
     555             :          * CoW fork has a delalloc reservation. Replace it with a real extent.
     556             :          * There may or may not be a data fork mapping.
     557             :          */
     558         437 :         if (isnullstartblock(cmap->br_startblock) ||
     559             :             cmap->br_startblock == DELAYSTARTBLOCK)
     560         437 :                 return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
     561             :                                 lockmode, convert_now);
     562             : 
     563             :         /* Shouldn't get here. */
     564           0 :         ASSERT(0);
     565           0 :         return -EFSCORRUPTED;
     566             : }
     567             : 
     568             : /*
     569             :  * Cancel CoW reservations for some block range of an inode.
     570             :  *
     571             :  * If cancel_real is true this function cancels all COW fork extents for the
     572             :  * inode; if cancel_real is false, real extents are not cleared.
     573             :  *
     574             :  * Caller must have already joined the inode to the current transaction. The
     575             :  * inode will be joined to the transaction returned to the caller.
     576             :  */
     577             : int
     578    11071963 : xfs_reflink_cancel_cow_blocks(
     579             :         struct xfs_inode                *ip,
     580             :         struct xfs_trans                **tpp,
     581             :         xfs_fileoff_t                   offset_fsb,
     582             :         xfs_fileoff_t                   end_fsb,
     583             :         bool                            cancel_real)
     584             : {
     585    11071963 :         struct xfs_ifork                *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
     586    11071963 :         struct xfs_bmbt_irec            got, del;
     587    11071963 :         struct xfs_iext_cursor          icur;
     588    11071963 :         int                             error = 0;
     589             : 
     590    22143926 :         if (!xfs_inode_has_cow_data(ip))
     591             :                 return 0;
     592      233367 :         if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
     593             :                 return 0;
     594             : 
     595             :         /* Walk backwards until we're out of the I/O range... */
     596      336228 :         while (got.br_startoff + got.br_blockcount > offset_fsb) {
     597      165007 :                 del = got;
     598      165007 :                 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
     599             : 
     600             :                 /* Extent delete may have bumped ext forward */
     601      165007 :                 if (!del.br_blockcount) {
     602        3760 :                         xfs_iext_prev(ifp, &icur);
     603        3760 :                         goto next_extent;
     604             :                 }
     605             : 
     606      161247 :                 trace_xfs_reflink_cancel_cow(ip, &del);
     607             : 
     608      161247 :                 if (isnullstartblock(del.br_startblock)) {
     609       18764 :                         error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
     610             :                                         &icur, &got, &del);
     611       18764 :                         if (error)
     612             :                                 break;
     613      142483 :                 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
     614      142483 :                         ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
     615             : 
     616             :                         /* Free the CoW orphan record. */
     617      142483 :                         xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
     618      142483 :                                         del.br_blockcount);
     619             : 
     620      142483 :                         error = xfs_free_extent_later(*tpp, del.br_startblock,
     621             :                                         del.br_blockcount, NULL,
     622             :                                         XFS_AG_RESV_NONE);
     623      142483 :                         if (error)
     624             :                                 break;
     625             : 
     626             :                         /* Roll the transaction */
     627      142483 :                         error = xfs_defer_finish(tpp);
     628      142483 :                         if (error)
     629             :                                 break;
     630             : 
     631             :                         /* Remove the mapping from the CoW fork. */
     632      142474 :                         xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
     633             : 
     634             :                         /* Remove the quota reservation */
     635      142474 :                         error = xfs_quota_unreserve_blkres(ip,
     636      142474 :                                         del.br_blockcount);
     637      142474 :                         if (error)
     638             :                                 break;
     639             :                 } else {
     640             :                         /* Didn't do anything, push cursor back. */
     641           0 :                         xfs_iext_prev(ifp, &icur);
     642             :                 }
     643      164998 : next_extent:
     644      164998 :                 if (!xfs_iext_get_extent(ifp, &icur, &got))
     645             :                         break;
     646             :         }
     647             : 
     648             :         /* clear tag if cow fork is emptied */
     649      218289 :         if (!ifp->if_bytes)
     650       43948 :                 xfs_inode_clear_cowblocks_tag(ip);
     651             :         return error;
     652             : }
     653             : 
     654             : /*
     655             :  * Cancel CoW reservations for some byte range of an inode.
     656             :  *
     657             :  * If cancel_real is true this function cancels all COW fork extents for the
     658             :  * inode; if cancel_real is false, real extents are not cleared.
     659             :  */
     660             : int
     661      102284 : xfs_reflink_cancel_cow_range(
     662             :         struct xfs_inode        *ip,
     663             :         xfs_off_t               offset,
     664             :         xfs_off_t               count,
     665             :         bool                    cancel_real)
     666             : {
     667      102284 :         struct xfs_trans        *tp;
     668      102284 :         xfs_fileoff_t           offset_fsb;
     669      102284 :         xfs_fileoff_t           end_fsb;
     670      102284 :         int                     error;
     671             : 
     672      102284 :         trace_xfs_reflink_cancel_cow_range(ip, offset, count);
     673      102284 :         ASSERT(ip->i_cowfp);
     674             : 
     675      102284 :         offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
     676      102284 :         if (count == NULLFILEOFF)
     677             :                 end_fsb = NULLFILEOFF;
     678             :         else
     679       53268 :                 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
     680             : 
     681             :         /* Start a rolling transaction to remove the mappings */
     682      102284 :         error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
     683             :                         0, 0, 0, &tp);
     684      102284 :         if (error)
     685           0 :                 goto out;
     686             : 
     687      102284 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     688      102284 :         xfs_trans_ijoin(tp, ip, 0);
     689             : 
     690             :         /* Scrape out the old CoW reservations */
     691      102284 :         error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
     692             :                         cancel_real);
     693      102284 :         if (error)
     694           9 :                 goto out_cancel;
     695             : 
     696      102275 :         error = xfs_trans_commit(tp);
     697             : 
     698      102275 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     699      102275 :         return error;
     700             : 
     701             : out_cancel:
     702           9 :         xfs_trans_cancel(tp);
     703           9 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     704           9 : out:
     705           9 :         trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
     706           9 :         return error;
     707             : }
     708             : 
     709             : /*
     710             :  * Remap part of the CoW fork into the data fork.
     711             :  *
     712             :  * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
     713             :  * into the data fork; this function will remap what it can (at the end of the
     714             :  * range) and update @end_fsb appropriately.  Each remap gets its own
     715             :  * transaction because we can end up merging and splitting bmbt blocks for
     716             :  * every remap operation and we'd like to keep the block reservation
     717             :  * requirements as low as possible.
     718             :  */
     719             : STATIC int
     720     1840027 : xfs_reflink_end_cow_extent(
     721             :         struct xfs_inode        *ip,
     722             :         xfs_fileoff_t           *offset_fsb,
     723             :         xfs_fileoff_t           end_fsb)
     724             : {
     725     1840027 :         struct xfs_iext_cursor  icur;
     726     1840027 :         struct xfs_bmbt_irec    got, del, data;
     727     1840027 :         struct xfs_mount        *mp = ip->i_mount;
     728     1840027 :         struct xfs_trans        *tp;
     729     1840027 :         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
     730     1840027 :         unsigned int            resblks;
     731     1840027 :         int                     nmaps;
     732     1840027 :         int                     error;
     733             : 
     734             :         /* No COW extents?  That's easy! */
     735     1840027 :         if (ifp->if_bytes == 0) {
     736         147 :                 *offset_fsb = end_fsb;
     737         147 :                 return 0;
     738             :         }
     739             : 
     740     1839880 :         resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
     741     1839880 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
     742             :                         XFS_TRANS_RESERVE, &tp);
     743     1839880 :         if (error)
     744             :                 return error;
     745             : 
     746             :         /*
     747             :          * Lock the inode.  We have to ijoin without automatic unlock because
     748             :          * the lead transaction is the refcountbt record deletion; the data
     749             :          * fork update follows as a deferred log item.
     750             :          */
     751     1839880 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     752     1839880 :         xfs_trans_ijoin(tp, ip, 0);
     753             : 
     754     1839880 :         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
     755             :                         XFS_IEXT_REFLINK_END_COW_CNT);
     756     1839880 :         if (error == -EFBIG)
     757           4 :                 error = xfs_iext_count_upgrade(tp, ip,
     758             :                                 XFS_IEXT_REFLINK_END_COW_CNT);
     759     1839880 :         if (error)
     760           4 :                 goto out_cancel;
     761             : 
     762             :         /*
     763             :          * In case of racing, overlapping AIO writes no COW extents might be
     764             :          * left by the time I/O completes for the loser of the race.  In that
     765             :          * case we are done.
     766             :          */
     767     1839876 :         if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
     768     1839570 :             got.br_startoff >= end_fsb) {
     769       21918 :                 *offset_fsb = end_fsb;
     770       21918 :                 goto out_cancel;
     771             :         }
     772             : 
     773             :         /*
     774             :          * Only remap real extents that contain data.  With AIO, speculative
     775             :          * preallocations can leak into the range we are called upon, and we
     776             :          * need to skip them.  Preserve @got for the eventual CoW fork
     777             :          * deletion; from now on @del represents the mapping that we're
     778             :          * actually remapping.
     779             :          */
     780     1820267 :         while (!xfs_bmap_is_written_extent(&got)) {
     781        2588 :                 if (!xfs_iext_next_extent(ifp, &icur, &got) ||
     782        2583 :                     got.br_startoff >= end_fsb) {
     783         279 :                         *offset_fsb = end_fsb;
     784         279 :                         goto out_cancel;
     785             :                 }
     786             :         }
     787     1817679 :         del = got;
     788             : 
     789             :         /* Grab the corresponding mapping in the data fork. */
     790     1817679 :         nmaps = 1;
     791     1817679 :         error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
     792             :                         &nmaps, 0);
     793     1817679 :         if (error)
     794          44 :                 goto out_cancel;
     795             : 
     796             :         /* We can only remap the smaller of the two extent sizes. */
     797     1817635 :         data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
     798     1817635 :         del.br_blockcount = data.br_blockcount;
     799             : 
     800     1817635 :         trace_xfs_reflink_cow_remap_from(ip, &del);
     801     1817635 :         trace_xfs_reflink_cow_remap_to(ip, &data);
     802             : 
     803     3546777 :         if (xfs_bmap_is_real_extent(&data)) {
     804             :                 /*
     805             :                  * If the extent we're remapping is backed by storage (written
     806             :                  * or not), unmap the extent and drop its refcount.
     807             :                  */
     808     1729142 :                 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
     809     1729142 :                 xfs_refcount_decrease_extent(tp, &data);
     810     1729142 :                 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
     811     1729142 :                                 -data.br_blockcount);
     812       88493 :         } else if (data.br_startblock == DELAYSTARTBLOCK) {
     813       17991 :                 int             done;
     814             : 
     815             :                 /*
     816             :                  * If the extent we're remapping is a delalloc reservation,
     817             :                  * we can use the regular bunmapi function to release the
     818             :                  * incore state.  Dropping the delalloc reservation takes care
     819             :                  * of the quota reservation for us.
     820             :                  */
     821       17991 :                 error = xfs_bunmapi(NULL, ip, data.br_startoff,
     822             :                                 data.br_blockcount, 0, 1, &done);
     823       17991 :                 if (error)
     824           0 :                         goto out_cancel;
     825       17991 :                 ASSERT(done);
     826             :         }
     827             : 
     828             :         /* Free the CoW orphan record. */
     829     1817635 :         xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
     830             : 
     831             :         /* Map the new blocks into the data fork. */
     832     1817635 :         xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
     833             : 
     834             :         /* Charge this new data fork mapping to the on-disk quota. */
     835     1817635 :         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
     836     1817635 :                         (long)del.br_blockcount);
     837             : 
     838             :         /* Remove the mapping from the CoW fork. */
     839     1817635 :         xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
     840             : 
     841     1817635 :         error = xfs_trans_commit(tp);
     842     1817635 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     843     1817635 :         if (error)
     844             :                 return error;
     845             : 
     846             :         /* Update the caller about how much progress we made. */
     847     1817631 :         *offset_fsb = del.br_startoff + del.br_blockcount;
     848     1817631 :         return 0;
     849             : 
     850       22245 : out_cancel:
     851       22245 :         xfs_trans_cancel(tp);
     852       22245 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     853       22245 :         return error;
     854             : }
     855             : 
     856             : /*
     857             :  * Remap parts of a file's data fork after a successful CoW.
     858             :  */
     859             : int
     860     1393749 : xfs_reflink_end_cow(
     861             :         struct xfs_inode                *ip,
     862             :         xfs_off_t                       offset,
     863             :         xfs_off_t                       count)
     864             : {
     865     1393749 :         xfs_fileoff_t                   offset_fsb;
     866     1393749 :         xfs_fileoff_t                   end_fsb;
     867     1393749 :         int                             error = 0;
     868             : 
     869     1393749 :         trace_xfs_reflink_end_cow(ip, offset, count);
     870             : 
     871     1393749 :         offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
     872     1393749 :         end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
     873             : 
     874             :         /*
     875             :          * Walk forwards until we've remapped the I/O range.  The loop function
     876             :          * repeatedly cycles the ILOCK to allocate one transaction per remapped
     877             :          * extent.
     878             :          *
     879             :          * If we're being called by writeback then the pages will still
     880             :          * have PageWriteback set, which prevents races with reflink remapping
     881             :          * and truncate.  Reflink remapping prevents races with writeback by
     882             :          * taking the iolock and mmaplock before flushing the pages and
     883             :          * remapping, which means there won't be any further writeback or page
     884             :          * cache dirtying until the reflink completes.
     885             :          *
     886             :          * We should never have two threads issuing writeback for the same file
     887             :          * region.  There are also have post-eof checks in the writeback
     888             :          * preparation code so that we don't bother writing out pages that are
     889             :          * about to be truncated.
     890             :          *
     891             :          * If we're being called as part of directio write completion, the dio
     892             :          * count is still elevated, which reflink and truncate will wait for.
     893             :          * Reflink remapping takes the iolock and mmaplock and waits for
     894             :          * pending dio to finish, which should prevent any directio until the
     895             :          * remap completes.  Multiple concurrent directio writes to the same
     896             :          * region are handled by end_cow processing only occurring for the
     897             :          * threads which succeed; the outcome of multiple overlapping direct
     898             :          * writes is not well defined anyway.
     899             :          *
     900             :          * It's possible that a buffered write and a direct write could collide
     901             :          * here (the buffered write stumbles in after the dio flushes and
     902             :          * invalidates the page cache and immediately queues writeback), but we
     903             :          * have never supported this 100%.  If either disk write succeeds the
     904             :          * blocks will be remapped.
     905             :          */
     906     3233776 :         while (end_fsb > offset_fsb && !error)
     907     1840027 :                 error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
     908             : 
     909     1393749 :         if (error)
     910          52 :                 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
     911     1393749 :         return error;
     912             : }
     913             : 
     914             : /*
     915             :  * Free all CoW staging blocks that are still referenced by the ondisk refcount
     916             :  * metadata.  The ondisk metadata does not track which inode created the
     917             :  * staging extent, so callers must ensure that there are no cached inodes with
     918             :  * live CoW staging extents.
     919             :  */
     920             : int
     921       11281 : xfs_reflink_recover_cow(
     922             :         struct xfs_mount        *mp)
     923             : {
     924       11281 :         struct xfs_perag        *pag;
     925       11281 :         xfs_agnumber_t          agno;
     926       11281 :         int                     error = 0;
     927             : 
     928       11281 :         if (!xfs_has_reflink(mp))
     929             :                 return 0;
     930             : 
     931       56355 :         for_each_perag(mp, agno, pag) {
     932       45082 :                 error = xfs_refcount_recover_cow_leftovers(mp, pag);
     933       45082 :                 if (error) {
     934           8 :                         xfs_perag_rele(pag);
     935           8 :                         break;
     936             :                 }
     937             :         }
     938             : 
     939             :         return error;
     940             : }
     941             : 
     942             : /*
     943             :  * Reflinking (Block) Ranges of Two Files Together
     944             :  *
     945             :  * First, ensure that the reflink flag is set on both inodes.  The flag is an
     946             :  * optimization to avoid unnecessary refcount btree lookups in the write path.
     947             :  *
     948             :  * Now we can iteratively remap the range of extents (and holes) in src to the
     949             :  * corresponding ranges in dest.  Let drange and srange denote the ranges of
     950             :  * logical blocks in dest and src touched by the reflink operation.
     951             :  *
     952             :  * While the length of drange is greater than zero,
     953             :  *    - Read src's bmbt at the start of srange ("imap")
     954             :  *    - If imap doesn't exist, make imap appear to start at the end of srange
     955             :  *      with zero length.
     956             :  *    - If imap starts before srange, advance imap to start at srange.
     957             :  *    - If imap goes beyond srange, truncate imap to end at the end of srange.
     958             :  *    - Punch (imap start - srange start + imap len) blocks from dest at
     959             :  *      offset (drange start).
     960             :  *    - If imap points to a real range of pblks,
     961             :  *         > Increase the refcount of the imap's pblks
     962             :  *         > Map imap's pblks into dest at the offset
     963             :  *           (drange start + imap start - srange start)
     964             :  *    - Advance drange and srange by (imap start - srange start + imap len)
     965             :  *
     966             :  * Finally, if the reflink made dest longer, update both the in-core and
     967             :  * on-disk file sizes.
     968             :  *
     969             :  * ASCII Art Demonstration:
     970             :  *
     971             :  * Let's say we want to reflink this source file:
     972             :  *
     973             :  * ----SSSSSSS-SSSSS----SSSSSS (src file)
     974             :  *   <-------------------->
     975             :  *
     976             :  * into this destination file:
     977             :  *
     978             :  * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
     979             :  *        <-------------------->
     980             :  * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
     981             :  * Observe that the range has different logical offsets in either file.
     982             :  *
     983             :  * Consider that the first extent in the source file doesn't line up with our
     984             :  * reflink range.  Unmapping  and remapping are separate operations, so we can
     985             :  * unmap more blocks from the destination file than we remap.
     986             :  *
     987             :  * ----SSSSSSS-SSSSS----SSSSSS
     988             :  *   <------->
     989             :  * --DDDDD---------DDDDD--DDD
     990             :  *        <------->
     991             :  *
     992             :  * Now remap the source extent into the destination file:
     993             :  *
     994             :  * ----SSSSSSS-SSSSS----SSSSSS
     995             :  *   <------->
     996             :  * --DDDDD--SSSSSSSDDDDD--DDD
     997             :  *        <------->
     998             :  *
     999             :  * Do likewise with the second hole and extent in our range.  Holes in the
    1000             :  * unmap range don't affect our operation.
    1001             :  *
    1002             :  * ----SSSSSSS-SSSSS----SSSSSS
    1003             :  *            <---->
    1004             :  * --DDDDD--SSSSSSS-SSSSS-DDD
    1005             :  *                 <---->
    1006             :  *
    1007             :  * Finally, unmap and remap part of the third extent.  This will increase the
    1008             :  * size of the destination file.
    1009             :  *
    1010             :  * ----SSSSSSS-SSSSS----SSSSSS
    1011             :  *                  <----->
    1012             :  * --DDDDD--SSSSSSS-SSSSS----SSS
    1013             :  *                       <----->
    1014             :  *
    1015             :  * Once we update the destination file's i_size, we're done.
    1016             :  */
    1017             : 
    1018             : /*
    1019             :  * Ensure the reflink bit is set in both inodes.
    1020             :  */
    1021             : STATIC int
    1022   105506003 : xfs_reflink_set_inode_flag(
    1023             :         struct xfs_inode        *src,
    1024             :         struct xfs_inode        *dest)
    1025             : {
    1026   105506003 :         struct xfs_mount        *mp = src->i_mount;
    1027   105506003 :         int                     error;
    1028   105506003 :         struct xfs_trans        *tp;
    1029             : 
    1030   105506003 :         if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
    1031             :                 return 0;
    1032             : 
    1033     2547108 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
    1034     2547992 :         if (error)
    1035           1 :                 goto out_error;
    1036             : 
    1037             :         /* Lock both files against IO */
    1038     2547991 :         if (src->i_ino == dest->i_ino)
    1039       71410 :                 xfs_ilock(src, XFS_ILOCK_EXCL);
    1040             :         else
    1041     2476581 :                 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
    1042             : 
    1043     2547994 :         if (!xfs_is_reflink_inode(src)) {
    1044      268331 :                 trace_xfs_reflink_set_inode_flag(src);
    1045      268331 :                 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
    1046      268331 :                 src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
    1047      268331 :                 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
    1048      268332 :                 xfs_ifork_init_cow(src);
    1049             :         } else
    1050     2279663 :                 xfs_iunlock(src, XFS_ILOCK_EXCL);
    1051             : 
    1052     2547995 :         if (src->i_ino == dest->i_ino)
    1053       71410 :                 goto commit_flags;
    1054             : 
    1055     2476585 :         if (!xfs_is_reflink_inode(dest)) {
    1056     2324864 :                 trace_xfs_reflink_set_inode_flag(dest);
    1057     2324862 :                 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
    1058     2324862 :                 dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
    1059     2324862 :                 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
    1060     2324864 :                 xfs_ifork_init_cow(dest);
    1061             :         } else
    1062      151721 :                 xfs_iunlock(dest, XFS_ILOCK_EXCL);
    1063             : 
    1064     2547994 : commit_flags:
    1065     2547994 :         error = xfs_trans_commit(tp);
    1066     2547993 :         if (error)
    1067           3 :                 goto out_error;
    1068             :         return error;
    1069             : 
    1070           4 : out_error:
    1071           4 :         trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
    1072           4 :         return error;
    1073             : }
    1074             : 
    1075             : /*
    1076             :  * Update destination inode size & cowextsize hint, if necessary.
    1077             :  */
    1078             : int
    1079   105018410 : xfs_reflink_update_dest(
    1080             :         struct xfs_inode        *dest,
    1081             :         xfs_off_t               newlen,
    1082             :         xfs_extlen_t            cowextsize,
    1083             :         unsigned int            remap_flags)
    1084             : {
    1085   105018410 :         struct xfs_mount        *mp = dest->i_mount;
    1086   105018410 :         struct xfs_trans        *tp;
    1087   105018410 :         int                     error;
    1088             : 
    1089   105018410 :         if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
    1090             :                 return 0;
    1091             : 
    1092     1719241 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
    1093     1721194 :         if (error)
    1094           0 :                 goto out_error;
    1095             : 
    1096     1721194 :         xfs_ilock(dest, XFS_ILOCK_EXCL);
    1097     1721196 :         xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
    1098             : 
    1099     1721198 :         if (newlen > i_size_read(VFS_I(dest))) {
    1100     1721188 :                 trace_xfs_reflink_update_inode_size(dest, newlen);
    1101     1721177 :                 i_size_write(VFS_I(dest), newlen);
    1102     1721177 :                 dest->i_disk_size = newlen;
    1103             :         }
    1104             : 
    1105     1721187 :         if (cowextsize) {
    1106           6 :                 dest->i_cowextsize = cowextsize;
    1107           6 :                 dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
    1108             :         }
    1109             : 
    1110     1721187 :         xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
    1111             : 
    1112     1721193 :         error = xfs_trans_commit(tp);
    1113     1721195 :         if (error)
    1114           0 :                 goto out_error;
    1115             :         return error;
    1116             : 
    1117           0 : out_error:
    1118           0 :         trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
    1119           0 :         return error;
    1120             : }
    1121             : 
    1122             : /*
    1123             :  * Do we have enough reserve in this AG to handle a reflink?  The refcount
    1124             :  * btree already reserved all the space it needs, but the rmap btree can grow
    1125             :  * infinitely, so we won't allow more reflinks when the AG is down to the
    1126             :  * btree reserves.
    1127             :  */
    1128             : static int
    1129    50340234 : xfs_reflink_ag_has_free_space(
    1130             :         struct xfs_mount        *mp,
    1131             :         xfs_agnumber_t          agno)
    1132             : {
    1133    50340234 :         struct xfs_perag        *pag;
    1134    50340234 :         int                     error = 0;
    1135             : 
    1136    50340234 :         if (!xfs_has_rmapbt(mp))
    1137             :                 return 0;
    1138             : 
    1139    50340234 :         pag = xfs_perag_get(mp, agno);
    1140   100680448 :         if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
    1141    50340219 :             xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
    1142             :                 error = -ENOSPC;
    1143    50340222 :         xfs_perag_put(pag);
    1144    50340222 :         return error;
    1145             : }
    1146             : 
    1147             : /*
    1148             :  * Remap the given extent into the file.  The dmap blockcount will be set to
    1149             :  * the number of blocks that were actually remapped.
    1150             :  */
    1151             : STATIC int
    1152   165712160 : xfs_reflink_remap_extent(
    1153             :         struct xfs_inode        *ip,
    1154             :         struct xfs_bmbt_irec    *dmap,
    1155             :         xfs_off_t               new_isize)
    1156             : {
    1157   165712160 :         struct xfs_bmbt_irec    smap;
    1158   165712160 :         struct xfs_mount        *mp = ip->i_mount;
    1159   165712160 :         struct xfs_trans        *tp;
    1160   165712160 :         xfs_off_t               newlen;
    1161   165712160 :         int64_t                 qdelta = 0;
    1162   165712160 :         unsigned int            resblks;
    1163   165712160 :         bool                    quota_reserved = true;
    1164   165712160 :         bool                    smap_real;
    1165   165712160 :         bool                    dmap_written = xfs_bmap_is_written_extent(dmap);
    1166   165712160 :         int                     iext_delta = 0;
    1167   165712160 :         int                     nimaps;
    1168   165712160 :         int                     error;
    1169             : 
    1170             :         /*
    1171             :          * Start a rolling transaction to switch the mappings.
    1172             :          *
    1173             :          * Adding a written extent to the extent map can cause a bmbt split,
    1174             :          * and removing a mapped extent from the extent can cause a bmbt split.
    1175             :          * The two operations cannot both cause a split since they operate on
    1176             :          * the same index in the bmap btree, so we only need a reservation for
    1177             :          * one bmbt split if either thing is happening.  However, we haven't
    1178             :          * locked the inode yet, so we reserve assuming this is the case.
    1179             :          *
    1180             :          * The first allocation call tries to reserve enough space to handle
    1181             :          * mapping dmap into a sparse part of the file plus the bmbt split.  We
    1182             :          * haven't locked the inode or read the existing mapping yet, so we do
    1183             :          * not know for sure that we need the space.  This should succeed most
    1184             :          * of the time.
    1185             :          *
    1186             :          * If the first attempt fails, try again but reserving only enough
    1187             :          * space to handle a bmbt split.  This is the hard minimum requirement,
    1188             :          * and we revisit quota reservations later when we know more about what
    1189             :          * we're remapping.
    1190             :          */
    1191   165712160 :         resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
    1192   165712160 :         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
    1193   165712160 :                         resblks + dmap->br_blockcount, 0, false, &tp);
    1194   165714257 :         if (error == -EDQUOT || error == -ENOSPC) {
    1195     1994090 :                 quota_reserved = false;
    1196     1994090 :                 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
    1197             :                                 resblks, 0, false, &tp);
    1198             :         }
    1199   165713998 :         if (error)
    1200      484523 :                 goto out;
    1201             : 
    1202             :         /*
    1203             :          * Read what's currently mapped in the destination file into smap.
    1204             :          * If smap isn't a hole, we will have to remove it before we can add
    1205             :          * dmap to the destination file.
    1206             :          */
    1207   165229475 :         nimaps = 1;
    1208   165229475 :         error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
    1209             :                         &smap, &nimaps, 0);
    1210   165228267 :         if (error)
    1211          12 :                 goto out_cancel;
    1212   165228255 :         ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
    1213   165228255 :         smap_real = xfs_bmap_is_real_extent(&smap);
    1214             : 
    1215             :         /*
    1216             :          * We can only remap as many blocks as the smaller of the two extent
    1217             :          * maps, because we can only remap one extent at a time.
    1218             :          */
    1219   165228255 :         dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
    1220   165228255 :         ASSERT(dmap->br_blockcount == smap.br_blockcount);
    1221             : 
    1222   165228255 :         trace_xfs_reflink_remap_extent_dest(ip, &smap);
    1223             : 
    1224             :         /*
    1225             :          * Two extents mapped to the same physical block must not have
    1226             :          * different states; that's filesystem corruption.  Move on to the next
    1227             :          * extent if they're both holes or both the same physical extent.
    1228             :          */
    1229   165227852 :         if (dmap->br_startblock == smap.br_startblock) {
    1230   109760420 :                 if (dmap->br_state != smap.br_state) {
    1231           0 :                         xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
    1232           0 :                         error = -EFSCORRUPTED;
    1233             :                 }
    1234   109760420 :                 goto out_cancel;
    1235             :         }
    1236             : 
    1237             :         /* If both extents are unwritten, leave them alone. */
    1238    55467432 :         if (dmap->br_state == XFS_EXT_UNWRITTEN &&
    1239     2413582 :             smap.br_state == XFS_EXT_UNWRITTEN)
    1240       76884 :                 goto out_cancel;
    1241             : 
    1242             :         /* No reflinking if the AG of the dest mapping is low on space. */
    1243    55390548 :         if (dmap_written) {
    1244    50340235 :                 error = xfs_reflink_ag_has_free_space(mp,
    1245    50340235 :                                 XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
    1246    50340218 :                 if (error)
    1247           7 :                         goto out_cancel;
    1248             :         }
    1249             : 
    1250             :         /*
    1251             :          * Increase quota reservation if we think the quota block counter for
    1252             :          * this file could increase.
    1253             :          *
    1254             :          * If we are mapping a written extent into the file, we need to have
    1255             :          * enough quota block count reservation to handle the blocks in that
    1256             :          * extent.  We log only the delta to the quota block counts, so if the
    1257             :          * extent we're unmapping also has blocks allocated to it, we don't
    1258             :          * need a quota reservation for the extent itself.
    1259             :          *
    1260             :          * Note that if we're replacing a delalloc reservation with a written
    1261             :          * extent, we have to take the full quota reservation because removing
    1262             :          * the delalloc reservation gives the block count back to the quota
    1263             :          * count.  This is suboptimal, but the VFS flushed the dest range
    1264             :          * before we started.  That should have removed all the delalloc
    1265             :          * reservations, but we code defensively.
    1266             :          *
    1267             :          * xfs_trans_alloc_inode above already tried to grab an even larger
    1268             :          * quota reservation, and kicked off a blockgc scan if it couldn't.
    1269             :          * If we can't get a potentially smaller quota reservation now, we're
    1270             :          * done.
    1271             :          */
    1272    55390524 :         if (!quota_reserved && !smap_real && dmap_written) {
    1273       16637 :                 error = xfs_trans_reserve_quota_nblks(tp, ip,
    1274       16637 :                                 dmap->br_blockcount, 0, false);
    1275       16637 :                 if (error)
    1276           0 :                         goto out_cancel;
    1277             :         }
    1278             : 
    1279    55390524 :         if (smap_real)
    1280     3040326 :                 ++iext_delta;
    1281             : 
    1282    55390524 :         if (dmap_written)
    1283    50340215 :                 ++iext_delta;
    1284             : 
    1285    55390524 :         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
    1286    55390469 :         if (error == -EFBIG)
    1287           6 :                 error = xfs_iext_count_upgrade(tp, ip, iext_delta);
    1288    55390469 :         if (error)
    1289           6 :                 goto out_cancel;
    1290             : 
    1291    55390463 :         if (smap_real) {
    1292             :                 /*
    1293             :                  * If the extent we're unmapping is backed by storage (written
    1294             :                  * or not), unmap the extent and drop its refcount.
    1295             :                  */
    1296     3040301 :                 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
    1297     3040295 :                 xfs_refcount_decrease_extent(tp, &smap);
    1298     3040306 :                 qdelta -= smap.br_blockcount;
    1299    52350162 :         } else if (smap.br_startblock == DELAYSTARTBLOCK) {
    1300          88 :                 int             done;
    1301             : 
    1302             :                 /*
    1303             :                  * If the extent we're unmapping is a delalloc reservation,
    1304             :                  * we can use the regular bunmapi function to release the
    1305             :                  * incore state.  Dropping the delalloc reservation takes care
    1306             :                  * of the quota reservation for us.
    1307             :                  */
    1308          88 :                 error = xfs_bunmapi(NULL, ip, smap.br_startoff,
    1309             :                                 smap.br_blockcount, 0, 1, &done);
    1310          88 :                 if (error)
    1311           0 :                         goto out_cancel;
    1312          88 :                 ASSERT(done);
    1313             :         }
    1314             : 
    1315             :         /*
    1316             :          * If the extent we're sharing is backed by written storage, increase
    1317             :          * its refcount and map it into the file.
    1318             :          */
    1319    55390468 :         if (dmap_written) {
    1320    50340210 :                 xfs_refcount_increase_extent(tp, dmap);
    1321    50340207 :                 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
    1322    50340214 :                 qdelta += dmap->br_blockcount;
    1323             :         }
    1324             : 
    1325    55390472 :         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
    1326             : 
    1327             :         /* Update dest isize if needed. */
    1328    55390498 :         newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
    1329    55390498 :         newlen = min_t(xfs_off_t, newlen, new_isize);
    1330    55390498 :         if (newlen > i_size_read(VFS_I(ip))) {
    1331    49598409 :                 trace_xfs_reflink_update_inode_size(ip, newlen);
    1332    49598409 :                 i_size_write(VFS_I(ip), newlen);
    1333    49598409 :                 ip->i_disk_size = newlen;
    1334    49598409 :                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
    1335             :         }
    1336             : 
    1337             :         /* Commit everything and unlock. */
    1338    55390498 :         error = xfs_trans_commit(tp);
    1339    55390544 :         goto out_unlock;
    1340             : 
    1341   109837329 : out_cancel:
    1342   109837329 :         xfs_trans_cancel(tp);
    1343   165227473 : out_unlock:
    1344   165227473 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    1345   165710653 : out:
    1346   165710653 :         if (error)
    1347      485319 :                 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
    1348   165710652 :         return error;
    1349             : }
    1350             : 
    1351             : /* Remap a range of one file to the other. */
    1352             : int
    1353   105505203 : xfs_reflink_remap_blocks(
    1354             :         struct xfs_inode        *src,
    1355             :         loff_t                  pos_in,
    1356             :         struct xfs_inode        *dest,
    1357             :         loff_t                  pos_out,
    1358             :         loff_t                  remap_len,
    1359             :         loff_t                  *remapped)
    1360             : {
    1361   105505203 :         struct xfs_bmbt_irec    imap;
    1362   105505203 :         struct xfs_mount        *mp = src->i_mount;
    1363   105505203 :         xfs_fileoff_t           srcoff = XFS_B_TO_FSBT(mp, pos_in);
    1364   105505203 :         xfs_fileoff_t           destoff = XFS_B_TO_FSBT(mp, pos_out);
    1365   105505203 :         xfs_filblks_t           len;
    1366   105505203 :         xfs_filblks_t           remapped_len = 0;
    1367   105505203 :         xfs_off_t               new_isize = pos_out + remap_len;
    1368   105505203 :         int                     nimaps;
    1369   105505203 :         int                     error = 0;
    1370             : 
    1371   105505203 :         len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
    1372             :                         XFS_MAX_FILEOFF);
    1373             : 
    1374   105505203 :         trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
    1375             : 
    1376   270731735 :         while (len > 0) {
    1377   165712207 :                 unsigned int    lock_mode;
    1378             : 
    1379             :                 /* Read extent from the source file */
    1380   165712207 :                 nimaps = 1;
    1381   165712207 :                 lock_mode = xfs_ilock_data_map_shared(src);
    1382   165711878 :                 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
    1383   165712409 :                 xfs_iunlock(src, lock_mode);
    1384   165713462 :                 if (error)
    1385             :                         break;
    1386             :                 /*
    1387             :                  * The caller supposedly flushed all dirty pages in the source
    1388             :                  * file range, which means that writeback should have allocated
    1389             :                  * or deleted all delalloc reservations in that range.  If we
    1390             :                  * find one, that's a good sign that something is seriously
    1391             :                  * wrong here.
    1392             :                  */
    1393   165713382 :                 ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
    1394   165713382 :                 if (imap.br_startblock == DELAYSTARTBLOCK) {
    1395           0 :                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
    1396           0 :                         xfs_bmap_mark_sick(src, XFS_DATA_FORK);
    1397           0 :                         error = -EFSCORRUPTED;
    1398           0 :                         break;
    1399             :                 }
    1400             : 
    1401   165713382 :                 trace_xfs_reflink_remap_extent_src(src, &imap);
    1402             : 
    1403             :                 /* Remap into the destination file at the given offset. */
    1404   165713168 :                 imap.br_startoff = destoff;
    1405   165713168 :                 error = xfs_reflink_remap_extent(dest, &imap, new_isize);
    1406   165711802 :                 if (error)
    1407             :                         break;
    1408             : 
    1409   165226488 :                 if (fatal_signal_pending(current)) {
    1410             :                         error = -EINTR;
    1411             :                         break;
    1412             :                 }
    1413             : 
    1414             :                 /* Advance drange/srange */
    1415   165226532 :                 srcoff += imap.br_blockcount;
    1416   165226532 :                 destoff += imap.br_blockcount;
    1417   165226532 :                 len -= imap.br_blockcount;
    1418   165226532 :                 remapped_len += imap.br_blockcount;
    1419             :         }
    1420             : 
    1421   105505889 :         if (error)
    1422      485544 :                 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
    1423   105505890 :         *remapped = min_t(loff_t, remap_len,
    1424             :                           XFS_FSB_TO_B(src->i_mount, remapped_len));
    1425   105505890 :         return error;
    1426             : }
    1427             : 
    1428             : /*
    1429             :  * If we're reflinking to a point past the destination file's EOF, we must
    1430             :  * zero any speculative post-EOF preallocations that sit between the old EOF
    1431             :  * and the destination file offset.
    1432             :  */
    1433             : static int
    1434   105505589 : xfs_reflink_zero_posteof(
    1435             :         struct xfs_inode        *ip,
    1436             :         loff_t                  pos)
    1437             : {
    1438   105505589 :         loff_t                  isize = i_size_read(VFS_I(ip));
    1439             : 
    1440   105505589 :         if (pos <= isize)
    1441             :                 return 0;
    1442             : 
    1443     1883762 :         trace_xfs_zero_eof(ip, isize, pos - isize);
    1444     1883761 :         return xfs_zero_range(ip, isize, pos - isize, NULL);
    1445             : }
    1446             : 
    1447             : /*
    1448             :  * Prepare two files for range cloning.  Upon a successful return both inodes
    1449             :  * will have the iolock and mmaplock held, the page cache of the out file will
    1450             :  * be truncated, and any leases on the out file will have been broken.  This
    1451             :  * function borrows heavily from xfs_file_aio_write_checks.
    1452             :  *
    1453             :  * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
    1454             :  * checked that the bytes beyond EOF physically match. Hence we cannot use the
    1455             :  * EOF block in the source dedupe range because it's not a complete block match,
    1456             :  * hence can introduce a corruption into the file that has it's block replaced.
    1457             :  *
    1458             :  * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
    1459             :  * "block aligned" for the purposes of cloning entire files.  However, if the
    1460             :  * source file range includes the EOF block and it lands within the existing EOF
    1461             :  * of the destination file, then we can expose stale data from beyond the source
    1462             :  * file EOF in the destination file.
    1463             :  *
    1464             :  * XFS doesn't support partial block sharing, so in both cases we have check
    1465             :  * these cases ourselves. For dedupe, we can simply round the length to dedupe
    1466             :  * down to the previous whole block and ignore the partial EOF block. While this
    1467             :  * means we can't dedupe the last block of a file, this is an acceptible
    1468             :  * tradeoff for simplicity on implementation.
    1469             :  *
    1470             :  * For cloning, we want to share the partial EOF block if it is also the new EOF
    1471             :  * block of the destination file. If the partial EOF block lies inside the
    1472             :  * existing destination EOF, then we have to abort the clone to avoid exposing
    1473             :  * stale data in the destination file. Hence we reject these clone attempts with
    1474             :  * -EINVAL in this case.
    1475             :  */
    1476             : int
    1477   137240698 : xfs_reflink_remap_prep(
    1478             :         struct file             *file_in,
    1479             :         loff_t                  pos_in,
    1480             :         struct file             *file_out,
    1481             :         loff_t                  pos_out,
    1482             :         loff_t                  *len,
    1483             :         unsigned int            remap_flags)
    1484             : {
    1485   137240698 :         struct inode            *inode_in = file_inode(file_in);
    1486   137240698 :         struct xfs_inode        *src = XFS_I(inode_in);
    1487   137240698 :         struct inode            *inode_out = file_inode(file_out);
    1488   137240698 :         struct xfs_inode        *dest = XFS_I(inode_out);
    1489   137240698 :         int                     ret;
    1490             : 
    1491             :         /* Lock both files against IO */
    1492   137240698 :         ret = xfs_ilock2_io_mmap(src, dest);
    1493   137247934 :         if (ret)
    1494             :                 return ret;
    1495             : 
    1496             :         /* Check file eligibility and prepare for block sharing. */
    1497   137247934 :         ret = -EINVAL;
    1498             :         /* Don't reflink realtime inodes */
    1499   137247934 :         if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
    1500           0 :                 goto out_unlock;
    1501             : 
    1502             :         /* Don't share DAX file data with non-DAX file. */
    1503   137247934 :         if (IS_DAX(inode_in) != IS_DAX(inode_out))
    1504             :                 goto out_unlock;
    1505             : 
    1506   137247934 :         if (!IS_DAX(inode_in))
    1507   137247934 :                 ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
    1508             :                                 pos_out, len, remap_flags);
    1509             :         else
    1510             :                 ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
    1511             :                                 pos_out, len, remap_flags, &xfs_read_iomap_ops);
    1512   137248692 :         if (ret || *len == 0)
    1513    31740507 :                 goto out_unlock;
    1514             : 
    1515             :         /* Attach dquots to dest inode before changing block map */
    1516   105508185 :         ret = xfs_qm_dqattach(dest);
    1517   105506052 :         if (ret)
    1518           0 :                 goto out_unlock;
    1519             : 
    1520             :         /*
    1521             :          * Zero existing post-eof speculative preallocations in the destination
    1522             :          * file.
    1523             :          */
    1524   105506052 :         ret = xfs_reflink_zero_posteof(dest, pos_out);
    1525   105506064 :         if (ret)
    1526         180 :                 goto out_unlock;
    1527             : 
    1528             :         /* Set flags and remap blocks. */
    1529   105505884 :         ret = xfs_reflink_set_inode_flag(src, dest);
    1530   105506041 :         if (ret)
    1531           4 :                 goto out_unlock;
    1532             : 
    1533             :         /*
    1534             :          * If pos_out > EOF, we may have dirtied blocks between EOF and
    1535             :          * pos_out. In that case, we need to extend the flush and unmap to cover
    1536             :          * from EOF to the end of the copy length.
    1537             :          */
    1538   211012074 :         if (pos_out > XFS_ISIZE(dest)) {
    1539     1882333 :                 loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
    1540     1882333 :                 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
    1541             :         } else {
    1542   103623704 :                 ret = xfs_flush_unmap_range(dest, pos_out, *len);
    1543             :         }
    1544   105505970 :         if (ret)
    1545        1017 :                 goto out_unlock;
    1546             : 
    1547             :         return 0;
    1548    31741708 : out_unlock:
    1549    31741708 :         xfs_iunlock2_io_mmap(src, dest);
    1550    31741708 :         return ret;
    1551             : }
    1552             : 
    1553             : /* Does this inode need the reflink flag? */
    1554             : int
    1555    15294329 : xfs_reflink_inode_has_shared_extents(
    1556             :         struct xfs_trans                *tp,
    1557             :         struct xfs_inode                *ip,
    1558             :         bool                            *has_shared)
    1559             : {
    1560    15294329 :         struct xfs_bmbt_irec            got;
    1561    15294329 :         struct xfs_mount                *mp = ip->i_mount;
    1562    15294329 :         struct xfs_ifork                *ifp;
    1563    15294329 :         struct xfs_iext_cursor          icur;
    1564    15294329 :         bool                            found;
    1565    15294329 :         int                             error;
    1566             : 
    1567    15294329 :         ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
    1568    15294329 :         error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
    1569    15294442 :         if (error)
    1570             :                 return error;
    1571             : 
    1572    15294355 :         *has_shared = false;
    1573    15294355 :         found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
    1574   274710998 :         while (found) {
    1575   259981498 :                 struct xfs_perag        *pag;
    1576   259981498 :                 xfs_agblock_t           agbno;
    1577   259981498 :                 xfs_extlen_t            aglen;
    1578   259981498 :                 xfs_agblock_t           rbno;
    1579   259981498 :                 xfs_extlen_t            rlen;
    1580             : 
    1581   259981498 :                 if (isnullstartblock(got.br_startblock) ||
    1582   259979090 :                     got.br_state != XFS_EXT_NORM)
    1583     5711013 :                         goto next;
    1584             : 
    1585   254270485 :                 pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
    1586   254271507 :                 agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
    1587   254271507 :                 aglen = got.br_blockcount;
    1588   254271507 :                 error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
    1589             :                                 &rbno, &rlen, false);
    1590   254272692 :                 xfs_perag_put(pag);
    1591   254273558 :                 if (error)
    1592      565014 :                         return error;
    1593             : 
    1594             :                 /* Is there still a shared block here? */
    1595   254273558 :                 if (rbno != NULLAGBLOCK) {
    1596      565014 :                         *has_shared = true;
    1597      565014 :                         return 0;
    1598             :                 }
    1599   253708544 : next:
    1600   259419557 :                 found = xfs_iext_next_extent(ifp, &icur, &got);
    1601             :         }
    1602             : 
    1603             :         return 0;
    1604             : }
    1605             : 
    1606             : /*
    1607             :  * Clear the inode reflink flag if there are no shared extents.
    1608             :  *
    1609             :  * The caller is responsible for joining the inode to the transaction passed in.
    1610             :  * The inode will be joined to the transaction that is returned to the caller.
    1611             :  */
    1612             : int
    1613        9280 : xfs_reflink_clear_inode_flag(
    1614             :         struct xfs_inode        *ip,
    1615             :         struct xfs_trans        **tpp)
    1616             : {
    1617        9280 :         bool                    needs_flag;
    1618        9280 :         int                     error = 0;
    1619             : 
    1620        9280 :         ASSERT(xfs_is_reflink_inode(ip));
    1621             : 
    1622        9280 :         error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
    1623        9279 :         if (error || needs_flag)
    1624             :                 return error;
    1625             : 
    1626             :         /*
    1627             :          * We didn't find any shared blocks so turn off the reflink flag.
    1628             :          * First, get rid of any leftover CoW mappings.
    1629             :          */
    1630        6552 :         error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
    1631             :                         true);
    1632        6552 :         if (error)
    1633             :                 return error;
    1634             : 
    1635             :         /* Clear the inode flag. */
    1636        6552 :         trace_xfs_reflink_unset_inode_flag(ip);
    1637        6552 :         ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
    1638        6552 :         xfs_inode_clear_cowblocks_tag(ip);
    1639        6552 :         xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
    1640             : 
    1641        6552 :         return error;
    1642             : }
    1643             : 
    1644             : /*
    1645             :  * Clear the inode reflink flag if there are no shared extents and the size
    1646             :  * hasn't changed.
    1647             :  */
    1648             : STATIC int
    1649          38 : xfs_reflink_try_clear_inode_flag(
    1650             :         struct xfs_inode        *ip)
    1651             : {
    1652          38 :         struct xfs_mount        *mp = ip->i_mount;
    1653          38 :         struct xfs_trans        *tp;
    1654          38 :         int                     error = 0;
    1655             : 
    1656             :         /* Start a rolling transaction to remove the mappings */
    1657          38 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
    1658          38 :         if (error)
    1659             :                 return error;
    1660             : 
    1661          38 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
    1662          38 :         xfs_trans_ijoin(tp, ip, 0);
    1663             : 
    1664          38 :         error = xfs_reflink_clear_inode_flag(ip, &tp);
    1665          38 :         if (error)
    1666           0 :                 goto cancel;
    1667             : 
    1668          38 :         error = xfs_trans_commit(tp);
    1669          38 :         if (error)
    1670           0 :                 goto out;
    1671             : 
    1672          38 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    1673          38 :         return 0;
    1674             : cancel:
    1675           0 :         xfs_trans_cancel(tp);
    1676           0 : out:
    1677           0 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    1678           0 :         return error;
    1679             : }
    1680             : 
    1681             : /*
    1682             :  * Pre-COW all shared blocks within a given byte range of a file and turn off
    1683             :  * the reflink flag if we unshare all of the file's blocks.
    1684             :  */
    1685             : int
    1686          66 : xfs_reflink_unshare(
    1687             :         struct xfs_inode        *ip,
    1688             :         xfs_off_t               offset,
    1689             :         xfs_off_t               len)
    1690             : {
    1691          66 :         struct inode            *inode = VFS_I(ip);
    1692          66 :         int                     error;
    1693             : 
    1694          66 :         if (!xfs_is_reflink_inode(ip))
    1695             :                 return 0;
    1696             : 
    1697          40 :         trace_xfs_reflink_unshare(ip, offset, len);
    1698             : 
    1699          40 :         inode_dio_wait(inode);
    1700             : 
    1701          40 :         if (IS_DAX(inode))
    1702             :                 error = dax_file_unshare(inode, offset, len,
    1703             :                                 &xfs_dax_write_iomap_ops);
    1704             :         else
    1705          40 :                 error = iomap_file_unshare(inode, offset, len,
    1706             :                                 &xfs_buffered_write_iomap_ops);
    1707          40 :         if (error)
    1708           0 :                 goto out;
    1709             : 
    1710          40 :         error = filemap_write_and_wait_range(inode->i_mapping, offset,
    1711          40 :                         offset + len - 1);
    1712          40 :         if (error)
    1713           2 :                 goto out;
    1714             : 
    1715             :         /* Turn off the reflink flag if possible. */
    1716          38 :         error = xfs_reflink_try_clear_inode_flag(ip);
    1717          38 :         if (error)
    1718           0 :                 goto out;
    1719             :         return 0;
    1720             : 
    1721           2 : out:
    1722           2 :         trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
    1723           2 :         return error;
    1724             : }

Generated by: LCOV version 1.14