LCOV - code coverage report
Current view: top level - fs/xfs - xfs_reflink.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023 Lines: 678 789 85.9 %
Date: 2023-07-31 20:08:27 Functions: 28 30 93.3 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0+
       2             : /*
       3             :  * Copyright (C) 2016 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <darrick.wong@oracle.com>
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_defer.h"
      14             : #include "xfs_inode.h"
      15             : #include "xfs_trans.h"
      16             : #include "xfs_bmap.h"
      17             : #include "xfs_bmap_util.h"
      18             : #include "xfs_trace.h"
      19             : #include "xfs_icache.h"
      20             : #include "xfs_btree.h"
      21             : #include "xfs_refcount_btree.h"
      22             : #include "xfs_refcount.h"
      23             : #include "xfs_bmap_btree.h"
      24             : #include "xfs_trans_space.h"
      25             : #include "xfs_bit.h"
      26             : #include "xfs_alloc.h"
      27             : #include "xfs_quota.h"
      28             : #include "xfs_reflink.h"
      29             : #include "xfs_iomap.h"
      30             : #include "xfs_ag.h"
      31             : #include "xfs_ag_resv.h"
      32             : #include "xfs_health.h"
      33             : #include "xfs_rtrefcount_btree.h"
      34             : #include "xfs_rtalloc.h"
      35             : #include "xfs_rtgroup.h"
      36             : #include "xfs_imeta.h"
      37             : #include "xfs_rtbitmap.h"
      38             : 
      39             : /*
      40             :  * Copy on Write of Shared Blocks
      41             :  *
      42             :  * XFS must preserve "the usual" file semantics even when two files share
      43             :  * the same physical blocks.  This means that a write to one file must not
      44             :  * alter the blocks in a different file; the way that we'll do that is
      45             :  * through the use of a copy-on-write mechanism.  At a high level, that
      46             :  * means that when we want to write to a shared block, we allocate a new
      47             :  * block, write the data to the new block, and if that succeeds we map the
      48             :  * new block into the file.
      49             :  *
      50             :  * XFS provides a "delayed allocation" mechanism that defers the allocation
      51             :  * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
      52             :  * possible.  This reduces fragmentation by enabling the filesystem to ask
      53             :  * for bigger chunks less often, which is exactly what we want for CoW.
      54             :  *
      55             :  * The delalloc mechanism begins when the kernel wants to make a block
      56             :  * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
      57             :  * create a delalloc mapping, which is a regular in-core extent, but without
      58             :  * a real startblock.  (For delalloc mappings, the startblock encodes both
      59             :  * a flag that this is a delalloc mapping, and a worst-case estimate of how
      60             :  * many blocks might be required to put the mapping into the BMBT.)  delalloc
      61             :  * mappings are a reservation against the free space in the filesystem;
      62             :  * adjacent mappings can also be combined into fewer larger mappings.
      63             :  *
      64             :  * As an optimization, the CoW extent size hint (cowextsz) creates
      65             :  * outsized aligned delalloc reservations in the hope of landing out of
      66             :  * order nearby CoW writes in a single extent on disk, thereby reducing
      67             :  * fragmentation and improving future performance.
      68             :  *
      69             :  * D: --RRRRRRSSSRRRRRRRR--- (data fork)
      70             :  * C: ------DDDDDDD--------- (CoW fork)
      71             :  *
      72             :  * When dirty pages are being written out (typically in writepage), the
      73             :  * delalloc reservations are converted into unwritten mappings by
      74             :  * allocating blocks and replacing the delalloc mapping with real ones.
      75             :  * A delalloc mapping can be replaced by several unwritten ones if the
      76             :  * free space is fragmented.
      77             :  *
      78             :  * D: --RRRRRRSSSRRRRRRRR---
      79             :  * C: ------UUUUUUU---------
      80             :  *
      81             :  * We want to adapt the delalloc mechanism for copy-on-write, since the
      82             :  * write paths are similar.  The first two steps (creating the reservation
      83             :  * and allocating the blocks) are exactly the same as delalloc except that
      84             :  * the mappings must be stored in a separate CoW fork because we do not want
      85             :  * to disturb the mapping in the data fork until we're sure that the write
      86             :  * succeeded.  IO completion in this case is the process of removing the old
      87             :  * mapping from the data fork and moving the new mapping from the CoW fork to
      88             :  * the data fork.  This will be discussed shortly.
      89             :  *
      90             :  * For now, unaligned directio writes will be bounced back to the page cache.
      91             :  * Block-aligned directio writes will use the same mechanism as buffered
      92             :  * writes.
      93             :  *
      94             :  * Just prior to submitting the actual disk write requests, we convert
      95             :  * the extents representing the range of the file actually being written
      96             :  * (as opposed to extra pieces created for the cowextsize hint) to real
      97             :  * extents.  This will become important in the next step:
      98             :  *
      99             :  * D: --RRRRRRSSSRRRRRRRR---
     100             :  * C: ------UUrrUUU---------
     101             :  *
     102             :  * CoW remapping must be done after the data block write completes,
     103             :  * because we don't want to destroy the old data fork map until we're sure
     104             :  * the new block has been written.  Since the new mappings are kept in a
     105             :  * separate fork, we can simply iterate these mappings to find the ones
     106             :  * that cover the file blocks that we just CoW'd.  For each extent, simply
     107             :  * unmap the corresponding range in the data fork, map the new range into
     108             :  * the data fork, and remove the extent from the CoW fork.  Because of
     109             :  * the presence of the cowextsize hint, however, we must be careful
     110             :  * only to remap the blocks that we've actually written out --  we must
     111             :  * never remap delalloc reservations nor CoW staging blocks that have
     112             :  * yet to be written.  This corresponds exactly to the real extents in
     113             :  * the CoW fork:
     114             :  *
     115             :  * D: --RRRRRRrrSRRRRRRRR---
     116             :  * C: ------UU--UUU---------
     117             :  *
     118             :  * Since the remapping operation can be applied to an arbitrary file
     119             :  * range, we record the need for the remap step as a flag in the ioend
     120             :  * instead of declaring a new IO type.  This is required for direct io
     121             :  * because we only have ioend for the whole dio, and we have to be able to
     122             :  * remember the presence of unwritten blocks and CoW blocks with a single
     123             :  * ioend structure.  Better yet, the more ground we can cover with one
     124             :  * ioend, the better.
     125             :  */
     126             : 
     127             : /*
     128             :  * Given an AG extent, find the lowest-numbered run of shared blocks
     129             :  * within that range and return the range in fbno/flen.  If
     130             :  * find_end_of_shared is true, return the longest contiguous extent of
     131             :  * shared blocks.  If there are no shared extents, fbno and flen will
     132             :  * be set to NULLAGBLOCK and 0, respectively.
     133             :  */
     134             : static int
     135   794556659 : xfs_reflink_find_shared(
     136             :         struct xfs_perag        *pag,
     137             :         struct xfs_trans        *tp,
     138             :         xfs_agblock_t           agbno,
     139             :         xfs_extlen_t            aglen,
     140             :         xfs_agblock_t           *fbno,
     141             :         xfs_extlen_t            *flen,
     142             :         bool                    find_end_of_shared)
     143             : {
     144   794556659 :         struct xfs_buf          *agbp;
     145   794556659 :         struct xfs_btree_cur    *cur;
     146   794556659 :         int                     error;
     147             : 
     148   794556659 :         error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
     149   794559022 :         if (error)
     150             :                 return error;
     151             : 
     152   794559046 :         cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
     153             : 
     154   794556187 :         error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
     155             :                         find_end_of_shared);
     156             : 
     157   794554503 :         xfs_btree_del_cursor(cur, error);
     158             : 
     159   794560635 :         xfs_trans_brelse(tp, agbp);
     160   794560635 :         return error;
     161             : }
     162             : 
     163             : /*
     164             :  * Given an RT extent, find the lowest-numbered run of shared blocks
     165             :  * within that range and return the range in fbno/flen.  If
     166             :  * find_end_of_shared is true, return the longest contiguous extent of
     167             :  * shared blocks.  If there are no shared extents, fbno and flen will
     168             :  * be set to NULLRGBLOCK and 0, respectively.
     169             :  */
     170             : static int
     171    13464042 : xfs_reflink_find_rtshared(
     172             :         struct xfs_rtgroup      *rtg,
     173             :         struct xfs_trans        *tp,
     174             :         xfs_agblock_t           rtbno,
     175             :         xfs_extlen_t            rtlen,
     176             :         xfs_agblock_t           *fbno,
     177             :         xfs_extlen_t            *flen,
     178             :         bool                    find_end_of_shared)
     179             : {
     180    13464042 :         struct xfs_mount        *mp = rtg->rtg_mount;
     181    13464042 :         struct xfs_btree_cur    *cur;
     182    13464042 :         int                     error;
     183             : 
     184    13464042 :         BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
     185             : 
     186    13464042 :         xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_REFCOUNT);
     187    13464564 :         cur = xfs_rtrefcountbt_init_cursor(mp, tp, rtg, rtg->rtg_refcountip);
     188    13464118 :         error = xfs_refcount_find_shared(cur, rtbno, rtlen, fbno, flen,
     189             :                         find_end_of_shared);
     190    13464493 :         xfs_btree_del_cursor(cur, error);
     191    13464869 :         xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
     192    13464739 :         return error;
     193             : }
     194             : 
     195             : /*
     196             :  * Trim the mapping to the next block where there's a change in the
     197             :  * shared/unshared status.  More specifically, this means that we
     198             :  * find the lowest-numbered extent of shared blocks that coincides with
     199             :  * the given block mapping.  If the shared extent overlaps the start of
     200             :  * the mapping, trim the mapping to the end of the shared extent.  If
     201             :  * the shared region intersects the mapping, trim the mapping to the
     202             :  * start of the shared extent.  If there are no shared regions that
     203             :  * overlap, just return the original extent.
     204             :  */
     205             : int
     206    36507041 : xfs_reflink_trim_around_shared(
     207             :         struct xfs_inode        *ip,
     208             :         struct xfs_bmbt_irec    *irec,
     209             :         bool                    *shared)
     210             : {
     211    36507041 :         struct xfs_mount        *mp = ip->i_mount;
     212    36507041 :         xfs_agblock_t           orig_bno;
     213    36507041 :         xfs_agblock_t           fbno;
     214    36507041 :         xfs_extlen_t            flen;
     215    36507041 :         int                     error = 0;
     216             : 
     217             :         /* Holes, unwritten, and delalloc extents cannot be shared */
     218    36507041 :         if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
     219    19933716 :                 *shared = false;
     220    19933716 :                 return 0;
     221             :         }
     222             : 
     223    16573529 :         trace_xfs_reflink_trim_around_shared(ip, irec);
     224             : 
     225    22596663 :         if (XFS_IS_REALTIME_INODE(ip)) {
     226     6023139 :                 struct xfs_rtgroup      *rtg;
     227     6023139 :                 xfs_rgnumber_t          rgno;
     228             : 
     229     6023139 :                 orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock, &rgno);
     230     6023136 :                 rtg = xfs_rtgroup_get(mp, rgno);
     231     6023126 :                 error = xfs_reflink_find_rtshared(rtg, NULL, orig_bno,
     232     6023126 :                                 irec->br_blockcount, &fbno, &flen, true);
     233     6023161 :                 xfs_rtgroup_put(rtg);
     234             :         } else {
     235    10550359 :                 struct xfs_perag        *pag;
     236             : 
     237    10550359 :                 pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
     238             :                                         irec->br_startblock));
     239    10550343 :                 orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
     240    10550343 :                 error = xfs_reflink_find_shared(pag, NULL, orig_bno,
     241    10550343 :                                 irec->br_blockcount, &fbno, &flen, true);
     242    10550397 :                 xfs_perag_put(pag);
     243             :         }
     244    16573565 :         if (error)
     245             :                 return error;
     246             : 
     247    16573470 :         *shared = false;
     248    16573470 :         if (fbno == NULLAGBLOCK) {
     249             :                 /* No shared blocks at all. */
     250             :                 return 0;
     251             :         }
     252             : 
     253     1934574 :         if (fbno == orig_bno) {
     254             :                 /*
     255             :                  * The start of this extent is shared.  Truncate the
     256             :                  * mapping at the end of the shared region so that a
     257             :                  * subsequent iteration starts at the start of the
     258             :                  * unshared region.
     259             :                  */
     260     1898568 :                 irec->br_blockcount = flen;
     261     1898568 :                 *shared = true;
     262     1898568 :                 return 0;
     263             :         }
     264             : 
     265             :         /*
     266             :          * There's a shared extent midway through this extent.
     267             :          * Truncate the mapping at the start of the shared
     268             :          * extent so that a subsequent iteration starts at the
     269             :          * start of the shared region.
     270             :          */
     271       36006 :         irec->br_blockcount = fbno - orig_bno;
     272       36006 :         return 0;
     273             : }
     274             : 
     275             : int
     276    27711787 : xfs_bmap_trim_cow(
     277             :         struct xfs_inode        *ip,
     278             :         struct xfs_bmbt_irec    *imap,
     279             :         bool                    *shared)
     280             : {
     281             :         /* We can't update any real extents in always COW mode. */
     282    27711787 :         if (xfs_is_always_cow_inode(ip) &&
     283           0 :             !isnullstartblock(imap->br_startblock)) {
     284           0 :                 *shared = true;
     285           0 :                 return 0;
     286             :         }
     287             : 
     288             :         /* Trim the mapping to the nearest shared extent boundary. */
     289    27711835 :         return xfs_reflink_trim_around_shared(ip, imap, shared);
     290             : }
     291             : 
     292             : static int
     293     2002971 : xfs_reflink_convert_cow_locked(
     294             :         struct xfs_inode        *ip,
     295             :         xfs_fileoff_t           offset_fsb,
     296             :         xfs_filblks_t           count_fsb)
     297             : {
     298     2002971 :         struct xfs_iext_cursor  icur;
     299     2002971 :         struct xfs_bmbt_irec    got;
     300     2002971 :         struct xfs_btree_cur    *dummy_cur = NULL;
     301     2002971 :         struct xfs_mount        *mp = ip->i_mount;
     302     2002971 :         int                     dummy_logflags;
     303     2002971 :         int                     error = 0;
     304             : 
     305             :         /*
     306             :          * We can only remap full rt extents, so make sure that we convert the
     307             :          * entire extent.  The caller must ensure that this is either a direct
     308             :          * write that's aligned to the rt extent size, or a buffered write for
     309             :          * which we've dirtied extra pages to make this work properly.
     310             :          */
     311     2002971 :         if (xfs_inode_needs_cow_around(ip)) {
     312           0 :                 xfs_fileoff_t   new_off;
     313             : 
     314           0 :                 new_off = xfs_rtb_rounddown_rtx(mp, offset_fsb);
     315           0 :                 count_fsb += offset_fsb - new_off;
     316           0 :                 offset_fsb = new_off;
     317             : 
     318           0 :                 count_fsb = xfs_rtb_roundup_rtx(mp, count_fsb);
     319             :         }
     320             : 
     321     2002944 :         if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
     322             :                 return 0;
     323             : 
     324     3764054 :         do {
     325     3764054 :                 if (got.br_startoff >= offset_fsb + count_fsb)
     326             :                         break;
     327     2004060 :                 if (got.br_state == XFS_EXT_NORM)
     328           6 :                         continue;
     329     2004054 :                 if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
     330             :                         return -EIO;
     331             : 
     332     2004054 :                 xfs_trim_extent(&got, offset_fsb, count_fsb);
     333     2004046 :                 if (!got.br_blockcount)
     334           0 :                         continue;
     335             : 
     336     2004046 :                 got.br_state = XFS_EXT_NORM;
     337     2004046 :                 error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
     338             :                                 XFS_COW_FORK, &icur, &dummy_cur, &got,
     339             :                                 &dummy_logflags);
     340     2003869 :                 if (error)
     341           0 :                         return error;
     342     2003875 :         } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
     343             : 
     344             :         return error;
     345             : }
     346             : 
     347             : /* Convert all of the unwritten CoW extents in a file's range to real ones. */
     348             : int
     349      611513 : xfs_reflink_convert_cow(
     350             :         struct xfs_inode        *ip,
     351             :         xfs_off_t               offset,
     352             :         xfs_off_t               count)
     353             : {
     354      611513 :         struct xfs_mount        *mp = ip->i_mount;
     355      611513 :         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
     356      611513 :         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
     357      611513 :         xfs_filblks_t           count_fsb = end_fsb - offset_fsb;
     358      611513 :         int                     error;
     359             : 
     360      611513 :         ASSERT(count != 0);
     361             : 
     362      611513 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     363      611476 :         error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
     364      611336 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     365      611244 :         return error;
     366             : }
     367             : 
     368             : /*
     369             :  * Find the extent that maps the given range in the COW fork. Even if the extent
     370             :  * is not shared we might have a preallocation for it in the COW fork. If so we
     371             :  * use it that rather than trigger a new allocation.
     372             :  */
     373             : static int
     374    22159553 : xfs_find_trim_cow_extent(
     375             :         struct xfs_inode        *ip,
     376             :         struct xfs_bmbt_irec    *imap,
     377             :         struct xfs_bmbt_irec    *cmap,
     378             :         bool                    *shared,
     379             :         bool                    *found)
     380             : {
     381    22159553 :         xfs_fileoff_t           offset_fsb = imap->br_startoff;
     382    22159553 :         xfs_filblks_t           count_fsb = imap->br_blockcount;
     383    22159553 :         struct xfs_iext_cursor  icur;
     384             : 
     385    22159553 :         *found = false;
     386             : 
     387             :         /*
     388             :          * If we don't find an overlapping extent, trim the range we need to
     389             :          * allocate to fit the hole we found.
     390             :          */
     391    22159553 :         if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
     392    17968993 :                 cmap->br_startoff = offset_fsb + count_fsb;
     393    22159479 :         if (cmap->br_startoff > offset_fsb) {
     394    20541857 :                 xfs_trim_extent(imap, imap->br_startoff,
     395             :                                 cmap->br_startoff - imap->br_startoff);
     396    20541857 :                 return xfs_bmap_trim_cow(ip, imap, shared);
     397             :         }
     398             : 
     399     1617622 :         *shared = true;
     400     1617622 :         if (isnullstartblock(cmap->br_startblock)) {
     401         913 :                 xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
     402         913 :                 return 0;
     403             :         }
     404             : 
     405             :         /* real extent found - no need to allocate */
     406     1616709 :         xfs_trim_extent(cmap, offset_fsb, count_fsb);
     407     1616709 :         *found = true;
     408     1616709 :         return 0;
     409             : }
     410             : 
     411             : static int
     412     2156384 : xfs_reflink_convert_unwritten(
     413             :         struct xfs_inode        *ip,
     414             :         struct xfs_bmbt_irec    *imap,
     415             :         struct xfs_bmbt_irec    *cmap,
     416             :         bool                    convert_now)
     417             : {
     418     2156384 :         xfs_fileoff_t           offset_fsb = imap->br_startoff;
     419     2156384 :         xfs_filblks_t           count_fsb = imap->br_blockcount;
     420     2156384 :         int                     error;
     421             : 
     422             :         /*
     423             :          * cmap might larger than imap due to cowextsize hint.
     424             :          */
     425     2156384 :         xfs_trim_extent(cmap, offset_fsb, count_fsb);
     426             : 
     427             :         /*
     428             :          * COW fork extents are supposed to remain unwritten until we're ready
     429             :          * to initiate a disk write.  For direct I/O we are going to write the
     430             :          * data and need the conversion, but for buffered writes we're done.
     431             :          */
     432     2156384 :         if (!convert_now || cmap->br_state == XFS_EXT_NORM)
     433             :                 return 0;
     434             : 
     435     1391473 :         trace_xfs_reflink_convert_cow(ip, cmap);
     436             : 
     437     1391473 :         error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
     438     1391473 :         if (!error)
     439     1391473 :                 cmap->br_state = XFS_EXT_NORM;
     440             : 
     441             :         return error;
     442             : }
     443             : 
     444             : static int
     445      539271 : xfs_reflink_fill_cow_hole(
     446             :         struct xfs_inode        *ip,
     447             :         struct xfs_bmbt_irec    *imap,
     448             :         struct xfs_bmbt_irec    *cmap,
     449             :         bool                    *shared,
     450             :         uint                    *lockmode,
     451             :         bool                    convert_now)
     452             : {
     453      539271 :         struct xfs_mount        *mp = ip->i_mount;
     454      539271 :         struct xfs_trans        *tp;
     455      539271 :         xfs_filblks_t           resaligned;
     456      539271 :         unsigned int            dblocks = 0, rblocks = 0;
     457      539271 :         int                     nimaps;
     458      539271 :         int                     error;
     459      539271 :         bool                    found;
     460             : 
     461      539271 :         resaligned = xfs_aligned_fsb_count(imap->br_startoff,
     462             :                 imap->br_blockcount, xfs_get_cowextsz_hint(ip));
     463      539271 :         if (XFS_IS_REALTIME_INODE(ip)) {
     464      313361 :                 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
     465      313361 :                 rblocks = resaligned;
     466             :         } else {
     467      225910 :                 dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
     468      225910 :                 rblocks = 0;
     469             :         }
     470             : 
     471      539271 :         xfs_iunlock(ip, *lockmode);
     472      539271 :         *lockmode = 0;
     473             : 
     474      539271 :         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
     475             :                         rblocks, false, &tp);
     476      539271 :         if (error)
     477             :                 return error;
     478             : 
     479      539264 :         *lockmode = XFS_ILOCK_EXCL;
     480             : 
     481      539264 :         error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
     482      539264 :         if (error || !*shared)
     483           0 :                 goto out_trans_cancel;
     484             : 
     485      539264 :         if (found) {
     486           0 :                 xfs_trans_cancel(tp);
     487           0 :                 goto convert;
     488             :         }
     489             : 
     490             :         /* Allocate the entire reservation as unwritten blocks. */
     491      539264 :         nimaps = 1;
     492      539264 :         error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
     493             :                         XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
     494             :                         &nimaps);
     495      539264 :         if (error)
     496          19 :                 goto out_trans_cancel;
     497             : 
     498      539245 :         xfs_inode_set_cowblocks_tag(ip);
     499      539245 :         error = xfs_trans_commit(tp);
     500      539245 :         if (error)
     501             :                 return error;
     502             : 
     503             :         /*
     504             :          * Allocation succeeded but the requested range was not even partially
     505             :          * satisfied?  Bail out!
     506             :          */
     507      539245 :         if (nimaps == 0)
     508             :                 return -ENOSPC;
     509             : 
     510      539245 : convert:
     511      539245 :         return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
     512             : 
     513          19 : out_trans_cancel:
     514          19 :         xfs_trans_cancel(tp);
     515          19 :         return error;
     516             : }
     517             : 
     518             : static int
     519         431 : xfs_reflink_fill_delalloc(
     520             :         struct xfs_inode        *ip,
     521             :         struct xfs_bmbt_irec    *imap,
     522             :         struct xfs_bmbt_irec    *cmap,
     523             :         bool                    *shared,
     524             :         uint                    *lockmode,
     525             :         bool                    convert_now)
     526             : {
     527         431 :         struct xfs_mount        *mp = ip->i_mount;
     528         483 :         struct xfs_trans        *tp;
     529         483 :         int                     nimaps;
     530         483 :         int                     error;
     531         483 :         bool                    found;
     532             : 
     533         483 :         do {
     534         483 :                 xfs_iunlock(ip, *lockmode);
     535         483 :                 *lockmode = 0;
     536             : 
     537         483 :                 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
     538             :                                 false, &tp);
     539         483 :                 if (error)
     540           0 :                         return error;
     541             : 
     542         483 :                 *lockmode = XFS_ILOCK_EXCL;
     543             : 
     544         483 :                 error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
     545             :                                 &found);
     546         483 :                 if (error || !*shared)
     547           0 :                         goto out_trans_cancel;
     548             : 
     549         483 :                 if (found) {
     550           1 :                         xfs_trans_cancel(tp);
     551           1 :                         break;
     552             :                 }
     553             : 
     554         482 :                 ASSERT(isnullstartblock(cmap->br_startblock) ||
     555             :                        cmap->br_startblock == DELAYSTARTBLOCK);
     556             : 
     557             :                 /*
     558             :                  * Replace delalloc reservation with an unwritten extent.
     559             :                  */
     560         482 :                 nimaps = 1;
     561         482 :                 error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
     562             :                                 cmap->br_blockcount,
     563             :                                 XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
     564             :                                 cmap, &nimaps);
     565         482 :                 if (error)
     566           0 :                         goto out_trans_cancel;
     567             : 
     568         482 :                 xfs_inode_set_cowblocks_tag(ip);
     569         482 :                 error = xfs_trans_commit(tp);
     570         482 :                 if (error)
     571           0 :                         return error;
     572             : 
     573             :                 /*
     574             :                  * Allocation succeeded but the requested range was not even
     575             :                  * partially satisfied?  Bail out!
     576             :                  */
     577         482 :                 if (nimaps == 0)
     578             :                         return -ENOSPC;
     579         482 :         } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
     580             : 
     581         431 :         return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
     582             : 
     583           0 : out_trans_cancel:
     584           0 :         xfs_trans_cancel(tp);
     585           0 :         return error;
     586             : }
     587             : 
     588             : /* Allocate all CoW reservations covering a range of blocks in a file. */
     589             : int
     590    21619744 : xfs_reflink_allocate_cow(
     591             :         struct xfs_inode        *ip,
     592             :         struct xfs_bmbt_irec    *imap,
     593             :         struct xfs_bmbt_irec    *cmap,
     594             :         bool                    *shared,
     595             :         uint                    *lockmode,
     596             :         bool                    convert_now)
     597             : {
     598    21619744 :         int                     error;
     599    21619744 :         bool                    found;
     600             : 
     601    21619744 :         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
     602    21619813 :         if (!ip->i_cowfp) {
     603           0 :                 ASSERT(!xfs_is_reflink_inode(ip));
     604           0 :                 xfs_ifork_init_cow(ip);
     605             :         }
     606             : 
     607    21619813 :         error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
     608    21619757 :         if (error || !*shared)
     609             :                 return error;
     610             : 
     611             :         /* CoW fork has a real extent */
     612     2156410 :         if (found)
     613     1616708 :                 return xfs_reflink_convert_unwritten(ip, imap, cmap,
     614             :                                 convert_now);
     615             : 
     616             :         /*
     617             :          * CoW fork does not have an extent and data extent is shared.
     618             :          * Allocate a real extent in the CoW fork.
     619             :          */
     620      539702 :         if (cmap->br_startoff > imap->br_startoff)
     621      539271 :                 return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
     622             :                                 lockmode, convert_now);
     623             : 
     624             :         /*
     625             :          * CoW fork has a delalloc reservation. Replace it with a real extent.
     626             :          * There may or may not be a data fork mapping.
     627             :          */
     628         431 :         if (isnullstartblock(cmap->br_startblock) ||
     629             :             cmap->br_startblock == DELAYSTARTBLOCK)
     630         431 :                 return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
     631             :                                 lockmode, convert_now);
     632             : 
     633             :         /* Shouldn't get here. */
     634           0 :         ASSERT(0);
     635           0 :         return -EFSCORRUPTED;
     636             : }
     637             : 
     638             : /*
     639             :  * Cancel CoW reservations for some block range of an inode.
     640             :  *
     641             :  * If cancel_real is true this function cancels all COW fork extents for the
     642             :  * inode; if cancel_real is false, real extents are not cleared.
     643             :  *
     644             :  * Caller must have already joined the inode to the current transaction. The
     645             :  * inode will be joined to the transaction returned to the caller.
     646             :  */
     647             : int
     648    12296133 : xfs_reflink_cancel_cow_blocks(
     649             :         struct xfs_inode                *ip,
     650             :         struct xfs_trans                **tpp,
     651             :         xfs_fileoff_t                   offset_fsb,
     652             :         xfs_fileoff_t                   end_fsb,
     653             :         bool                            cancel_real)
     654             : {
     655    12296133 :         struct xfs_ifork                *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
     656    12296133 :         struct xfs_mount                *mp = ip->i_mount;
     657    12296133 :         struct xfs_bmbt_irec            got, del;
     658    12296133 :         struct xfs_iext_cursor          icur;
     659    12296133 :         bool                            isrt = XFS_IS_REALTIME_INODE(ip);
     660    12296133 :         int                             error = 0;
     661             : 
     662             :         /*
     663             :          * Shrink the range that we're cancelling if they don't align to the
     664             :          * realtime extent size, since we can only free full extents.
     665             :          */
     666    12296133 :         if (xfs_inode_needs_cow_around(ip)) {
     667           0 :                 offset_fsb = xfs_rtb_roundup_rtx(mp, offset_fsb);
     668           0 :                 end_fsb = xfs_rtb_rounddown_rtx(mp, end_fsb);
     669             :         }
     670             : 
     671    24593968 :         if (!xfs_inode_has_cow_data(ip))
     672             :                 return 0;
     673      607405 :         if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
     674             :                 return 0;
     675             : 
     676             :         /* Walk backwards until we're out of the I/O range... */
     677      854558 :         while (got.br_startoff + got.br_blockcount > offset_fsb) {
     678      522854 :                 del = got;
     679      522854 :                 xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
     680             : 
     681             :                 /* Extent delete may have bumped ext forward */
     682      522849 :                 if (!del.br_blockcount) {
     683        5888 :                         xfs_iext_prev(ifp, &icur);
     684        5888 :                         goto next_extent;
     685             :                 }
     686             : 
     687      516961 :                 trace_xfs_reflink_cancel_cow(ip, &del);
     688             : 
     689      516962 :                 if (isnullstartblock(del.br_startblock)) {
     690       20830 :                         error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
     691             :                                         &icur, &got, &del);
     692       20830 :                         if (error)
     693             :                                 break;
     694      496132 :                 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
     695      496132 :                         ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
     696             : 
     697             :                         /* Free the CoW orphan record. */
     698      496132 :                         xfs_refcount_free_cow_extent(*tpp, isrt,
     699      496132 :                                         del.br_startblock, del.br_blockcount);
     700             : 
     701      645311 :                         error = xfs_free_extent_later(*tpp, del.br_startblock,
     702             :                                         del.br_blockcount, NULL,
     703             :                                         XFS_AG_RESV_NONE,
     704             :                                         isrt ? XFS_FREE_EXTENT_REALTIME : 0);
     705      496127 :                         if (error)
     706             :                                 break;
     707             : 
     708             :                         /* Roll the transaction */
     709      496127 :                         error = xfs_defer_finish(tpp);
     710      496135 :                         if (error)
     711             :                                 break;
     712             : 
     713             :                         /* Remove the mapping from the CoW fork. */
     714      496119 :                         xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
     715             : 
     716             :                         /* Remove the quota reservation */
     717      496121 :                         error = xfs_quota_unreserve_blkres(ip,
     718      496121 :                                         del.br_blockcount);
     719      496120 :                         if (error)
     720             :                                 break;
     721             :                 } else {
     722             :                         /* Didn't do anything, push cursor back. */
     723           0 :                         xfs_iext_prev(ifp, &icur);
     724             :                 }
     725      522838 : next_extent:
     726      522838 :                 if (!xfs_iext_get_extent(ifp, &icur, &got))
     727             :                         break;
     728             :         }
     729             : 
     730             :         /* clear tag if cow fork is emptied */
     731      583273 :         if (!ifp->if_bytes)
     732      247282 :                 xfs_inode_clear_cowblocks_tag(ip);
     733             :         return error;
     734             : }
     735             : 
     736             : /*
     737             :  * Cancel CoW reservations for some byte range of an inode.
     738             :  *
     739             :  * If cancel_real is true this function cancels all COW fork extents for the
     740             :  * inode; if cancel_real is false, real extents are not cleared.
     741             :  */
     742             : int
     743      348564 : xfs_reflink_cancel_cow_range(
     744             :         struct xfs_inode        *ip,
     745             :         xfs_off_t               offset,
     746             :         xfs_off_t               count,
     747             :         bool                    cancel_real)
     748             : {
     749      348564 :         struct xfs_trans        *tp;
     750      348564 :         xfs_fileoff_t           offset_fsb;
     751      348564 :         xfs_fileoff_t           end_fsb;
     752      348564 :         int                     error;
     753             : 
     754      348564 :         trace_xfs_reflink_cancel_cow_range(ip, offset, count);
     755      348637 :         ASSERT(ip->i_cowfp);
     756             : 
     757      348637 :         offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
     758      348637 :         if (count == NULLFILEOFF)
     759             :                 end_fsb = NULLFILEOFF;
     760             :         else
     761       87009 :                 end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
     762             : 
     763             :         /* Start a rolling transaction to remove the mappings */
     764      348637 :         error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
     765             :                         0, 0, 0, &tp);
     766      348640 :         if (error)
     767           0 :                 goto out;
     768             : 
     769      348640 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     770      348639 :         xfs_trans_ijoin(tp, ip, 0);
     771             : 
     772             :         /* Scrape out the old CoW reservations */
     773      348639 :         error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
     774             :                         cancel_real);
     775      348634 :         if (error)
     776          16 :                 goto out_cancel;
     777             : 
     778      348618 :         error = xfs_trans_commit(tp);
     779             : 
     780      348624 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     781      348624 :         return error;
     782             : 
     783             : out_cancel:
     784          16 :         xfs_trans_cancel(tp);
     785          16 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     786          16 : out:
     787          16 :         trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
     788          16 :         return error;
     789             : }
     790             : 
     791             : #ifdef CONFIG_XFS_QUOTA
     792             : /*
     793             :  * Update quota accounting for a remapping operation.  When we're remapping
     794             :  * something from the CoW fork to the data fork, we must update the quota
     795             :  * accounting for delayed allocations.  For remapping from the data fork to the
     796             :  * data fork, use regular block accounting.
     797             :  */
     798             : static inline void
     799    89195184 : xfs_reflink_update_quota(
     800             :         struct xfs_trans        *tp,
     801             :         struct xfs_inode        *ip,
     802             :         bool                    is_cow,
     803             :         int64_t                 blocks)
     804             : {
     805    89195184 :         unsigned int            qflag;
     806             : 
     807    89195184 :         if (XFS_IS_REALTIME_INODE(ip)) {
     808    25557497 :                 qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
     809             :                                  XFS_TRANS_DQ_RTBCOUNT;
     810             :         } else {
     811    63637687 :                 qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
     812             :                                  XFS_TRANS_DQ_BCOUNT;
     813             :         }
     814    89195184 :         xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
     815    89194857 : }
     816             : #else
     817             : # define xfs_reflink_update_quota(tp, ip, is_cow, blocks)       ((void)0)
     818             : #endif
     819             : 
     820             : /*
     821             :  * Remap part of the CoW fork into the data fork.
     822             :  *
     823             :  * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
     824             :  * into the data fork; this function will remap what it can (at the end of the
     825             :  * range) and update @end_fsb appropriately.  Each remap gets its own
     826             :  * transaction because we can end up merging and splitting bmbt blocks for
     827             :  * every remap operation and we'd like to keep the block reservation
     828             :  * requirements as low as possible.
     829             :  */
     830             : STATIC int
     831     2984788 : xfs_reflink_end_cow_extent(
     832             :         struct xfs_inode        *ip,
     833             :         xfs_fileoff_t           *offset_fsb,
     834             :         xfs_fileoff_t           end_fsb)
     835             : {
     836     2984788 :         struct xfs_iext_cursor  icur;
     837     2984788 :         struct xfs_bmbt_irec    got, del, data;
     838     2984788 :         struct xfs_mount        *mp = ip->i_mount;
     839     2984788 :         struct xfs_trans        *tp;
     840     2984788 :         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
     841     2984788 :         unsigned int            resblks;
     842     2984788 :         int                     nmaps;
     843     2984788 :         bool                    isrt = XFS_IS_REALTIME_INODE(ip);
     844     2984788 :         int                     error;
     845             : 
     846             :         /* No COW extents?  That's easy! */
     847     2984788 :         if (ifp->if_bytes == 0) {
     848         167 :                 *offset_fsb = end_fsb;
     849         167 :                 return 0;
     850             :         }
     851             : 
     852     2984621 :         resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
     853     2984621 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
     854             :                         XFS_TRANS_RESERVE, &tp);
     855     2984621 :         if (error)
     856             :                 return error;
     857             : 
     858             :         /*
     859             :          * Lock the inode.  We have to ijoin without automatic unlock because
     860             :          * the lead transaction is the refcountbt record deletion; the data
     861             :          * fork update follows as a deferred log item.
     862             :          */
     863     2984621 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
     864     2984621 :         xfs_trans_ijoin(tp, ip, 0);
     865             : 
     866     2984621 :         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
     867             :                         XFS_IEXT_REFLINK_END_COW_CNT);
     868     2984621 :         if (error == -EFBIG)
     869           4 :                 error = xfs_iext_count_upgrade(tp, ip,
     870             :                                 XFS_IEXT_REFLINK_END_COW_CNT);
     871     2984621 :         if (error)
     872           4 :                 goto out_cancel;
     873             : 
     874             :         /*
     875             :          * In case of racing, overlapping AIO writes no COW extents might be
     876             :          * left by the time I/O completes for the loser of the race.  In that
     877             :          * case we are done.
     878             :          */
     879     2984617 :         if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
     880     2984192 :             got.br_startoff >= end_fsb) {
     881       19025 :                 *offset_fsb = end_fsb;
     882       19025 :                 goto out_cancel;
     883             :         }
     884             : 
     885             :         /*
     886             :          * Only remap real extents that contain data.  With AIO, speculative
     887             :          * preallocations can leak into the range we are called upon, and we
     888             :          * need to skip them.  Preserve @got for the eventual CoW fork
     889             :          * deletion; from now on @del represents the mapping that we're
     890             :          * actually remapping.
     891             :          */
     892     2968324 :         while (!xfs_bmap_is_written_extent(&got)) {
     893        2966 :                 if (!xfs_iext_next_extent(ifp, &icur, &got) ||
     894        2965 :                     got.br_startoff >= end_fsb) {
     895         234 :                         *offset_fsb = end_fsb;
     896         234 :                         goto out_cancel;
     897             :                 }
     898             :         }
     899     2965358 :         del = got;
     900             : 
     901             :         /* Grab the corresponding mapping in the data fork. */
     902     2965358 :         nmaps = 1;
     903     2965358 :         error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
     904             :                         &nmaps, 0);
     905     2965358 :         if (error)
     906          16 :                 goto out_cancel;
     907             : 
     908             :         /* We can only remap the smaller of the two extent sizes. */
     909     2965342 :         data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
     910     2965342 :         del.br_blockcount = data.br_blockcount;
     911             : 
     912     2965342 :         trace_xfs_reflink_cow_remap_from(ip, &del);
     913     2965342 :         trace_xfs_reflink_cow_remap_to(ip, &data);
     914             : 
     915     5558990 :         if (xfs_bmap_is_real_extent(&data)) {
     916             :                 /*
     917             :                  * If the extent we're remapping is backed by storage (written
     918             :                  * or not), unmap the extent and drop its refcount.
     919             :                  */
     920     2593648 :                 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
     921     2593648 :                 xfs_refcount_decrease_extent(tp, isrt, &data);
     922     2593648 :                 xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
     923      371694 :         } else if (data.br_startblock == DELAYSTARTBLOCK) {
     924       19879 :                 int             done;
     925             : 
     926             :                 /*
     927             :                  * If the extent we're remapping is a delalloc reservation,
     928             :                  * we can use the regular bunmapi function to release the
     929             :                  * incore state.  Dropping the delalloc reservation takes care
     930             :                  * of the quota reservation for us.
     931             :                  */
     932       19879 :                 error = xfs_bunmapi(NULL, ip, data.br_startoff,
     933             :                                 data.br_blockcount, 0, 1, &done);
     934       19879 :                 if (error)
     935           0 :                         goto out_cancel;
     936       19879 :                 ASSERT(done);
     937             :         }
     938             : 
     939             :         /* Free the CoW orphan record. */
     940     2965342 :         xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
     941     2965342 :                         del.br_blockcount);
     942             : 
     943             :         /* Map the new blocks into the data fork. */
     944     2965342 :         xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
     945             : 
     946             :         /* Charge this new data fork mapping to the on-disk quota. */
     947     2965342 :         xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
     948             : 
     949             :         /* Remove the mapping from the CoW fork. */
     950     2965342 :         xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
     951             : 
     952     2965342 :         error = xfs_trans_commit(tp);
     953     2965342 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     954     2965342 :         if (error)
     955             :                 return error;
     956             : 
     957             :         /* Update the caller about how much progress we made. */
     958     2965338 :         *offset_fsb = del.br_startoff + del.br_blockcount;
     959     2965338 :         return 0;
     960             : 
     961       19279 : out_cancel:
     962       19279 :         xfs_trans_cancel(tp);
     963       19279 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
     964       19279 :         return error;
     965             : }
     966             : 
     967             : /*
     968             :  * Remap parts of a file's data fork after a successful CoW.
     969             :  */
     970             : int
     971     1743149 : xfs_reflink_end_cow(
     972             :         struct xfs_inode                *ip,
     973             :         xfs_off_t                       offset,
     974             :         xfs_off_t                       count)
     975             : {
     976     1743149 :         struct xfs_mount                *mp = ip->i_mount;
     977     1743149 :         xfs_fileoff_t                   offset_fsb;
     978     1743149 :         xfs_fileoff_t                   end_fsb;
     979     1743149 :         int                             error = 0;
     980             : 
     981     1743149 :         trace_xfs_reflink_end_cow(ip, offset, count);
     982             : 
     983     1743149 :         offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
     984     1743149 :         end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
     985             : 
     986             :         /*
     987             :          * Make sure the end is aligned with a rt extent (if desired), since
     988             :          * the end of the range could be EOF.  The _convert_cow function should
     989             :          * have set us up to swap only full rt extents.
     990             :          */
     991     1743149 :         if (xfs_inode_needs_cow_around(ip)) {
     992           0 :                 offset_fsb = xfs_rtb_rounddown_rtx(mp, offset_fsb);
     993           0 :                 end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
     994             :         }
     995             : 
     996             :         /*
     997             :          * Walk forwards until we've remapped the I/O range.  The loop function
     998             :          * repeatedly cycles the ILOCK to allocate one transaction per remapped
     999             :          * extent.
    1000             :          *
    1001             :          * If we're being called by writeback then the pages will still
    1002             :          * have PageWriteback set, which prevents races with reflink remapping
    1003             :          * and truncate.  Reflink remapping prevents races with writeback by
    1004             :          * taking the iolock and mmaplock before flushing the pages and
    1005             :          * remapping, which means there won't be any further writeback or page
    1006             :          * cache dirtying until the reflink completes.
    1007             :          *
    1008             :          * We should never have two threads issuing writeback for the same file
    1009             :          * region.  There are also have post-eof checks in the writeback
    1010             :          * preparation code so that we don't bother writing out pages that are
    1011             :          * about to be truncated.
    1012             :          *
    1013             :          * If we're being called as part of directio write completion, the dio
    1014             :          * count is still elevated, which reflink and truncate will wait for.
    1015             :          * Reflink remapping takes the iolock and mmaplock and waits for
    1016             :          * pending dio to finish, which should prevent any directio until the
    1017             :          * remap completes.  Multiple concurrent directio writes to the same
    1018             :          * region are handled by end_cow processing only occurring for the
    1019             :          * threads which succeed; the outcome of multiple overlapping direct
    1020             :          * writes is not well defined anyway.
    1021             :          *
    1022             :          * It's possible that a buffered write and a direct write could collide
    1023             :          * here (the buffered write stumbles in after the dio flushes and
    1024             :          * invalidates the page cache and immediately queues writeback), but we
    1025             :          * have never supported this 100%.  If either disk write succeeds the
    1026             :          * blocks will be remapped.
    1027             :          */
    1028     4727937 :         while (end_fsb > offset_fsb && !error)
    1029     2984787 :                 error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
    1030             : 
    1031     1743150 :         if (error)
    1032          24 :                 trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
    1033     1743150 :         return error;
    1034             : }
    1035             : 
    1036             : /*
    1037             :  * Free all CoW staging blocks that are still referenced by the ondisk refcount
    1038             :  * metadata.  The ondisk metadata does not track which inode created the
    1039             :  * staging extent, so callers must ensure that there are no cached inodes with
    1040             :  * live CoW staging extents.
    1041             :  */
    1042             : int
    1043       11367 : xfs_reflink_recover_cow(
    1044             :         struct xfs_mount        *mp)
    1045             : {
    1046       11367 :         struct xfs_perag        *pag;
    1047       11367 :         struct xfs_rtgroup      *rtg;
    1048       11367 :         xfs_agnumber_t          agno;
    1049       11367 :         xfs_rgnumber_t          rgno;
    1050       11367 :         int                     error = 0;
    1051             : 
    1052       11367 :         if (!xfs_has_reflink(mp))
    1053             :                 return 0;
    1054             : 
    1055       56785 :         for_each_perag(mp, agno, pag) {
    1056       45426 :                 error = xfs_refcount_recover_cow_leftovers(mp, pag);
    1057       45426 :                 if (error) {
    1058           8 :                         xfs_perag_rele(pag);
    1059           8 :                         return error;
    1060             :                 }
    1061             :         }
    1062             : 
    1063       11359 :         for_each_rtgroup(mp, rgno, rtg) {
    1064           0 :                 error = xfs_refcount_recover_rtcow_leftovers(mp, rtg);
    1065           0 :                 if (error) {
    1066           0 :                         xfs_rtgroup_rele(rtg);
    1067           0 :                         return error;
    1068             :                 }
    1069             :         }
    1070             : 
    1071             :         return 0;
    1072             : }
    1073             : 
    1074             : /*
    1075             :  * Reflinking (Block) Ranges of Two Files Together
    1076             :  *
    1077             :  * First, ensure that the reflink flag is set on both inodes.  The flag is an
    1078             :  * optimization to avoid unnecessary refcount btree lookups in the write path.
    1079             :  *
    1080             :  * Now we can iteratively remap the range of extents (and holes) in src to the
    1081             :  * corresponding ranges in dest.  Let drange and srange denote the ranges of
    1082             :  * logical blocks in dest and src touched by the reflink operation.
    1083             :  *
    1084             :  * While the length of drange is greater than zero,
    1085             :  *    - Read src's bmbt at the start of srange ("imap")
    1086             :  *    - If imap doesn't exist, make imap appear to start at the end of srange
    1087             :  *      with zero length.
    1088             :  *    - If imap starts before srange, advance imap to start at srange.
    1089             :  *    - If imap goes beyond srange, truncate imap to end at the end of srange.
    1090             :  *    - Punch (imap start - srange start + imap len) blocks from dest at
    1091             :  *      offset (drange start).
    1092             :  *    - If imap points to a real range of pblks,
    1093             :  *         > Increase the refcount of the imap's pblks
    1094             :  *         > Map imap's pblks into dest at the offset
    1095             :  *           (drange start + imap start - srange start)
    1096             :  *    - Advance drange and srange by (imap start - srange start + imap len)
    1097             :  *
    1098             :  * Finally, if the reflink made dest longer, update both the in-core and
    1099             :  * on-disk file sizes.
    1100             :  *
    1101             :  * ASCII Art Demonstration:
    1102             :  *
    1103             :  * Let's say we want to reflink this source file:
    1104             :  *
    1105             :  * ----SSSSSSS-SSSSS----SSSSSS (src file)
    1106             :  *   <-------------------->
    1107             :  *
    1108             :  * into this destination file:
    1109             :  *
    1110             :  * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
    1111             :  *        <-------------------->
    1112             :  * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
    1113             :  * Observe that the range has different logical offsets in either file.
    1114             :  *
    1115             :  * Consider that the first extent in the source file doesn't line up with our
    1116             :  * reflink range.  Unmapping  and remapping are separate operations, so we can
    1117             :  * unmap more blocks from the destination file than we remap.
    1118             :  *
    1119             :  * ----SSSSSSS-SSSSS----SSSSSS
    1120             :  *   <------->
    1121             :  * --DDDDD---------DDDDD--DDD
    1122             :  *        <------->
    1123             :  *
    1124             :  * Now remap the source extent into the destination file:
    1125             :  *
    1126             :  * ----SSSSSSS-SSSSS----SSSSSS
    1127             :  *   <------->
    1128             :  * --DDDDD--SSSSSSSDDDDD--DDD
    1129             :  *        <------->
    1130             :  *
    1131             :  * Do likewise with the second hole and extent in our range.  Holes in the
    1132             :  * unmap range don't affect our operation.
    1133             :  *
    1134             :  * ----SSSSSSS-SSSSS----SSSSSS
    1135             :  *            <---->
    1136             :  * --DDDDD--SSSSSSS-SSSSS-DDD
    1137             :  *                 <---->
    1138             :  *
    1139             :  * Finally, unmap and remap part of the third extent.  This will increase the
    1140             :  * size of the destination file.
    1141             :  *
    1142             :  * ----SSSSSSS-SSSSS----SSSSSS
    1143             :  *                  <----->
    1144             :  * --DDDDD--SSSSSSS-SSSSS----SSS
    1145             :  *                       <----->
    1146             :  *
    1147             :  * Once we update the destination file's i_size, we're done.
    1148             :  */
    1149             : 
    1150             : /*
    1151             :  * Ensure the reflink bit is set in both inodes.
    1152             :  */
    1153             : STATIC int
    1154   197421863 : xfs_reflink_set_inode_flag(
    1155             :         struct xfs_inode        *src,
    1156             :         struct xfs_inode        *dest)
    1157             : {
    1158   197421863 :         struct xfs_mount        *mp = src->i_mount;
    1159   197421863 :         int                     error;
    1160   197421863 :         struct xfs_trans        *tp;
    1161             : 
    1162   197421863 :         if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
    1163             :                 return 0;
    1164             : 
    1165     3760210 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
    1166     3761792 :         if (error)
    1167           2 :                 goto out_error;
    1168             : 
    1169             :         /* Lock both files against IO */
    1170     3761790 :         if (src->i_ino == dest->i_ino)
    1171       76209 :                 xfs_ilock(src, XFS_ILOCK_EXCL);
    1172             :         else
    1173     3685581 :                 xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
    1174             : 
    1175     3761785 :         if (!xfs_is_reflink_inode(src)) {
    1176      327289 :                 trace_xfs_reflink_set_inode_flag(src);
    1177      327289 :                 xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
    1178      327289 :                 src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
    1179      327289 :                 xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
    1180      327289 :                 xfs_ifork_init_cow(src);
    1181             :         } else
    1182     3434496 :                 xfs_iunlock(src, XFS_ILOCK_EXCL);
    1183             : 
    1184     3761791 :         if (src->i_ino == dest->i_ino)
    1185       76209 :                 goto commit_flags;
    1186             : 
    1187     3685582 :         if (!xfs_is_reflink_inode(dest)) {
    1188     3484781 :                 trace_xfs_reflink_set_inode_flag(dest);
    1189     3484781 :                 xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
    1190     3484777 :                 dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
    1191     3484777 :                 xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
    1192     3484781 :                 xfs_ifork_init_cow(dest);
    1193             :         } else
    1194      200801 :                 xfs_iunlock(dest, XFS_ILOCK_EXCL);
    1195             : 
    1196     3761791 : commit_flags:
    1197     3761791 :         error = xfs_trans_commit(tp);
    1198     3761789 :         if (error)
    1199           3 :                 goto out_error;
    1200             :         return error;
    1201             : 
    1202           5 : out_error:
    1203           5 :         trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
    1204           5 :         return error;
    1205             : }
    1206             : 
    1207             : /*
    1208             :  * Update destination inode size & cowextsize hint, if necessary.
    1209             :  */
    1210             : int
    1211   196825597 : xfs_reflink_update_dest(
    1212             :         struct xfs_inode        *dest,
    1213             :         xfs_off_t               newlen,
    1214             :         xfs_extlen_t            cowextsize,
    1215             :         unsigned int            remap_flags)
    1216             : {
    1217   196825597 :         struct xfs_mount        *mp = dest->i_mount;
    1218   196825597 :         struct xfs_trans        *tp;
    1219   196825597 :         int                     error;
    1220             : 
    1221   196825597 :         if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
    1222             :                 return 0;
    1223             : 
    1224     2794278 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
    1225     2791886 :         if (error)
    1226           0 :                 goto out_error;
    1227             : 
    1228     2791886 :         xfs_ilock(dest, XFS_ILOCK_EXCL);
    1229     2791887 :         xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
    1230             : 
    1231     2791882 :         if (newlen > i_size_read(VFS_I(dest))) {
    1232     2791876 :                 trace_xfs_reflink_update_inode_size(dest, newlen);
    1233     2791880 :                 i_size_write(VFS_I(dest), newlen);
    1234     2791880 :                 dest->i_disk_size = newlen;
    1235             :         }
    1236             : 
    1237     2791886 :         if (cowextsize) {
    1238           6 :                 dest->i_cowextsize = cowextsize;
    1239           6 :                 dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
    1240             :         }
    1241             : 
    1242     2791886 :         xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
    1243             : 
    1244     2791887 :         error = xfs_trans_commit(tp);
    1245     2791887 :         if (error)
    1246           0 :                 goto out_error;
    1247             :         return error;
    1248             : 
    1249           0 : out_error:
    1250           0 :         trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
    1251           0 :         return error;
    1252             : }
    1253             : 
    1254             : /*
    1255             :  * Do we have enough reserve in this AG to handle a reflink?  The refcount
    1256             :  * btree already reserved all the space it needs, but the rmap btree can grow
    1257             :  * infinitely, so we won't allow more reflinks when the AG is down to the
    1258             :  * btree reserves.
    1259             :  */
    1260             : static int
    1261    61781221 : xfs_reflink_ag_has_free_space(
    1262             :         struct xfs_mount        *mp,
    1263             :         struct xfs_inode        *ip,
    1264             :         xfs_fsblock_t           fsb)
    1265             : {
    1266    61781221 :         struct xfs_perag        *pag;
    1267    61781221 :         xfs_agnumber_t          agno;
    1268    61781221 :         int                     error = 0;
    1269             : 
    1270    61781221 :         if (!xfs_has_rmapbt(mp))
    1271             :                 return 0;
    1272    61781221 :         if (XFS_IS_REALTIME_INODE(ip)) {
    1273     7502544 :                 struct xfs_rtgroup      *rtg;
    1274     7502544 :                 xfs_rgnumber_t          rgno;
    1275             : 
    1276     7502544 :                 rgno = xfs_rtb_to_rgno(mp, fsb);
    1277     7502544 :                 rtg = xfs_rtgroup_get(mp, rgno);
    1278    15005012 :                 if (xfs_imeta_resv_critical(rtg->rtg_rmapip) ||
    1279     7502499 :                     xfs_imeta_resv_critical(rtg->rtg_refcountip))
    1280             :                         error = -ENOSPC;
    1281     7502504 :                 xfs_rtgroup_put(rtg);
    1282     7502504 :                 return error;
    1283             :         }
    1284             : 
    1285    54278682 :         agno = XFS_FSB_TO_AGNO(mp, fsb);
    1286    54278682 :         pag = xfs_perag_get(mp, agno);
    1287   108557338 :         if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
    1288    54278660 :             xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
    1289             :                 error = -ENOSPC;
    1290    54278674 :         xfs_perag_put(pag);
    1291    54278674 :         return error;
    1292             : }
    1293             : 
    1294             : /*
    1295             :  * Remap the given extent into the file.  The dmap blockcount will be set to
    1296             :  * the number of blocks that were actually remapped.
    1297             :  */
    1298             : STATIC int
    1299   287373995 : xfs_reflink_remap_extent(
    1300             :         struct xfs_inode        *ip,
    1301             :         struct xfs_bmbt_irec    *dmap,
    1302             :         xfs_off_t               new_isize)
    1303             : {
    1304   287373995 :         struct xfs_bmbt_irec    smap;
    1305   287373995 :         struct xfs_mount        *mp = ip->i_mount;
    1306   287373995 :         struct xfs_trans        *tp;
    1307   287373995 :         xfs_off_t               newlen;
    1308   287373995 :         int64_t                 qdelta = 0;
    1309   287373995 :         unsigned int            dblocks, rblocks, resblks;
    1310   287373995 :         bool                    quota_reserved = true;
    1311   287373995 :         bool                    smap_real;
    1312   287373995 :         bool                    dmap_written = xfs_bmap_is_written_extent(dmap);
    1313   287373995 :         bool                    isrt = XFS_IS_REALTIME_INODE(ip);
    1314   287373995 :         int                     iext_delta = 0;
    1315   287373995 :         int                     nimaps;
    1316   287373995 :         int                     error;
    1317             : 
    1318             :         /*
    1319             :          * Start a rolling transaction to switch the mappings.
    1320             :          *
    1321             :          * Adding a written extent to the extent map can cause a bmbt split,
    1322             :          * and removing a mapped extent from the extent can cause a bmbt split.
    1323             :          * The two operations cannot both cause a split since they operate on
    1324             :          * the same index in the bmap btree, so we only need a reservation for
    1325             :          * one bmbt split if either thing is happening.  However, we haven't
    1326             :          * locked the inode yet, so we reserve assuming this is the case.
    1327             :          *
    1328             :          * The first allocation call tries to reserve enough space to handle
    1329             :          * mapping dmap into a sparse part of the file plus the bmbt split.  We
    1330             :          * haven't locked the inode or read the existing mapping yet, so we do
    1331             :          * not know for sure that we need the space.  This should succeed most
    1332             :          * of the time.
    1333             :          *
    1334             :          * If the first attempt fails, try again but reserving only enough
    1335             :          * space to handle a bmbt split.  This is the hard minimum requirement,
    1336             :          * and we revisit quota reservations later when we know more about what
    1337             :          * we're remapping.
    1338             :          */
    1339   287373995 :         resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
    1340   287373995 :         if (XFS_IS_REALTIME_INODE(ip)) {
    1341   100457850 :                 dblocks = resblks;
    1342   100457850 :                 rblocks = dmap->br_blockcount;
    1343             :         } else {
    1344   186916300 :                 dblocks = resblks + dmap->br_blockcount;
    1345   186916300 :                 rblocks = 0;
    1346             :         }
    1347   287373995 :         error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
    1348             :                         dblocks, rblocks, false, &tp);
    1349   287379650 :         if (error == -EDQUOT || error == -ENOSPC) {
    1350     2374348 :                 quota_reserved = false;
    1351     2374348 :                 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
    1352             :                                 resblks, 0, false, &tp);
    1353             :         }
    1354   287379320 :         if (error)
    1355      596854 :                 goto out;
    1356             : 
    1357             :         /*
    1358             :          * Read what's currently mapped in the destination file into smap.
    1359             :          * If smap isn't a hole, we will have to remove it before we can add
    1360             :          * dmap to the destination file.
    1361             :          */
    1362   286782466 :         nimaps = 1;
    1363   286782466 :         error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
    1364             :                         &smap, &nimaps, 0);
    1365   286780343 :         if (error)
    1366           9 :                 goto out_cancel;
    1367   286780334 :         ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
    1368   286780334 :         smap_real = xfs_bmap_is_real_extent(&smap);
    1369             : 
    1370             :         /*
    1371             :          * We can only remap as many blocks as the smaller of the two extent
    1372             :          * maps, because we can only remap one extent at a time.
    1373             :          */
    1374   286780334 :         dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
    1375   286780334 :         ASSERT(dmap->br_blockcount == smap.br_blockcount);
    1376             : 
    1377   286780334 :         trace_xfs_reflink_remap_extent_dest(ip, &smap);
    1378             : 
    1379             :         /*
    1380             :          * Two extents mapped to the same physical block must not have
    1381             :          * different states; that's filesystem corruption.  Move on to the next
    1382             :          * extent if they're both holes or both the same physical extent.
    1383             :          */
    1384   286778070 :         if (dmap->br_startblock == smap.br_startblock) {
    1385   202840393 :                 if (dmap->br_state != smap.br_state) {
    1386           0 :                         xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
    1387           0 :                         error = -EFSCORRUPTED;
    1388             :                 }
    1389   202840393 :                 goto out_cancel;
    1390             :         }
    1391             : 
    1392             :         /* If both extents are unwritten, leave them alone. */
    1393    83937677 :         if (dmap->br_state == XFS_EXT_UNWRITTEN &&
    1394     7858778 :             smap.br_state == XFS_EXT_UNWRITTEN)
    1395      301248 :                 goto out_cancel;
    1396             : 
    1397             :         /* No reflinking if the AG of the dest mapping is low on space. */
    1398    83636429 :         if (dmap_written) {
    1399    61781223 :                 error = xfs_reflink_ag_has_free_space(mp, ip,
    1400             :                                 dmap->br_startblock);
    1401    61781196 :                 if (error)
    1402           6 :                         goto out_cancel;
    1403             :         }
    1404             : 
    1405             :         /*
    1406             :          * Increase quota reservation if we think the quota block counter for
    1407             :          * this file could increase.
    1408             :          *
    1409             :          * If we are mapping a written extent into the file, we need to have
    1410             :          * enough quota block count reservation to handle the blocks in that
    1411             :          * extent.  We log only the delta to the quota block counts, so if the
    1412             :          * extent we're unmapping also has blocks allocated to it, we don't
    1413             :          * need a quota reservation for the extent itself.
    1414             :          *
    1415             :          * Note that if we're replacing a delalloc reservation with a written
    1416             :          * extent, we have to take the full quota reservation because removing
    1417             :          * the delalloc reservation gives the block count back to the quota
    1418             :          * count.  This is suboptimal, but the VFS flushed the dest range
    1419             :          * before we started.  That should have removed all the delalloc
    1420             :          * reservations, but we code defensively.
    1421             :          *
    1422             :          * xfs_trans_alloc_inode above already tried to grab an even larger
    1423             :          * quota reservation, and kicked off a blockgc scan if it couldn't.
    1424             :          * If we can't get a potentially smaller quota reservation now, we're
    1425             :          * done.
    1426             :          */
    1427    83636396 :         if (!quota_reserved && !smap_real && dmap_written) {
    1428       17560 :                 if (XFS_IS_REALTIME_INODE(ip)) {
    1429           0 :                         dblocks = 0;
    1430           0 :                         rblocks = dmap->br_blockcount;
    1431             :                 } else {
    1432       17560 :                         dblocks = dmap->br_blockcount;
    1433       17560 :                         rblocks = 0;
    1434             :                 }
    1435       17560 :                 error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
    1436             :                                 false);
    1437       17560 :                 if (error)
    1438           0 :                         goto out_cancel;
    1439             :         }
    1440             : 
    1441    83636396 :         if (smap_real)
    1442    15551352 :                 ++iext_delta;
    1443             : 
    1444    83636396 :         if (dmap_written)
    1445    61781193 :                 ++iext_delta;
    1446             : 
    1447    83636396 :         error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
    1448    83636241 :         if (error == -EFBIG)
    1449           6 :                 error = xfs_iext_count_upgrade(tp, ip, iext_delta);
    1450    83636241 :         if (error)
    1451           6 :                 goto out_cancel;
    1452             : 
    1453    83636235 :         if (smap_real) {
    1454             :                 /*
    1455             :                  * If the extent we're unmapping is backed by storage (written
    1456             :                  * or not), unmap the extent and drop its refcount.
    1457             :                  */
    1458    15551212 :                 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
    1459    15550755 :                 xfs_refcount_decrease_extent(tp, isrt, &smap);
    1460    15550889 :                 qdelta -= smap.br_blockcount;
    1461    68085023 :         } else if (smap.br_startblock == DELAYSTARTBLOCK) {
    1462          79 :                 int             done;
    1463             : 
    1464             :                 /*
    1465             :                  * If the extent we're unmapping is a delalloc reservation,
    1466             :                  * we can use the regular bunmapi function to release the
    1467             :                  * incore state.  Dropping the delalloc reservation takes care
    1468             :                  * of the quota reservation for us.
    1469             :                  */
    1470          79 :                 error = xfs_bunmapi(NULL, ip, smap.br_startoff,
    1471             :                                 smap.br_blockcount, 0, 1, &done);
    1472          79 :                 if (error)
    1473           0 :                         goto out_cancel;
    1474          79 :                 ASSERT(done);
    1475             :         }
    1476             : 
    1477             :         /*
    1478             :          * If the extent we're sharing is backed by written storage, increase
    1479             :          * its refcount and map it into the file.
    1480             :          */
    1481    83635912 :         if (dmap_written) {
    1482    61781139 :                 xfs_refcount_increase_extent(tp, isrt, dmap);
    1483    61781180 :                 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
    1484    61781152 :                 qdelta += dmap->br_blockcount;
    1485             :         }
    1486             : 
    1487    83635925 :         xfs_reflink_update_quota(tp, ip, false, qdelta);
    1488             : 
    1489             :         /* Update dest isize if needed. */
    1490    83635694 :         newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
    1491    83635694 :         newlen = min_t(xfs_off_t, newlen, new_isize);
    1492    83635694 :         if (newlen > i_size_read(VFS_I(ip))) {
    1493    53800692 :                 trace_xfs_reflink_update_inode_size(ip, newlen);
    1494    53800692 :                 i_size_write(VFS_I(ip), newlen);
    1495    53800692 :                 ip->i_disk_size = newlen;
    1496    53800692 :                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
    1497             :         }
    1498             : 
    1499             :         /* Commit everything and unlock. */
    1500    83635694 :         error = xfs_trans_commit(tp);
    1501    83636702 :         goto out_unlock;
    1502             : 
    1503   203141662 : out_cancel:
    1504   203141662 :         xfs_trans_cancel(tp);
    1505   286778787 : out_unlock:
    1506   286778787 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    1507   287368429 : out:
    1508   287368429 :         if (error)
    1509      597670 :                 trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
    1510   287368438 :         return error;
    1511             : }
    1512             : 
    1513             : /* Remap a range of one file to the other. */
    1514             : int
    1515   197409030 : xfs_reflink_remap_blocks(
    1516             :         struct xfs_inode        *src,
    1517             :         loff_t                  pos_in,
    1518             :         struct xfs_inode        *dest,
    1519             :         loff_t                  pos_out,
    1520             :         loff_t                  remap_len,
    1521             :         loff_t                  *remapped)
    1522             : {
    1523   197409030 :         struct xfs_bmbt_irec    imap;
    1524   197409030 :         struct xfs_mount        *mp = src->i_mount;
    1525   197409030 :         xfs_fileoff_t           srcoff = XFS_B_TO_FSBT(mp, pos_in);
    1526   197409030 :         xfs_fileoff_t           destoff = XFS_B_TO_FSBT(mp, pos_out);
    1527   197409030 :         xfs_filblks_t           len;
    1528   197409030 :         xfs_filblks_t           remapped_len = 0;
    1529   197409030 :         xfs_off_t               new_isize = pos_out + remap_len;
    1530   197409030 :         int                     nimaps;
    1531   197409030 :         int                     error = 0;
    1532             : 
    1533   197409030 :         len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
    1534             :                         XFS_MAX_FILEOFF);
    1535             : 
    1536             :         /*
    1537             :          * Make sure the end is aligned with a rt extent (if desired), since
    1538             :          * the end of the range could be EOF.
    1539             :          */
    1540   197409030 :         if (xfs_inode_has_bigrtextents(dest))
    1541           0 :                 len = xfs_rtb_roundup_rtx(mp, len);
    1542             : 
    1543   197409030 :         trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
    1544             : 
    1545   484180453 :         while (len > 0) {
    1546   287358801 :                 unsigned int    lock_mode;
    1547             : 
    1548             :                 /* Read extent from the source file */
    1549   287358801 :                 nimaps = 1;
    1550   287358801 :                 lock_mode = xfs_ilock_data_map_shared(src);
    1551   287366027 :                 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
    1552   287371356 :                 xfs_iunlock(src, lock_mode);
    1553   287375642 :                 if (error)
    1554             :                         break;
    1555             :                 /*
    1556             :                  * The caller supposedly flushed all dirty pages in the source
    1557             :                  * file range, which means that writeback should have allocated
    1558             :                  * or deleted all delalloc reservations in that range.  If we
    1559             :                  * find one, that's a good sign that something is seriously
    1560             :                  * wrong here.
    1561             :                  */
    1562   287375581 :                 ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
    1563   287375581 :                 if (imap.br_startblock == DELAYSTARTBLOCK) {
    1564           0 :                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
    1565           0 :                         xfs_bmap_mark_sick(src, XFS_DATA_FORK);
    1566           0 :                         error = -EFSCORRUPTED;
    1567           0 :                         break;
    1568             :                 }
    1569             : 
    1570   287375581 :                 trace_xfs_reflink_remap_extent_src(src, &imap);
    1571             : 
    1572             :                 /* Remap into the destination file at the given offset. */
    1573   287376186 :                 imap.br_startoff = destoff;
    1574   287376186 :                 error = xfs_reflink_remap_extent(dest, &imap, new_isize);
    1575   287365658 :                 if (error)
    1576             :                         break;
    1577             : 
    1578   286767980 :                 if (fatal_signal_pending(current)) {
    1579             :                         error = -EINTR;
    1580             :                         break;
    1581             :                 }
    1582             : 
    1583             :                 /* Advance drange/srange */
    1584   286771423 :                 srcoff += imap.br_blockcount;
    1585   286771423 :                 destoff += imap.br_blockcount;
    1586   286771423 :                 len -= imap.br_blockcount;
    1587   286771423 :                 remapped_len += imap.br_blockcount;
    1588             :         }
    1589             : 
    1590   197424485 :         if (error)
    1591      597904 :                 trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
    1592   197424485 :         *remapped = min_t(loff_t, remap_len,
    1593             :                           XFS_FSB_TO_B(src->i_mount, remapped_len));
    1594   197424485 :         return error;
    1595             : }
    1596             : 
    1597             : /*
    1598             :  * If we're reflinking to a point past the destination file's EOF, we must
    1599             :  * zero any speculative post-EOF preallocations that sit between the old EOF
    1600             :  * and the destination file offset.
    1601             :  */
    1602             : static int
    1603   197424030 : xfs_reflink_zero_posteof(
    1604             :         struct xfs_inode        *ip,
    1605             :         loff_t                  pos)
    1606             : {
    1607   197424030 :         loff_t                  isize = i_size_read(VFS_I(ip));
    1608             : 
    1609   197424030 :         if (pos <= isize)
    1610             :                 return 0;
    1611             : 
    1612     3107520 :         trace_xfs_zero_eof(ip, isize, pos - isize);
    1613     3107518 :         return xfs_zero_range(ip, isize, pos - isize, NULL);
    1614             : }
    1615             : 
    1616             : #ifdef CONFIG_XFS_RT
    1617             : /* Adjust the length of the remap operation to end on a rt extent boundary. */
    1618             : STATIC int
    1619           0 : xfs_reflink_remap_adjust_rtlen(
    1620             :         struct xfs_inode        *src,
    1621             :         loff_t                  pos_in,
    1622             :         struct xfs_inode        *dest,
    1623             :         loff_t                  pos_out,
    1624             :         loff_t                  *len,
    1625             :         unsigned int            remap_flags)
    1626             : {
    1627           0 :         struct xfs_mount        *mp = src->i_mount;
    1628           0 :         uint32_t                mod;
    1629             : 
    1630           0 :         div_u64_rem(*len, XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize), &mod);
    1631             : 
    1632             :         /*
    1633             :          * We previously checked the rtextent alignment of both offsets, so we
    1634             :          * now have to check the alignment of the length.  The VFS remap prep
    1635             :          * function can change the length on us, so we can only make length
    1636             :          * adjustments after that.  If the length is aligned to an rtextent,
    1637             :          * we're trivially good to go.
    1638             :          *
    1639             :          * Otherwise, the length is not aligned to an rt extent.  If the source
    1640             :          * file's range ends at EOF, the VFS ensured that the dest file's range
    1641             :          * also ends at EOF.  The actual remap function will round the (byte)
    1642             :          * length up to the nearest rtextent unit, so we're ok here too.
    1643             :          */
    1644           0 :         if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
    1645             :                 return 0;
    1646             : 
    1647             :         /*
    1648             :          * Otherwise, the only thing we can do is round the request length down
    1649             :          * to an rt extent boundary.  If the caller doesn't allow that, we are
    1650             :          * finished.
    1651             :          */
    1652           0 :         if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
    1653             :                 return -EINVAL;
    1654             : 
    1655             :         /* Back off by a single extent. */
    1656           0 :         (*len) -= mod;
    1657           0 :         trace_xfs_reflink_remap_adjust_rtlen(src, pos_in, *len, dest, pos_out);
    1658           0 :         return 0;
    1659             : }
    1660             : #else
    1661             : # define xfs_reflink_remap_adjust_rtlen(...)            (0)
    1662             : #endif /* CONFIG_XFS_RT */
    1663             : 
    1664             : /*
    1665             :  * Check the alignment of a remap request when the allocation unit size isn't a
    1666             :  * power of two.  The VFS helpers use (fast) bitmask-based alignment checks,
    1667             :  * but here we have to use slow long division.
    1668             :  */
    1669             : static int
    1670           0 : xfs_reflink_remap_check_rtalign(
    1671             :         struct xfs_inode                *ip_in,
    1672             :         loff_t                          pos_in,
    1673             :         struct xfs_inode                *ip_out,
    1674             :         loff_t                          pos_out,
    1675             :         loff_t                          *req_len,
    1676             :         unsigned int                    remap_flags)
    1677             : {
    1678           0 :         struct xfs_mount                *mp = ip_in->i_mount;
    1679           0 :         uint32_t                        rextbytes;
    1680           0 :         loff_t                          in_size, out_size;
    1681           0 :         loff_t                          new_length, length = *req_len;
    1682           0 :         loff_t                          blen;
    1683             : 
    1684           0 :         rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
    1685           0 :         in_size = i_size_read(VFS_I(ip_in));
    1686           0 :         out_size = i_size_read(VFS_I(ip_out));
    1687             : 
    1688             :         /* The start of both ranges must be aligned to a rt extent. */
    1689           0 :         if (!isaligned_64(pos_in, rextbytes) ||
    1690           0 :             !isaligned_64(pos_out, rextbytes))
    1691             :                 return -EINVAL;
    1692             : 
    1693           0 :         if (length == 0)
    1694           0 :                 length = in_size - pos_in;
    1695             : 
    1696             :         /*
    1697             :          * If the user wanted us to exchange up to the infile's EOF, round up
    1698             :          * to the next block boundary for this check.
    1699             :          *
    1700             :          * Otherwise, reject the range length if it's not extent aligned.  We
    1701             :          * already confirmed the starting offsets' extent alignment.
    1702             :          */
    1703           0 :         if (pos_in + length == in_size)
    1704           0 :                 blen = roundup_64(in_size, rextbytes) - pos_in;
    1705             :         else
    1706           0 :                 blen = rounddown_64(length, rextbytes);
    1707             : 
    1708             :         /* Don't allow overlapped remappings within the same file. */
    1709           0 :         if (ip_in == ip_out &&
    1710           0 :             pos_out + blen > pos_in &&
    1711           0 :             pos_in + blen > pos_out)
    1712             :                 return -EINVAL;
    1713             : 
    1714             :         /*
    1715             :          * Ensure that we don't exchange a partial EOF extent into the middle
    1716             :          * of another file.
    1717             :          */
    1718           0 :         if (isaligned_64(length, rextbytes))
    1719             :                 return 0;
    1720             : 
    1721           0 :         new_length = length;
    1722           0 :         if (pos_out + length < out_size)
    1723           0 :                 new_length = rounddown_64(new_length, rextbytes);
    1724             : 
    1725           0 :         if (new_length == length)
    1726             :                 return 0;
    1727             : 
    1728             :         /*
    1729             :          * Return the shortened request if the caller permits it.  If the
    1730             :          * request was shortened to zero rt extents, we know that the original
    1731             :          * arguments weren't valid in the first place.
    1732             :          */
    1733           0 :         if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) {
    1734           0 :                 *req_len = new_length;
    1735           0 :                 return 0;
    1736             :         }
    1737             : 
    1738           0 :         return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
    1739             : }
    1740             : 
    1741             : /*
    1742             :  * Prepare two files for range cloning.  Upon a successful return both inodes
    1743             :  * will have the iolock and mmaplock held, the page cache of the out file will
    1744             :  * be truncated, and any leases on the out file will have been broken.  This
    1745             :  * function borrows heavily from xfs_file_aio_write_checks.
    1746             :  *
    1747             :  * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
    1748             :  * checked that the bytes beyond EOF physically match. Hence we cannot use the
    1749             :  * EOF block in the source dedupe range because it's not a complete block match,
    1750             :  * hence can introduce a corruption into the file that has it's block replaced.
    1751             :  *
    1752             :  * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
    1753             :  * "block aligned" for the purposes of cloning entire files.  However, if the
    1754             :  * source file range includes the EOF block and it lands within the existing EOF
    1755             :  * of the destination file, then we can expose stale data from beyond the source
    1756             :  * file EOF in the destination file.
    1757             :  *
    1758             :  * XFS doesn't support partial block sharing, so in both cases we have check
    1759             :  * these cases ourselves. For dedupe, we can simply round the length to dedupe
    1760             :  * down to the previous whole block and ignore the partial EOF block. While this
    1761             :  * means we can't dedupe the last block of a file, this is an acceptible
    1762             :  * tradeoff for simplicity on implementation.
    1763             :  *
    1764             :  * For cloning, we want to share the partial EOF block if it is also the new EOF
    1765             :  * block of the destination file. If the partial EOF block lies inside the
    1766             :  * existing destination EOF, then we have to abort the clone to avoid exposing
    1767             :  * stale data in the destination file. Hence we reject these clone attempts with
    1768             :  * -EINVAL in this case.
    1769             :  */
    1770             : int
    1771   272702656 : xfs_reflink_remap_prep(
    1772             :         struct file             *file_in,
    1773             :         loff_t                  pos_in,
    1774             :         struct file             *file_out,
    1775             :         loff_t                  pos_out,
    1776             :         loff_t                  *len,
    1777             :         unsigned int            remap_flags)
    1778             : {
    1779   272702656 :         struct inode            *inode_in = file_inode(file_in);
    1780   272702656 :         struct xfs_inode        *src = XFS_I(inode_in);
    1781   272702656 :         struct inode            *inode_out = file_inode(file_out);
    1782   272702656 :         struct xfs_inode        *dest = XFS_I(inode_out);
    1783   272702656 :         const struct iomap_ops  *dax_read_ops = NULL;
    1784   272702656 :         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(dest);
    1785   272701536 :         int                     ret;
    1786             : 
    1787             :         /* Lock both files against IO */
    1788   272701536 :         ret = xfs_ilock2_io_mmap(src, dest);
    1789   272722455 :         if (ret)
    1790             :                 return ret;
    1791             : 
    1792             :         /* Check file eligibility and prepare for block sharing. */
    1793   272722455 :         ret = -EINVAL;
    1794             :         /* Can't reflink between data and rt volumes */
    1795   588288367 :         if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
    1796           0 :                 goto out_unlock;
    1797             : 
    1798             :         /* Don't share DAX file data with non-DAX file. */
    1799   272722455 :         if (IS_DAX(inode_in) != IS_DAX(inode_out))
    1800             :                 goto out_unlock;
    1801             : 
    1802             :         /* Check non-power of two alignment issues, if necessary. */
    1803   387660867 :         if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
    1804           0 :                 ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,
    1805             :                                 pos_out, len, remap_flags);
    1806           0 :                 if (ret)
    1807           0 :                         goto out_unlock;
    1808             : 
    1809             :                 /* Do the VFS checks with the regular block alignment. */
    1810           0 :                 alloc_unit = src->i_mount->m_sb.sb_blocksize;
    1811             :         }
    1812             : 
    1813   272722455 :         if (IS_DAX(inode_in))
    1814             :                 dax_read_ops = &xfs_read_iomap_ops;
    1815             : 
    1816   272722455 :         ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
    1817             :                         pos_out, len, remap_flags, dax_read_ops, alloc_unit);
    1818   272720854 :         if (ret || *len == 0)
    1819    75293322 :                 goto out_unlock;
    1820             : 
    1821             :         /* Make sure the end is aligned with a rt extent. */
    1822   197427532 :         if (xfs_inode_has_bigrtextents(src)) {
    1823           0 :                 ret = xfs_reflink_remap_adjust_rtlen(src, pos_in, dest,
    1824             :                                 pos_out, len, remap_flags);
    1825           0 :                 if (ret || *len == 0)
    1826           0 :                         goto out_unlock;
    1827             :         }
    1828             : 
    1829             :         /* Attach dquots to dest inode before changing block map */
    1830   197427532 :         ret = xfs_qm_dqattach(dest);
    1831   197425805 :         if (ret)
    1832           0 :                 goto out_unlock;
    1833             : 
    1834             :         /*
    1835             :          * Zero existing post-eof speculative preallocations in the destination
    1836             :          * file.
    1837             :          */
    1838   197425805 :         ret = xfs_reflink_zero_posteof(dest, pos_out);
    1839   197425198 :         if (ret)
    1840         225 :                 goto out_unlock;
    1841             : 
    1842             :         /* Set flags and remap blocks. */
    1843   197424973 :         ret = xfs_reflink_set_inode_flag(src, dest);
    1844   197424085 :         if (ret)
    1845           5 :                 goto out_unlock;
    1846             : 
    1847             :         /*
    1848             :          * Now that we've marked both inodes for reflink, make sure that all
    1849             :          * possible rt extents in both files' ranges are either wholly written,
    1850             :          * wholly unwritten, or holes.  The bmap code requires that we align
    1851             :          * all unmap and remap requests to a rt extent boundary.  We've already
    1852             :          * flushed the page cache and finished directio for the range that's
    1853             :          * being remapped, so we can convert the extents directly.
    1854             :          */
    1855   197424080 :         if (xfs_inode_has_bigrtextents(src)) {
    1856           0 :                 ret = xfs_rtfile_convert_unwritten(src, pos_in, *len);
    1857           0 :                 if (ret)
    1858           0 :                         goto out_unlock;
    1859             :         }
    1860   197424080 :         if (xfs_inode_has_bigrtextents(dest)) {
    1861           0 :                 ret = xfs_rtfile_convert_unwritten(dest, pos_out, *len);
    1862           0 :                 if (ret)
    1863           0 :                         goto out_unlock;
    1864             :         }
    1865             : 
    1866             :         /*
    1867             :          * If pos_out > EOF, we may have dirtied blocks between EOF and
    1868             :          * pos_out. In that case, we need to extend the flush and unmap to cover
    1869             :          * from EOF to the end of the copy length.
    1870             :          */
    1871   394848160 :         if (pos_out > XFS_ISIZE(dest)) {
    1872     3105240 :                 loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
    1873     3105240 :                 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
    1874             :         } else {
    1875   194318840 :                 ret = xfs_flush_unmap_range(dest, pos_out, *len);
    1876             :         }
    1877   197418404 :         if (ret)
    1878         916 :                 goto out_unlock;
    1879             : 
    1880             :         return 0;
    1881    75294468 : out_unlock:
    1882    75294468 :         xfs_iunlock2_io_mmap(src, dest);
    1883    75294468 :         return ret;
    1884             : }
    1885             : 
    1886             : /* Does this inode need the reflink flag? */
    1887             : int
    1888    23128660 : xfs_reflink_inode_has_shared_extents(
    1889             :         struct xfs_trans                *tp,
    1890             :         struct xfs_inode                *ip,
    1891             :         bool                            *has_shared)
    1892             : {
    1893    23128660 :         struct xfs_bmbt_irec            got;
    1894    23128660 :         struct xfs_mount                *mp = ip->i_mount;
    1895    23128660 :         struct xfs_ifork                *ifp;
    1896    23128660 :         struct xfs_iext_cursor          icur;
    1897    23128660 :         bool                            found;
    1898    23128660 :         int                             error;
    1899             : 
    1900    23128660 :         ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
    1901    23128660 :         error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
    1902    23128967 :         if (error)
    1903             :                 return error;
    1904             : 
    1905    23128974 :         *has_shared = false;
    1906    23128974 :         found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
    1907   822691872 :         while (found) {
    1908   801131148 :                 xfs_agblock_t           rbno;
    1909   801131148 :                 xfs_extlen_t            rlen;
    1910             : 
    1911   801131148 :                 if (isnullstartblock(got.br_startblock) ||
    1912   801128710 :                     got.br_state != XFS_EXT_NORM)
    1913     9685066 :                         goto next;
    1914             : 
    1915   798887781 :                 if (XFS_IS_REALTIME_INODE(ip)) {
    1916     7440697 :                         struct xfs_rtgroup      *rtg;
    1917     7440697 :                         xfs_rgnumber_t          rgno;
    1918     7440697 :                         xfs_rgblock_t           rgbno;
    1919             : 
    1920     7440697 :                         rgbno = xfs_rtb_to_rgbno(mp, got.br_startblock, &rgno);
    1921     7440741 :                         rtg = xfs_rtgroup_get(mp, rgno);
    1922     7441045 :                         error = xfs_reflink_find_rtshared(rtg, tp, rgbno,
    1923     7441045 :                                         got.br_blockcount, &rbno, &rlen,
    1924             :                                         false);
    1925     7441501 :                         xfs_rtgroup_put(rtg);
    1926             :                 } else {
    1927   784005385 :                         struct xfs_perag        *pag;
    1928   784005385 :                         xfs_agblock_t           agbno;
    1929             : 
    1930   784005385 :                         pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
    1931             :                                                 got.br_startblock));
    1932   784006100 :                         agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
    1933   784006100 :                         error = xfs_reflink_find_shared(pag, tp, agbno,
    1934   784006100 :                                         got.br_blockcount, &rbno, &rlen,
    1935             :                                         false);
    1936   784009943 :                         xfs_perag_put(pag);
    1937             :                 }
    1938   791450100 :                 if (error)
    1939     1568235 :                         return error;
    1940             : 
    1941             :                 /* Is there still a shared block here? */
    1942   791450100 :                 if (rbno != NULLAGBLOCK) {
    1943     1568235 :                         *has_shared = true;
    1944     1568235 :                         return 0;
    1945             :                 }
    1946   789881865 : next:
    1947   799566931 :                 found = xfs_iext_next_extent(ifp, &icur, &got);
    1948             :         }
    1949             : 
    1950             :         return 0;
    1951             : }
    1952             : 
    1953             : /*
    1954             :  * Clear the inode reflink flag if there are no shared extents.
    1955             :  *
    1956             :  * The caller is responsible for joining the inode to the transaction passed in.
    1957             :  * The inode will be joined to the transaction that is returned to the caller.
    1958             :  */
    1959             : int
    1960       14390 : xfs_reflink_clear_inode_flag(
    1961             :         struct xfs_inode        *ip,
    1962             :         struct xfs_trans        **tpp)
    1963             : {
    1964       14390 :         bool                    needs_flag;
    1965       14390 :         int                     error = 0;
    1966             : 
    1967       14390 :         ASSERT(xfs_is_reflink_inode(ip));
    1968             : 
    1969       14390 :         error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
    1970       14390 :         if (error || needs_flag)
    1971             :                 return error;
    1972             : 
    1973             :         /*
    1974             :          * We didn't find any shared blocks so turn off the reflink flag.
    1975             :          * First, get rid of any leftover CoW mappings.
    1976             :          */
    1977        7473 :         error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
    1978             :                         true);
    1979        7473 :         if (error)
    1980             :                 return error;
    1981             : 
    1982             :         /* Clear the inode flag. */
    1983        7473 :         trace_xfs_reflink_unset_inode_flag(ip);
    1984        7473 :         ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
    1985        7473 :         xfs_inode_clear_cowblocks_tag(ip);
    1986        7473 :         xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
    1987             : 
    1988        7473 :         return error;
    1989             : }
    1990             : 
    1991             : /*
    1992             :  * Clear the inode reflink flag if there are no shared extents and the size
    1993             :  * hasn't changed.
    1994             :  */
    1995             : STATIC int
    1996          38 : xfs_reflink_try_clear_inode_flag(
    1997             :         struct xfs_inode        *ip)
    1998             : {
    1999          38 :         struct xfs_mount        *mp = ip->i_mount;
    2000          38 :         struct xfs_trans        *tp;
    2001          38 :         int                     error = 0;
    2002             : 
    2003             :         /* Start a rolling transaction to remove the mappings */
    2004          38 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
    2005          38 :         if (error)
    2006             :                 return error;
    2007             : 
    2008          38 :         xfs_ilock(ip, XFS_ILOCK_EXCL);
    2009          38 :         xfs_trans_ijoin(tp, ip, 0);
    2010             : 
    2011          38 :         error = xfs_reflink_clear_inode_flag(ip, &tp);
    2012          38 :         if (error)
    2013           0 :                 goto cancel;
    2014             : 
    2015          38 :         error = xfs_trans_commit(tp);
    2016          38 :         if (error)
    2017           0 :                 goto out;
    2018             : 
    2019          38 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    2020          38 :         return 0;
    2021             : cancel:
    2022           0 :         xfs_trans_cancel(tp);
    2023           0 : out:
    2024           0 :         xfs_iunlock(ip, XFS_ILOCK_EXCL);
    2025           0 :         return error;
    2026             : }
    2027             : 
    2028             : /*
    2029             :  * Pre-COW all shared blocks within a given byte range of a file and turn off
    2030             :  * the reflink flag if we unshare all of the file's blocks.
    2031             :  */
    2032             : int
    2033          66 : xfs_reflink_unshare(
    2034             :         struct xfs_inode        *ip,
    2035             :         xfs_off_t               offset,
    2036             :         xfs_off_t               len)
    2037             : {
    2038          66 :         struct inode            *inode = VFS_I(ip);
    2039          66 :         int                     error;
    2040             : 
    2041          66 :         if (!xfs_is_reflink_inode(ip))
    2042             :                 return 0;
    2043             : 
    2044          40 :         trace_xfs_reflink_unshare(ip, offset, len);
    2045             : 
    2046          40 :         inode_dio_wait(inode);
    2047             : 
    2048          40 :         if (IS_DAX(inode))
    2049             :                 error = dax_file_unshare(inode, offset, len,
    2050             :                                 &xfs_dax_write_iomap_ops);
    2051             :         else
    2052          40 :                 error = iomap_file_unshare(inode, offset, len,
    2053             :                                 &xfs_buffered_write_iomap_ops);
    2054          40 :         if (error)
    2055           0 :                 goto out;
    2056             : 
    2057          40 :         error = filemap_write_and_wait_range(inode->i_mapping, offset,
    2058          40 :                         offset + len - 1);
    2059          40 :         if (error)
    2060           2 :                 goto out;
    2061             : 
    2062             :         /* Turn off the reflink flag if possible. */
    2063          38 :         error = xfs_reflink_try_clear_inode_flag(ip);
    2064          38 :         if (error)
    2065           0 :                 goto out;
    2066             :         return 0;
    2067             : 
    2068           2 : out:
    2069           2 :         trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
    2070           2 :         return error;
    2071             : }

Generated by: LCOV version 1.14