LCOV - code coverage report
Current view: top level - fs/xfs - xfs_xchgrange.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsx @ Mon Jul 31 20:08:34 PDT 2023 Lines: 475 509 93.3 %
Date: 2023-07-31 20:08:34 Functions: 22 22 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * Copyright (C) 2020-2023 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <djwong@kernel.org>
       5             :  *
       6             :  * The xfs_swap_extent_* functions are:
       7             :  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
       8             :  * Copyright (c) 2012 Red Hat, Inc.
       9             :  * All Rights Reserved.
      10             :  */
      11             : #include "xfs.h"
      12             : #include "xfs_shared.h"
      13             : #include "xfs_format.h"
      14             : #include "xfs_log_format.h"
      15             : #include "xfs_trans_resv.h"
      16             : #include "xfs_mount.h"
      17             : #include "xfs_defer.h"
      18             : #include "xfs_inode.h"
      19             : #include "xfs_trans.h"
      20             : #include "xfs_quota.h"
      21             : #include "xfs_bmap_util.h"
      22             : #include "xfs_bmap_btree.h"
      23             : #include "xfs_reflink.h"
      24             : #include "xfs_trace.h"
      25             : #include "xfs_swapext.h"
      26             : #include "xfs_xchgrange.h"
      27             : #include "xfs_sb.h"
      28             : #include "xfs_icache.h"
      29             : #include "xfs_log.h"
      30             : #include "xfs_rtalloc.h"
      31             : #include "xfs_rtbitmap.h"
      32             : #include <linux/fsnotify.h>
      33             : 
      34             : /*
      35             :  * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
      36             :  * This part does not deal with XFS-specific data structures, and may some day
      37             :  * be ported to the VFS.
      38             :  *
      39             :  * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
      40             :  * file1 with the same number of bytes starting at fxr.file2_offset in file2.
      41             :  * Implementations must call xfs_exch_range_prep to prepare the two files
      42             :  * prior to taking locks; they must call xfs_exch_range_check_fresh once
      43             :  * the inode is locked to abort the call if file2 has changed; and they must
      44             :  * update the inode change and mod times of both files as part of the metadata
      45             :  * update.  The timestamp updates must be done atomically as part of the data
      46             :  * exchange operation to ensure correctness of the freshness check.
      47             :  */
      48             : 
      49             : /*
      50             :  * Check that both files' metadata agree with the snapshot that we took for
      51             :  * the range exchange request.
      52             :  *
      53             :  * This should be called after the filesystem has locked /all/ inode metadata
      54             :  * against modification.
      55             :  */
      56             : STATIC int
      57     3723718 : xfs_exch_range_check_fresh(
      58             :         struct inode                    *inode2,
      59             :         const struct xfs_exch_range     *fxr)
      60             : {
      61             :         /* Check that file2 hasn't otherwise been modified. */
      62     3723718 :         if ((fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) &&
      63       64241 :             (fxr->file2_ino        != inode2->i_ino ||
      64       64241 :              fxr->file2_ctime      != inode2->i_ctime.tv_sec  ||
      65       64240 :              fxr->file2_ctime_nsec != inode2->i_ctime.tv_nsec ||
      66       64211 :              fxr->file2_mtime      != inode2->i_mtime.tv_sec  ||
      67       64211 :              fxr->file2_mtime_nsec != inode2->i_mtime.tv_nsec))
      68          30 :                 return -EBUSY;
      69             : 
      70             :         return 0;
      71             : }
      72             : 
      73             : /* Performs necessary checks before doing a range exchange. */
      74             : STATIC int
      75     1862059 : xfs_exch_range_checks(
      76             :         struct file             *file1,
      77             :         struct file             *file2,
      78             :         struct xfs_exch_range   *fxr,
      79             :         unsigned int            blocksize)
      80             : {
      81     1862059 :         struct inode            *inode1 = file1->f_mapping->host;
      82     1862059 :         struct inode            *inode2 = file2->f_mapping->host;
      83     1862059 :         uint64_t                blkmask = blocksize - 1;
      84     1862059 :         int64_t                 test_len;
      85     1862059 :         uint64_t                blen;
      86     1862059 :         loff_t                  size1, size2;
      87     1862059 :         int                     error;
      88             : 
      89             :         /* Don't touch certain kinds of inodes */
      90     1862059 :         if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
      91             :                 return -EPERM;
      92     1862049 :         if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
      93             :                 return -ETXTBSY;
      94             : 
      95     1862038 :         size1 = i_size_read(inode1);
      96     1862038 :         size2 = i_size_read(inode2);
      97             : 
      98             :         /* Ranges cannot start after EOF. */
      99     1862038 :         if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
     100             :                 return -EINVAL;
     101             : 
     102             :         /*
     103             :          * If the caller asked for full files, check that the offset/length
     104             :          * values cover all of both files.
     105             :          */
     106     1862002 :         if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
     107       31724 :             (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
     108       31724 :              fxr->length != size1 || fxr->length != size2))
     109             :                 return -EDOM;
     110             : 
     111             :         /*
     112             :          * If the caller said to exchange to EOF, we set the length of the
     113             :          * request large enough to cover everything to the end of both files.
     114             :          */
     115     1861983 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
     116         366 :                 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
     117             :                                              size2 - fxr->file2_offset);
     118             : 
     119             :         /* The start of both ranges must be aligned to an fs block. */
     120     1861983 :         if (!IS_ALIGNED(fxr->file1_offset, blocksize) ||
     121     1861974 :             !IS_ALIGNED(fxr->file2_offset, blocksize))
     122             :                 return -EINVAL;
     123             : 
     124             :         /* Ensure offsets don't wrap. */
     125     1861974 :         if (fxr->file1_offset + fxr->length < fxr->file1_offset ||
     126     1861974 :             fxr->file2_offset + fxr->length < fxr->file2_offset)
     127             :                 return -EINVAL;
     128             : 
     129             :         /*
     130             :          * We require both ranges to be within EOF, unless we're exchanging
     131             :          * to EOF.  xfs_xchg_range_prep already checked that both
     132             :          * fxr->file1_offset and fxr->file2_offset are within EOF.
     133             :          */
     134     1861974 :         if (!(fxr->flags & XFS_EXCH_RANGE_TO_EOF) &&
     135     1861608 :             (fxr->file1_offset + fxr->length > size1 ||
     136     1861590 :              fxr->file2_offset + fxr->length > size2))
     137             :                 return -EINVAL;
     138             : 
     139             :         /*
     140             :          * Make sure we don't hit any file size limits.  If we hit any size
     141             :          * limits such that test_length was adjusted, we abort the whole
     142             :          * operation.
     143             :          */
     144     1861947 :         test_len = fxr->length;
     145     1861947 :         error = generic_write_check_limits(file2, fxr->file2_offset, &test_len);
     146     1861946 :         if (error)
     147             :                 return error;
     148     1861946 :         error = generic_write_check_limits(file1, fxr->file1_offset, &test_len);
     149     1861946 :         if (error)
     150             :                 return error;
     151     1861946 :         if (test_len != fxr->length)
     152             :                 return -EINVAL;
     153             : 
     154             :         /*
     155             :          * If the user wanted us to exchange up to the infile's EOF, round up
     156             :          * to the next block boundary for this check.  Do the same for the
     157             :          * outfile.
     158             :          *
     159             :          * Otherwise, reject the range length if it's not block aligned.  We
     160             :          * already confirmed the starting offsets' block alignment.
     161             :          */
     162     1861935 :         if (fxr->file1_offset + fxr->length == size1)
     163       32617 :                 blen = ALIGN(size1, blocksize) - fxr->file1_offset;
     164     1829318 :         else if (fxr->file2_offset + fxr->length == size2)
     165       10572 :                 blen = ALIGN(size2, blocksize) - fxr->file2_offset;
     166     1818746 :         else if (!IS_ALIGNED(fxr->length, blocksize))
     167             :                 return -EINVAL;
     168             :         else
     169             :                 blen = fxr->length;
     170             : 
     171             :         /* Don't allow overlapped exchanges within the same file. */
     172     1861935 :         if (inode1 == inode2 &&
     173     1829722 :             fxr->file2_offset + blen > fxr->file1_offset &&
     174      809786 :             fxr->file1_offset + blen > fxr->file2_offset)
     175             :                 return -EINVAL;
     176             : 
     177             :         /* If we already failed the freshness check, we're done. */
     178     1861910 :         error = xfs_exch_range_check_fresh(inode2, fxr);
     179     1861910 :         if (error)
     180             :                 return error;
     181             : 
     182             :         /*
     183             :          * Ensure that we don't exchange a partial EOF block into the middle of
     184             :          * another file.
     185             :          */
     186     1861872 :         if ((fxr->length & blkmask) == 0)
     187             :                 return 0;
     188             : 
     189        4622 :         blen = fxr->length;
     190        4622 :         if (fxr->file2_offset + blen < size2)
     191           9 :                 blen &= ~blkmask;
     192             : 
     193        4622 :         if (fxr->file1_offset + blen < size1)
     194          18 :                 blen &= ~blkmask;
     195             : 
     196        4622 :         return blen == fxr->length ? 0 : -EINVAL;
     197             : }
     198             : 
     199             : /*
     200             :  * Check that the two inodes are eligible for range exchanges, the ranges make
     201             :  * sense, and then flush all dirty data.  Caller must ensure that the inodes
     202             :  * have been locked against any other modifications.
     203             :  */
     204             : int
     205     1862061 : xfs_exch_range_prep(
     206             :         struct file             *file1,
     207             :         struct file             *file2,
     208             :         struct xfs_exch_range   *fxr,
     209             :         unsigned int            blocksize)
     210             : {
     211     1862061 :         struct inode            *inode1 = file_inode(file1);
     212     1862061 :         struct inode            *inode2 = file_inode(file2);
     213     1862061 :         bool                    same_inode = (inode1 == inode2);
     214     1862061 :         int                     error;
     215             : 
     216             :         /* Check that we don't violate system file offset limits. */
     217     1862061 :         error = xfs_exch_range_checks(file1, file2, fxr, blocksize);
     218     1862047 :         if (error || fxr->length == 0)
     219             :                 return error;
     220             : 
     221             :         /* Wait for the completion of any pending IOs on both files */
     222     1861298 :         inode_dio_wait(inode1);
     223     1861298 :         if (!same_inode)
     224       32164 :                 inode_dio_wait(inode2);
     225             : 
     226     1861298 :         error = filemap_write_and_wait_range(inode1->i_mapping,
     227             :                         fxr->file1_offset,
     228     1861298 :                         fxr->file1_offset + fxr->length - 1);
     229     1861303 :         if (error)
     230             :                 return error;
     231             : 
     232     1861302 :         error = filemap_write_and_wait_range(inode2->i_mapping,
     233             :                         fxr->file2_offset,
     234     1861302 :                         fxr->file2_offset + fxr->length - 1);
     235     1861315 :         if (error)
     236             :                 return error;
     237             : 
     238             :         /*
     239             :          * If the files or inodes involved require synchronous writes, amend
     240             :          * the request to force the filesystem to flush all data and metadata
     241             :          * to disk after the operation completes.
     242             :          */
     243     1861315 :         if (((file1->f_flags | file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
     244     1837203 :             IS_SYNC(inode1) || IS_SYNC(inode2))
     245       24113 :                 fxr->flags |= XFS_EXCH_RANGE_FSYNC;
     246             : 
     247             :         return 0;
     248             : }
     249             : 
     250             : /*
     251             :  * Finish a range exchange operation, if it was successful.  Caller must ensure
     252             :  * that the inodes are still locked against any other modifications.
     253             :  */
     254             : int
     255     1861683 : xfs_exch_range_finish(
     256             :         struct file             *file1,
     257             :         struct file             *file2)
     258             : {
     259     1861683 :         int                     error;
     260             : 
     261     1861683 :         error = file_remove_privs(file1);
     262     1861671 :         if (error)
     263             :                 return error;
     264     1861671 :         if (file_inode(file1) == file_inode(file2))
     265             :                 return 0;
     266             : 
     267       32076 :         return file_remove_privs(file2);
     268             : }
     269             : 
     270             : /* Decide if it's ok to remap the selected range of a given file. */
     271             : STATIC int
     272     3724135 : xfs_exch_range_verify_area(
     273             :         struct file             *file,
     274             :         loff_t                  pos,
     275             :         struct xfs_exch_range   *fxr)
     276             : {
     277     3724135 :         int64_t                 len = fxr->length;
     278             : 
     279     3724135 :         if (pos < 0)
     280             :                 return -EINVAL;
     281             : 
     282     3724135 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
     283         732 :                 len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos);
     284     3724135 :         return remap_verify_area(file, pos, len, true);
     285             : }
     286             : 
     287             : /* Prepare for and exchange parts of two files. */
     288             : static inline int
     289     4967102 : __xfs_exch_range(
     290             :         struct file             *file1,
     291             :         struct file             *file2,
     292             :         struct xfs_exch_range   *fxr)
     293             : {
     294     4967102 :         struct inode            *inode1 = file_inode(file1);
     295     4967102 :         struct inode            *inode2 = file_inode(file2);
     296     4967102 :         int                     ret;
     297             : 
     298     9934289 :         if ((fxr->flags & ~XFS_EXCH_RANGE_ALL_FLAGS) ||
     299     4967102 :             memchr_inv(&fxr->pad, 0, sizeof(fxr->pad)))
     300           0 :                 return -EINVAL;
     301             : 
     302     4967187 :         if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
     303             :             (fxr->flags & XFS_EXCH_RANGE_TO_EOF))
     304             :                 return -EINVAL;
     305             : 
     306             :         /*
     307             :          * The ioctl enforces that src and dest files are on the same mount.
     308             :          * However, they only need to be on the same file system.
     309             :          */
     310     4967187 :         if (inode1->i_sb != inode2->i_sb)
     311             :                 return -EXDEV;
     312             : 
     313             :         /* This only works for regular files. */
     314     4967187 :         if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
     315             :                 return -EISDIR;
     316     4967177 :         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
     317             :                 return -EINVAL;
     318             : 
     319     4967177 :         ret = generic_file_rw_checks(file1, file2);
     320     4967062 :         if (ret < 0)
     321             :                 return ret;
     322             : 
     323     4967052 :         ret = generic_file_rw_checks(file2, file1);
     324     4967056 :         if (ret < 0)
     325             :                 return ret;
     326             : 
     327     1862071 :         ret = xfs_exch_range_verify_area(file1, fxr->file1_offset, fxr);
     328     1862067 :         if (ret)
     329             :                 return ret;
     330             : 
     331     1862069 :         ret = xfs_exch_range_verify_area(file2, fxr->file2_offset, fxr);
     332     1862072 :         if (ret)
     333             :                 return ret;
     334             : 
     335     1862072 :         ret = xfs_file_xchg_range(file1, file2, fxr);
     336     1862069 :         if (ret)
     337             :                 return ret;
     338             : 
     339     1861683 :         fsnotify_modify(file1);
     340     1861672 :         if (file2 != file1)
     341       32092 :                 fsnotify_modify(file2);
     342             :         return 0;
     343             : }
     344             : 
     345             : /* Exchange parts of two files. */
     346             : int
     347     4967150 : xfs_exch_range(
     348             :         struct file             *file1,
     349             :         struct file             *file2,
     350             :         struct xfs_exch_range   *fxr)
     351             : {
     352     4967150 :         int                     error;
     353             : 
     354     4967150 :         file_start_write(file2);
     355     4967101 :         error = __xfs_exch_range(file1, file2, fxr);
     356     4967057 :         file_end_write(file2);
     357     4967095 :         return error;
     358             : }
     359             : 
     360             : /* XFS-specific parts of XFS_IOC_EXCHANGE_RANGE */
     361             : 
     362             : /*
     363             :  * Exchanging ranges as a file operation.  This is the binding between the
     364             :  * VFS-level concepts and the XFS-specific implementation.
     365             :  */
     366             : int
     367     1862069 : xfs_file_xchg_range(
     368             :         struct file             *file1,
     369             :         struct file             *file2,
     370             :         struct xfs_exch_range   *fxr)
     371             : {
     372     1862069 :         struct inode            *inode1 = file_inode(file1);
     373     1862069 :         struct inode            *inode2 = file_inode(file2);
     374     1862069 :         struct xfs_inode        *ip1 = XFS_I(inode1);
     375     1862069 :         struct xfs_inode        *ip2 = XFS_I(inode2);
     376     1862069 :         struct xfs_mount        *mp = ip1->i_mount;
     377     1862069 :         unsigned int            priv_flags = 0;
     378     1862069 :         bool                    use_logging = false;
     379     1862069 :         int                     error;
     380             : 
     381     3724138 :         if (xfs_is_shutdown(mp))
     382             :                 return -EIO;
     383             : 
     384             :         /* Update cmtime if the fd/inode don't forbid it. */
     385     1862069 :         if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
     386     1862069 :                 priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
     387     1862069 :         if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
     388     1832769 :                 priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
     389             : 
     390             :         /* Lock both files against IO */
     391     1862069 :         error = xfs_ilock2_io_mmap(ip1, ip2);
     392     1862077 :         if (error)
     393           0 :                 goto out_err;
     394             : 
     395             :         /* Prepare and then exchange file contents. */
     396     1862077 :         error = xfs_xchg_range_prep(file1, file2, fxr, priv_flags);
     397     1862070 :         if (error)
     398         210 :                 goto out_unlock;
     399             : 
     400             :         /* Get permission to use log-assisted file content swaps. */
     401     1861860 :         error = xfs_xchg_range_grab_log_assist(mp,
     402     1861860 :                         !(fxr->flags & XFS_EXCH_RANGE_NONATOMIC),
     403             :                         &use_logging);
     404     1861862 :         if (error)
     405          54 :                 goto out_unlock;
     406     1861808 :         if (use_logging)
     407     1830069 :                 priv_flags |= XFS_XCHG_RANGE_LOGGED;
     408             : 
     409     1861808 :         error = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
     410     1861807 :         if (error)
     411         120 :                 goto out_drop_feat;
     412             : 
     413             :         /*
     414             :          * Finish the exchange by removing special file privileges like any
     415             :          * other file write would do.  This may involve turning on support for
     416             :          * logged xattrs if either file has security capabilities, which means
     417             :          * xfs_xchg_range_grab_log_assist before xfs_attr_grab_log_assist.
     418             :          */
     419     1861687 :         error = xfs_exch_range_finish(file1, file2);
     420     1861669 :         if (error)
     421           0 :                 goto out_drop_feat;
     422             : 
     423     1861669 : out_drop_feat:
     424     1861789 :         if (use_logging)
     425     1830050 :                 xfs_xchg_range_rele_log_assist(mp);
     426       31739 : out_unlock:
     427     1862070 :         xfs_iunlock2_io_mmap(ip1, ip2);
     428     1862072 : out_err:
     429     1862072 :         if (error)
     430         384 :                 trace_xfs_file_xchg_range_error(ip2, error, _RET_IP_);
     431             :         return error;
     432             : }
     433             : 
     434             : /* Lock (and optionally join) two inodes for a file range exchange. */
     435             : void
     436     8686402 : xfs_xchg_range_ilock(
     437             :         struct xfs_trans        *tp,
     438             :         struct xfs_inode        *ip1,
     439             :         struct xfs_inode        *ip2)
     440             : {
     441     8686402 :         if (ip1 != ip2)
     442     5027087 :                 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
     443             :                                     ip2, XFS_ILOCK_EXCL);
     444             :         else
     445     3659315 :                 xfs_ilock(ip1, XFS_ILOCK_EXCL);
     446     8682289 :         if (tp) {
     447     6726874 :                 xfs_trans_ijoin(tp, ip1, 0);
     448     6733109 :                 if (ip2 != ip1)
     449     4903443 :                         xfs_trans_ijoin(tp, ip2, 0);
     450             :         }
     451             : 
     452     8689013 : }
     453             : 
     454             : /* Unlock two inodes after a file range exchange operation. */
     455             : void
     456     3817240 : xfs_xchg_range_iunlock(
     457             :         struct xfs_inode        *ip1,
     458             :         struct xfs_inode        *ip2)
     459             : {
     460     3817240 :         if (ip2 != ip1)
     461      157926 :                 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
     462     3817241 :         xfs_iunlock(ip1, XFS_ILOCK_EXCL);
     463     3817241 : }
     464             : 
     465             : /*
     466             :  * Estimate the resource requirements to exchange file contents between the two
     467             :  * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
     468             :  * have flushed both inodes' pagecache and active direct-ios.
     469             :  */
     470             : int
     471     1955380 : xfs_xchg_range_estimate(
     472             :         struct xfs_swapext_req  *req)
     473             : {
     474     1955380 :         int                     error;
     475             : 
     476     1955380 :         xfs_xchg_range_ilock(NULL, req->ip1, req->ip2);
     477     1955383 :         error = xfs_swapext_estimate(req);
     478     1955381 :         xfs_xchg_range_iunlock(req->ip1, req->ip2);
     479     1955377 :         return error;
     480             : }
     481             : 
     482             : /*
     483             :  * We need to check that the format of the data fork in the temporary inode is
     484             :  * valid for the target inode before doing the swap. This is not a problem with
     485             :  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
     486             :  * data fork depending on the space the attribute fork is taking so we can get
     487             :  * invalid formats on the target inode.
     488             :  *
     489             :  * E.g. target has space for 7 extents in extent format, temp inode only has
     490             :  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
     491             :  * btree, but when swapped it needs to be in extent format. Hence we can't just
     492             :  * blindly swap data forks on attr2 filesystems.
     493             :  *
     494             :  * Note that we check the swap in both directions so that we don't end up with
     495             :  * a corrupt temporary inode, either.
     496             :  *
     497             :  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
     498             :  * inode will prevent this situation from occurring, so all we do here is
     499             :  * reject and log the attempt. basically we are putting the responsibility on
     500             :  * userspace to get this right.
     501             :  */
     502             : STATIC int
     503        3465 : xfs_swap_extents_check_format(
     504             :         struct xfs_inode        *ip,    /* target inode */
     505             :         struct xfs_inode        *tip)   /* tmp inode */
     506             : {
     507        3465 :         struct xfs_ifork        *ifp = &ip->i_df;
     508        3465 :         struct xfs_ifork        *tifp = &tip->i_df;
     509             : 
     510             :         /* User/group/project quota ids must match if quotas are enforced. */
     511        3465 :         if (XFS_IS_QUOTA_ON(ip->i_mount) &&
     512        3464 :             (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
     513        3464 :              !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
     514        3464 :              ip->i_projid != tip->i_projid))
     515             :                 return -EINVAL;
     516             : 
     517             :         /* Should never get a local format */
     518        3464 :         if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
     519        3464 :             tifp->if_format == XFS_DINODE_FMT_LOCAL)
     520             :                 return -EINVAL;
     521             : 
     522             :         /*
     523             :          * if the target inode has less extents that then temporary inode then
     524             :          * why did userspace call us?
     525             :          */
     526        3464 :         if (ifp->if_nextents < tifp->if_nextents)
     527             :                 return -EINVAL;
     528             : 
     529             :         /*
     530             :          * If we have to use the (expensive) rmap swap method, we can
     531             :          * handle any number of extents and any format.
     532             :          */
     533        3464 :         if (xfs_has_rmapbt(ip->i_mount))
     534             :                 return 0;
     535             : 
     536             :         /*
     537             :          * if the target inode is in extent form and the temp inode is in btree
     538             :          * form then we will end up with the target inode in the wrong format
     539             :          * as we already know there are less extents in the temp inode.
     540             :          */
     541        3464 :         if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
     542             :             tifp->if_format == XFS_DINODE_FMT_BTREE)
     543             :                 return -EINVAL;
     544             : 
     545             :         /* Check temp in extent form to max in target */
     546        3464 :         if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
     547        3277 :             tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
     548             :                 return -EINVAL;
     549             : 
     550             :         /* Check target in extent form to max in temp */
     551        3464 :         if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
     552        2490 :             ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
     553             :                 return -EINVAL;
     554             : 
     555             :         /*
     556             :          * If we are in a btree format, check that the temp root block will fit
     557             :          * in the target and that it has enough extents to be in btree format
     558             :          * in the target.
     559             :          *
     560             :          * Note that we have to be careful to allow btree->extent conversions
     561             :          * (a common defrag case) which will occur when the temp inode is in
     562             :          * extent format...
     563             :          */
     564        3464 :         if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
     565         187 :                 if (xfs_inode_has_attr_fork(ip) &&
     566           0 :                     xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip))
     567             :                         return -EINVAL;
     568         187 :                 if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
     569             :                         return -EINVAL;
     570             :         }
     571             : 
     572             :         /* Reciprocal target->temp btree format checks */
     573        3464 :         if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
     574         974 :                 if (xfs_inode_has_attr_fork(tip) &&
     575         783 :                     xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
     576             :                         return -EINVAL;
     577         974 :                 if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
     578           0 :                         return -EINVAL;
     579             :         }
     580             : 
     581             :         return 0;
     582             : }
     583             : 
     584             : /*
     585             :  * Fix up the owners of the bmbt blocks to refer to the current inode. The
     586             :  * change owner scan attempts to order all modified buffers in the current
     587             :  * transaction. In the event of ordered buffer failure, the offending buffer is
     588             :  * physically logged as a fallback and the scan returns -EAGAIN. We must roll
     589             :  * the transaction in this case to replenish the fallback log reservation and
     590             :  * restart the scan. This process repeats until the scan completes.
     591             :  */
     592             : static int
     593        1159 : xfs_swap_change_owner(
     594             :         struct xfs_trans        **tpp,
     595             :         struct xfs_inode        *ip,
     596             :         struct xfs_inode        *tmpip)
     597             : {
     598        1159 :         int                     error;
     599        1159 :         struct xfs_trans        *tp = *tpp;
     600             : 
     601        4255 :         do {
     602        2707 :                 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
     603             :                                               NULL);
     604             :                 /* success or fatal error */
     605        2707 :                 if (error != -EAGAIN)
     606             :                         break;
     607             : 
     608        1548 :                 error = xfs_trans_roll(tpp);
     609        1548 :                 if (error)
     610             :                         break;
     611        1548 :                 tp = *tpp;
     612             : 
     613             :                 /*
     614             :                  * Redirty both inodes so they can relog and keep the log tail
     615             :                  * moving forward.
     616             :                  */
     617        1548 :                 xfs_trans_ijoin(tp, ip, 0);
     618        1548 :                 xfs_trans_ijoin(tp, tmpip, 0);
     619        1548 :                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
     620        1548 :                 xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
     621             :         } while (true);
     622             : 
     623        1159 :         return error;
     624             : }
     625             : 
     626             : /* Swap the extents of two files by swapping data forks. */
     627             : STATIC int
     628        3463 : xfs_swap_extent_forks(
     629             :         struct xfs_trans        **tpp,
     630             :         struct xfs_swapext_req  *req)
     631             : {
     632        3463 :         struct xfs_inode        *ip = req->ip2;
     633        3463 :         struct xfs_inode        *tip = req->ip1;
     634        3463 :         xfs_filblks_t           aforkblks = 0;
     635        3463 :         xfs_filblks_t           taforkblks = 0;
     636        3463 :         xfs_extnum_t            junk;
     637        3463 :         uint64_t                tmp;
     638        3463 :         int                     src_log_flags = XFS_ILOG_CORE;
     639        3463 :         int                     target_log_flags = XFS_ILOG_CORE;
     640        3463 :         int                     error;
     641             : 
     642             :         /*
     643             :          * Count the number of extended attribute blocks
     644             :          */
     645        3463 :         if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
     646         459 :             ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
     647         459 :                 error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
     648             :                                 &aforkblks);
     649         459 :                 if (error)
     650             :                         return error;
     651             :         }
     652        3463 :         if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
     653           0 :             tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
     654           0 :                 error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
     655             :                                 &taforkblks);
     656           0 :                 if (error)
     657             :                         return error;
     658             :         }
     659             : 
     660             :         /*
     661             :          * Btree format (v3) inodes have the inode number stamped in the bmbt
     662             :          * block headers. We can't start changing the bmbt blocks until the
     663             :          * inode owner change is logged so recovery does the right thing in the
     664             :          * event of a crash. Set the owner change log flags now and leave the
     665             :          * bmbt scan as the last step.
     666             :          */
     667        3463 :         if (xfs_has_v3inodes(ip->i_mount)) {
     668        3463 :                 if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
     669         973 :                         target_log_flags |= XFS_ILOG_DOWNER;
     670        3463 :                 if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
     671         186 :                         src_log_flags |= XFS_ILOG_DOWNER;
     672             :         }
     673             : 
     674             :         /*
     675             :          * Swap the data forks of the inodes
     676             :          */
     677        3463 :         swap(ip->i_df, tip->i_df);
     678             : 
     679             :         /*
     680             :          * Fix the on-disk inode values
     681             :          */
     682        3463 :         tmp = (uint64_t)ip->i_nblocks;
     683        3463 :         ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
     684        3463 :         tip->i_nblocks = tmp + taforkblks - aforkblks;
     685             : 
     686             :         /*
     687             :          * The extents in the source inode could still contain speculative
     688             :          * preallocation beyond EOF (e.g. the file is open but not modified
     689             :          * while defrag is in progress). In that case, we need to copy over the
     690             :          * number of delalloc blocks the data fork in the source inode is
     691             :          * tracking beyond EOF so that when the fork is truncated away when the
     692             :          * temporary inode is unlinked we don't underrun the i_delayed_blks
     693             :          * counter on that inode.
     694             :          */
     695        3463 :         ASSERT(tip->i_delayed_blks == 0);
     696        3463 :         tip->i_delayed_blks = ip->i_delayed_blks;
     697        3463 :         ip->i_delayed_blks = 0;
     698             : 
     699        3463 :         switch (ip->i_df.if_format) {
     700        3277 :         case XFS_DINODE_FMT_EXTENTS:
     701        3277 :                 src_log_flags |= XFS_ILOG_DEXT;
     702        3277 :                 break;
     703         186 :         case XFS_DINODE_FMT_BTREE:
     704         186 :                 ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
     705             :                        (src_log_flags & XFS_ILOG_DOWNER));
     706         186 :                 src_log_flags |= XFS_ILOG_DBROOT;
     707         186 :                 break;
     708             :         }
     709             : 
     710        3463 :         switch (tip->i_df.if_format) {
     711        2490 :         case XFS_DINODE_FMT_EXTENTS:
     712        2490 :                 target_log_flags |= XFS_ILOG_DEXT;
     713        2490 :                 break;
     714         973 :         case XFS_DINODE_FMT_BTREE:
     715         973 :                 target_log_flags |= XFS_ILOG_DBROOT;
     716         973 :                 ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
     717             :                        (target_log_flags & XFS_ILOG_DOWNER));
     718             :                 break;
     719             :         }
     720             : 
     721             :         /* Do we have to swap reflink flags? */
     722        3463 :         if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
     723        3463 :             (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
     724           0 :                 uint64_t        f;
     725             : 
     726           0 :                 f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
     727           0 :                 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
     728           0 :                 ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
     729           0 :                 tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
     730           0 :                 tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
     731             :         }
     732             : 
     733             :         /* Swap the cow forks. */
     734        3463 :         if (xfs_has_reflink(ip->i_mount)) {
     735           0 :                 ASSERT(!ip->i_cowfp ||
     736             :                        ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
     737           0 :                 ASSERT(!tip->i_cowfp ||
     738             :                        tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
     739             : 
     740           0 :                 swap(ip->i_cowfp, tip->i_cowfp);
     741             : 
     742           0 :                 if (ip->i_cowfp && ip->i_cowfp->if_bytes)
     743           0 :                         xfs_inode_set_cowblocks_tag(ip);
     744             :                 else
     745           0 :                         xfs_inode_clear_cowblocks_tag(ip);
     746           0 :                 if (tip->i_cowfp && tip->i_cowfp->if_bytes)
     747           0 :                         xfs_inode_set_cowblocks_tag(tip);
     748             :                 else
     749           0 :                         xfs_inode_clear_cowblocks_tag(tip);
     750             :         }
     751             : 
     752        3463 :         xfs_trans_log_inode(*tpp, ip,  src_log_flags);
     753        3463 :         xfs_trans_log_inode(*tpp, tip, target_log_flags);
     754             : 
     755             :         /*
     756             :          * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
     757             :          * have inode number owner values in the bmbt blocks that still refer to
     758             :          * the old inode. Scan each bmbt to fix up the owner values with the
     759             :          * inode number of the current inode.
     760             :          */
     761        3463 :         if (src_log_flags & XFS_ILOG_DOWNER) {
     762         186 :                 error = xfs_swap_change_owner(tpp, ip, tip);
     763         186 :                 if (error)
     764             :                         return error;
     765             :         }
     766        3463 :         if (target_log_flags & XFS_ILOG_DOWNER) {
     767         973 :                 error = xfs_swap_change_owner(tpp, tip, ip);
     768         973 :                 if (error)
     769           0 :                         return error;
     770             :         }
     771             : 
     772             :         return 0;
     773             : }
     774             : 
     775             : /*
     776             :  * There may be partially written rt extents lurking in the ranges to be
     777             :  * swapped.  According to the rules for realtime files with big rt extents, we
     778             :  * must guarantee that an outside observer (an IO thread, realistically) never
     779             :  * can see multiple physical rt extents mapped to the same logical file rt
     780             :  * extent.  The deferred bmap log intent items that we use under the hood
     781             :  * operate on single block mappings and not rt extents, which means we must
     782             :  * have a strategy to ensure that log recovery after a failure won't stop in
     783             :  * the middle of an rt extent.
     784             :  *
     785             :  * The preferred strategy is to use deferred extent swap log intent items to
     786             :  * track the status of the overall swap operation so that we can complete the
     787             :  * work during crash recovery.  If that isn't possible, we fall back to
     788             :  * requiring the selected mappings in both forks to be aligned to rt extent
     789             :  * boundaries.  As an aside, the old fork swap routine didn't have this
     790             :  * requirement, but at an extreme cost in flexibilty (full files only, and no
     791             :  * support if rmapbt is enabled).
     792             :  */
     793             : static bool
     794     1861311 : xfs_xchg_range_need_rt_conversion(
     795             :         struct xfs_inode                *ip,
     796             :         unsigned int                    xchg_flags)
     797             : {
     798     1861311 :         struct xfs_mount                *mp = ip->i_mount;
     799             : 
     800             :         /*
     801             :          * Caller got permission to use logged swapext, so log recovery will
     802             :          * finish the swap and not leave us with partially swapped rt extents
     803             :          * exposed to userspace.
     804             :          */
     805     1861311 :         if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
     806             :                 return false;
     807             : 
     808             :         /*
     809             :          * If we can't use log intent items at all, the only supported
     810             :          * operation is full fork swaps, so no conversions are needed.
     811             :          * The range requirements are enforced by the swapext code itself.
     812             :          */
     813     1864790 :         if (!xfs_swapext_supported(mp))
     814             :                 return false;
     815             : 
     816             :         /* Conversion is only needed for realtime files with big rt extents */
     817     1857832 :         return xfs_inode_has_bigrtextents(ip);
     818             : }
     819             : 
     820             : /*
     821             :  * Check the alignment of an exchange request when the allocation unit size
     822             :  * isn't a power of two.  The VFS helpers use (fast) bitmask-based alignment
     823             :  * checks, but here we have to use slow long division.
     824             :  */
     825             : static int
     826      127480 : xfs_xchg_range_check_rtalign(
     827             :         struct xfs_inode                *ip1,
     828             :         struct xfs_inode                *ip2,
     829             :         const struct xfs_exch_range     *fxr)
     830             : {
     831      127480 :         struct xfs_mount                *mp = ip1->i_mount;
     832      127480 :         uint32_t                        rextbytes;
     833      127480 :         uint64_t                        length = fxr->length;
     834      127480 :         uint64_t                        blen;
     835      127480 :         loff_t                          size1, size2;
     836             : 
     837      127480 :         rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
     838      127480 :         size1 = i_size_read(VFS_I(ip1));
     839      127480 :         size2 = i_size_read(VFS_I(ip2));
     840             : 
     841             :         /* The start of both ranges must be aligned to a rt extent. */
     842      254953 :         if (!isaligned_64(fxr->file1_offset, rextbytes) ||
     843      127474 :             !isaligned_64(fxr->file2_offset, rextbytes))
     844           8 :                 return -EINVAL;
     845             : 
     846             :         /*
     847             :          * If the caller asked for full files, check that the offset/length
     848             :          * values cover all of both files.
     849             :          */
     850      127471 :         if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
     851        4935 :             (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
     852        4935 :              fxr->length != size1 || fxr->length != size2))
     853             :                 return -EDOM;
     854             : 
     855      127469 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
     856          29 :                 length = max_t(int64_t, size1 - fxr->file1_offset,
     857             :                                         size2 - fxr->file2_offset);
     858             : 
     859             :         /*
     860             :          * If the user wanted us to exchange up to the infile's EOF, round up
     861             :          * to the next rt extent boundary for this check.  Do the same for the
     862             :          * outfile.
     863             :          *
     864             :          * Otherwise, reject the range length if it's not rt extent aligned.
     865             :          * We already confirmed the starting offsets' rt extent block
     866             :          * alignment.
     867             :          */
     868      127469 :         if (fxr->file1_offset + length == size1)
     869        4999 :                 blen = roundup_64(size1, rextbytes) - fxr->file1_offset;
     870      122470 :         else if (fxr->file2_offset + length == size2)
     871        1085 :                 blen = roundup_64(size2, rextbytes) - fxr->file2_offset;
     872      121385 :         else if (!isaligned_64(length, rextbytes))
     873             :                 return -EINVAL;
     874             :         else
     875             :                 blen = length;
     876             : 
     877             :         /* Don't allow overlapped exchanges within the same file. */
     878      127468 :         if (ip1 == ip2 &&
     879      122506 :             fxr->file2_offset + blen > fxr->file1_offset &&
     880       59202 :             fxr->file1_offset + blen > fxr->file2_offset)
     881             :                 return -EINVAL;
     882             : 
     883             :         /*
     884             :          * Ensure that we don't exchange a partial EOF rt extent into the
     885             :          * middle of another file.
     886             :          */
     887      127467 :         if (isaligned_64(length, rextbytes))
     888             :                 return 0;
     889             : 
     890        4726 :         blen = length;
     891        4726 :         if (fxr->file2_offset + length < size2)
     892           0 :                 blen = rounddown_64(blen, rextbytes);
     893             : 
     894        4726 :         if (fxr->file1_offset + blen < size1)
     895           0 :                 blen = rounddown_64(blen, rextbytes);
     896             : 
     897        4726 :         return blen == length ? 0 : -EINVAL;
     898             : }
     899             : 
     900             : /* Prepare two files to have their data exchanged. */
     901             : int
     902     1862074 : xfs_xchg_range_prep(
     903             :         struct file             *file1,
     904             :         struct file             *file2,
     905             :         struct xfs_exch_range   *fxr,
     906             :         unsigned int            xchg_flags)
     907             : {
     908     1862074 :         struct xfs_inode        *ip1 = XFS_I(file_inode(file1));
     909     1862074 :         struct xfs_inode        *ip2 = XFS_I(file_inode(file2));
     910     1862074 :         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip2);
     911     1862065 :         int                     error;
     912             : 
     913     1862065 :         trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0);
     914             : 
     915             :         /* Verify both files are either real-time or non-realtime */
     916     4288982 :         if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
     917             :                 return -EINVAL;
     918             : 
     919             :         /* Check non-power of two alignment issues, if necessary. */
     920     2510680 :         if (XFS_IS_REALTIME_INODE(ip2) && !is_power_of_2(alloc_unit)) {
     921      127480 :                 error = xfs_xchg_range_check_rtalign(ip1, ip2, fxr);
     922      127481 :                 if (error)
     923             :                         return error;
     924             : 
     925             :                 /* Do the VFS checks with the regular block alignment. */
     926      127467 :                 alloc_unit = ip1->i_mount->m_sb.sb_blocksize;
     927             :         }
     928             : 
     929     1862055 :         error = xfs_exch_range_prep(file1, file2, fxr, alloc_unit);
     930     1862065 :         if (error || fxr->length == 0)
     931             :                 return error;
     932             : 
     933             :         /* Attach dquots to both inodes before changing block maps. */
     934     1861314 :         error = xfs_qm_dqattach(ip2);
     935     1861309 :         if (error)
     936             :                 return error;
     937     1861309 :         error = xfs_qm_dqattach(ip1);
     938     1861304 :         if (error)
     939             :                 return error;
     940             : 
     941     1861303 :         trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0);
     942             : 
     943             :         /* Flush the relevant ranges of both files. */
     944     1861305 :         error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
     945     1861304 :         if (error)
     946             :                 return error;
     947     1861303 :         error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
     948     1861309 :         if (error)
     949             :                 return error;
     950             : 
     951             :         /*
     952             :          * Cancel CoW fork preallocations for the ranges of both files.  The
     953             :          * prep function should have flushed all the dirty data, so the only
     954             :          * extents remaining should be speculative.
     955             :          */
     956     3722618 :         if (xfs_inode_has_cow_data(ip1)) {
     957      960691 :                 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
     958      960691 :                                 fxr->length, true);
     959      960691 :                 if (error)
     960             :                         return error;
     961             :         }
     962             : 
     963     3722618 :         if (xfs_inode_has_cow_data(ip2)) {
     964      953686 :                 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
     965      953686 :                                 fxr->length, true);
     966      953686 :                 if (error)
     967             :                         return error;
     968             :         }
     969             : 
     970             :         /* Convert unwritten sub-extent mappings if required. */
     971     1861309 :         if (xfs_xchg_range_need_rt_conversion(ip2, xchg_flags)) {
     972      186609 :                 error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
     973             :                                 fxr->length);
     974      186610 :                 if (error)
     975             :                         return error;
     976             : 
     977      186610 :                 error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
     978             :                                 fxr->length);
     979      186611 :                 if (error)
     980           0 :                         return error;
     981             :         }
     982             : 
     983             :         return 0;
     984             : }
     985             : 
     986             : #define QRETRY_IP1      (0x1)
     987             : #define QRETRY_IP2      (0x2)
     988             : 
     989             : /*
     990             :  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
     991             :  * this if quota enforcement is disabled or if both inodes' dquots are the
     992             :  * same.  The qretry structure must be initialized to zeroes before the first
     993             :  * call to this function.
     994             :  */
     995             : STATIC int
     996     1861801 : xfs_xchg_range_reserve_quota(
     997             :         struct xfs_trans                *tp,
     998             :         const struct xfs_swapext_req    *req,
     999             :         unsigned int                    *qretry)
    1000             : {
    1001     1861801 :         int64_t                         ddelta, rdelta;
    1002     1861801 :         int                             ip1_error = 0;
    1003     1861801 :         int                             error;
    1004             : 
    1005             :         /*
    1006             :          * Don't bother with a quota reservation if we're not enforcing them
    1007             :          * or the two inodes have the same dquots.
    1008             :          */
    1009     1861801 :         if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
    1010       29077 :             (req->ip1->i_udquot == req->ip2->i_udquot &&
    1011       29004 :              req->ip1->i_gdquot == req->ip2->i_gdquot &&
    1012       28977 :              req->ip1->i_pdquot == req->ip2->i_pdquot))
    1013             :                 return 0;
    1014             : 
    1015         100 :         *qretry = 0;
    1016             : 
    1017             :         /*
    1018             :          * For each file, compute the net gain in the number of regular blocks
    1019             :          * that will be mapped into that file and reserve that much quota.  The
    1020             :          * quota counts must be able to absorb at least that much space.
    1021             :          */
    1022         100 :         ddelta = req->ip2_bcount - req->ip1_bcount;
    1023         100 :         rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
    1024         100 :         if (ddelta > 0 || rdelta > 0) {
    1025          42 :                 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
    1026             :                                 ddelta > 0 ? ddelta : 0,
    1027             :                                 rdelta > 0 ? rdelta : 0,
    1028             :                                 false);
    1029          42 :                 if (error == -EDQUOT || error == -ENOSPC) {
    1030             :                         /*
    1031             :                          * Save this error and see what happens if we try to
    1032             :                          * reserve quota for ip2.  Then report both.
    1033             :                          */
    1034          22 :                         *qretry |= QRETRY_IP1;
    1035          22 :                         ip1_error = error;
    1036          22 :                         error = 0;
    1037             :                 }
    1038          42 :                 if (error)
    1039             :                         return error;
    1040             :         }
    1041         100 :         if (ddelta < 0 || rdelta < 0) {
    1042           9 :                 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
    1043             :                                 ddelta < 0 ? -ddelta : 0,
    1044             :                                 rdelta < 0 ? -rdelta : 0,
    1045             :                                 false);
    1046           9 :                 if (error == -EDQUOT || error == -ENOSPC)
    1047           0 :                         *qretry |= QRETRY_IP2;
    1048           9 :                 if (error)
    1049             :                         return error;
    1050             :         }
    1051         100 :         if (ip1_error)
    1052             :                 return ip1_error;
    1053             : 
    1054             :         /*
    1055             :          * For each file, forcibly reserve the gross gain in mapped blocks so
    1056             :          * that we don't trip over any quota block reservation assertions.
    1057             :          * We must reserve the gross gain because the quota code subtracts from
    1058             :          * bcount the number of blocks that we unmap; it does not add that
    1059             :          * quantity back to the quota block reservation.
    1060             :          */
    1061          78 :         error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
    1062          78 :                         req->ip1_rtbcount, true);
    1063          78 :         if (error)
    1064             :                 return error;
    1065             : 
    1066          78 :         return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
    1067          78 :                         req->ip2_rtbcount, true);
    1068             : }
    1069             : 
    1070             : /*
    1071             :  * Get permission to use log-assisted atomic exchange of file extents.
    1072             :  *
    1073             :  * Callers must hold the IOLOCK and MMAPLOCK of both files.  They must not be
    1074             :  * running any transactions or hold any ILOCKS.  If @use_logging is set after a
    1075             :  * successful return, callers must call xfs_xchg_range_rele_log_assist after
    1076             :  * the exchange is completed.
    1077             :  */
    1078             : int
    1079     6818952 : xfs_xchg_range_grab_log_assist(
    1080             :         struct xfs_mount        *mp,
    1081             :         bool                    force,
    1082             :         bool                    *use_logging)
    1083             : {
    1084     6818952 :         int                     error = 0;
    1085             : 
    1086             :         /*
    1087             :          * Protect ourselves from an idle log clearing the atomic swapext
    1088             :          * log incompat feature bit.
    1089             :          */
    1090     6818952 :         xlog_use_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
    1091     6814820 :         *use_logging = true;
    1092             : 
    1093             :         /*
    1094             :          * If log-assisted swapping is already enabled, the caller can use the
    1095             :          * log assisted swap functions with the log-incompat reference we got.
    1096             :          */
    1097    13629640 :         if (xfs_sb_version_haslogswapext(&mp->m_sb))
    1098             :                 return 0;
    1099             : 
    1100             :         /*
    1101             :          * If the caller doesn't /require/ log-assisted swapping, drop the
    1102             :          * log-incompat feature protection and exit.  The caller cannot use
    1103             :          * log assisted swapping.
    1104             :          */
    1105       57660 :         if (!force)
    1106       31739 :                 goto drop_incompat;
    1107             : 
    1108             :         /*
    1109             :          * Caller requires log-assisted swapping but the fs feature set isn't
    1110             :          * rich enough to support it.  Bail out.
    1111             :          */
    1112       26066 :         if (!xfs_swapext_supported(mp)) {
    1113         139 :                 error = -EOPNOTSUPP;
    1114         139 :                 goto drop_incompat;
    1115             :         }
    1116             : 
    1117       25782 :         error = xfs_add_incompat_log_feature(mp,
    1118             :                         XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT);
    1119       25793 :         if (error)
    1120           0 :                 goto drop_incompat;
    1121             : 
    1122       25793 :         xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SWAPEXT,
    1123             :  "EXPERIMENTAL atomic file range swap feature in use. Use at your own risk!");
    1124             : 
    1125             :         return 0;
    1126       31878 : drop_incompat:
    1127       31878 :         xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
    1128       31878 :         *use_logging = false;
    1129       31878 :         return error;
    1130             : }
    1131             : 
    1132             : /* Release permission to use log-assisted extent swapping. */
    1133             : void
    1134     4953777 : xfs_xchg_range_rele_log_assist(
    1135             :         struct xfs_mount        *mp)
    1136             : {
    1137     6783827 :         xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
    1138     1830067 : }
    1139             : 
    1140             : /* Decide if we can use the old data fork exchange code. */
    1141             : static inline bool
    1142        3510 : xfs_xchg_use_forkswap(
    1143             :         const struct xfs_exch_range     *fxr,
    1144             :         struct xfs_inode                *ip1,
    1145             :         struct xfs_inode                *ip2)
    1146             : {
    1147        3510 :         if (!(fxr->flags & XFS_EXCH_RANGE_NONATOMIC))
    1148             :                 return false;
    1149        3510 :         if (!(fxr->flags & XFS_EXCH_RANGE_FULL_FILES))
    1150             :                 return false;
    1151        3465 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
    1152             :                 return false;
    1153        3465 :         if (fxr->file1_offset != 0 || fxr->file2_offset != 0)
    1154             :                 return false;
    1155        3465 :         if (fxr->length != ip1->i_disk_size)
    1156             :                 return false;
    1157        3465 :         if (fxr->length != ip2->i_disk_size)
    1158           0 :                 return false;
    1159             :         return true;
    1160             : }
    1161             : 
    1162             : enum xchg_strategy {
    1163             :         SWAPEXT         = 1,    /* xfs_swapext() */
    1164             :         FORKSWAP        = 2,    /* exchange forks */
    1165             : };
    1166             : 
    1167             : /* Exchange the contents of two files. */
    1168             : int
    1169     1861811 : xfs_xchg_range(
    1170             :         struct xfs_inode                *ip1,
    1171             :         struct xfs_inode                *ip2,
    1172             :         const struct xfs_exch_range     *fxr,
    1173             :         unsigned int                    xchg_flags)
    1174             : {
    1175     1861811 :         struct xfs_mount                *mp = ip1->i_mount;
    1176     7447244 :         struct xfs_swapext_req          req = {
    1177             :                 .ip1                    = ip1,
    1178             :                 .ip2                    = ip2,
    1179             :                 .whichfork              = XFS_DATA_FORK,
    1180     1861811 :                 .startoff1              = XFS_B_TO_FSBT(mp, fxr->file1_offset),
    1181     1861811 :                 .startoff2              = XFS_B_TO_FSBT(mp, fxr->file2_offset),
    1182     1861811 :                 .blockcount             = XFS_B_TO_FSB(mp, fxr->length),
    1183             :         };
    1184     1861811 :         struct xfs_trans                *tp;
    1185     1861811 :         unsigned int                    qretry;
    1186     1861811 :         unsigned int                    flags = 0;
    1187     1861811 :         bool                            retried = false;
    1188     1861811 :         enum xchg_strategy              strategy;
    1189     1861811 :         int                             error;
    1190             : 
    1191     1861811 :         trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags);
    1192             : 
    1193     1861803 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
    1194         335 :                 req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
    1195     1861803 :         if (fxr->flags & XFS_EXCH_RANGE_FILE1_WRITTEN)
    1196          64 :                 req.req_flags |= XFS_SWAP_REQ_INO1_WRITTEN;
    1197     1861803 :         if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
    1198     1830063 :                 req.req_flags |= XFS_SWAP_REQ_LOGGED;
    1199             : 
    1200             :         /*
    1201             :          * Round the request length up to the nearest fundamental unit of
    1202             :          * allocation.  The prep function already checked that the request
    1203             :          * offsets and length in @fxr are safe to round up.
    1204             :          */
    1205     1861803 :         if (XFS_IS_REALTIME_INODE(ip2))
    1206      648475 :                 req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
    1207             : 
    1208     1861801 :         error = xfs_xchg_range_estimate(&req);
    1209     1861809 :         if (error)
    1210             :                 return error;
    1211             : 
    1212             :         /*
    1213             :          * We haven't decided which exchange strategy we want to use yet, but
    1214             :          * here we must choose if we want freed blocks during the swap to be
    1215             :          * added to the transaction block reservation (RES_FDBLKS) or freed
    1216             :          * into the global fdblocks.  The legacy fork swap mechanism doesn't
    1217             :          * free any blocks, so it doesn't require it.  It is also the only
    1218             :          * option that works for older filesystems.
    1219             :          *
    1220             :          * The bmap log intent items that were added with rmap and reflink can
    1221             :          * change the bmbt shape, so the intent-based swap strategies require
    1222             :          * us to set RES_FDBLKS.
    1223             :          */
    1224     1861799 :         if (xfs_has_lazysbcount(mp))
    1225     1861799 :                 flags |= XFS_TRANS_RES_FDBLKS;
    1226             : 
    1227     1861799 : retry:
    1228             :         /* Allocate the transaction, lock the inodes, and join them. */
    1229     1861810 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
    1230             :                         flags, &tp);
    1231     1861807 :         if (error)
    1232           1 :                 return error;
    1233             : 
    1234     1861806 :         xfs_xchg_range_ilock(tp, ip1, ip2);
    1235             : 
    1236     1861813 :         trace_xfs_swap_extent_before(ip2, 0);
    1237     1861804 :         trace_xfs_swap_extent_before(ip1, 1);
    1238             : 
    1239     1861805 :         if (fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH)
    1240       32091 :                 trace_xfs_xchg_range_freshness(ip2, fxr);
    1241             : 
    1242             :         /*
    1243             :          * Now that we've excluded all other inode metadata changes by taking
    1244             :          * the ILOCK, repeat the freshness check.
    1245             :          */
    1246     1861805 :         error = xfs_exch_range_check_fresh(VFS_I(ip2), fxr);
    1247     1861805 :         if (error)
    1248           0 :                 goto out_trans_cancel;
    1249             : 
    1250     1861805 :         error = xfs_swapext_check_extents(mp, &req);
    1251     1861799 :         if (error)
    1252           0 :                 goto out_trans_cancel;
    1253             : 
    1254             :         /*
    1255             :          * Reserve ourselves some quota if any of them are in enforcing mode.
    1256             :          * In theory we only need enough to satisfy the change in the number
    1257             :          * of blocks between the two ranges being remapped.
    1258             :          */
    1259     1861799 :         error = xfs_xchg_range_reserve_quota(tp, &req, &qretry);
    1260     1861785 :         if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
    1261          11 :                 xfs_trans_cancel(tp);
    1262          11 :                 xfs_xchg_range_iunlock(ip1, ip2);
    1263          11 :                 if (qretry & QRETRY_IP1)
    1264          11 :                         xfs_blockgc_free_quota(ip1, 0);
    1265          11 :                 if (qretry & QRETRY_IP2)
    1266           0 :                         xfs_blockgc_free_quota(ip2, 0);
    1267          11 :                 retried = true;
    1268          11 :                 goto retry;
    1269             :         }
    1270     1861774 :         if (error)
    1271          11 :                 goto out_trans_cancel;
    1272             : 
    1273     1893480 :         if ((xchg_flags & XFS_XCHG_RANGE_LOGGED) || xfs_swapext_supported(mp)) {
    1274             :                 /*
    1275             :                  * xfs_swapext() uses deferred bmap log intent items to swap
    1276             :                  * extents between file forks.  If the atomic log swap feature
    1277             :                  * is enabled, it will also use swapext log intent items to
    1278             :                  * restart the operation in case of failure.
    1279             :                  *
    1280             :                  * This means that we can use it if we previously obtained
    1281             :                  * permission from the log to use log-assisted atomic extent
    1282             :                  * swapping; or if the fs supports rmap or reflink and the
    1283             :                  * user said NONATOMIC.
    1284             :                  */
    1285             :                 strategy = SWAPEXT;
    1286        3510 :         } else if (xfs_xchg_use_forkswap(fxr, ip1, ip2)) {
    1287             :                 /*
    1288             :                  * Exchange the file contents by using the old bmap fork
    1289             :                  * exchange code, if we're a defrag tool doing a full file
    1290             :                  * swap.
    1291             :                  */
    1292        3465 :                 strategy = FORKSWAP;
    1293             : 
    1294        3465 :                 error = xfs_swap_extents_check_format(ip2, ip1);
    1295        3465 :                 if (error) {
    1296           1 :                         xfs_notice(mp,
    1297             :                 "%s: inode 0x%llx format is incompatible for exchanging.",
    1298             :                                         __func__, ip2->i_ino);
    1299           1 :                         goto out_trans_cancel;
    1300             :                 }
    1301             :         } else {
    1302             :                 /* We cannot exchange the file contents. */
    1303          45 :                 error = -EOPNOTSUPP;
    1304          45 :                 goto out_trans_cancel;
    1305             :         }
    1306             : 
    1307             :         /* If we got this far on a dry run, all parameters are ok. */
    1308     1861717 :         if (fxr->flags & XFS_EXCH_RANGE_DRY_RUN)
    1309         480 :                 goto out_trans_cancel;
    1310             : 
    1311             :         /* Update the mtime and ctime of both files. */
    1312     1861237 :         if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
    1313     1861247 :                 xfs_trans_ichgtime(tp, ip1,
    1314             :                                 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
    1315     1861237 :         if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
    1316     1831943 :                 xfs_trans_ichgtime(tp, ip2,
    1317             :                                 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
    1318             : 
    1319     1861242 :         if (strategy == SWAPEXT) {
    1320     1857779 :                 xfs_swapext(tp, &req);
    1321             :         } else {
    1322        3463 :                 error = xfs_swap_extent_forks(&tp, &req);
    1323        3463 :                 if (error)
    1324           0 :                         goto out_trans_cancel;
    1325             :         }
    1326             : 
    1327             :         /*
    1328             :          * Force the log to persist metadata updates if the caller or the
    1329             :          * administrator requires this.  The VFS prep function already flushed
    1330             :          * the relevant parts of the page cache.
    1331             :          */
    1332     1861256 :         if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCH_RANGE_FSYNC))
    1333       24468 :                 xfs_trans_set_sync(tp);
    1334             : 
    1335     1861256 :         error = xfs_trans_commit(tp);
    1336             : 
    1337     1861266 :         trace_xfs_swap_extent_after(ip2, 0);
    1338     1861263 :         trace_xfs_swap_extent_after(ip1, 1);
    1339             : 
    1340     1861260 :         if (error)
    1341          52 :                 goto out_unlock;
    1342             : 
    1343             :         /*
    1344             :          * If the caller wanted us to exchange the contents of two complete
    1345             :          * files of unequal length, exchange the incore sizes now.  This should
    1346             :          * be safe because we flushed both files' page caches, moved all the
    1347             :          * extents, and updated the ondisk sizes.
    1348             :          */
    1349     1861208 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) {
    1350         326 :                 loff_t  temp;
    1351             : 
    1352         326 :                 temp = i_size_read(VFS_I(ip2));
    1353         326 :                 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
    1354         326 :                 i_size_write(VFS_I(ip1), temp);
    1355             :         }
    1356             : 
    1357     1860882 : out_unlock:
    1358     1861797 :         xfs_xchg_range_iunlock(ip1, ip2);
    1359     1861797 :         return error;
    1360             : 
    1361         537 : out_trans_cancel:
    1362         537 :         xfs_trans_cancel(tp);
    1363         537 :         goto out_unlock;
    1364             : }

Generated by: LCOV version 1.14