LCOV - code coverage report
Current view: top level - fs/xfs - xfs_xchgrange.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-acha @ Mon Jul 31 20:08:06 PDT 2023 Lines: 336 508 66.1 %
Date: 2023-07-31 20:08:07 Functions: 18 22 81.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * Copyright (C) 2020-2023 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <djwong@kernel.org>
       5             :  *
       6             :  * The xfs_swap_extent_* functions are:
       7             :  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
       8             :  * Copyright (c) 2012 Red Hat, Inc.
       9             :  * All Rights Reserved.
      10             :  */
      11             : #include "xfs.h"
      12             : #include "xfs_shared.h"
      13             : #include "xfs_format.h"
      14             : #include "xfs_log_format.h"
      15             : #include "xfs_trans_resv.h"
      16             : #include "xfs_mount.h"
      17             : #include "xfs_defer.h"
      18             : #include "xfs_inode.h"
      19             : #include "xfs_trans.h"
      20             : #include "xfs_quota.h"
      21             : #include "xfs_bmap_util.h"
      22             : #include "xfs_bmap_btree.h"
      23             : #include "xfs_reflink.h"
      24             : #include "xfs_trace.h"
      25             : #include "xfs_swapext.h"
      26             : #include "xfs_xchgrange.h"
      27             : #include "xfs_sb.h"
      28             : #include "xfs_icache.h"
      29             : #include "xfs_log.h"
      30             : #include "xfs_rtalloc.h"
      31             : #include <linux/fsnotify.h>
      32             : 
      33             : /*
      34             :  * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
      35             :  * This part does not deal with XFS-specific data structures, and may some day
      36             :  * be ported to the VFS.
      37             :  *
      38             :  * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
      39             :  * file1 with the same number of bytes starting at fxr.file2_offset in file2.
      40             :  * Implementations must call xfs_exch_range_prep to prepare the two files
      41             :  * prior to taking locks; they must call xfs_exch_range_check_fresh once
      42             :  * the inode is locked to abort the call if file2 has changed; and they must
      43             :  * update the inode change and mod times of both files as part of the metadata
      44             :  * update.  The timestamp updates must be done atomically as part of the data
      45             :  * exchange operation to ensure correctness of the freshness check.
      46             :  */
      47             : 
      48             : /*
      49             :  * Check that both files' metadata agree with the snapshot that we took for
      50             :  * the range exchange request.
      51             :  *
      52             :  * This should be called after the filesystem has locked /all/ inode metadata
      53             :  * against modification.
      54             :  */
      55             : STATIC int
      56      353666 : xfs_exch_range_check_fresh(
      57             :         struct inode                    *inode2,
      58             :         const struct xfs_exch_range     *fxr)
      59             : {
      60             :         /* Check that file2 hasn't otherwise been modified. */
      61      353666 :         if ((fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) &&
      62       13096 :             (fxr->file2_ino        != inode2->i_ino ||
      63       13096 :              fxr->file2_ctime      != inode2->i_ctime.tv_sec  ||
      64       13096 :              fxr->file2_ctime_nsec != inode2->i_ctime.tv_nsec ||
      65       13090 :              fxr->file2_mtime      != inode2->i_mtime.tv_sec  ||
      66       13090 :              fxr->file2_mtime_nsec != inode2->i_mtime.tv_nsec))
      67           6 :                 return -EBUSY;
      68             : 
      69             :         return 0;
      70             : }
      71             : 
      72             : /* Performs necessary checks before doing a range exchange. */
      73             : STATIC int
      74      176871 : xfs_exch_range_checks(
      75             :         struct file             *file1,
      76             :         struct file             *file2,
      77             :         struct xfs_exch_range   *fxr,
      78             :         unsigned int            blocksize)
      79             : {
      80      176871 :         struct inode            *inode1 = file1->f_mapping->host;
      81      176871 :         struct inode            *inode2 = file2->f_mapping->host;
      82      176871 :         uint64_t                blkmask = blocksize - 1;
      83      176871 :         int64_t                 test_len;
      84      176871 :         uint64_t                blen;
      85      176871 :         loff_t                  size1, size2;
      86      176871 :         int                     error;
      87             : 
      88             :         /* Don't touch certain kinds of inodes */
      89      176871 :         if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
      90             :                 return -EPERM;
      91      176869 :         if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
      92             :                 return -ETXTBSY;
      93             : 
      94      176867 :         size1 = i_size_read(inode1);
      95      176867 :         size2 = i_size_read(inode2);
      96             : 
      97             :         /* Ranges cannot start after EOF. */
      98      176867 :         if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
      99             :                 return -EINVAL;
     100             : 
     101             :         /*
     102             :          * If the caller asked for full files, check that the offset/length
     103             :          * values cover all of both files.
     104             :          */
     105      176859 :         if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
     106        6459 :             (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
     107        6459 :              fxr->length != size1 || fxr->length != size2))
     108             :                 return -EDOM;
     109             : 
     110             :         /*
     111             :          * If the caller said to exchange to EOF, we set the length of the
     112             :          * request large enough to cover everything to the end of both files.
     113             :          */
     114      176855 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
     115          74 :                 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
     116             :                                              size2 - fxr->file2_offset);
     117             : 
     118             :         /* The start of both ranges must be aligned to an fs block. */
     119      176855 :         if (!IS_ALIGNED(fxr->file1_offset, blocksize) ||
     120      176853 :             !IS_ALIGNED(fxr->file2_offset, blocksize))
     121             :                 return -EINVAL;
     122             : 
     123             :         /* Ensure offsets don't wrap. */
     124      176853 :         if (fxr->file1_offset + fxr->length < fxr->file1_offset ||
     125      176853 :             fxr->file2_offset + fxr->length < fxr->file2_offset)
     126             :                 return -EINVAL;
     127             : 
     128             :         /*
     129             :          * We require both ranges to be within EOF, unless we're exchanging
     130             :          * to EOF.  xfs_xchg_range_prep already checked that both
     131             :          * fxr->file1_offset and fxr->file2_offset are within EOF.
     132             :          */
     133      176853 :         if (!(fxr->flags & XFS_EXCH_RANGE_TO_EOF) &&
     134      176779 :             (fxr->file1_offset + fxr->length > size1 ||
     135      176775 :              fxr->file2_offset + fxr->length > size2))
     136             :                 return -EINVAL;
     137             : 
     138             :         /*
     139             :          * Make sure we don't hit any file size limits.  If we hit any size
     140             :          * limits such that test_length was adjusted, we abort the whole
     141             :          * operation.
     142             :          */
     143      176847 :         test_len = fxr->length;
     144      176847 :         error = generic_write_check_limits(file2, fxr->file2_offset, &test_len);
     145      176847 :         if (error)
     146             :                 return error;
     147      176847 :         error = generic_write_check_limits(file1, fxr->file1_offset, &test_len);
     148      176847 :         if (error)
     149             :                 return error;
     150      176847 :         if (test_len != fxr->length)
     151             :                 return -EINVAL;
     152             : 
     153             :         /*
     154             :          * If the user wanted us to exchange up to the infile's EOF, round up
     155             :          * to the next block boundary for this check.  Do the same for the
     156             :          * outfile.
     157             :          *
     158             :          * Otherwise, reject the range length if it's not block aligned.  We
     159             :          * already confirmed the starting offsets' block alignment.
     160             :          */
     161      176845 :         if (fxr->file1_offset + fxr->length == size1)
     162        6631 :                 blen = ALIGN(size1, blocksize) - fxr->file1_offset;
     163      170214 :         else if (fxr->file2_offset + fxr->length == size2)
     164        7585 :                 blen = ALIGN(size2, blocksize) - fxr->file2_offset;
     165      162629 :         else if (!IS_ALIGNED(fxr->length, blocksize))
     166             :                 return -EINVAL;
     167             :         else
     168             :                 blen = fxr->length;
     169             : 
     170             :         /* Don't allow overlapped exchanges within the same file. */
     171      176845 :         if (inode1 == inode2 &&
     172      170282 :             fxr->file2_offset + blen > fxr->file1_offset &&
     173       95617 :             fxr->file1_offset + blen > fxr->file2_offset)
     174             :                 return -EINVAL;
     175             : 
     176             :         /* If we already failed the freshness check, we're done. */
     177      176839 :         error = xfs_exch_range_check_fresh(inode2, fxr);
     178      176839 :         if (error)
     179             :                 return error;
     180             : 
     181             :         /*
     182             :          * Ensure that we don't exchange a partial EOF block into the middle of
     183             :          * another file.
     184             :          */
     185      176833 :         if ((fxr->length & blkmask) == 0)
     186             :                 return 0;
     187             : 
     188        1605 :         blen = fxr->length;
     189        1605 :         if (fxr->file2_offset + blen < size2)
     190           2 :                 blen &= ~blkmask;
     191             : 
     192        1605 :         if (fxr->file1_offset + blen < size1)
     193           4 :                 blen &= ~blkmask;
     194             : 
     195        1605 :         return blen == fxr->length ? 0 : -EINVAL;
     196             : }
     197             : 
     198             : /*
     199             :  * Check that the two inodes are eligible for range exchanges, the ranges make
     200             :  * sense, and then flush all dirty data.  Caller must ensure that the inodes
     201             :  * have been locked against any other modifications.
     202             :  */
     203             : int
     204      176871 : xfs_exch_range_prep(
     205             :         struct file             *file1,
     206             :         struct file             *file2,
     207             :         struct xfs_exch_range   *fxr,
     208             :         unsigned int            blocksize)
     209             : {
     210      176871 :         struct inode            *inode1 = file_inode(file1);
     211      176871 :         struct inode            *inode2 = file_inode(file2);
     212      176871 :         bool                    same_inode = (inode1 == inode2);
     213      176871 :         int                     error;
     214             : 
     215             :         /* Check that we don't violate system file offset limits. */
     216      176871 :         error = xfs_exch_range_checks(file1, file2, fxr, blocksize);
     217      176871 :         if (error || fxr->length == 0)
     218             :                 return error;
     219             : 
     220             :         /* Wait for the completion of any pending IOs on both files */
     221      176713 :         inode_dio_wait(inode1);
     222      176713 :         if (!same_inode)
     223        6553 :                 inode_dio_wait(inode2);
     224             : 
     225      176713 :         error = filemap_write_and_wait_range(inode1->i_mapping,
     226             :                         fxr->file1_offset,
     227      176713 :                         fxr->file1_offset + fxr->length - 1);
     228      176713 :         if (error)
     229             :                 return error;
     230             : 
     231      176713 :         error = filemap_write_and_wait_range(inode2->i_mapping,
     232             :                         fxr->file2_offset,
     233      176713 :                         fxr->file2_offset + fxr->length - 1);
     234      176713 :         if (error)
     235             :                 return error;
     236             : 
     237             :         /*
     238             :          * If the files or inodes involved require synchronous writes, amend
     239             :          * the request to force the filesystem to flush all data and metadata
     240             :          * to disk after the operation completes.
     241             :          */
     242      176713 :         if (((file1->f_flags | file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
     243      170819 :             IS_SYNC(inode1) || IS_SYNC(inode2))
     244        5894 :                 fxr->flags |= XFS_EXCH_RANGE_FSYNC;
     245             : 
     246             :         return 0;
     247             : }
     248             : 
     249             : /*
     250             :  * Finish a range exchange operation, if it was successful.  Caller must ensure
     251             :  * that the inodes are still locked against any other modifications.
     252             :  */
     253             : int
     254      176809 : xfs_exch_range_finish(
     255             :         struct file             *file1,
     256             :         struct file             *file2)
     257             : {
     258      176809 :         int                     error;
     259             : 
     260      176809 :         error = file_remove_privs(file1);
     261      176809 :         if (error)
     262             :                 return error;
     263      176809 :         if (file_inode(file1) == file_inode(file2))
     264             :                 return 0;
     265             : 
     266        6537 :         return file_remove_privs(file2);
     267             : }
     268             : 
     269             : /* Decide if it's ok to remap the selected range of a given file. */
     270             : STATIC int
     271      353742 : xfs_exch_range_verify_area(
     272             :         struct file             *file,
     273             :         loff_t                  pos,
     274             :         struct xfs_exch_range   *fxr)
     275             : {
     276      353742 :         int64_t                 len = fxr->length;
     277             : 
     278      353742 :         if (pos < 0)
     279             :                 return -EINVAL;
     280             : 
     281      353742 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
     282         148 :                 len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos);
     283      353742 :         return remap_verify_area(file, pos, len, true);
     284             : }
     285             : 
     286             : /* Prepare for and exchange parts of two files. */
     287             : static inline int
     288     1703894 : __xfs_exch_range(
     289             :         struct file             *file1,
     290             :         struct file             *file2,
     291             :         struct xfs_exch_range   *fxr)
     292             : {
     293     1703894 :         struct inode            *inode1 = file_inode(file1);
     294     1703894 :         struct inode            *inode2 = file_inode(file2);
     295     1703894 :         int                     ret;
     296             : 
     297     3407788 :         if ((fxr->flags & ~XFS_EXCH_RANGE_ALL_FLAGS) ||
     298     1703894 :             memchr_inv(&fxr->pad, 0, sizeof(fxr->pad)))
     299           0 :                 return -EINVAL;
     300             : 
     301     1703894 :         if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
     302             :             (fxr->flags & XFS_EXCH_RANGE_TO_EOF))
     303             :                 return -EINVAL;
     304             : 
     305             :         /*
     306             :          * The ioctl enforces that src and dest files are on the same mount.
     307             :          * However, they only need to be on the same file system.
     308             :          */
     309     1703894 :         if (inode1->i_sb != inode2->i_sb)
     310             :                 return -EXDEV;
     311             : 
     312             :         /* This only works for regular files. */
     313     1703894 :         if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
     314             :                 return -EISDIR;
     315     1703892 :         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
     316             :                 return -EINVAL;
     317             : 
     318     1703892 :         ret = generic_file_rw_checks(file1, file2);
     319     1703893 :         if (ret < 0)
     320             :                 return ret;
     321             : 
     322     1703891 :         ret = generic_file_rw_checks(file2, file1);
     323     1703890 :         if (ret < 0)
     324             :                 return ret;
     325             : 
     326      176871 :         ret = xfs_exch_range_verify_area(file1, fxr->file1_offset, fxr);
     327      176871 :         if (ret)
     328             :                 return ret;
     329             : 
     330      176871 :         ret = xfs_exch_range_verify_area(file2, fxr->file2_offset, fxr);
     331      176871 :         if (ret)
     332             :                 return ret;
     333             : 
     334      176871 :         ret = xfs_file_xchg_range(file1, file2, fxr);
     335      176871 :         if (ret)
     336             :                 return ret;
     337             : 
     338      176809 :         fsnotify_modify(file1);
     339      176809 :         if (file2 != file1)
     340        6541 :                 fsnotify_modify(file2);
     341             :         return 0;
     342             : }
     343             : 
     344             : /* Exchange parts of two files. */
     345             : int
     346     1703896 : xfs_exch_range(
     347             :         struct file             *file1,
     348             :         struct file             *file2,
     349             :         struct xfs_exch_range   *fxr)
     350             : {
     351     1703896 :         int                     error;
     352             : 
     353     1703896 :         file_start_write(file2);
     354     1703894 :         error = __xfs_exch_range(file1, file2, fxr);
     355     1703887 :         file_end_write(file2);
     356     1703892 :         return error;
     357             : }
     358             : 
     359             : /* XFS-specific parts of XFS_IOC_EXCHANGE_RANGE */
     360             : 
     361             : /*
     362             :  * Exchanging ranges as a file operation.  This is the binding between the
     363             :  * VFS-level concepts and the XFS-specific implementation.
     364             :  */
     365             : int
     366      176871 : xfs_file_xchg_range(
     367             :         struct file             *file1,
     368             :         struct file             *file2,
     369             :         struct xfs_exch_range   *fxr)
     370             : {
     371      176871 :         struct inode            *inode1 = file_inode(file1);
     372      176871 :         struct inode            *inode2 = file_inode(file2);
     373      176871 :         struct xfs_inode        *ip1 = XFS_I(inode1);
     374      176871 :         struct xfs_inode        *ip2 = XFS_I(inode2);
     375      176871 :         struct xfs_mount        *mp = ip1->i_mount;
     376      176871 :         unsigned int            priv_flags = 0;
     377      176871 :         bool                    use_logging = false;
     378      176871 :         int                     error;
     379             : 
     380      353742 :         if (xfs_is_shutdown(mp))
     381             :                 return -EIO;
     382             : 
     383             :         /* Update cmtime if the fd/inode don't forbid it. */
     384      176871 :         if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
     385      176871 :                 priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
     386      176871 :         if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
     387      170892 :                 priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
     388             : 
     389             :         /* Lock both files against IO */
     390      176871 :         error = xfs_ilock2_io_mmap(ip1, ip2);
     391      176871 :         if (error)
     392           0 :                 goto out_err;
     393             : 
     394             :         /* Prepare and then exchange file contents. */
     395      176871 :         error = xfs_xchg_range_prep(file1, file2, fxr, priv_flags);
     396      176871 :         if (error)
     397          42 :                 goto out_unlock;
     398             : 
     399             :         /* Get permission to use log-assisted file content swaps. */
     400      176829 :         error = xfs_xchg_range_grab_log_assist(mp,
     401      176829 :                         !(fxr->flags & XFS_EXCH_RANGE_NONATOMIC),
     402             :                         &use_logging);
     403      176829 :         if (error)
     404           2 :                 goto out_unlock;
     405      176827 :         if (use_logging)
     406      170364 :                 priv_flags |= XFS_XCHG_RANGE_LOGGED;
     407             : 
     408      176827 :         error = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
     409      176827 :         if (error)
     410          18 :                 goto out_drop_feat;
     411             : 
     412             :         /*
     413             :          * Finish the exchange by removing special file privileges like any
     414             :          * other file write would do.  This may involve turning on support for
     415             :          * logged xattrs if either file has security capabilities, which means
     416             :          * xfs_xchg_range_grab_log_assist before xfs_attr_grab_log_assist.
     417             :          */
     418      176809 :         error = xfs_exch_range_finish(file1, file2);
     419      176809 :         if (error)
     420           0 :                 goto out_drop_feat;
     421             : 
     422      176809 : out_drop_feat:
     423      176827 :         if (use_logging)
     424      170364 :                 xfs_xchg_range_rele_log_assist(mp);
     425        6463 : out_unlock:
     426      176871 :         xfs_iunlock2_io_mmap(ip1, ip2);
     427      176871 : out_err:
     428      176871 :         if (error)
     429          62 :                 trace_xfs_file_xchg_range_error(ip2, error, _RET_IP_);
     430             :         return error;
     431             : }
     432             : 
     433             : /* Lock (and optionally join) two inodes for a file range exchange. */
     434             : void
     435      610633 : xfs_xchg_range_ilock(
     436             :         struct xfs_trans        *tp,
     437             :         struct xfs_inode        *ip1,
     438             :         struct xfs_inode        *ip2)
     439             : {
     440      610633 :         if (ip1 != ip2)
     441      270085 :                 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
     442             :                                     ip2, XFS_ILOCK_EXCL);
     443             :         else
     444      340548 :                 xfs_ilock(ip1, XFS_ILOCK_EXCL);
     445      610646 :         if (tp) {
     446      390875 :                 xfs_trans_ijoin(tp, ip1, 0);
     447      390862 :                 if (ip2 != ip1)
     448      220588 :                         xfs_trans_ijoin(tp, ip2, 0);
     449             :         }
     450             : 
     451      610638 : }
     452             : 
     453             : /* Unlock two inodes after a file range exchange operation. */
     454             : void
     455      396604 : xfs_xchg_range_iunlock(
     456             :         struct xfs_inode        *ip1,
     457             :         struct xfs_inode        *ip2)
     458             : {
     459      396604 :         if (ip2 != ip1)
     460       56056 :                 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
     461      396604 :         xfs_iunlock(ip1, XFS_ILOCK_EXCL);
     462      396604 : }
     463             : 
     464             : /*
     465             :  * Estimate the resource requirements to exchange file contents between the two
     466             :  * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
     467             :  * have flushed both inodes' pagecache and active direct-ios.
     468             :  */
     469             : int
     470      219765 : xfs_xchg_range_estimate(
     471             :         struct xfs_swapext_req  *req)
     472             : {
     473      219765 :         int                     error;
     474             : 
     475      219765 :         xfs_xchg_range_ilock(NULL, req->ip1, req->ip2);
     476      219765 :         error = xfs_swapext_estimate(req);
     477      219765 :         xfs_xchg_range_iunlock(req->ip1, req->ip2);
     478      219765 :         return error;
     479             : }
     480             : 
     481             : /*
     482             :  * We need to check that the format of the data fork in the temporary inode is
     483             :  * valid for the target inode before doing the swap. This is not a problem with
     484             :  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
     485             :  * data fork depending on the space the attribute fork is taking so we can get
     486             :  * invalid formats on the target inode.
     487             :  *
     488             :  * E.g. target has space for 7 extents in extent format, temp inode only has
     489             :  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
     490             :  * btree, but when swapped it needs to be in extent format. Hence we can't just
     491             :  * blindly swap data forks on attr2 filesystems.
     492             :  *
     493             :  * Note that we check the swap in both directions so that we don't end up with
     494             :  * a corrupt temporary inode, either.
     495             :  *
     496             :  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
     497             :  * inode will prevent this situation from occurring, so all we do here is
     498             :  * reject and log the attempt. basically we are putting the responsibility on
     499             :  * userspace to get this right.
     500             :  */
     501             : STATIC int
     502           0 : xfs_swap_extents_check_format(
     503             :         struct xfs_inode        *ip,    /* target inode */
     504             :         struct xfs_inode        *tip)   /* tmp inode */
     505             : {
     506           0 :         struct xfs_ifork        *ifp = &ip->i_df;
     507           0 :         struct xfs_ifork        *tifp = &tip->i_df;
     508             : 
     509             :         /* User/group/project quota ids must match if quotas are enforced. */
     510           0 :         if (XFS_IS_QUOTA_ON(ip->i_mount) &&
     511           0 :             (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
     512           0 :              !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
     513           0 :              ip->i_projid != tip->i_projid))
     514             :                 return -EINVAL;
     515             : 
     516             :         /* Should never get a local format */
     517           0 :         if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
     518           0 :             tifp->if_format == XFS_DINODE_FMT_LOCAL)
     519             :                 return -EINVAL;
     520             : 
     521             :         /*
     522             :          * if the target inode has less extents that then temporary inode then
     523             :          * why did userspace call us?
     524             :          */
     525           0 :         if (ifp->if_nextents < tifp->if_nextents)
     526             :                 return -EINVAL;
     527             : 
     528             :         /*
     529             :          * If we have to use the (expensive) rmap swap method, we can
     530             :          * handle any number of extents and any format.
     531             :          */
     532           0 :         if (xfs_has_rmapbt(ip->i_mount))
     533             :                 return 0;
     534             : 
     535             :         /*
     536             :          * if the target inode is in extent form and the temp inode is in btree
     537             :          * form then we will end up with the target inode in the wrong format
     538             :          * as we already know there are less extents in the temp inode.
     539             :          */
     540           0 :         if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
     541             :             tifp->if_format == XFS_DINODE_FMT_BTREE)
     542             :                 return -EINVAL;
     543             : 
     544             :         /* Check temp in extent form to max in target */
     545           0 :         if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
     546           0 :             tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
     547             :                 return -EINVAL;
     548             : 
     549             :         /* Check target in extent form to max in temp */
     550           0 :         if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
     551           0 :             ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
     552             :                 return -EINVAL;
     553             : 
     554             :         /*
     555             :          * If we are in a btree format, check that the temp root block will fit
     556             :          * in the target and that it has enough extents to be in btree format
     557             :          * in the target.
     558             :          *
     559             :          * Note that we have to be careful to allow btree->extent conversions
     560             :          * (a common defrag case) which will occur when the temp inode is in
     561             :          * extent format...
     562             :          */
     563           0 :         if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
     564           0 :                 if (xfs_inode_has_attr_fork(ip) &&
     565           0 :                     XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
     566             :                         return -EINVAL;
     567           0 :                 if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
     568             :                         return -EINVAL;
     569             :         }
     570             : 
     571             :         /* Reciprocal target->temp btree format checks */
     572           0 :         if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
     573           0 :                 if (xfs_inode_has_attr_fork(tip) &&
     574           0 :                     XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
     575             :                         return -EINVAL;
     576           0 :                 if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
     577           0 :                         return -EINVAL;
     578             :         }
     579             : 
     580             :         return 0;
     581             : }
     582             : 
     583             : /*
     584             :  * Fix up the owners of the bmbt blocks to refer to the current inode. The
     585             :  * change owner scan attempts to order all modified buffers in the current
     586             :  * transaction. In the event of ordered buffer failure, the offending buffer is
     587             :  * physically logged as a fallback and the scan returns -EAGAIN. We must roll
     588             :  * the transaction in this case to replenish the fallback log reservation and
     589             :  * restart the scan. This process repeats until the scan completes.
     590             :  */
     591             : static int
     592           0 : xfs_swap_change_owner(
     593             :         struct xfs_trans        **tpp,
     594             :         struct xfs_inode        *ip,
     595             :         struct xfs_inode        *tmpip)
     596             : {
     597           0 :         int                     error;
     598           0 :         struct xfs_trans        *tp = *tpp;
     599             : 
     600           0 :         do {
     601           0 :                 error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
     602             :                                               NULL);
     603             :                 /* success or fatal error */
     604           0 :                 if (error != -EAGAIN)
     605             :                         break;
     606             : 
     607           0 :                 error = xfs_trans_roll(tpp);
     608           0 :                 if (error)
     609             :                         break;
     610           0 :                 tp = *tpp;
     611             : 
     612             :                 /*
     613             :                  * Redirty both inodes so they can relog and keep the log tail
     614             :                  * moving forward.
     615             :                  */
     616           0 :                 xfs_trans_ijoin(tp, ip, 0);
     617           0 :                 xfs_trans_ijoin(tp, tmpip, 0);
     618           0 :                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
     619           0 :                 xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
     620             :         } while (true);
     621             : 
     622           0 :         return error;
     623             : }
     624             : 
     625             : /* Swap the extents of two files by swapping data forks. */
     626             : STATIC int
     627           0 : xfs_swap_extent_forks(
     628             :         struct xfs_trans        **tpp,
     629             :         struct xfs_swapext_req  *req)
     630             : {
     631           0 :         struct xfs_inode        *ip = req->ip2;
     632           0 :         struct xfs_inode        *tip = req->ip1;
     633           0 :         xfs_filblks_t           aforkblks = 0;
     634           0 :         xfs_filblks_t           taforkblks = 0;
     635           0 :         xfs_extnum_t            junk;
     636           0 :         uint64_t                tmp;
     637           0 :         int                     src_log_flags = XFS_ILOG_CORE;
     638           0 :         int                     target_log_flags = XFS_ILOG_CORE;
     639           0 :         int                     error;
     640             : 
     641             :         /*
     642             :          * Count the number of extended attribute blocks
     643             :          */
     644           0 :         if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
     645           0 :             ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
     646           0 :                 error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
     647             :                                 &aforkblks);
     648           0 :                 if (error)
     649             :                         return error;
     650             :         }
     651           0 :         if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
     652           0 :             tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
     653           0 :                 error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
     654             :                                 &taforkblks);
     655           0 :                 if (error)
     656             :                         return error;
     657             :         }
     658             : 
     659             :         /*
     660             :          * Btree format (v3) inodes have the inode number stamped in the bmbt
     661             :          * block headers. We can't start changing the bmbt blocks until the
     662             :          * inode owner change is logged so recovery does the right thing in the
     663             :          * event of a crash. Set the owner change log flags now and leave the
     664             :          * bmbt scan as the last step.
     665             :          */
     666           0 :         if (xfs_has_v3inodes(ip->i_mount)) {
     667           0 :                 if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
     668           0 :                         target_log_flags |= XFS_ILOG_DOWNER;
     669           0 :                 if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
     670           0 :                         src_log_flags |= XFS_ILOG_DOWNER;
     671             :         }
     672             : 
     673             :         /*
     674             :          * Swap the data forks of the inodes
     675             :          */
     676           0 :         swap(ip->i_df, tip->i_df);
     677             : 
     678             :         /*
     679             :          * Fix the on-disk inode values
     680             :          */
     681           0 :         tmp = (uint64_t)ip->i_nblocks;
     682           0 :         ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
     683           0 :         tip->i_nblocks = tmp + taforkblks - aforkblks;
     684             : 
     685             :         /*
     686             :          * The extents in the source inode could still contain speculative
     687             :          * preallocation beyond EOF (e.g. the file is open but not modified
     688             :          * while defrag is in progress). In that case, we need to copy over the
     689             :          * number of delalloc blocks the data fork in the source inode is
     690             :          * tracking beyond EOF so that when the fork is truncated away when the
     691             :          * temporary inode is unlinked we don't underrun the i_delayed_blks
     692             :          * counter on that inode.
     693             :          */
     694           0 :         ASSERT(tip->i_delayed_blks == 0);
     695           0 :         tip->i_delayed_blks = ip->i_delayed_blks;
     696           0 :         ip->i_delayed_blks = 0;
     697             : 
     698           0 :         switch (ip->i_df.if_format) {
     699           0 :         case XFS_DINODE_FMT_EXTENTS:
     700           0 :                 src_log_flags |= XFS_ILOG_DEXT;
     701           0 :                 break;
     702           0 :         case XFS_DINODE_FMT_BTREE:
     703           0 :                 ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
     704             :                        (src_log_flags & XFS_ILOG_DOWNER));
     705           0 :                 src_log_flags |= XFS_ILOG_DBROOT;
     706           0 :                 break;
     707             :         }
     708             : 
     709           0 :         switch (tip->i_df.if_format) {
     710           0 :         case XFS_DINODE_FMT_EXTENTS:
     711           0 :                 target_log_flags |= XFS_ILOG_DEXT;
     712           0 :                 break;
     713           0 :         case XFS_DINODE_FMT_BTREE:
     714           0 :                 target_log_flags |= XFS_ILOG_DBROOT;
     715           0 :                 ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
     716             :                        (target_log_flags & XFS_ILOG_DOWNER));
     717             :                 break;
     718             :         }
     719             : 
     720             :         /* Do we have to swap reflink flags? */
     721           0 :         if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
     722           0 :             (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
     723           0 :                 uint64_t        f;
     724             : 
     725           0 :                 f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
     726           0 :                 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
     727           0 :                 ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
     728           0 :                 tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
     729           0 :                 tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
     730             :         }
     731             : 
     732             :         /* Swap the cow forks. */
     733           0 :         if (xfs_has_reflink(ip->i_mount)) {
     734           0 :                 ASSERT(!ip->i_cowfp ||
     735             :                        ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
     736           0 :                 ASSERT(!tip->i_cowfp ||
     737             :                        tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
     738             : 
     739           0 :                 swap(ip->i_cowfp, tip->i_cowfp);
     740             : 
     741           0 :                 if (ip->i_cowfp && ip->i_cowfp->if_bytes)
     742           0 :                         xfs_inode_set_cowblocks_tag(ip);
     743             :                 else
     744           0 :                         xfs_inode_clear_cowblocks_tag(ip);
     745           0 :                 if (tip->i_cowfp && tip->i_cowfp->if_bytes)
     746           0 :                         xfs_inode_set_cowblocks_tag(tip);
     747             :                 else
     748           0 :                         xfs_inode_clear_cowblocks_tag(tip);
     749             :         }
     750             : 
     751           0 :         xfs_trans_log_inode(*tpp, ip,  src_log_flags);
     752           0 :         xfs_trans_log_inode(*tpp, tip, target_log_flags);
     753             : 
     754             :         /*
     755             :          * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
     756             :          * have inode number owner values in the bmbt blocks that still refer to
     757             :          * the old inode. Scan each bmbt to fix up the owner values with the
     758             :          * inode number of the current inode.
     759             :          */
     760           0 :         if (src_log_flags & XFS_ILOG_DOWNER) {
     761           0 :                 error = xfs_swap_change_owner(tpp, ip, tip);
     762           0 :                 if (error)
     763             :                         return error;
     764             :         }
     765           0 :         if (target_log_flags & XFS_ILOG_DOWNER) {
     766           0 :                 error = xfs_swap_change_owner(tpp, tip, ip);
     767           0 :                 if (error)
     768           0 :                         return error;
     769             :         }
     770             : 
     771             :         return 0;
     772             : }
     773             : 
     774             : /*
     775             :  * There may be partially written rt extents lurking in the ranges to be
     776             :  * swapped.  According to the rules for realtime files with big rt extents, we
     777             :  * must guarantee that an outside observer (an IO thread, realistically) never
     778             :  * can see multiple physical rt extents mapped to the same logical file rt
     779             :  * extent.  The deferred bmap log intent items that we use under the hood
     780             :  * operate on single block mappings and not rt extents, which means we must
     781             :  * have a strategy to ensure that log recovery after a failure won't stop in
     782             :  * the middle of an rt extent.
     783             :  *
     784             :  * The preferred strategy is to use deferred extent swap log intent items to
     785             :  * track the status of the overall swap operation so that we can complete the
     786             :  * work during crash recovery.  If that isn't possible, we fall back to
     787             :  * requiring the selected mappings in both forks to be aligned to rt extent
     788             :  * boundaries.  As an aside, the old fork swap routine didn't have this
     789             :  * requirement, but at an extreme cost in flexibilty (full files only, and no
     790             :  * support if rmapbt is enabled).
     791             :  */
     792             : static bool
     793      176713 : xfs_xchg_range_need_rt_conversion(
     794             :         struct xfs_inode                *ip,
     795             :         unsigned int                    xchg_flags)
     796             : {
     797      176713 :         struct xfs_mount                *mp = ip->i_mount;
     798             : 
     799             :         /*
     800             :          * Caller got permission to use logged swapext, so log recovery will
     801             :          * finish the swap and not leave us with partially swapped rt extents
     802             :          * exposed to userspace.
     803             :          */
     804      176713 :         if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
     805             :                 return false;
     806             : 
     807             :         /*
     808             :          * If we can't use log intent items at all, the only supported
     809             :          * operation is full fork swaps, so no conversions are needed.
     810             :          * The range requirements are enforced by the swapext code itself.
     811             :          */
     812      176713 :         if (!xfs_swapext_supported(mp))
     813             :                 return false;
     814             : 
     815             :         /* Conversion is only needed for realtime files with big rt extents */
     816      176713 :         return xfs_inode_has_bigrtextents(ip);
     817             : }
     818             : 
     819             : /*
     820             :  * Check the alignment of an exchange request when the allocation unit size
     821             :  * isn't a power of two.  The VFS helpers use (fast) bitmask-based alignment
     822             :  * checks, but here we have to use slow long division.
     823             :  */
     824             : static int
     825           0 : xfs_xchg_range_check_rtalign(
     826             :         struct xfs_inode                *ip1,
     827             :         struct xfs_inode                *ip2,
     828             :         const struct xfs_exch_range     *fxr)
     829             : {
     830           0 :         struct xfs_mount                *mp = ip1->i_mount;
     831           0 :         uint32_t                        rextbytes;
     832           0 :         uint64_t                        length = fxr->length;
     833           0 :         uint64_t                        blen;
     834           0 :         loff_t                          size1, size2;
     835             : 
     836           0 :         rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
     837           0 :         size1 = i_size_read(VFS_I(ip1));
     838           0 :         size2 = i_size_read(VFS_I(ip2));
     839             : 
     840             :         /* The start of both ranges must be aligned to a rt extent. */
     841           0 :         if (!isaligned_64(fxr->file1_offset, rextbytes) ||
     842           0 :             !isaligned_64(fxr->file2_offset, rextbytes))
     843             :                 return -EINVAL;
     844             : 
     845             :         /*
     846             :          * If the caller asked for full files, check that the offset/length
     847             :          * values cover all of both files.
     848             :          */
     849           0 :         if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
     850           0 :             (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
     851           0 :              fxr->length != size1 || fxr->length != size2))
     852             :                 return -EDOM;
     853             : 
     854           0 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
     855           0 :                 length = max_t(int64_t, size1 - fxr->file1_offset,
     856             :                                         size2 - fxr->file2_offset);
     857             : 
     858             :         /*
     859             :          * If the user wanted us to exchange up to the infile's EOF, round up
     860             :          * to the next rt extent boundary for this check.  Do the same for the
     861             :          * outfile.
     862             :          *
     863             :          * Otherwise, reject the range length if it's not rt extent aligned.
     864             :          * We already confirmed the starting offsets' rt extent block
     865             :          * alignment.
     866             :          */
     867           0 :         if (fxr->file1_offset + length == size1)
     868           0 :                 blen = roundup_64(size1, rextbytes) - fxr->file1_offset;
     869           0 :         else if (fxr->file2_offset + length == size2)
     870           0 :                 blen = roundup_64(size2, rextbytes) - fxr->file2_offset;
     871           0 :         else if (!isaligned_64(length, rextbytes))
     872             :                 return -EINVAL;
     873             :         else
     874             :                 blen = length;
     875             : 
     876             :         /* Don't allow overlapped exchanges within the same file. */
     877           0 :         if (ip1 == ip2 &&
     878           0 :             fxr->file2_offset + blen > fxr->file1_offset &&
     879           0 :             fxr->file1_offset + blen > fxr->file2_offset)
     880             :                 return -EINVAL;
     881             : 
     882             :         /*
     883             :          * Ensure that we don't exchange a partial EOF rt extent into the
     884             :          * middle of another file.
     885             :          */
     886           0 :         if (isaligned_64(length, rextbytes))
     887             :                 return 0;
     888             : 
     889           0 :         blen = length;
     890           0 :         if (fxr->file2_offset + length < size2)
     891           0 :                 blen = rounddown_64(blen, rextbytes);
     892             : 
     893           0 :         if (fxr->file1_offset + blen < size1)
     894           0 :                 blen = rounddown_64(blen, rextbytes);
     895             : 
     896           0 :         return blen == length ? 0 : -EINVAL;
     897             : }
     898             : 
     899             : /* Prepare two files to have their data exchanged. */
     900             : int
     901      176871 : xfs_xchg_range_prep(
     902             :         struct file             *file1,
     903             :         struct file             *file2,
     904             :         struct xfs_exch_range   *fxr,
     905             :         unsigned int            xchg_flags)
     906             : {
     907      176871 :         struct xfs_inode        *ip1 = XFS_I(file_inode(file1));
     908      176871 :         struct xfs_inode        *ip2 = XFS_I(file_inode(file2));
     909      176871 :         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip2);
     910      176871 :         int                     error;
     911             : 
     912      176871 :         trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0);
     913             : 
     914             :         /* Verify both files are either real-time or non-realtime */
     915      530605 :         if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
     916             :                 return -EINVAL;
     917             : 
     918             :         /* Check non-power of two alignment issues, if necessary. */
     919      176875 :         if (XFS_IS_REALTIME_INODE(ip2) && !is_power_of_2(alloc_unit)) {
     920           0 :                 error = xfs_xchg_range_check_rtalign(ip1, ip2, fxr);
     921           0 :                 if (error)
     922             :                         return error;
     923             : 
     924             :                 /* Do the VFS checks with the regular block alignment. */
     925           0 :                 alloc_unit = ip1->i_mount->m_sb.sb_blocksize;
     926             :         }
     927             : 
     928      176871 :         error = xfs_exch_range_prep(file1, file2, fxr, alloc_unit);
     929      176871 :         if (error || fxr->length == 0)
     930             :                 return error;
     931             : 
     932             :         /* Attach dquots to both inodes before changing block maps. */
     933      176713 :         error = xfs_qm_dqattach(ip2);
     934      176713 :         if (error)
     935             :                 return error;
     936      176713 :         error = xfs_qm_dqattach(ip1);
     937      176713 :         if (error)
     938             :                 return error;
     939             : 
     940      176713 :         trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0);
     941             : 
     942             :         /* Flush the relevant ranges of both files. */
     943      176713 :         error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
     944      176713 :         if (error)
     945             :                 return error;
     946      176713 :         error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
     947      176713 :         if (error)
     948             :                 return error;
     949             : 
     950             :         /*
     951             :          * Cancel CoW fork preallocations for the ranges of both files.  The
     952             :          * prep function should have flushed all the dirty data, so the only
     953             :          * extents remaining should be speculative.
     954             :          */
     955      353426 :         if (xfs_inode_has_cow_data(ip1)) {
     956       27232 :                 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
     957       27232 :                                 fxr->length, true);
     958       27232 :                 if (error)
     959             :                         return error;
     960             :         }
     961             : 
     962      353426 :         if (xfs_inode_has_cow_data(ip2)) {
     963       25926 :                 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
     964       25926 :                                 fxr->length, true);
     965       25926 :                 if (error)
     966             :                         return error;
     967             :         }
     968             : 
     969             :         /* Convert unwritten sub-extent mappings if required. */
     970      176713 :         if (xfs_xchg_range_need_rt_conversion(ip2, xchg_flags)) {
     971           0 :                 error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
     972             :                                 fxr->length);
     973           0 :                 if (error)
     974             :                         return error;
     975             : 
     976           0 :                 error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
     977             :                                 fxr->length);
     978           0 :                 if (error)
     979           0 :                         return error;
     980             :         }
     981             : 
     982             :         return 0;
     983             : }
     984             : 
     985             : #define QRETRY_IP1      (0x1)
     986             : #define QRETRY_IP2      (0x2)
     987             : 
     988             : /*
     989             :  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
     990             :  * this if quota enforcement is disabled or if both inodes' dquots are the
     991             :  * same.  The qretry structure must be initialized to zeroes before the first
     992             :  * call to this function.
     993             :  */
     994             : STATIC int
     995      176827 : xfs_xchg_range_reserve_quota(
     996             :         struct xfs_trans                *tp,
     997             :         const struct xfs_swapext_req    *req,
     998             :         unsigned int                    *qretry)
     999             : {
    1000      176827 :         int64_t                         ddelta, rdelta;
    1001      176827 :         int                             ip1_error = 0;
    1002      176827 :         int                             error;
    1003             : 
    1004             :         /*
    1005             :          * Don't bother with a quota reservation if we're not enforcing them
    1006             :          * or the two inodes have the same dquots.
    1007             :          */
    1008      176827 :         if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
    1009        6549 :             (req->ip1->i_udquot == req->ip2->i_udquot &&
    1010        6533 :              req->ip1->i_gdquot == req->ip2->i_gdquot &&
    1011        6527 :              req->ip1->i_pdquot == req->ip2->i_pdquot))
    1012             :                 return 0;
    1013             : 
    1014          22 :         *qretry = 0;
    1015             : 
    1016             :         /*
    1017             :          * For each file, compute the net gain in the number of regular blocks
    1018             :          * that will be mapped into that file and reserve that much quota.  The
    1019             :          * quota counts must be able to absorb at least that much space.
    1020             :          */
    1021          22 :         ddelta = req->ip2_bcount - req->ip1_bcount;
    1022          22 :         rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
    1023          22 :         if (ddelta > 0 || rdelta > 0) {
    1024           8 :                 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
    1025             :                                 ddelta > 0 ? ddelta : 0,
    1026             :                                 rdelta > 0 ? rdelta : 0,
    1027             :                                 false);
    1028           8 :                 if (error == -EDQUOT || error == -ENOSPC) {
    1029             :                         /*
    1030             :                          * Save this error and see what happens if we try to
    1031             :                          * reserve quota for ip2.  Then report both.
    1032             :                          */
    1033           4 :                         *qretry |= QRETRY_IP1;
    1034           4 :                         ip1_error = error;
    1035           4 :                         error = 0;
    1036             :                 }
    1037           8 :                 if (error)
    1038             :                         return error;
    1039             :         }
    1040          22 :         if (ddelta < 0 || rdelta < 0) {
    1041           2 :                 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
    1042             :                                 ddelta < 0 ? -ddelta : 0,
    1043             :                                 rdelta < 0 ? -rdelta : 0,
    1044             :                                 false);
    1045           2 :                 if (error == -EDQUOT || error == -ENOSPC)
    1046           0 :                         *qretry |= QRETRY_IP2;
    1047           2 :                 if (error)
    1048             :                         return error;
    1049             :         }
    1050          22 :         if (ip1_error)
    1051             :                 return ip1_error;
    1052             : 
    1053             :         /*
    1054             :          * For each file, forcibly reserve the gross gain in mapped blocks so
    1055             :          * that we don't trip over any quota block reservation assertions.
    1056             :          * We must reserve the gross gain because the quota code subtracts from
    1057             :          * bcount the number of blocks that we unmap; it does not add that
    1058             :          * quantity back to the quota block reservation.
    1059             :          */
    1060          18 :         error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
    1061          18 :                         req->ip1_rtbcount, true);
    1062          18 :         if (error)
    1063             :                 return error;
    1064             : 
    1065          18 :         return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
    1066          18 :                         req->ip2_rtbcount, true);
    1067             : }
    1068             : 
    1069             : /*
    1070             :  * Get permission to use log-assisted atomic exchange of file extents.
    1071             :  *
    1072             :  * Callers must hold the IOLOCK and MMAPLOCK of both files.  They must not be
    1073             :  * running any transactions or hold any ILOCKS.  If @use_logging is set after a
    1074             :  * successful return, callers must call xfs_xchg_range_rele_log_assist after
    1075             :  * the exchange is completed.
    1076             :  */
    1077             : int
    1078      391812 : xfs_xchg_range_grab_log_assist(
    1079             :         struct xfs_mount        *mp,
    1080             :         bool                    force,
    1081             :         bool                    *use_logging)
    1082             : {
    1083      391812 :         int                     error = 0;
    1084             : 
    1085             :         /*
    1086             :          * Protect ourselves from an idle log clearing the atomic swapext
    1087             :          * log incompat feature bit.
    1088             :          */
    1089      391812 :         xlog_use_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
    1090      391822 :         *use_logging = true;
    1091             : 
    1092             :         /*
    1093             :          * If log-assisted swapping is already enabled, the caller can use the
    1094             :          * log assisted swap functions with the log-incompat reference we got.
    1095             :          */
    1096      783644 :         if (xfs_sb_version_haslogswapext(&mp->m_sb))
    1097             :                 return 0;
    1098             : 
    1099             :         /*
    1100             :          * If the caller doesn't /require/ log-assisted swapping, drop the
    1101             :          * log-incompat feature protection and exit.  The caller cannot use
    1102             :          * log assisted swapping.
    1103             :          */
    1104       14300 :         if (!force)
    1105        6463 :                 goto drop_incompat;
    1106             : 
    1107             :         /*
    1108             :          * Caller requires log-assisted swapping but the fs feature set isn't
    1109             :          * rich enough to support it.  Bail out.
    1110             :          */
    1111        8788 :         if (!xfs_swapext_supported(mp)) {
    1112         951 :                 error = -EOPNOTSUPP;
    1113         951 :                 goto drop_incompat;
    1114             :         }
    1115             : 
    1116        6886 :         error = xfs_add_incompat_log_feature(mp,
    1117             :                         XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT);
    1118        6886 :         if (error)
    1119           0 :                 goto drop_incompat;
    1120             : 
    1121        6886 :         xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SWAPEXT,
    1122             :  "EXPERIMENTAL atomic file range swap feature in use. Use at your own risk!");
    1123             : 
    1124             :         return 0;
    1125        7414 : drop_incompat:
    1126        7414 :         xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
    1127        7414 :         *use_logging = false;
    1128        7414 :         return error;
    1129             : }
    1130             : 
    1131             : /* Release permission to use log-assisted extent swapping. */
    1132             : void
    1133      214045 : xfs_xchg_range_rele_log_assist(
    1134             :         struct xfs_mount        *mp)
    1135             : {
    1136      384409 :         xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
    1137      170364 : }
    1138             : 
    1139             : /* Decide if we can use the old data fork exchange code. */
    1140             : static inline bool
    1141           2 : xfs_xchg_use_forkswap(
    1142             :         const struct xfs_exch_range     *fxr,
    1143             :         struct xfs_inode                *ip1,
    1144             :         struct xfs_inode                *ip2)
    1145             : {
    1146           2 :         if (!(fxr->flags & XFS_EXCH_RANGE_NONATOMIC))
    1147             :                 return false;
    1148           2 :         if (!(fxr->flags & XFS_EXCH_RANGE_FULL_FILES))
    1149             :                 return false;
    1150           0 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
    1151             :                 return false;
    1152           0 :         if (fxr->file1_offset != 0 || fxr->file2_offset != 0)
    1153             :                 return false;
    1154           0 :         if (fxr->length != ip1->i_disk_size)
    1155             :                 return false;
    1156           0 :         if (fxr->length != ip2->i_disk_size)
    1157           0 :                 return false;
    1158             :         return true;
    1159             : }
    1160             : 
    1161             : enum xchg_strategy {
    1162             :         SWAPEXT         = 1,    /* xfs_swapext() */
    1163             :         FORKSWAP        = 2,    /* exchange forks */
    1164             : };
    1165             : 
    1166             : /* Exchange the contents of two files. */
    1167             : int
    1168      176827 : xfs_xchg_range(
    1169             :         struct xfs_inode                *ip1,
    1170             :         struct xfs_inode                *ip2,
    1171             :         const struct xfs_exch_range     *fxr,
    1172             :         unsigned int                    xchg_flags)
    1173             : {
    1174      176827 :         struct xfs_mount                *mp = ip1->i_mount;
    1175      176827 :         struct xfs_swapext_req          req = {
    1176             :                 .ip1                    = ip1,
    1177             :                 .ip2                    = ip2,
    1178             :                 .whichfork              = XFS_DATA_FORK,
    1179      176827 :                 .startoff1              = XFS_B_TO_FSBT(mp, fxr->file1_offset),
    1180      176827 :                 .startoff2              = XFS_B_TO_FSBT(mp, fxr->file2_offset),
    1181      176827 :                 .blockcount             = XFS_B_TO_FSB(mp, fxr->length),
    1182             :         };
    1183      176827 :         struct xfs_trans                *tp;
    1184      176827 :         unsigned int                    qretry;
    1185      176827 :         unsigned int                    flags = 0;
    1186      176827 :         bool                            retried = false;
    1187      176827 :         enum xchg_strategy              strategy;
    1188      176827 :         int                             error;
    1189             : 
    1190      176827 :         trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags);
    1191             : 
    1192      176827 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
    1193          68 :                 req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
    1194      176827 :         if (fxr->flags & XFS_EXCH_RANGE_FILE1_WRITTEN)
    1195          12 :                 req.req_flags |= XFS_SWAP_REQ_INO1_WRITTEN;
    1196      176827 :         if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
    1197      170364 :                 req.req_flags |= XFS_SWAP_REQ_LOGGED;
    1198             : 
    1199             :         /*
    1200             :          * Round the request length up to the nearest fundamental unit of
    1201             :          * allocation.  The prep function already checked that the request
    1202             :          * offsets and length in @fxr are safe to round up.
    1203             :          */
    1204      176827 :         if (XFS_IS_REALTIME_INODE(ip2))
    1205           2 :                 req.blockcount = roundup_64(req.blockcount,
    1206             :                                             mp->m_sb.sb_rextsize);
    1207             : 
    1208      176827 :         error = xfs_xchg_range_estimate(&req);
    1209      176827 :         if (error)
    1210             :                 return error;
    1211             : 
    1212             :         /*
    1213             :          * We haven't decided which exchange strategy we want to use yet, but
    1214             :          * here we must choose if we want freed blocks during the swap to be
    1215             :          * added to the transaction block reservation (RES_FDBLKS) or freed
    1216             :          * into the global fdblocks.  The legacy fork swap mechanism doesn't
    1217             :          * free any blocks, so it doesn't require it.  It is also the only
    1218             :          * option that works for older filesystems.
    1219             :          *
    1220             :          * The bmap log intent items that were added with rmap and reflink can
    1221             :          * change the bmbt shape, so the intent-based swap strategies require
    1222             :          * us to set RES_FDBLKS.
    1223             :          */
    1224      176825 :         if (xfs_has_lazysbcount(mp))
    1225      176825 :                 flags |= XFS_TRANS_RES_FDBLKS;
    1226             : 
    1227      176825 : retry:
    1228             :         /* Allocate the transaction, lock the inodes, and join them. */
    1229      176827 :         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
    1230             :                         flags, &tp);
    1231      176827 :         if (error)
    1232           0 :                 return error;
    1233             : 
    1234      176827 :         xfs_xchg_range_ilock(tp, ip1, ip2);
    1235             : 
    1236      176827 :         trace_xfs_swap_extent_before(ip2, 0);
    1237      176827 :         trace_xfs_swap_extent_before(ip1, 1);
    1238             : 
    1239      176827 :         if (fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH)
    1240        6543 :                 trace_xfs_xchg_range_freshness(ip2, fxr);
    1241             : 
    1242             :         /*
    1243             :          * Now that we've excluded all other inode metadata changes by taking
    1244             :          * the ILOCK, repeat the freshness check.
    1245             :          */
    1246      176827 :         error = xfs_exch_range_check_fresh(VFS_I(ip2), fxr);
    1247      176827 :         if (error)
    1248           0 :                 goto out_trans_cancel;
    1249             : 
    1250      176827 :         error = xfs_swapext_check_extents(mp, &req);
    1251      176827 :         if (error)
    1252           0 :                 goto out_trans_cancel;
    1253             : 
    1254             :         /*
    1255             :          * Reserve ourselves some quota if any of them are in enforcing mode.
    1256             :          * In theory we only need enough to satisfy the change in the number
    1257             :          * of blocks between the two ranges being remapped.
    1258             :          */
    1259      176827 :         error = xfs_xchg_range_reserve_quota(tp, &req, &qretry);
    1260      176827 :         if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
    1261           2 :                 xfs_trans_cancel(tp);
    1262           2 :                 xfs_xchg_range_iunlock(ip1, ip2);
    1263           2 :                 if (qretry & QRETRY_IP1)
    1264           2 :                         xfs_blockgc_free_quota(ip1, 0);
    1265           2 :                 if (qretry & QRETRY_IP2)
    1266           0 :                         xfs_blockgc_free_quota(ip2, 0);
    1267           2 :                 retried = true;
    1268           2 :                 goto retry;
    1269             :         }
    1270      176825 :         if (error)
    1271           2 :                 goto out_trans_cancel;
    1272             : 
    1273      183282 :         if ((xchg_flags & XFS_XCHG_RANGE_LOGGED) || xfs_swapext_supported(mp)) {
    1274             :                 /*
    1275             :                  * xfs_swapext() uses deferred bmap log intent items to swap
    1276             :                  * extents between file forks.  If the atomic log swap feature
    1277             :                  * is enabled, it will also use swapext log intent items to
    1278             :                  * restart the operation in case of failure.
    1279             :                  *
    1280             :                  * This means that we can use it if we previously obtained
    1281             :                  * permission from the log to use log-assisted atomic extent
    1282             :                  * swapping; or if the fs supports rmap or reflink and the
    1283             :                  * user said NONATOMIC.
    1284             :                  */
    1285             :                 strategy = SWAPEXT;
    1286           2 :         } else if (xfs_xchg_use_forkswap(fxr, ip1, ip2)) {
    1287             :                 /*
    1288             :                  * Exchange the file contents by using the old bmap fork
    1289             :                  * exchange code, if we're a defrag tool doing a full file
    1290             :                  * swap.
    1291             :                  */
    1292           0 :                 strategy = FORKSWAP;
    1293             : 
    1294           0 :                 error = xfs_swap_extents_check_format(ip2, ip1);
    1295           0 :                 if (error) {
    1296           0 :                         xfs_notice(mp,
    1297             :                 "%s: inode 0x%llx format is incompatible for exchanging.",
    1298             :                                         __func__, ip2->i_ino);
    1299           0 :                         goto out_trans_cancel;
    1300             :                 }
    1301             :         } else {
    1302             :                 /* We cannot exchange the file contents. */
    1303           2 :                 error = -EOPNOTSUPP;
    1304           2 :                 goto out_trans_cancel;
    1305             :         }
    1306             : 
    1307             :         /* If we got this far on a dry run, all parameters are ok. */
    1308      176821 :         if (fxr->flags & XFS_EXCH_RANGE_DRY_RUN)
    1309         114 :                 goto out_trans_cancel;
    1310             : 
    1311             :         /* Update the mtime and ctime of both files. */
    1312      176707 :         if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
    1313      176707 :                 xfs_trans_ichgtime(tp, ip1,
    1314             :                                 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
    1315      176707 :         if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
    1316      170728 :                 xfs_trans_ichgtime(tp, ip2,
    1317             :                                 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
    1318             : 
    1319      176707 :         if (strategy == SWAPEXT) {
    1320      176707 :                 xfs_swapext(tp, &req);
    1321             :         } else {
    1322           0 :                 error = xfs_swap_extent_forks(&tp, &req);
    1323           0 :                 if (error)
    1324           0 :                         goto out_trans_cancel;
    1325             :         }
    1326             : 
    1327             :         /*
    1328             :          * Force the log to persist metadata updates if the caller or the
    1329             :          * administrator requires this.  The VFS prep function already flushed
    1330             :          * the relevant parts of the page cache.
    1331             :          */
    1332      176707 :         if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCH_RANGE_FSYNC))
    1333        5966 :                 xfs_trans_set_sync(tp);
    1334             : 
    1335      176707 :         error = xfs_trans_commit(tp);
    1336             : 
    1337      176707 :         trace_xfs_swap_extent_after(ip2, 0);
    1338      176707 :         trace_xfs_swap_extent_after(ip1, 1);
    1339             : 
    1340      176707 :         if (error)
    1341          12 :                 goto out_unlock;
    1342             : 
    1343             :         /*
    1344             :          * If the caller wanted us to exchange the contents of two complete
    1345             :          * files of unequal length, exchange the incore sizes now.  This should
    1346             :          * be safe because we flushed both files' page caches, moved all the
    1347             :          * extents, and updated the ondisk sizes.
    1348             :          */
    1349      176695 :         if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) {
    1350          66 :                 loff_t  temp;
    1351             : 
    1352          66 :                 temp = i_size_read(VFS_I(ip2));
    1353          66 :                 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
    1354          66 :                 i_size_write(VFS_I(ip1), temp);
    1355             :         }
    1356             : 
    1357      176629 : out_unlock:
    1358      176825 :         xfs_xchg_range_iunlock(ip1, ip2);
    1359      176825 :         return error;
    1360             : 
    1361         118 : out_trans_cancel:
    1362         118 :         xfs_trans_cancel(tp);
    1363         118 :         goto out_unlock;
    1364             : }

Generated by: LCOV version 1.14