LCOV - fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023

LCOV - code coverage report

Current view:	top level - fs/xfs - xfs_file.c (source / functions)		Hit	Total	Coverage
Test:	fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023	Lines:	476	543	87.7 %
Date:	2023-07-31 20:08:12	Functions:	31	35	88.6 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_inode.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_inode_item.h"
      16             : #include "xfs_bmap.h"
      17             : #include "xfs_bmap_util.h"
      18             : #include "xfs_dir2.h"
      19             : #include "xfs_dir2_priv.h"
      20             : #include "xfs_ioctl.h"
      21             : #include "xfs_trace.h"
      22             : #include "xfs_log.h"
      23             : #include "xfs_icache.h"
      24             : #include "xfs_pnfs.h"
      25             : #include "xfs_iomap.h"
      26             : #include "xfs_reflink.h"
      27             : #include "xfs_file.h"
      28             : 
      29             : #include <linux/dax.h>
      30             : #include <linux/falloc.h>
      31             : #include <linux/backing-dev.h>
      32             : #include <linux/mman.h>
      33             : #include <linux/fadvise.h>
      34             : #include <linux/mount.h>
      35             : 
      36             : static const struct vm_operations_struct xfs_file_vm_ops;
      37             : 
      38             : /*
      39             :  * Decide if the given file range is aligned to the size of the fundamental
      40             :  * allocation unit for the file.
      41             :  */
      42             : bool
      43     5371201 : xfs_is_falloc_aligned(
      44             :         struct xfs_inode        *ip,
      45             :         loff_t                  pos,
      46             :         long long int           len)
      47             : {
      48     5371201 :         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip);
      49             : 
      50     7390093 :         if (XFS_IS_REALTIME_INODE(ip) && !is_power_of_2(alloc_unit))
      51      524809 :                 return isaligned_64(pos, alloc_unit) &&
      52      248597 :                        isaligned_64(len, alloc_unit);
      53             : 
      54     5103035 :         return !((pos | len) & (alloc_unit - 1));
      55             : }
      56             : 
      57             : /*
      58             :  * Fsync operations on directories are much simpler than on regular files,
      59             :  * as there is no file data to flush, and thus also no need for explicit
      60             :  * cache flush operations, and there are no non-transaction metadata updates
      61             :  * on directories either.
      62             :  */
      63             : STATIC int
      64      864170 : xfs_dir_fsync(
      65             :         struct file             *file,
      66             :         loff_t                  start,
      67             :         loff_t                  end,
      68             :         int                     datasync)
      69             : {
      70      864170 :         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
      71             : 
      72      864170 :         trace_xfs_dir_fsync(ip);
      73      864158 :         return xfs_log_force_inode(ip);
      74             : }
      75             : 
      76             : static xfs_csn_t
      77     4294265 : xfs_fsync_seq(
      78             :         struct xfs_inode        *ip,
      79             :         bool                    datasync)
      80             : {
      81     4294265 :         if (!xfs_ipincount(ip))
      82             :                 return 0;
      83     4293158 :         if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
      84             :                 return 0;
      85     4005966 :         return ip->i_itemp->ili_commit_seq;
      86             : }
      87             : 
      88             : /*
      89             :  * All metadata updates are logged, which means that we just have to flush the
      90             :  * log up to the latest LSN that touched the inode.
      91             :  *
      92             :  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
      93             :  * the log force before we clear the ili_fsync_fields field. This ensures that
      94             :  * we don't get a racing sync operation that does not wait for the metadata to
      95             :  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
      96             :  * then all that will happen is the log force will do nothing as the lsn will
      97             :  * already be on disk.  We can't race with setting ili_fsync_fields because that
      98             :  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
      99             :  * shared until after the ili_fsync_fields is cleared.
     100             :  */
     101             : static  int
     102     4294298 : xfs_fsync_flush_log(
     103             :         struct xfs_inode        *ip,
     104             :         bool                    datasync,
     105             :         int                     *log_flushed)
     106             : {
     107     4294298 :         int                     error = 0;
     108     4294298 :         xfs_csn_t               seq;
     109             : 
     110     4294298 :         xfs_ilock(ip, XFS_ILOCK_SHARED);
     111     4294358 :         seq = xfs_fsync_seq(ip, datasync);
     112     4294060 :         if (seq) {
     113     4005983 :                 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
     114             :                                           log_flushed);
     115             : 
     116     4005581 :                 spin_lock(&ip->i_itemp->ili_lock);
     117     4005966 :                 ip->i_itemp->ili_fsync_fields = 0;
     118     4005966 :                 spin_unlock(&ip->i_itemp->ili_lock);
     119             :         }
     120     4294049 :         xfs_iunlock(ip, XFS_ILOCK_SHARED);
     121     4292920 :         return error;
     122             : }
     123             : 
     124             : STATIC int
     125    20332276 : xfs_file_fsync(
     126             :         struct file             *file,
     127             :         loff_t                  start,
     128             :         loff_t                  end,
     129             :         int                     datasync)
     130             : {
     131    20332276 :         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
     132    20332276 :         struct xfs_mount        *mp = ip->i_mount;
     133    20332276 :         int                     error, err2;
     134    20332276 :         int                     log_flushed = 0;
     135             : 
     136    20332276 :         trace_xfs_file_fsync(ip);
     137             : 
     138    20331775 :         error = file_write_and_wait_range(file, start, end);
     139    20336659 :         if (error)
     140             :                 return error;
     141             : 
     142    40666880 :         if (xfs_is_shutdown(mp))
     143             :                 return -EIO;
     144             : 
     145    20331492 :         xfs_iflags_clear(ip, XFS_ITRUNCATED);
     146             : 
     147             :         /*
     148             :          * If we have an RT and/or log subvolume we need to make sure to flush
     149             :          * the write cache the device used for file data first.  This is to
     150             :          * ensure newly written file data make it to disk before logging the new
     151             :          * inode size in case of an extending write.
     152             :          */
     153    20331972 :         if (XFS_IS_REALTIME_INODE(ip))
     154     6390901 :                 error = xfs_buftarg_flush(mp->m_rtdev_targp);
     155    13941071 :         else if (mp->m_logdev_targp != mp->m_ddev_targp)
     156      542241 :                 error = xfs_buftarg_flush(mp->m_ddev_targp);
     157             : 
     158             :         /*
     159             :          * Any inode that has dirty modifications in the log is pinned.  The
     160             :          * racy check here for a pinned inode will not catch modifications
     161             :          * that happen concurrently to the fsync call, but fsync semantics
     162             :          * only require to sync previously completed I/O.
     163             :          */
     164    20329679 :         if (xfs_ipincount(ip)) {
     165     4294305 :                 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
     166     4292870 :                 if (err2 && !error)
     167        1019 :                         error = err2;
     168             :         }
     169             : 
     170             :         /*
     171             :          * If we only have a single device, and the log force about was
     172             :          * a no-op we might have to flush the data device cache here.
     173             :          * This can only happen for fdatasync/O_DSYNC if we were overwriting
     174             :          * an already allocated file and thus do not have any metadata to
     175             :          * commit.
     176             :          */
     177    20328244 :         if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
     178    11869854 :             mp->m_logdev_targp == mp->m_ddev_targp) {
     179    11575534 :                 err2 = xfs_buftarg_flush(mp->m_ddev_targp);
     180    11566540 :                 if (err2 && !error)
     181         257 :                         error = err2;
     182             :         }
     183             : 
     184             :         return error;
     185             : }
     186             : 
     187             : static int
     188  1197571972 : xfs_ilock_iocb(
     189             :         struct kiocb            *iocb,
     190             :         unsigned int            lock_mode)
     191             : {
     192  1197571972 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     193             : 
     194  1197571972 :         if (iocb->ki_flags & IOCB_NOWAIT) {
     195           0 :                 if (!xfs_ilock_nowait(ip, lock_mode))
     196           0 :                         return -EAGAIN;
     197             :         } else {
     198  1197571972 :                 xfs_ilock(ip, lock_mode);
     199             :         }
     200             : 
     201             :         return 0;
     202             : }
     203             : 
     204             : STATIC ssize_t
     205   534592992 : xfs_file_dio_read(
     206             :         struct kiocb            *iocb,
     207             :         struct iov_iter         *to)
     208             : {
     209   534592992 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     210   534592992 :         ssize_t                 ret;
     211             : 
     212   534592992 :         trace_xfs_file_direct_read(iocb, to);
     213             : 
     214   534592313 :         if (!iov_iter_count(to))
     215             :                 return 0; /* skip atime */
     216             : 
     217   534566479 :         file_accessed(iocb->ki_filp);
     218             : 
     219   534565496 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     220   534565276 :         if (ret)
     221             :                 return ret;
     222   534565302 :         ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
     223   534567927 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     224             : 
     225   534567927 :         return ret;
     226             : }
     227             : 
     228             : static noinline ssize_t
     229           0 : xfs_file_dax_read(
     230             :         struct kiocb            *iocb,
     231             :         struct iov_iter         *to)
     232             : {
     233           0 :         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
     234           0 :         ssize_t                 ret = 0;
     235             : 
     236           0 :         trace_xfs_file_dax_read(iocb, to);
     237             : 
     238           0 :         if (!iov_iter_count(to))
     239             :                 return 0; /* skip atime */
     240             : 
     241           0 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     242           0 :         if (ret)
     243             :                 return ret;
     244           0 :         ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
     245           0 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     246             : 
     247           0 :         file_accessed(iocb->ki_filp);
     248           0 :         return ret;
     249             : }
     250             : 
     251             : STATIC ssize_t
     252   386888712 : xfs_file_buffered_read(
     253             :         struct kiocb            *iocb,
     254             :         struct iov_iter         *to)
     255             : {
     256   386888712 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     257   386888712 :         ssize_t                 ret;
     258             : 
     259   386888712 :         trace_xfs_file_buffered_read(iocb, to);
     260             : 
     261   386871109 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     262   385563502 :         if (ret)
     263             :                 return ret;
     264   385597650 :         ret = generic_file_read_iter(iocb, to);
     265   386164803 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     266             : 
     267   386164803 :         return ret;
     268             : }
     269             : 
     270             : STATIC ssize_t
     271   922293176 : xfs_file_read_iter(
     272             :         struct kiocb            *iocb,
     273             :         struct iov_iter         *to)
     274             : {
     275   922293176 :         struct inode            *inode = file_inode(iocb->ki_filp);
     276   922293176 :         struct xfs_mount        *mp = XFS_I(inode)->i_mount;
     277   922293176 :         ssize_t                 ret = 0;
     278             : 
     279   922293176 :         XFS_STATS_INC(mp, xs_read_calls);
     280             : 
     281  1841069260 :         if (xfs_is_shutdown(mp))
     282             :                 return -EIO;
     283             : 
     284   920531270 :         if (IS_DAX(inode))
     285           0 :                 ret = xfs_file_dax_read(iocb, to);
     286   920531270 :         else if (iocb->ki_flags & IOCB_DIRECT)
     287   534592985 :                 ret = xfs_file_dio_read(iocb, to);
     288             :         else
     289   385938285 :                 ret = xfs_file_buffered_read(iocb, to);
     290             : 
     291   919595060 :         if (ret > 0)
     292   381242311 :                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
     293             :         return ret;
     294             : }
     295             : 
     296             : STATIC ssize_t
     297    11929995 : xfs_file_splice_read(
     298             :         struct file             *in,
     299             :         loff_t                  *ppos,
     300             :         struct pipe_inode_info  *pipe,
     301             :         size_t                  len,
     302             :         unsigned int            flags)
     303             : {
     304    11929995 :         struct inode            *inode = file_inode(in);
     305    11929995 :         struct xfs_inode        *ip = XFS_I(inode);
     306    11929995 :         struct xfs_mount        *mp = ip->i_mount;
     307    11929995 :         ssize_t                 ret = 0;
     308             : 
     309    11929995 :         XFS_STATS_INC(mp, xs_read_calls);
     310             : 
     311    23860036 :         if (xfs_is_shutdown(mp))
     312             :                 return -EIO;
     313             : 
     314    11929965 :         trace_xfs_file_splice_read(ip, *ppos, len);
     315             : 
     316    11929887 :         xfs_ilock(ip, XFS_IOLOCK_SHARED);
     317    11929813 :         ret = filemap_splice_read(in, ppos, pipe, len, flags);
     318    11929920 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     319    11929878 :         if (ret > 0)
     320    11929635 :                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
     321             :         return ret;
     322             : }
     323             : 
     324             : /*
     325             :  * Common pre-write limit and setup checks.
     326             :  *
     327             :  * Called with the iolocked held either shared and exclusive according to
     328             :  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
     329             :  * if called for a direct write beyond i_size.
     330             :  */
     331             : STATIC ssize_t
     332   268850806 : xfs_file_write_checks(
     333             :         struct kiocb            *iocb,
     334             :         struct iov_iter         *from,
     335             :         unsigned int            *iolock)
     336             : {
     337   268850806 :         struct file             *file = iocb->ki_filp;
     338   268850806 :         struct inode            *inode = file->f_mapping->host;
     339   268850806 :         struct xfs_inode        *ip = XFS_I(inode);
     340   268850806 :         ssize_t                 error = 0;
     341   268850806 :         size_t                  count = iov_iter_count(from);
     342   268850806 :         bool                    drained_dio = false;
     343   334138695 :         loff_t                  isize;
     344             : 
     345             : restart:
     346   334138695 :         error = generic_write_checks(iocb, from);
     347   334337203 :         if (error <= 0)
     348          30 :                 return error;
     349             : 
     350   334337173 :         if (iocb->ki_flags & IOCB_NOWAIT) {
     351           0 :                 error = break_layout(inode, false);
     352           0 :                 if (error == -EWOULDBLOCK)
     353             :                         error = -EAGAIN;
     354             :         } else {
     355   334337173 :                 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
     356             :         }
     357             : 
     358   334803951 :         if (error)
     359           0 :                 return error;
     360             : 
     361             :         /*
     362             :          * For changing security info in file_remove_privs() we need i_rwsem
     363             :          * exclusively.
     364             :          */
     365   334803951 :         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
     366       88494 :                 xfs_iunlock(ip, *iolock);
     367       88491 :                 *iolock = XFS_IOLOCK_EXCL;
     368       88491 :                 error = xfs_ilock_iocb(iocb, *iolock);
     369       88485 :                 if (error) {
     370           0 :                         *iolock = 0;
     371           0 :                         return error;
     372             :                 }
     373       88485 :                 goto restart;
     374             :         }
     375             : 
     376             :         /*
     377             :          * If the offset is beyond the size of the file, we need to zero any
     378             :          * blocks that fall between the existing EOF and the start of this
     379             :          * write.  If zeroing is needed and we are currently holding the iolock
     380             :          * shared, we need to update it to exclusive which implies having to
     381             :          * redo all checks before.
     382             :          *
     383             :          * We need to serialise against EOF updates that occur in IO completions
     384             :          * here. We want to make sure that nobody is changing the size while we
     385             :          * do this check until we have placed an IO barrier (i.e.  hold the
     386             :          * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
     387             :          * spinlock effectively forms a memory barrier once we have the
     388             :          * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
     389             :          * hence be able to correctly determine if we need to run zeroing.
     390             :          *
     391             :          * We can do an unlocked check here safely as IO completion can only
     392             :          * extend EOF. Truncate is locked out at this point, so the EOF can
     393             :          * not move backwards, only forwards. Hence we only need to take the
     394             :          * slow path and spin locks when we are at or beyond the current EOF.
     395             :          */
     396   334715457 :         if (iocb->ki_pos <= i_size_read(inode))
     397   203803035 :                 goto out;
     398             : 
     399   130912422 :         spin_lock(&ip->i_flags_lock);
     400   131149675 :         isize = i_size_read(inode);
     401   131149675 :         if (iocb->ki_pos > isize) {
     402   131149675 :                 spin_unlock(&ip->i_flags_lock);
     403             : 
     404   131054414 :                 if (iocb->ki_flags & IOCB_NOWAIT)
     405             :                         return -EAGAIN;
     406             : 
     407   131054414 :                 if (!drained_dio) {
     408    65527467 :                         if (*iolock == XFS_IOLOCK_SHARED) {
     409      803139 :                                 xfs_iunlock(ip, *iolock);
     410      803061 :                                 *iolock = XFS_IOLOCK_EXCL;
     411      803061 :                                 xfs_ilock(ip, *iolock);
     412      802954 :                                 iov_iter_reexpand(from, count);
     413             :                         }
     414             :                         /*
     415             :                          * We now have an IO submission barrier in place, but
     416             :                          * AIO can do EOF updates during IO completion and hence
     417             :                          * we now need to wait for all of them to drain. Non-AIO
     418             :                          * DIO will have drained before we are given the
     419             :                          * XFS_IOLOCK_EXCL, and so for most cases this wait is a
     420             :                          * no-op.
     421             :                          */
     422    65527282 :                         inode_dio_wait(inode);
     423    65199404 :                         drained_dio = true;
     424    65199404 :                         goto restart;
     425             :                 }
     426             : 
     427    65526947 :                 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
     428    65242387 :                 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
     429    65041536 :                 if (error)
     430             :                         return error;
     431             :         } else
     432           0 :                 spin_unlock(&ip->i_flags_lock);
     433             : 
     434   268830794 : out:
     435   268830794 :         return kiocb_modified(iocb);
     436             : }
     437             : 
     438             : static int
     439    19065270 : xfs_dio_write_end_io(
     440             :         struct kiocb            *iocb,
     441             :         ssize_t                 size,
     442             :         int                     error,
     443             :         unsigned                flags)
     444             : {
     445    19065270 :         struct inode            *inode = file_inode(iocb->ki_filp);
     446    19065270 :         struct xfs_inode        *ip = XFS_I(inode);
     447    19065270 :         loff_t                  offset = iocb->ki_pos;
     448    19065270 :         unsigned int            nofs_flag;
     449             : 
     450    19065270 :         trace_xfs_end_io_direct_write(ip, offset, size);
     451             : 
     452    38130986 :         if (xfs_is_shutdown(ip->i_mount))
     453             :                 return -EIO;
     454             : 
     455    19064751 :         if (error)
     456             :                 return error;
     457    16795551 :         if (!size)
     458             :                 return 0;
     459             : 
     460             :         /*
     461             :          * Capture amount written on completion as we can't reliably account
     462             :          * for it on submission.
     463             :          */
     464    16795551 :         XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
     465             : 
     466             :         /*
     467             :          * We can allocate memory here while doing writeback on behalf of
     468             :          * memory reclaim.  To avoid memory allocation deadlocks set the
     469             :          * task-wide nofs context for the following operations.
     470             :          */
     471    16795667 :         nofs_flag = memalloc_nofs_save();
     472             : 
     473    16795667 :         if (flags & IOMAP_DIO_COW) {
     474     3486630 :                 error = xfs_reflink_end_cow(ip, offset, size);
     475     3486630 :                 if (error)
     476          15 :                         goto out;
     477             :         }
     478             : 
     479             :         /*
     480             :          * Unwritten conversion updates the in-core isize after extent
     481             :          * conversion but before updating the on-disk size. Updating isize any
     482             :          * earlier allows a racing dio read to find unwritten extents before
     483             :          * they are converted.
     484             :          */
     485    16795652 :         if (flags & IOMAP_DIO_UNWRITTEN) {
     486     8451978 :                 error = xfs_iomap_write_unwritten(ip, offset, size, true);
     487     8450896 :                 goto out;
     488             :         }
     489             : 
     490             :         /*
     491             :          * We need to update the in-core inode size here so that we don't end up
     492             :          * with the on-disk inode size being outside the in-core inode size. We
     493             :          * have no other method of updating EOF for AIO, so always do it here
     494             :          * if necessary.
     495             :          *
     496             :          * We need to lock the test/set EOF update as we can be racing with
     497             :          * other IO completions here to update the EOF. Failing to serialise
     498             :          * here can result in EOF moving backwards and Bad Things Happen when
     499             :          * that occurs.
     500             :          *
     501             :          * As IO completion only ever extends EOF, we can do an unlocked check
     502             :          * here to avoid taking the spinlock. If we land within the current EOF,
     503             :          * then we do not need to do an extending update at all, and we don't
     504             :          * need to take the lock to check this. If we race with an update moving
     505             :          * EOF, then we'll either still be beyond EOF and need to take the lock,
     506             :          * or we'll be within EOF and we don't need to take it at all.
     507             :          */
     508     8343674 :         if (offset + size <= i_size_read(inode))
     509     7727649 :                 goto out;
     510             : 
     511      616025 :         spin_lock(&ip->i_flags_lock);
     512      616025 :         if (offset + size > i_size_read(inode)) {
     513      616025 :                 i_size_write(inode, offset + size);
     514      616025 :                 spin_unlock(&ip->i_flags_lock);
     515      616025 :                 error = xfs_setfilesize(ip, offset, size);
     516             :         } else {
     517           0 :                 spin_unlock(&ip->i_flags_lock);
     518             :         }
     519             : 
     520    16794585 : out:
     521    16794585 :         memalloc_nofs_restore(nofs_flag);
     522    16794585 :         return error;
     523             : }
     524             : 
     525             : static const struct iomap_dio_ops xfs_dio_write_ops = {
     526             :         .end_io         = xfs_dio_write_end_io,
     527             : };
     528             : 
     529             : /*
     530             :  * Handle block aligned direct I/O writes
     531             :  */
     532             : static noinline ssize_t
     533    13144423 : xfs_file_dio_write_aligned(
     534             :         struct xfs_inode        *ip,
     535             :         struct kiocb            *iocb,
     536             :         struct iov_iter         *from)
     537             : {
     538    13144423 :         unsigned int            iolock = XFS_IOLOCK_SHARED;
     539    13144423 :         ssize_t                 ret;
     540             : 
     541    13144423 :         ret = xfs_ilock_iocb(iocb, iolock);
     542    13140952 :         if (ret)
     543             :                 return ret;
     544    13141122 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     545    13141226 :         if (ret)
     546         623 :                 goto out_unlock;
     547             : 
     548             :         /*
     549             :          * We don't need to hold the IOLOCK exclusively across the IO, so demote
     550             :          * the iolock back to shared if we had to take the exclusive lock in
     551             :          * xfs_file_write_checks() for other reasons.
     552             :          */
     553    13140603 :         if (iolock == XFS_IOLOCK_EXCL) {
     554      869503 :                 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
     555      869539 :                 iolock = XFS_IOLOCK_SHARED;
     556             :         }
     557    13140639 :         trace_xfs_file_direct_write(iocb, from);
     558    13137665 :         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
     559             :                            &xfs_dio_write_ops, 0, NULL, 0);
     560    13148165 : out_unlock:
     561    13148165 :         if (iolock)
     562    13147984 :                 xfs_iunlock(ip, iolock);
     563             :         return ret;
     564             : }
     565             : 
     566             : /*
     567             :  * Handle block unaligned direct I/O writes
     568             :  *
     569             :  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
     570             :  * them to be done in parallel with reads and other direct I/O writes.  However,
     571             :  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
     572             :  * to do sub-block zeroing and that requires serialisation against other direct
     573             :  * I/O to the same block.  In this case we need to serialise the submission of
     574             :  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
     575             :  * In the case where sub-block zeroing is not required, we can do concurrent
     576             :  * sub-block dios to the same block successfully.
     577             :  *
     578             :  * Optimistically submit the I/O using the shared lock first, but use the
     579             :  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
     580             :  * if block allocation or partial block zeroing would be required.  In that case
     581             :  * we try again with the exclusive lock.
     582             :  */
     583             : static noinline ssize_t
     584    11620044 : xfs_file_dio_write_unaligned(
     585             :         struct xfs_inode        *ip,
     586             :         struct kiocb            *iocb,
     587             :         struct iov_iter         *from)
     588             : {
     589    11620044 :         size_t                  isize = i_size_read(VFS_I(ip));
     590    11620044 :         size_t                  count = iov_iter_count(from);
     591    11620044 :         unsigned int            iolock = XFS_IOLOCK_SHARED;
     592    11620044 :         unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
     593    11620044 :         ssize_t                 ret;
     594             : 
     595             :         /*
     596             :          * Extending writes need exclusivity because of the sub-block zeroing
     597             :          * that the DIO code always does for partial tail blocks beyond EOF, so
     598             :          * don't even bother trying the fast path in this case.
     599             :          */
     600    11620044 :         if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
     601     7153462 :                 if (iocb->ki_flags & IOCB_NOWAIT)
     602             :                         return -EAGAIN;
     603     7153462 : retry_exclusive:
     604     8415027 :                 iolock = XFS_IOLOCK_EXCL;
     605     8415027 :                 flags = IOMAP_DIO_FORCE_WAIT;
     606             :         }
     607             : 
     608    12881609 :         ret = xfs_ilock_iocb(iocb, iolock);
     609    12881400 :         if (ret)
     610             :                 return ret;
     611             : 
     612             :         /*
     613             :          * We can't properly handle unaligned direct I/O to reflink files yet,
     614             :          * as we can't unshare a partial block.
     615             :          */
     616    12881399 :         if (xfs_is_cow_inode(ip)) {
     617     6960397 :                 trace_xfs_reflink_bounce_dio_write(iocb, from);
     618     6960383 :                 ret = -ENOTBLK;
     619     6960383 :                 goto out_unlock;
     620             :         }
     621             : 
     622     5920931 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     623     5920617 :         if (ret)
     624         103 :                 goto out_unlock;
     625             : 
     626             :         /*
     627             :          * If we are doing exclusive unaligned I/O, this must be the only I/O
     628             :          * in-flight.  Otherwise we risk data corruption due to unwritten extent
     629             :          * conversions from the AIO end_io handler.  Wait for all other I/O to
     630             :          * drain first.
     631             :          */
     632     5920514 :         if (flags & IOMAP_DIO_FORCE_WAIT)
     633     4390853 :                 inode_dio_wait(VFS_I(ip));
     634             : 
     635     5920386 :         trace_xfs_file_direct_write(iocb, from);
     636     5920396 :         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
     637             :                            &xfs_dio_write_ops, flags, NULL, 0);
     638             : 
     639             :         /*
     640             :          * Retry unaligned I/O with exclusive blocking semantics if the DIO
     641             :          * layer rejected it for mapping or locking reasons. If we are doing
     642             :          * nonblocking user I/O, propagate the error.
     643             :          */
     644     5920692 :         if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
     645     1261579 :                 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
     646     1261579 :                 xfs_iunlock(ip, iolock);
     647     1261565 :                 goto retry_exclusive;
     648             :         }
     649             : 
     650     4659113 : out_unlock:
     651    11619599 :         if (iolock)
     652    11619279 :                 xfs_iunlock(ip, iolock);
     653             :         return ret;
     654             : }
     655             : 
     656             : static ssize_t
     657    24765437 : xfs_file_dio_write(
     658             :         struct kiocb            *iocb,
     659             :         struct iov_iter         *from)
     660             : {
     661    24765437 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     662    24765437 :         struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
     663    24765437 :         size_t                  count = iov_iter_count(from);
     664             : 
     665             :         /* direct I/O must be aligned to device logical sector size */
     666    24765437 :         if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
     667             :                 return -EINVAL;
     668    24765437 :         if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
     669    11620116 :                 return xfs_file_dio_write_unaligned(ip, iocb, from);
     670    13145321 :         return xfs_file_dio_write_aligned(ip, iocb, from);
     671             : }
     672             : 
     673             : static noinline ssize_t
     674           0 : xfs_file_dax_write(
     675             :         struct kiocb            *iocb,
     676             :         struct iov_iter         *from)
     677             : {
     678           0 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     679           0 :         struct xfs_inode        *ip = XFS_I(inode);
     680           0 :         unsigned int            iolock = XFS_IOLOCK_EXCL;
     681           0 :         ssize_t                 ret, error = 0;
     682           0 :         loff_t                  pos;
     683             : 
     684           0 :         ret = xfs_ilock_iocb(iocb, iolock);
     685           0 :         if (ret)
     686             :                 return ret;
     687           0 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     688           0 :         if (ret)
     689           0 :                 goto out;
     690             : 
     691           0 :         pos = iocb->ki_pos;
     692             : 
     693           0 :         trace_xfs_file_dax_write(iocb, from);
     694           0 :         ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
     695           0 :         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
     696           0 :                 i_size_write(inode, iocb->ki_pos);
     697           0 :                 error = xfs_setfilesize(ip, pos, ret);
     698             :         }
     699           0 : out:
     700           0 :         if (iolock)
     701           0 :                 xfs_iunlock(ip, iolock);
     702           0 :         if (error)
     703             :                 return error;
     704             : 
     705           0 :         if (ret > 0) {
     706           0 :                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
     707             : 
     708             :                 /* Handle various SYNC-type writes */
     709           0 :                 ret = generic_write_sync(iocb, ret);
     710             :         }
     711             :         return ret;
     712             : }
     713             : 
     714             : STATIC ssize_t
     715   248879643 : xfs_file_buffered_write(
     716             :         struct kiocb            *iocb,
     717             :         struct iov_iter         *from)
     718             : {
     719   248879643 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     720   248879643 :         struct xfs_inode        *ip = XFS_I(inode);
     721   248879643 :         ssize_t                 ret;
     722   248879643 :         bool                    cleared_space = false;
     723   250000334 :         unsigned int            iolock;
     724             : 
     725             : write_retry:
     726   250000334 :         iolock = XFS_IOLOCK_EXCL;
     727   250000334 :         ret = xfs_ilock_iocb(iocb, iolock);
     728   249577708 :         if (ret)
     729           0 :                 return ret;
     730             : 
     731   249577708 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     732   249618491 :         if (ret)
     733       13727 :                 goto out;
     734             : 
     735   249604764 :         trace_xfs_file_buffered_write(iocb, from);
     736   249269792 :         ret = iomap_file_buffered_write(iocb, from,
     737             :                         &xfs_buffered_write_iomap_ops);
     738             : 
     739             :         /*
     740             :          * If we hit a space limit, try to free up some lingering preallocated
     741             :          * space before returning an error. In the case of ENOSPC, first try to
     742             :          * write back all dirty inodes to free up some of the excess reserved
     743             :          * metadata space. This reduces the chances that the eofblocks scan
     744             :          * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
     745             :          * also behaves as a filter to prevent too many eofblocks scans from
     746             :          * running at the same time.  Use a synchronous scan to increase the
     747             :          * effectiveness of the scan.
     748             :          */
     749   249922409 :         if (ret == -EDQUOT && !cleared_space) {
     750        1376 :                 xfs_iunlock(ip, iolock);
     751        1376 :                 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
     752        1376 :                 cleared_space = true;
     753        1376 :                 goto write_retry;
     754   249921033 :         } else if (ret == -ENOSPC && !cleared_space) {
     755     1119263 :                 struct xfs_icwalk       icw = {0};
     756             : 
     757     1119263 :                 cleared_space = true;
     758     1119263 :                 xfs_flush_inodes(ip->i_mount);
     759             : 
     760     1119165 :                 xfs_iunlock(ip, iolock);
     761     1118855 :                 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
     762     1118855 :                 xfs_blockgc_free_space(ip->i_mount, &icw);
     763     1119315 :                 goto write_retry;
     764             :         }
     765             : 
     766   248801770 : out:
     767   248815497 :         if (iolock)
     768   248699812 :                 xfs_iunlock(ip, iolock);
     769             : 
     770   248915241 :         if (ret > 0) {
     771   247803481 :                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
     772             :                 /* Handle various SYNC-type writes */
     773   247864185 :                 ret = generic_write_sync(iocb, ret);
     774             :         }
     775             :         return ret;
     776             : }
     777             : 
     778             : STATIC ssize_t
     779   266839291 : xfs_file_write_iter(
     780             :         struct kiocb            *iocb,
     781             :         struct iov_iter         *from)
     782             : {
     783   266839291 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     784   266839291 :         struct xfs_inode        *ip = XFS_I(inode);
     785   266839291 :         ssize_t                 ret;
     786   266839291 :         size_t                  ocount = iov_iter_count(from);
     787             : 
     788   266839291 :         XFS_STATS_INC(ip->i_mount, xs_write_calls);
     789             : 
     790   266781795 :         if (ocount == 0)
     791             :                 return 0;
     792             : 
     793   533563098 :         if (xfs_is_shutdown(ip->i_mount))
     794             :                 return -EIO;
     795             : 
     796   266770649 :         if (IS_DAX(inode))
     797           0 :                 return xfs_file_dax_write(iocb, from);
     798             : 
     799   266770649 :         if (iocb->ki_flags & IOCB_DIRECT) {
     800             :                 /*
     801             :                  * Allow a directio write to fall back to a buffered
     802             :                  * write *only* in the case that we're doing a reflink
     803             :                  * CoW.  In all other directio scenarios we do not
     804             :                  * allow an operation to fall back to buffered mode.
     805             :                  */
     806    24766493 :                 ret = xfs_file_dio_write(iocb, from);
     807    24762124 :                 if (ret != -ENOTBLK)
     808             :                         return ret;
     809             :         }
     810             : 
     811   248964877 :         return xfs_file_buffered_write(iocb, from);
     812             : }
     813             : 
     814             : /* Does this file, inode, or mount want synchronous writes? */
     815   287042623 : static inline bool xfs_file_sync_writes(struct file *filp)
     816             : {
     817   287042623 :         struct xfs_inode        *ip = XFS_I(file_inode(filp));
     818             : 
     819   287042623 :         if (xfs_has_wsync(ip->i_mount))
     820             :                 return true;
     821   287042599 :         if (filp->f_flags & (__O_SYNC | O_DSYNC))
     822             :                 return true;
     823   287012696 :         if (IS_SYNC(file_inode(filp)))
     824          13 :                 return true;
     825             : 
     826             :         return false;
     827             : }
     828             : 
     829             : #define XFS_FALLOC_FL_SUPPORTED                                         \
     830             :                 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
     831             :                  FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
     832             :                  FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
     833             : 
     834             : STATIC long
     835    57295415 : xfs_file_fallocate(
     836             :         struct file             *file,
     837             :         int                     mode,
     838             :         loff_t                  offset,
     839             :         loff_t                  len)
     840             : {
     841    57295415 :         struct inode            *inode = file_inode(file);
     842    57295415 :         struct xfs_inode        *ip = XFS_I(inode);
     843    57295415 :         long                    error;
     844    57295415 :         uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
     845    57295415 :         loff_t                  new_size = 0;
     846    57295415 :         bool                    do_file_insert = false;
     847             : 
     848    57295415 :         if (!S_ISREG(inode->i_mode))
     849             :                 return -EINVAL;
     850    57295415 :         if (mode & ~XFS_FALLOC_FL_SUPPORTED)
     851             :                 return -EOPNOTSUPP;
     852             : 
     853    57295415 :         xfs_ilock(ip, iolock);
     854    57295578 :         error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
     855    57295514 :         if (error)
     856           0 :                 goto out_unlock;
     857             : 
     858             :         /*
     859             :          * Must wait for all AIO to complete before we continue as AIO can
     860             :          * change the file size on completion without holding any locks we
     861             :          * currently hold. We must do this first because AIO can update both
     862             :          * the on disk and in memory inode sizes, and the operations that follow
     863             :          * require the in-memory size to be fully up-to-date.
     864             :          */
     865    57295514 :         inode_dio_wait(inode);
     866             : 
     867             :         /*
     868             :          * Now AIO and DIO has drained we flush and (if necessary) invalidate
     869             :          * the cached range over the first operation we are about to run.
     870             :          *
     871             :          * We care about zero and collapse here because they both run a hole
     872             :          * punch over the range first. Because that can zero data, and the range
     873             :          * of invalidation for the shift operations is much larger, we still do
     874             :          * the required flush for collapse in xfs_prepare_shift().
     875             :          *
     876             :          * Insert has the same range requirements as collapse, and we extend the
     877             :          * file first which can zero data. Hence insert has the same
     878             :          * flush/invalidate requirements as collapse and so they are both
     879             :          * handled at the right time by xfs_prepare_shift().
     880             :          */
     881    57294810 :         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
     882             :                     FALLOC_FL_COLLAPSE_RANGE)) {
     883    45796554 :                 error = xfs_flush_unmap_range(ip, offset, len);
     884    45796466 :                 if (error)
     885         337 :                         goto out_unlock;
     886             :         }
     887             : 
     888    57294385 :         error = file_modified(file);
     889    57295436 :         if (error)
     890          13 :                 goto out_unlock;
     891             : 
     892    57295423 :         if (mode & FALLOC_FL_PUNCH_HOLE) {
     893    38901176 :                 error = xfs_free_file_space(ip, offset, len);
     894    38901334 :                 if (error)
     895       35669 :                         goto out_unlock;
     896    18394247 :         } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
     897     3022246 :                 if (!xfs_is_falloc_aligned(ip, offset, len)) {
     898      397835 :                         error = -EINVAL;
     899      397835 :                         goto out_unlock;
     900             :                 }
     901             : 
     902             :                 /*
     903             :                  * There is no need to overlap collapse range with EOF,
     904             :                  * in which case it is effectively a truncate operation
     905             :                  */
     906     2624405 :                 if (offset + len >= i_size_read(inode)) {
     907      218380 :                         error = -EINVAL;
     908      218380 :                         goto out_unlock;
     909             :                 }
     910             : 
     911     2406025 :                 new_size = i_size_read(inode) - len;
     912             : 
     913     2406025 :                 error = xfs_collapse_file_space(ip, offset, len);
     914     2406029 :                 if (error)
     915        1850 :                         goto out_unlock;
     916    15372001 :         } else if (mode & FALLOC_FL_INSERT_RANGE) {
     917     2348969 :                 loff_t          isize = i_size_read(inode);
     918             : 
     919     2348969 :                 if (!xfs_is_falloc_aligned(ip, offset, len)) {
     920      382201 :                         error = -EINVAL;
     921      382201 :                         goto out_unlock;
     922             :                 }
     923             : 
     924             :                 /*
     925             :                  * New inode size must not exceed ->s_maxbytes, accounting for
     926             :                  * possible signed overflow.
     927             :                  */
     928     1966769 :                 if (inode->i_sb->s_maxbytes - isize < len) {
     929          10 :                         error = -EFBIG;
     930          10 :                         goto out_unlock;
     931             :                 }
     932     1966759 :                 new_size = isize + len;
     933             : 
     934             :                 /* Offset should be less than i_size */
     935     1966759 :                 if (offset >= isize) {
     936      170770 :                         error = -EINVAL;
     937      170770 :                         goto out_unlock;
     938             :                 }
     939             :                 do_file_insert = true;
     940             :         } else {
     941    13023032 :                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
     942     7230985 :                     offset + len > i_size_read(inode)) {
     943     4893753 :                         new_size = offset + len;
     944     4893753 :                         error = inode_newsize_ok(inode, new_size);
     945     4893760 :                         if (error)
     946          10 :                                 goto out_unlock;
     947             :                 }
     948             : 
     949    13023029 :                 if (mode & FALLOC_FL_ZERO_RANGE) {
     950             :                         /*
     951             :                          * Punch a hole and prealloc the range.  We use a hole
     952             :                          * punch rather than unwritten extent conversion for two
     953             :                          * reasons:
     954             :                          *
     955             :                          *   1.) Hole punch handles partial block zeroing for us.
     956             :                          *   2.) If prealloc returns ENOSPC, the file range is
     957             :                          *       still zero-valued by virtue of the hole punch.
     958             :                          */
     959     3873130 :                         unsigned int blksize = i_blocksize(inode);
     960             : 
     961     3873127 :                         trace_xfs_zero_file_space(ip);
     962             : 
     963     3873118 :                         error = xfs_free_file_space(ip, offset, len);
     964     3873137 :                         if (error)
     965       13273 :                                 goto out_unlock;
     966             : 
     967     3859864 :                         len = round_up(offset + len, blksize) -
     968     3859864 :                               round_down(offset, blksize);
     969     3859864 :                         offset = round_down(offset, blksize);
     970     9149899 :                 } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
     971         205 :                         error = xfs_reflink_unshare(ip, offset, len);
     972         205 :                         if (error)
     973           6 :                                 goto out_unlock;
     974             :                 } else {
     975             :                         /*
     976             :                          * If always_cow mode we can't use preallocations and
     977             :                          * thus should not create them.
     978             :                          */
     979     9149694 :                         if (xfs_is_always_cow_inode(ip)) {
     980      424145 :                                 error = -EOPNOTSUPP;
     981      424145 :                                 goto out_unlock;
     982             :                         }
     983             :                 }
     984             : 
     985    12585538 :                 if (!xfs_is_always_cow_inode(ip)) {
     986    12340201 :                         error = xfs_alloc_file_space(ip, offset, len);
     987    12340348 :                         if (error)
     988      278781 :                                 goto out_unlock;
     989             :                 }
     990             :         }
     991             : 
     992             :         /* Change file size if needed */
     993    55372739 :         if (new_size) {
     994     8764321 :                 struct iattr iattr;
     995             : 
     996     8764321 :                 iattr.ia_valid = ATTR_SIZE;
     997     8764321 :                 iattr.ia_size = new_size;
     998    17528641 :                 error = xfs_vn_setattr_size(file_mnt_idmap(file),
     999             :                                             file_dentry(file), &iattr);
    1000     8764311 :                 if (error)
    1001        1256 :                         goto out_unlock;
    1002             :         }
    1003             : 
    1004             :         /*
    1005             :          * Perform hole insertion now that the file size has been
    1006             :          * updated so that if we crash during the operation we don't
    1007             :          * leave shifted extents past EOF and hence losing access to
    1008             :          * the data that is contained within them.
    1009             :          */
    1010    55371473 :         if (do_file_insert) {
    1011     1795519 :                 error = xfs_insert_file_space(ip, offset, len);
    1012     1795520 :                 if (error)
    1013        2007 :                         goto out_unlock;
    1014             :         }
    1015             : 
    1016    55369467 :         if (xfs_file_sync_writes(file))
    1017       29892 :                 error = xfs_log_force_inode(ip);
    1018             : 
    1019    55339575 : out_unlock:
    1020    57296010 :         xfs_iunlock(ip, iolock);
    1021    57296010 :         return error;
    1022             : }
    1023             : 
    1024             : STATIC int
    1025    11750559 : xfs_file_fadvise(
    1026             :         struct file     *file,
    1027             :         loff_t          start,
    1028             :         loff_t          end,
    1029             :         int             advice)
    1030             : {
    1031    11750559 :         struct xfs_inode *ip = XFS_I(file_inode(file));
    1032    11750559 :         int ret;
    1033    11750559 :         int lockflags = 0;
    1034             : 
    1035             :         /*
    1036             :          * Operations creating pages in page cache need protection from hole
    1037             :          * punching and similar ops
    1038             :          */
    1039    11750559 :         if (advice == POSIX_FADV_WILLNEED) {
    1040           0 :                 lockflags = XFS_IOLOCK_SHARED;
    1041           0 :                 xfs_ilock(ip, lockflags);
    1042             :         }
    1043    11750559 :         ret = generic_fadvise(file, start, end, advice);
    1044    11742090 :         if (lockflags)
    1045           0 :                 xfs_iunlock(ip, lockflags);
    1046    11742090 :         return ret;
    1047             : }
    1048             : 
    1049             : STATIC loff_t
    1050   290961369 : xfs_file_remap_range(
    1051             :         struct file             *file_in,
    1052             :         loff_t                  pos_in,
    1053             :         struct file             *file_out,
    1054             :         loff_t                  pos_out,
    1055             :         loff_t                  len,
    1056             :         unsigned int            remap_flags)
    1057             : {
    1058   290961369 :         struct inode            *inode_in = file_inode(file_in);
    1059   290961369 :         struct xfs_inode        *src = XFS_I(inode_in);
    1060   290961369 :         struct inode            *inode_out = file_inode(file_out);
    1061   290961369 :         struct xfs_inode        *dest = XFS_I(inode_out);
    1062   290961369 :         struct xfs_mount        *mp = src->i_mount;
    1063   290961369 :         loff_t                  remapped = 0;
    1064   290961369 :         xfs_extlen_t            cowextsize;
    1065   290961369 :         int                     ret;
    1066             : 
    1067   290961369 :         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
    1068             :                 return -EINVAL;
    1069             : 
    1070   290961369 :         if (!xfs_has_reflink(mp))
    1071             :                 return -EOPNOTSUPP;
    1072             : 
    1073   380505334 :         if (xfs_is_shutdown(mp))
    1074             :                 return -EIO;
    1075             : 
    1076             :         /* Prepare and then clone file data. */
    1077   190245782 :         ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
    1078             :                         &len, remap_flags);
    1079   190262254 :         if (ret || len == 0)
    1080    73549180 :                 return ret;
    1081             : 
    1082   116713074 :         trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
    1083             : 
    1084   116708541 :         ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
    1085             :                         &remapped);
    1086   116694388 :         if (ret)
    1087      837038 :                 goto out_unlock;
    1088             : 
    1089             :         /*
    1090             :          * Carry the cowextsize hint from src to dest if we're sharing the
    1091             :          * entire source file to the entire destination file, the source file
    1092             :          * has a cowextsize hint, and the destination file does not.
    1093             :          */
    1094   115857350 :         cowextsize = 0;
    1095   115857350 :         if (pos_in == 0 && len == i_size_read(inode_in) &&
    1096      103898 :             (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
    1097         140 :             pos_out == 0 && len >= i_size_read(inode_out) &&
    1098         134 :             !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
    1099          18 :                 cowextsize = src->i_cowextsize;
    1100             : 
    1101   115857350 :         ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
    1102             :                         remap_flags);
    1103   115840295 :         if (ret)
    1104           0 :                 goto out_unlock;
    1105             : 
    1106   115840295 :         if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
    1107         670 :                 xfs_log_force_inode(dest);
    1108   115841210 : out_unlock:
    1109   116678296 :         xfs_iunlock2_io_mmap(src, dest);
    1110   116704854 :         if (ret)
    1111      837053 :                 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
    1112   116704796 :         return remapped > 0 ? remapped : ret;
    1113             : }
    1114             : 
    1115             : STATIC int
    1116   607836608 : xfs_file_open(
    1117             :         struct inode    *inode,
    1118             :         struct file     *file)
    1119             : {
    1120  1215673216 :         if (xfs_is_shutdown(XFS_M(inode->i_sb)))
    1121             :                 return -EIO;
    1122   607822907 :         file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
    1123             :                         FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
    1124   607822907 :         return generic_file_open(inode, file);
    1125             : }
    1126             : 
    1127             : STATIC int
    1128    46111457 : xfs_dir_open(
    1129             :         struct inode    *inode,
    1130             :         struct file     *file)
    1131             : {
    1132    46111457 :         struct xfs_inode *ip = XFS_I(inode);
    1133    46111457 :         unsigned int    mode;
    1134    46111457 :         int             error;
    1135             : 
    1136    46111457 :         error = xfs_file_open(inode, file);
    1137    46047783 :         if (error)
    1138             :                 return error;
    1139             : 
    1140             :         /*
    1141             :          * If there are any blocks, read-ahead block 0 as we're almost
    1142             :          * certain to have the next operation be a read there.
    1143             :          */
    1144    46036743 :         mode = xfs_ilock_data_map_shared(ip);
    1145    46001325 :         if (ip->i_df.if_nextents > 0)
    1146    10540538 :                 error = xfs_dir3_data_readahead(ip, 0, 0);
    1147    46009844 :         xfs_iunlock(ip, mode);
    1148    46009844 :         return error;
    1149             : }
    1150             : 
    1151             : /*
    1152             :  * When we release the file, we don't want it to trim EOF blocks if it is a
    1153             :  * readonly context.  This avoids open/read/close workloads from removing
    1154             :  * EOF blocks that other writers depend upon to reduce fragmentation.
    1155             :  */
    1156             : STATIC int
    1157   561397852 : xfs_file_release(
    1158             :         struct inode    *inode,
    1159             :         struct file     *file)
    1160             : {
    1161   561397852 :         bool            free_eof_blocks = true;
    1162             : 
    1163   561397852 :         if ((file->f_mode & (FMODE_WRITE | FMODE_READ)) == FMODE_READ)
    1164   117116664 :                 free_eof_blocks = false;
    1165             : 
    1166   561397852 :         return xfs_release(XFS_I(inode), free_eof_blocks);
    1167             : }
    1168             : 
    1169             : STATIC int
    1170    96953043 : xfs_file_readdir(
    1171             :         struct file     *file,
    1172             :         struct dir_context *ctx)
    1173             : {
    1174    96953043 :         struct inode    *inode = file_inode(file);
    1175    96953043 :         xfs_inode_t     *ip = XFS_I(inode);
    1176    96953043 :         size_t          bufsize;
    1177             : 
    1178             :         /*
    1179             :          * The Linux API doesn't pass down the total size of the buffer
    1180             :          * we read into down to the filesystem.  With the filldir concept
    1181             :          * it's not needed for correct information, but the XFS dir2 leaf
    1182             :          * code wants an estimate of the buffer size to calculate it's
    1183             :          * readahead window and size the buffers used for mapping to
    1184             :          * physical blocks.
    1185             :          *
    1186             :          * Try to give it an estimate that's good enough, maybe at some
    1187             :          * point we can change the ->readdir prototype to include the
    1188             :          * buffer size.  For now we use the current glibc buffer size.
    1189             :          */
    1190    96953043 :         bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
    1191             : 
    1192    96953043 :         return xfs_readdir(NULL, ip, ctx, bufsize);
    1193             : }
    1194             : 
    1195             : STATIC loff_t
    1196    91736405 : xfs_file_llseek(
    1197             :         struct file     *file,
    1198             :         loff_t          offset,
    1199             :         int             whence)
    1200             : {
    1201    91736405 :         struct inode            *inode = file->f_mapping->host;
    1202             : 
    1203   183472810 :         if (xfs_is_shutdown(XFS_I(inode)->i_mount))
    1204             :                 return -EIO;
    1205             : 
    1206    91736400 :         switch (whence) {
    1207    91492229 :         default:
    1208    91492229 :                 return generic_file_llseek(file, offset, whence);
    1209        1482 :         case SEEK_HOLE:
    1210        1482 :                 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
    1211        1482 :                 break;
    1212      242689 :         case SEEK_DATA:
    1213      242689 :                 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
    1214      242689 :                 break;
    1215             :         }
    1216             : 
    1217      244171 :         if (offset < 0)
    1218             :                 return offset;
    1219      193956 :         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
    1220             : }
    1221             : 
    1222             : #ifdef CONFIG_FS_DAX
    1223             : static inline vm_fault_t
    1224           0 : xfs_dax_fault(
    1225             :         struct vm_fault         *vmf,
    1226             :         enum page_entry_size    pe_size,
    1227             :         bool                    write_fault,
    1228             :         pfn_t                   *pfn)
    1229             : {
    1230           0 :         return dax_iomap_fault(vmf, pe_size, pfn, NULL,
    1231           0 :                         (write_fault && !vmf->cow_page) ?
    1232             :                                 &xfs_dax_write_iomap_ops :
    1233             :                                 &xfs_read_iomap_ops);
    1234             : }
    1235             : #else
    1236             : static inline vm_fault_t
    1237             : xfs_dax_fault(
    1238             :         struct vm_fault         *vmf,
    1239             :         enum page_entry_size    pe_size,
    1240             :         bool                    write_fault,
    1241             :         pfn_t                   *pfn)
    1242             : {
    1243             :         ASSERT(0);
    1244             :         return VM_FAULT_SIGBUS;
    1245             : }
    1246             : #endif
    1247             : 
    1248             : /*
    1249             :  * Locking for serialisation of IO during page faults. This results in a lock
    1250             :  * ordering of:
    1251             :  *
    1252             :  * mmap_lock (MM)
    1253             :  *   sb_start_pagefault(vfs, freeze)
    1254             :  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
    1255             :  *       page_lock (MM)
    1256             :  *         i_lock (XFS - extent map serialisation)
    1257             :  */
    1258             : static vm_fault_t
    1259   196789862 : __xfs_filemap_fault(
    1260             :         struct vm_fault         *vmf,
    1261             :         enum page_entry_size    pe_size,
    1262             :         bool                    write_fault)
    1263             : {
    1264   196789862 :         struct inode            *inode = file_inode(vmf->vma->vm_file);
    1265   196789862 :         struct xfs_inode        *ip = XFS_I(inode);
    1266   196789862 :         vm_fault_t              ret;
    1267             : 
    1268   196789862 :         trace_xfs_filemap_fault(ip, pe_size, write_fault);
    1269             : 
    1270   196819350 :         if (write_fault) {
    1271    83828510 :                 sb_start_pagefault(inode->i_sb);
    1272    83673803 :                 file_update_time(vmf->vma->vm_file);
    1273             :         }
    1274             : 
    1275   196678961 :         if (IS_DAX(inode)) {
    1276           0 :                 pfn_t pfn;
    1277             : 
    1278           0 :                 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1279           0 :                 ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
    1280           0 :                 if (ret & VM_FAULT_NEEDDSYNC)
    1281           0 :                         ret = dax_finish_sync_fault(vmf, pe_size, pfn);
    1282           0 :                 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1283             :         } else {
    1284   196678961 :                 if (write_fault) {
    1285    83686865 :                         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1286    83691715 :                         ret = iomap_page_mkwrite(vmf,
    1287             :                                         &xfs_page_mkwrite_iomap_ops);
    1288    83623620 :                         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1289             :                 } else {
    1290   112992096 :                         ret = filemap_fault(vmf);
    1291             :                 }
    1292             :         }
    1293             : 
    1294   196807112 :         if (write_fault)
    1295    83759636 :                 sb_end_pagefault(inode->i_sb);
    1296   196766581 :         return ret;
    1297             : }
    1298             : 
    1299             : static inline bool
    1300             : xfs_is_write_fault(
    1301             :         struct vm_fault         *vmf)
    1302             : {
    1303           0 :         return (vmf->flags & FAULT_FLAG_WRITE) &&
    1304           0 :                (vmf->vma->vm_flags & VM_SHARED);
    1305             : }
    1306             : 
    1307             : static vm_fault_t
    1308   113045276 : xfs_filemap_fault(
    1309             :         struct vm_fault         *vmf)
    1310             : {
    1311             :         /* DAX can shortcut the normal fault path on write faults! */
    1312   113045276 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
    1313   113045276 :                         IS_DAX(file_inode(vmf->vma->vm_file)) &&
    1314             :                         xfs_is_write_fault(vmf));
    1315             : }
    1316             : 
    1317             : static vm_fault_t
    1318       16437 : xfs_filemap_huge_fault(
    1319             :         struct vm_fault         *vmf,
    1320             :         enum page_entry_size    pe_size)
    1321             : {
    1322       16437 :         if (!IS_DAX(file_inode(vmf->vma->vm_file)))
    1323             :                 return VM_FAULT_FALLBACK;
    1324             : 
    1325             :         /* DAX can shortcut the normal fault path on write faults! */
    1326           0 :         return __xfs_filemap_fault(vmf, pe_size,
    1327             :                         xfs_is_write_fault(vmf));
    1328             : }
    1329             : 
    1330             : static vm_fault_t
    1331    83870314 : xfs_filemap_page_mkwrite(
    1332             :         struct vm_fault         *vmf)
    1333             : {
    1334    83870314 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
    1335             : }
    1336             : 
    1337             : /*
    1338             :  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
    1339             :  * on write faults. In reality, it needs to serialise against truncate and
    1340             :  * prepare memory for writing so handle is as standard write fault.
    1341             :  */
    1342             : static vm_fault_t
    1343           0 : xfs_filemap_pfn_mkwrite(
    1344             :         struct vm_fault         *vmf)
    1345             : {
    1346             : 
    1347           0 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
    1348             : }
    1349             : 
    1350             : static const struct vm_operations_struct xfs_file_vm_ops = {
    1351             :         .fault          = xfs_filemap_fault,
    1352             :         .huge_fault     = xfs_filemap_huge_fault,
    1353             :         .map_pages      = filemap_map_pages,
    1354             :         .page_mkwrite   = xfs_filemap_page_mkwrite,
    1355             :         .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
    1356             : };
    1357             : 
    1358             : STATIC int
    1359    14074434 : xfs_file_mmap(
    1360             :         struct file             *file,
    1361             :         struct vm_area_struct   *vma)
    1362             : {
    1363    14074434 :         struct inode            *inode = file_inode(file);
    1364    14074434 :         struct xfs_buftarg      *target = xfs_inode_buftarg(XFS_I(inode));
    1365             : 
    1366             :         /*
    1367             :          * We don't support synchronous mappings for non-DAX files and
    1368             :          * for DAX files if underneath dax_device is not synchronous.
    1369             :          */
    1370    14074434 :         if (!daxdev_mapping_supported(vma, target->bt_daxdev))
    1371             :                 return -EOPNOTSUPP;
    1372             : 
    1373    14074231 :         file_accessed(file);
    1374    14074427 :         vma->vm_ops = &xfs_file_vm_ops;
    1375    14074427 :         if (IS_DAX(inode))
    1376           0 :                 vm_flags_set(vma, VM_HUGEPAGE);
    1377             :         return 0;
    1378             : }
    1379             : 
    1380             : const struct file_operations xfs_file_operations = {
    1381             :         .llseek         = xfs_file_llseek,
    1382             :         .read_iter      = xfs_file_read_iter,
    1383             :         .write_iter     = xfs_file_write_iter,
    1384             :         .splice_read    = xfs_file_splice_read,
    1385             :         .splice_write   = iter_file_splice_write,
    1386             :         .iopoll         = iocb_bio_iopoll,
    1387             :         .unlocked_ioctl = xfs_file_ioctl,
    1388             : #ifdef CONFIG_COMPAT
    1389             :         .compat_ioctl   = xfs_file_compat_ioctl,
    1390             : #endif
    1391             :         .mmap           = xfs_file_mmap,
    1392             :         .mmap_supported_flags = MAP_SYNC,
    1393             :         .open           = xfs_file_open,
    1394             :         .release        = xfs_file_release,
    1395             :         .fsync          = xfs_file_fsync,
    1396             :         .get_unmapped_area = thp_get_unmapped_area,
    1397             :         .fallocate      = xfs_file_fallocate,
    1398             :         .fadvise        = xfs_file_fadvise,
    1399             :         .remap_file_range = xfs_file_remap_range,
    1400             : };
    1401             : 
    1402             : const struct file_operations xfs_dir_file_operations = {
    1403             :         .open           = xfs_dir_open,
    1404             :         .read           = generic_read_dir,
    1405             :         .iterate_shared = xfs_file_readdir,
    1406             :         .llseek         = generic_file_llseek,
    1407             :         .unlocked_ioctl = xfs_file_ioctl,
    1408             : #ifdef CONFIG_COMPAT
    1409             :         .compat_ioctl   = xfs_file_compat_ioctl,
    1410             : #endif
    1411             :         .fsync          = xfs_dir_fsync,
    1412             : };

Generated by: LCOV version 1.14