LCOV - code coverage report
Current view: top level - fs/xfs - xfs_file.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwa @ Mon Jul 31 20:08:17 PDT 2023 Lines: 498 523 95.2 %
Date: 2023-07-31 20:08:17 Functions: 32 34 94.1 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_inode.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_inode_item.h"
      16             : #include "xfs_bmap.h"
      17             : #include "xfs_bmap_util.h"
      18             : #include "xfs_dir2.h"
      19             : #include "xfs_dir2_priv.h"
      20             : #include "xfs_ioctl.h"
      21             : #include "xfs_trace.h"
      22             : #include "xfs_log.h"
      23             : #include "xfs_icache.h"
      24             : #include "xfs_pnfs.h"
      25             : #include "xfs_iomap.h"
      26             : #include "xfs_reflink.h"
      27             : 
      28             : #include <linux/dax.h>
      29             : #include <linux/falloc.h>
      30             : #include <linux/backing-dev.h>
      31             : #include <linux/mman.h>
      32             : #include <linux/fadvise.h>
      33             : #include <linux/mount.h>
      34             : 
      35             : static const struct vm_operations_struct xfs_file_vm_ops;
      36             : 
      37             : /*
      38             :  * Decide if the given file range is aligned to the size of the fundamental
      39             :  * allocation unit for the file.
      40             :  */
      41             : static bool
      42     1363624 : xfs_is_falloc_aligned(
      43             :         struct xfs_inode        *ip,
      44             :         loff_t                  pos,
      45             :         long long int           len)
      46             : {
      47     1363624 :         struct xfs_mount        *mp = ip->i_mount;
      48     1363624 :         uint64_t                mask;
      49             : 
      50     1363624 :         if (XFS_IS_REALTIME_INODE(ip)) {
      51      807650 :                 if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
      52           8 :                         u64     rextbytes;
      53           8 :                         u32     mod;
      54             : 
      55           8 :                         rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
      56           8 :                         div_u64_rem(pos, rextbytes, &mod);
      57           8 :                         if (mod)
      58             :                                 return false;
      59           4 :                         div_u64_rem(len, rextbytes, &mod);
      60           4 :                         return mod == 0;
      61             :                 }
      62      403817 :                 mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
      63             :         } else {
      64      959799 :                 mask = mp->m_sb.sb_blocksize - 1;
      65             :         }
      66             : 
      67     1363616 :         return !((pos | len) & mask);
      68             : }
      69             : 
      70             : /*
      71             :  * Fsync operations on directories are much simpler than on regular files,
      72             :  * as there is no file data to flush, and thus also no need for explicit
      73             :  * cache flush operations, and there are no non-transaction metadata updates
      74             :  * on directories either.
      75             :  */
      76             : STATIC int
      77      296764 : xfs_dir_fsync(
      78             :         struct file             *file,
      79             :         loff_t                  start,
      80             :         loff_t                  end,
      81             :         int                     datasync)
      82             : {
      83      296764 :         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
      84             : 
      85      296764 :         trace_xfs_dir_fsync(ip);
      86      296766 :         return xfs_log_force_inode(ip);
      87             : }
      88             : 
      89             : static xfs_csn_t
      90     1596268 : xfs_fsync_seq(
      91             :         struct xfs_inode        *ip,
      92             :         bool                    datasync)
      93             : {
      94     1596268 :         if (!xfs_ipincount(ip))
      95             :                 return 0;
      96     1596029 :         if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
      97             :                 return 0;
      98     1527492 :         return ip->i_itemp->ili_commit_seq;
      99             : }
     100             : 
     101             : /*
     102             :  * All metadata updates are logged, which means that we just have to flush the
     103             :  * log up to the latest LSN that touched the inode.
     104             :  *
     105             :  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
     106             :  * the log force before we clear the ili_fsync_fields field. This ensures that
     107             :  * we don't get a racing sync operation that does not wait for the metadata to
     108             :  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
     109             :  * then all that will happen is the log force will do nothing as the lsn will
     110             :  * already be on disk.  We can't race with setting ili_fsync_fields because that
     111             :  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
     112             :  * shared until after the ili_fsync_fields is cleared.
     113             :  */
     114             : static  int
     115     1596275 : xfs_fsync_flush_log(
     116             :         struct xfs_inode        *ip,
     117             :         bool                    datasync,
     118             :         int                     *log_flushed)
     119             : {
     120     1596275 :         int                     error = 0;
     121     1596275 :         xfs_csn_t               seq;
     122             : 
     123     1596275 :         xfs_ilock(ip, XFS_ILOCK_SHARED);
     124     1596275 :         seq = xfs_fsync_seq(ip, datasync);
     125     1596276 :         if (seq) {
     126     1527496 :                 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
     127             :                                           log_flushed);
     128             : 
     129     1526985 :                 spin_lock(&ip->i_itemp->ili_lock);
     130     1527492 :                 ip->i_itemp->ili_fsync_fields = 0;
     131     1527492 :                 spin_unlock(&ip->i_itemp->ili_lock);
     132             :         }
     133     1596197 :         xfs_iunlock(ip, XFS_ILOCK_SHARED);
     134     1596214 :         return error;
     135             : }
     136             : 
     137             : STATIC int
     138    10780990 : xfs_file_fsync(
     139             :         struct file             *file,
     140             :         loff_t                  start,
     141             :         loff_t                  end,
     142             :         int                     datasync)
     143             : {
     144    10780990 :         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
     145    10780990 :         struct xfs_mount        *mp = ip->i_mount;
     146    10780990 :         int                     error, err2;
     147    10780990 :         int                     log_flushed = 0;
     148             : 
     149    10780990 :         trace_xfs_file_fsync(ip);
     150             : 
     151    10781247 :         error = file_write_and_wait_range(file, start, end);
     152    10781296 :         if (error)
     153             :                 return error;
     154             : 
     155    21559308 :         if (xfs_is_shutdown(mp))
     156             :                 return -EIO;
     157             : 
     158    10777542 :         xfs_iflags_clear(ip, XFS_ITRUNCATED);
     159             : 
     160             :         /*
     161             :          * If we have an RT and/or log subvolume we need to make sure to flush
     162             :          * the write cache the device used for file data first.  This is to
     163             :          * ensure newly written file data make it to disk before logging the new
     164             :          * inode size in case of an extending write.
     165             :          */
     166    10777297 :         if (XFS_IS_REALTIME_INODE(ip))
     167     4311372 :                 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
     168     6465925 :         else if (mp->m_logdev_targp != mp->m_ddev_targp)
     169           0 :                 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
     170             : 
     171             :         /*
     172             :          * Any inode that has dirty modifications in the log is pinned.  The
     173             :          * racy check here for a pinned inode will not catch modifications
     174             :          * that happen concurrently to the fsync call, but fsync semantics
     175             :          * only require to sync previously completed I/O.
     176             :          */
     177    10776527 :         if (xfs_ipincount(ip)) {
     178     1596278 :                 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
     179     1596094 :                 if (err2 && !error)
     180         976 :                         error = err2;
     181             :         }
     182             : 
     183             :         /*
     184             :          * If we only have a single device, and the log force about was
     185             :          * a no-op we might have to flush the data device cache here.
     186             :          * This can only happen for fdatasync/O_DSYNC if we were overwriting
     187             :          * an already allocated file and thus do not have any metadata to
     188             :          * commit.
     189             :          */
     190    10776343 :         if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
     191     5458059 :             mp->m_logdev_targp == mp->m_ddev_targp) {
     192     5457924 :                 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
     193     5458061 :                 if (err2 && !error)
     194         210 :                         error = err2;
     195             :         }
     196             : 
     197             :         return error;
     198             : }
     199             : 
     200             : static int
     201   369832508 : xfs_ilock_iocb(
     202             :         struct kiocb            *iocb,
     203             :         unsigned int            lock_mode)
     204             : {
     205   369832508 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     206             : 
     207   369832508 :         if (iocb->ki_flags & IOCB_NOWAIT) {
     208           0 :                 if (!xfs_ilock_nowait(ip, lock_mode))
     209           0 :                         return -EAGAIN;
     210             :         } else {
     211   369832508 :                 xfs_ilock(ip, lock_mode);
     212             :         }
     213             : 
     214             :         return 0;
     215             : }
     216             : 
     217             : STATIC ssize_t
     218   254229897 : xfs_file_dio_read(
     219             :         struct kiocb            *iocb,
     220             :         struct iov_iter         *to)
     221             : {
     222   254229897 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     223   254229897 :         ssize_t                 ret;
     224             : 
     225   254229897 :         trace_xfs_file_direct_read(iocb, to);
     226             : 
     227   254229900 :         if (!iov_iter_count(to))
     228             :                 return 0; /* skip atime */
     229             : 
     230   254229850 :         file_accessed(iocb->ki_filp);
     231             : 
     232   254229902 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     233   254229929 :         if (ret)
     234             :                 return ret;
     235   254229929 :         ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
     236   254229907 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     237             : 
     238   254229907 :         return ret;
     239             : }
     240             : 
     241             : static noinline ssize_t
     242             : xfs_file_dax_read(
     243             :         struct kiocb            *iocb,
     244             :         struct iov_iter         *to)
     245             : {
     246             :         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
     247             :         ssize_t                 ret = 0;
     248             : 
     249             :         trace_xfs_file_dax_read(iocb, to);
     250             : 
     251             :         if (!iov_iter_count(to))
     252             :                 return 0; /* skip atime */
     253             : 
     254             :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     255             :         if (ret)
     256             :                 return ret;
     257             :         ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
     258             :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     259             : 
     260             :         file_accessed(iocb->ki_filp);
     261             :         return ret;
     262             : }
     263             : 
     264             : STATIC ssize_t
     265    56551291 : xfs_file_buffered_read(
     266             :         struct kiocb            *iocb,
     267             :         struct iov_iter         *to)
     268             : {
     269    56551291 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     270    56551291 :         ssize_t                 ret;
     271             : 
     272    56551291 :         trace_xfs_file_buffered_read(iocb, to);
     273             : 
     274    56552844 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     275    56553489 :         if (ret)
     276             :                 return ret;
     277    56553952 :         ret = generic_file_read_iter(iocb, to);
     278    56545118 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     279             : 
     280    56545118 :         return ret;
     281             : }
     282             : 
     283             : STATIC ssize_t
     284   310785580 : xfs_file_read_iter(
     285             :         struct kiocb            *iocb,
     286             :         struct iov_iter         *to)
     287             : {
     288   310785580 :         struct inode            *inode = file_inode(iocb->ki_filp);
     289   310785580 :         struct xfs_mount        *mp = XFS_I(inode)->i_mount;
     290   310785580 :         ssize_t                 ret = 0;
     291             : 
     292   310785580 :         XFS_STATS_INC(mp, xs_read_calls);
     293             : 
     294   621568152 :         if (xfs_is_shutdown(mp))
     295             :                 return -EIO;
     296             : 
     297   310780450 :         if (IS_DAX(inode))
     298             :                 ret = xfs_file_dax_read(iocb, to);
     299   310780450 :         else if (iocb->ki_flags & IOCB_DIRECT)
     300   254229851 :                 ret = xfs_file_dio_read(iocb, to);
     301             :         else
     302    56550599 :                 ret = xfs_file_buffered_read(iocb, to);
     303             : 
     304   310781473 :         if (ret > 0)
     305    57921598 :                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
     306             :         return ret;
     307             : }
     308             : 
     309             : STATIC ssize_t
     310     4118990 : xfs_file_splice_read(
     311             :         struct file             *in,
     312             :         loff_t                  *ppos,
     313             :         struct pipe_inode_info  *pipe,
     314             :         size_t                  len,
     315             :         unsigned int            flags)
     316             : {
     317     4118990 :         struct inode            *inode = file_inode(in);
     318     4118990 :         struct xfs_inode        *ip = XFS_I(inode);
     319     4118990 :         struct xfs_mount        *mp = ip->i_mount;
     320     4118990 :         ssize_t                 ret = 0;
     321             : 
     322     4118990 :         XFS_STATS_INC(mp, xs_read_calls);
     323             : 
     324     8238004 :         if (xfs_is_shutdown(mp))
     325             :                 return -EIO;
     326             : 
     327     4118995 :         trace_xfs_file_splice_read(ip, *ppos, len);
     328             : 
     329     4118997 :         xfs_ilock(ip, XFS_IOLOCK_SHARED);
     330     4118990 :         ret = filemap_splice_read(in, ppos, pipe, len, flags);
     331     4119005 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     332     4119005 :         if (ret > 0)
     333     4118865 :                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
     334             :         return ret;
     335             : }
     336             : 
     337             : /*
     338             :  * Common pre-write limit and setup checks.
     339             :  *
     340             :  * Called with the iolocked held either shared and exclusive according to
     341             :  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
     342             :  * if called for a direct write beyond i_size.
     343             :  */
     344             : STATIC ssize_t
     345    56625354 : xfs_file_write_checks(
     346             :         struct kiocb            *iocb,
     347             :         struct iov_iter         *from,
     348             :         unsigned int            *iolock)
     349             : {
     350    56625354 :         struct file             *file = iocb->ki_filp;
     351    56625354 :         struct inode            *inode = file->f_mapping->host;
     352    56625354 :         struct xfs_inode        *ip = XFS_I(inode);
     353    56625354 :         ssize_t                 error = 0;
     354    56625354 :         size_t                  count = iov_iter_count(from);
     355    56625354 :         bool                    drained_dio = false;
     356    72238969 :         loff_t                  isize;
     357             : 
     358             : restart:
     359    72238969 :         error = generic_write_checks(iocb, from);
     360    72212530 :         if (error <= 0)
     361           6 :                 return error;
     362             : 
     363    72212524 :         if (iocb->ki_flags & IOCB_NOWAIT) {
     364           0 :                 error = break_layout(inode, false);
     365           0 :                 if (error == -EWOULDBLOCK)
     366             :                         error = -EAGAIN;
     367             :         } else {
     368    72212524 :                 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
     369             :         }
     370             : 
     371    72236426 :         if (error)
     372           0 :                 return error;
     373             : 
     374             :         /*
     375             :          * For changing security info in file_remove_privs() we need i_rwsem
     376             :          * exclusively.
     377             :          */
     378    72236426 :         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
     379       25691 :                 xfs_iunlock(ip, *iolock);
     380       25691 :                 *iolock = XFS_IOLOCK_EXCL;
     381       25691 :                 error = xfs_ilock_iocb(iocb, *iolock);
     382       25691 :                 if (error) {
     383           0 :                         *iolock = 0;
     384           0 :                         return error;
     385             :                 }
     386       25691 :                 goto restart;
     387             :         }
     388             : 
     389             :         /*
     390             :          * If the offset is beyond the size of the file, we need to zero any
     391             :          * blocks that fall between the existing EOF and the start of this
     392             :          * write.  If zeroing is needed and we are currently holding the iolock
     393             :          * shared, we need to update it to exclusive which implies having to
     394             :          * redo all checks before.
     395             :          *
     396             :          * We need to serialise against EOF updates that occur in IO completions
     397             :          * here. We want to make sure that nobody is changing the size while we
     398             :          * do this check until we have placed an IO barrier (i.e.  hold the
     399             :          * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
     400             :          * spinlock effectively forms a memory barrier once we have the
     401             :          * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
     402             :          * hence be able to correctly determine if we need to run zeroing.
     403             :          *
     404             :          * We can do an unlocked check here safely as IO completion can only
     405             :          * extend EOF. Truncate is locked out at this point, so the EOF can
     406             :          * not move backwards, only forwards. Hence we only need to take the
     407             :          * slow path and spin locks when we are at or beyond the current EOF.
     408             :          */
     409    72210735 :         if (iocb->ki_pos <= i_size_read(inode))
     410    41054335 :                 goto out;
     411             : 
     412    31156400 :         spin_lock(&ip->i_flags_lock);
     413    31155154 :         isize = i_size_read(inode);
     414    31155154 :         if (iocb->ki_pos > isize) {
     415    31155154 :                 spin_unlock(&ip->i_flags_lock);
     416             : 
     417    31161308 :                 if (iocb->ki_flags & IOCB_NOWAIT)
     418             :                         return -EAGAIN;
     419             : 
     420    31161308 :                 if (!drained_dio) {
     421    15587975 :                         if (*iolock == XFS_IOLOCK_SHARED) {
     422      101835 :                                 xfs_iunlock(ip, *iolock);
     423      101835 :                                 *iolock = XFS_IOLOCK_EXCL;
     424      101835 :                                 xfs_ilock(ip, *iolock);
     425      101833 :                                 iov_iter_reexpand(from, count);
     426             :                         }
     427             :                         /*
     428             :                          * We now have an IO submission barrier in place, but
     429             :                          * AIO can do EOF updates during IO completion and hence
     430             :                          * we now need to wait for all of them to drain. Non-AIO
     431             :                          * DIO will have drained before we are given the
     432             :                          * XFS_IOLOCK_EXCL, and so for most cases this wait is a
     433             :                          * no-op.
     434             :                          */
     435    15587973 :                         inode_dio_wait(inode);
     436    15587924 :                         drained_dio = true;
     437    15587924 :                         goto restart;
     438             :                 }
     439             : 
     440    15573333 :                 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
     441    15573743 :                 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
     442    15569394 :                 if (error)
     443             :                         return error;
     444             :         } else
     445           0 :                 spin_unlock(&ip->i_flags_lock);
     446             : 
     447    56623114 : out:
     448    56623114 :         return kiocb_modified(iocb);
     449             : }
     450             : 
     451             : static int
     452     6287111 : xfs_dio_write_end_io(
     453             :         struct kiocb            *iocb,
     454             :         ssize_t                 size,
     455             :         int                     error,
     456             :         unsigned                flags)
     457             : {
     458     6287111 :         struct inode            *inode = file_inode(iocb->ki_filp);
     459     6287111 :         struct xfs_inode        *ip = XFS_I(inode);
     460     6287111 :         loff_t                  offset = iocb->ki_pos;
     461     6287111 :         unsigned int            nofs_flag;
     462             : 
     463     6287111 :         trace_xfs_end_io_direct_write(ip, offset, size);
     464             : 
     465    12574090 :         if (xfs_is_shutdown(ip->i_mount))
     466             :                 return -EIO;
     467             : 
     468     6286376 :         if (error)
     469             :                 return error;
     470     5494092 :         if (!size)
     471             :                 return 0;
     472             : 
     473             :         /*
     474             :          * Capture amount written on completion as we can't reliably account
     475             :          * for it on submission.
     476             :          */
     477     5494092 :         XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
     478             : 
     479             :         /*
     480             :          * We can allocate memory here while doing writeback on behalf of
     481             :          * memory reclaim.  To avoid memory allocation deadlocks set the
     482             :          * task-wide nofs context for the following operations.
     483             :          */
     484     5494075 :         nofs_flag = memalloc_nofs_save();
     485             : 
     486     5494075 :         if (flags & IOMAP_DIO_COW) {
     487     1103891 :                 error = xfs_reflink_end_cow(ip, offset, size);
     488     1103891 :                 if (error)
     489          29 :                         goto out;
     490             :         }
     491             : 
     492             :         /*
     493             :          * Unwritten conversion updates the in-core isize after extent
     494             :          * conversion but before updating the on-disk size. Updating isize any
     495             :          * earlier allows a racing dio read to find unwritten extents before
     496             :          * they are converted.
     497             :          */
     498     5494046 :         if (flags & IOMAP_DIO_UNWRITTEN) {
     499     2671488 :                 error = xfs_iomap_write_unwritten(ip, offset, size, true);
     500     2671592 :                 goto out;
     501             :         }
     502             : 
     503             :         /*
     504             :          * We need to update the in-core inode size here so that we don't end up
     505             :          * with the on-disk inode size being outside the in-core inode size. We
     506             :          * have no other method of updating EOF for AIO, so always do it here
     507             :          * if necessary.
     508             :          *
     509             :          * We need to lock the test/set EOF update as we can be racing with
     510             :          * other IO completions here to update the EOF. Failing to serialise
     511             :          * here can result in EOF moving backwards and Bad Things Happen when
     512             :          * that occurs.
     513             :          *
     514             :          * As IO completion only ever extends EOF, we can do an unlocked check
     515             :          * here to avoid taking the spinlock. If we land within the current EOF,
     516             :          * then we do not need to do an extending update at all, and we don't
     517             :          * need to take the lock to check this. If we race with an update moving
     518             :          * EOF, then we'll either still be beyond EOF and need to take the lock,
     519             :          * or we'll be within EOF and we don't need to take it at all.
     520             :          */
     521     2822558 :         if (offset + size <= i_size_read(inode))
     522     2612239 :                 goto out;
     523             : 
     524      210319 :         spin_lock(&ip->i_flags_lock);
     525      210319 :         if (offset + size > i_size_read(inode)) {
     526      210319 :                 i_size_write(inode, offset + size);
     527      210319 :                 spin_unlock(&ip->i_flags_lock);
     528      210319 :                 error = xfs_setfilesize(ip, offset, size);
     529             :         } else {
     530           0 :                 spin_unlock(&ip->i_flags_lock);
     531             :         }
     532             : 
     533     5494179 : out:
     534     5494179 :         memalloc_nofs_restore(nofs_flag);
     535     5494179 :         return error;
     536             : }
     537             : 
     538             : static const struct iomap_dio_ops xfs_dio_write_ops = {
     539             :         .end_io         = xfs_dio_write_end_io,
     540             : };
     541             : 
     542             : /*
     543             :  * Handle block aligned direct I/O writes
     544             :  */
     545             : static noinline ssize_t
     546     3445034 : xfs_file_dio_write_aligned(
     547             :         struct xfs_inode        *ip,
     548             :         struct kiocb            *iocb,
     549             :         struct iov_iter         *from)
     550             : {
     551     3445034 :         unsigned int            iolock = XFS_IOLOCK_SHARED;
     552     3445034 :         ssize_t                 ret;
     553             : 
     554     3445034 :         ret = xfs_ilock_iocb(iocb, iolock);
     555     3445069 :         if (ret)
     556             :                 return ret;
     557     3445065 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     558     3445032 :         if (ret)
     559         575 :                 goto out_unlock;
     560             : 
     561             :         /*
     562             :          * We don't need to hold the IOLOCK exclusively across the IO, so demote
     563             :          * the iolock back to shared if we had to take the exclusive lock in
     564             :          * xfs_file_write_checks() for other reasons.
     565             :          */
     566     3444457 :         if (iolock == XFS_IOLOCK_EXCL) {
     567      104697 :                 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
     568      104696 :                 iolock = XFS_IOLOCK_SHARED;
     569             :         }
     570     3444456 :         trace_xfs_file_direct_write(iocb, from);
     571     3444445 :         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
     572             :                            &xfs_dio_write_ops, 0, NULL, 0);
     573     3445051 : out_unlock:
     574     3445051 :         if (iolock)
     575     3445072 :                 xfs_iunlock(ip, iolock);
     576             :         return ret;
     577             : }
     578             : 
     579             : /*
     580             :  * Handle block unaligned direct I/O writes
     581             :  *
     582             :  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
     583             :  * them to be done in parallel with reads and other direct I/O writes.  However,
     584             :  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
     585             :  * to do sub-block zeroing and that requires serialisation against other direct
     586             :  * I/O to the same block.  In this case we need to serialise the submission of
     587             :  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
     588             :  * In the case where sub-block zeroing is not required, we can do concurrent
     589             :  * sub-block dios to the same block successfully.
     590             :  *
     591             :  * Optimistically submit the I/O using the shared lock first, but use the
     592             :  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
     593             :  * if block allocation or partial block zeroing would be required.  In that case
     594             :  * we try again with the exclusive lock.
     595             :  */
     596             : static noinline ssize_t
     597     4589504 : xfs_file_dio_write_unaligned(
     598             :         struct xfs_inode        *ip,
     599             :         struct kiocb            *iocb,
     600             :         struct iov_iter         *from)
     601             : {
     602     4589504 :         size_t                  isize = i_size_read(VFS_I(ip));
     603     4589504 :         size_t                  count = iov_iter_count(from);
     604     4589504 :         unsigned int            iolock = XFS_IOLOCK_SHARED;
     605     4589504 :         unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
     606     4589504 :         ssize_t                 ret;
     607             : 
     608             :         /*
     609             :          * Extending writes need exclusivity because of the sub-block zeroing
     610             :          * that the DIO code always does for partial tail blocks beyond EOF, so
     611             :          * don't even bother trying the fast path in this case.
     612             :          */
     613     4589504 :         if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
     614     1743223 :                 if (iocb->ki_flags & IOCB_NOWAIT)
     615             :                         return -EAGAIN;
     616     1743223 : retry_exclusive:
     617     2413792 :                 iolock = XFS_IOLOCK_EXCL;
     618     2413792 :                 flags = IOMAP_DIO_FORCE_WAIT;
     619             :         }
     620             : 
     621     5260073 :         ret = xfs_ilock_iocb(iocb, iolock);
     622     5260068 :         if (ret)
     623             :                 return ret;
     624             : 
     625             :         /*
     626             :          * We can't properly handle unaligned direct I/O to reflink files yet,
     627             :          * as we can't unshare a partial block.
     628             :          */
     629     5260065 :         if (xfs_is_cow_inode(ip)) {
     630     2417249 :                 trace_xfs_reflink_bounce_dio_write(iocb, from);
     631     2417250 :                 ret = -ENOTBLK;
     632     2417250 :                 goto out_unlock;
     633             :         }
     634             : 
     635     2842814 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     636     2842812 :         if (ret)
     637          66 :                 goto out_unlock;
     638             : 
     639             :         /*
     640             :          * If we are doing exclusive unaligned I/O, this must be the only I/O
     641             :          * in-flight.  Otherwise we risk data corruption due to unwritten extent
     642             :          * conversions from the AIO end_io handler.  Wait for all other I/O to
     643             :          * drain first.
     644             :          */
     645     2842746 :         if (flags & IOMAP_DIO_FORCE_WAIT)
     646     2066550 :                 inode_dio_wait(VFS_I(ip));
     647             : 
     648     2842746 :         trace_xfs_file_direct_write(iocb, from);
     649     2842746 :         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
     650             :                            &xfs_dio_write_ops, flags, NULL, 0);
     651             : 
     652             :         /*
     653             :          * Retry unaligned I/O with exclusive blocking semantics if the DIO
     654             :          * layer rejected it for mapping or locking reasons. If we are doing
     655             :          * nonblocking user I/O, propagate the error.
     656             :          */
     657     2842746 :         if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
     658      670571 :                 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
     659      670571 :                 xfs_iunlock(ip, iolock);
     660      670569 :                 goto retry_exclusive;
     661             :         }
     662             : 
     663     2172175 : out_unlock:
     664     4589491 :         if (iolock)
     665     4589491 :                 xfs_iunlock(ip, iolock);
     666             :         return ret;
     667             : }
     668             : 
     669             : static ssize_t
     670     8034534 : xfs_file_dio_write(
     671             :         struct kiocb            *iocb,
     672             :         struct iov_iter         *from)
     673             : {
     674     8034534 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     675     8034534 :         struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
     676     8034534 :         size_t                  count = iov_iter_count(from);
     677             : 
     678             :         /* direct I/O must be aligned to device logical sector size */
     679     8034534 :         if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
     680             :                 return -EINVAL;
     681     8034534 :         if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
     682     4589487 :                 return xfs_file_dio_write_unaligned(ip, iocb, from);
     683     3445047 :         return xfs_file_dio_write_aligned(ip, iocb, from);
     684             : }
     685             : 
     686             : static noinline ssize_t
     687             : xfs_file_dax_write(
     688             :         struct kiocb            *iocb,
     689             :         struct iov_iter         *from)
     690             : {
     691             :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     692             :         struct xfs_inode        *ip = XFS_I(inode);
     693             :         unsigned int            iolock = XFS_IOLOCK_EXCL;
     694             :         ssize_t                 ret, error = 0;
     695             :         loff_t                  pos;
     696             : 
     697             :         ret = xfs_ilock_iocb(iocb, iolock);
     698             :         if (ret)
     699             :                 return ret;
     700             :         ret = xfs_file_write_checks(iocb, from, &iolock);
     701             :         if (ret)
     702             :                 goto out;
     703             : 
     704             :         pos = iocb->ki_pos;
     705             : 
     706             :         trace_xfs_file_dax_write(iocb, from);
     707             :         ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
     708             :         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
     709             :                 i_size_write(inode, iocb->ki_pos);
     710             :                 error = xfs_setfilesize(ip, pos, ret);
     711             :         }
     712             : out:
     713             :         if (iolock)
     714             :                 xfs_iunlock(ip, iolock);
     715             :         if (error)
     716             :                 return error;
     717             : 
     718             :         if (ret > 0) {
     719             :                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
     720             : 
     721             :                 /* Handle various SYNC-type writes */
     722             :                 ret = generic_write_sync(iocb, ret);
     723             :         }
     724             :         return ret;
     725             : }
     726             : 
     727             : STATIC ssize_t
     728    49913564 : xfs_file_buffered_write(
     729             :         struct kiocb            *iocb,
     730             :         struct iov_iter         *from)
     731             : {
     732    49913564 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     733    49913564 :         struct xfs_inode        *ip = XFS_I(inode);
     734    49913564 :         ssize_t                 ret;
     735    49913564 :         bool                    cleared_space = false;
     736    50320147 :         unsigned int            iolock;
     737             : 
     738             : write_retry:
     739    50320147 :         iolock = XFS_IOLOCK_EXCL;
     740    50320147 :         ret = xfs_ilock_iocb(iocb, iolock);
     741    50340183 :         if (ret)
     742           0 :                 return ret;
     743             : 
     744    50340183 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     745    50332400 :         if (ret)
     746         649 :                 goto out;
     747             : 
     748    50331751 :         trace_xfs_file_buffered_write(iocb, from);
     749    50339154 :         ret = iomap_file_buffered_write(iocb, from,
     750             :                         &xfs_buffered_write_iomap_ops);
     751             : 
     752             :         /*
     753             :          * If we hit a space limit, try to free up some lingering preallocated
     754             :          * space before returning an error. In the case of ENOSPC, first try to
     755             :          * write back all dirty inodes to free up some of the excess reserved
     756             :          * metadata space. This reduces the chances that the eofblocks scan
     757             :          * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
     758             :          * also behaves as a filter to prevent too many eofblocks scans from
     759             :          * running at the same time.  Use a synchronous scan to increase the
     760             :          * effectiveness of the scan.
     761             :          */
     762    50329316 :         if (ret == -EDQUOT && !cleared_space) {
     763         333 :                 xfs_iunlock(ip, iolock);
     764         333 :                 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
     765         333 :                 cleared_space = true;
     766         333 :                 goto write_retry;
     767    50328983 :         } else if (ret == -ENOSPC && !cleared_space) {
     768      406270 :                 struct xfs_icwalk       icw = {0};
     769             : 
     770      406270 :                 cleared_space = true;
     771      406270 :                 xfs_flush_inodes(ip->i_mount);
     772             : 
     773      406166 :                 xfs_iunlock(ip, iolock);
     774      406066 :                 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
     775      406066 :                 xfs_blockgc_free_space(ip->i_mount, &icw);
     776      406250 :                 goto write_retry;
     777             :         }
     778             : 
     779    49922713 : out:
     780    49923362 :         if (iolock)
     781    49933547 :                 xfs_iunlock(ip, iolock);
     782             : 
     783    49923458 :         if (ret > 0) {
     784    49562558 :                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
     785             :                 /* Handle various SYNC-type writes */
     786    49562448 :                 ret = generic_write_sync(iocb, ret);
     787             :         }
     788             :         return ret;
     789             : }
     790             : 
     791             : STATIC ssize_t
     792    55533279 : xfs_file_write_iter(
     793             :         struct kiocb            *iocb,
     794             :         struct iov_iter         *from)
     795             : {
     796    55533279 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     797    55533279 :         struct xfs_inode        *ip = XFS_I(inode);
     798    55533279 :         ssize_t                 ret;
     799    55533279 :         size_t                  ocount = iov_iter_count(from);
     800             : 
     801    55533279 :         XFS_STATS_INC(ip->i_mount, xs_write_calls);
     802             : 
     803    55557287 :         if (ocount == 0)
     804             :                 return 0;
     805             : 
     806   111114510 :         if (xfs_is_shutdown(ip->i_mount))
     807             :                 return -EIO;
     808             : 
     809    55547740 :         if (IS_DAX(inode))
     810             :                 return xfs_file_dax_write(iocb, from);
     811             : 
     812    55547740 :         if (iocb->ki_flags & IOCB_DIRECT) {
     813             :                 /*
     814             :                  * Allow a directio write to fall back to a buffered
     815             :                  * write *only* in the case that we're doing a reflink
     816             :                  * CoW.  In all other directio scenarios we do not
     817             :                  * allow an operation to fall back to buffered mode.
     818             :                  */
     819     8034528 :                 ret = xfs_file_dio_write(iocb, from);
     820     8034489 :                 if (ret != -ENOTBLK)
     821             :                         return ret;
     822             :         }
     823             : 
     824    49930562 :         return xfs_file_buffered_write(iocb, from);
     825             : }
     826             : 
     827             : static void
     828             : xfs_wait_dax_page(
     829             :         struct inode            *inode)
     830             : {
     831             :         struct xfs_inode        *ip = XFS_I(inode);
     832             : 
     833             :         xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
     834             :         schedule();
     835             :         xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
     836             : }
     837             : 
     838             : int
     839    16480174 : xfs_break_dax_layouts(
     840             :         struct inode            *inode,
     841             :         bool                    *retry)
     842             : {
     843    16480174 :         struct page             *page;
     844             : 
     845    16480174 :         ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
     846             : 
     847    16480175 :         page = dax_layout_busy_page(inode->i_mapping);
     848    16480175 :         if (!page)
     849    16480175 :                 return 0;
     850             : 
     851             :         *retry = true;
     852             :         return ___wait_var_event(&page->_refcount,
     853             :                         atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
     854             :                         0, 0, xfs_wait_dax_page(inode));
     855             : }
     856             : 
     857             : int
     858    88706769 : xfs_break_layouts(
     859             :         struct inode            *inode,
     860             :         uint                    *iolock,
     861             :         enum layout_break_reason reason)
     862             : {
     863    88706769 :         bool                    retry;
     864    88706769 :         int                     error;
     865             : 
     866    88706769 :         ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
     867             : 
     868    88710683 :         do {
     869    88710683 :                 retry = false;
     870    88710683 :                 switch (reason) {
     871    16480167 :                 case BREAK_UNMAP:
     872    16480167 :                         error = xfs_break_dax_layouts(inode, &retry);
     873    16480173 :                         if (error || retry)
     874             :                                 break;
     875    88710680 :                         fallthrough;
     876             :                 case BREAK_WRITE:
     877    88710680 :                         error = xfs_break_leased_layouts(inode, iolock, &retry);
     878    88710680 :                         break;
     879             :                 default:
     880           0 :                         WARN_ON_ONCE(1);
     881           0 :                         error = -EINVAL;
     882             :                 }
     883    88712684 :         } while (error == 0 && retry);
     884             : 
     885    88711866 :         return error;
     886             : }
     887             : 
     888             : /* Does this file, inode, or mount want synchronous writes? */
     889   171321549 : static inline bool xfs_file_sync_writes(struct file *filp)
     890             : {
     891   171321549 :         struct xfs_inode        *ip = XFS_I(file_inode(filp));
     892             : 
     893   171321549 :         if (xfs_has_wsync(ip->i_mount))
     894             :                 return true;
     895   171321541 :         if (filp->f_flags & (__O_SYNC | O_DSYNC))
     896             :                 return true;
     897   171314409 :         if (IS_SYNC(file_inode(filp)))
     898           4 :                 return true;
     899             : 
     900             :         return false;
     901             : }
     902             : 
     903             : #define XFS_FALLOC_FL_SUPPORTED                                         \
     904             :                 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
     905             :                  FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
     906             :                  FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
     907             : 
     908             : STATIC long
     909    13378462 : xfs_file_fallocate(
     910             :         struct file             *file,
     911             :         int                     mode,
     912             :         loff_t                  offset,
     913             :         loff_t                  len)
     914             : {
     915    13378462 :         struct inode            *inode = file_inode(file);
     916    13378462 :         struct xfs_inode        *ip = XFS_I(inode);
     917    13378462 :         long                    error;
     918    13378462 :         uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
     919    13378462 :         loff_t                  new_size = 0;
     920    13378462 :         bool                    do_file_insert = false;
     921             : 
     922    13378462 :         if (!S_ISREG(inode->i_mode))
     923             :                 return -EINVAL;
     924    13378462 :         if (mode & ~XFS_FALLOC_FL_SUPPORTED)
     925             :                 return -EOPNOTSUPP;
     926             : 
     927    13378462 :         xfs_ilock(ip, iolock);
     928    13378479 :         error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
     929    13378485 :         if (error)
     930           0 :                 goto out_unlock;
     931             : 
     932             :         /*
     933             :          * Must wait for all AIO to complete before we continue as AIO can
     934             :          * change the file size on completion without holding any locks we
     935             :          * currently hold. We must do this first because AIO can update both
     936             :          * the on disk and in memory inode sizes, and the operations that follow
     937             :          * require the in-memory size to be fully up-to-date.
     938             :          */
     939    13378485 :         inode_dio_wait(inode);
     940             : 
     941             :         /*
     942             :          * Now AIO and DIO has drained we flush and (if necessary) invalidate
     943             :          * the cached range over the first operation we are about to run.
     944             :          *
     945             :          * We care about zero and collapse here because they both run a hole
     946             :          * punch over the range first. Because that can zero data, and the range
     947             :          * of invalidation for the shift operations is much larger, we still do
     948             :          * the required flush for collapse in xfs_prepare_shift().
     949             :          *
     950             :          * Insert has the same range requirements as collapse, and we extend the
     951             :          * file first which can zero data. Hence insert has the same
     952             :          * flush/invalidate requirements as collapse and so they are both
     953             :          * handled at the right time by xfs_prepare_shift().
     954             :          */
     955    13378453 :         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
     956             :                     FALLOC_FL_COLLAPSE_RANGE)) {
     957     9702637 :                 error = xfs_flush_unmap_range(ip, offset, len);
     958     9702644 :                 if (error)
     959         147 :                         goto out_unlock;
     960             :         }
     961             : 
     962    13378313 :         error = file_modified(file);
     963    13378335 :         if (error)
     964           5 :                 goto out_unlock;
     965             : 
     966    13378330 :         if (mode & FALLOC_FL_PUNCH_HOLE) {
     967     7451151 :                 error = xfs_free_file_space(ip, offset, len);
     968     7451154 :                 if (error)
     969       14926 :                         goto out_unlock;
     970     5927179 :         } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
     971      733649 :                 if (!xfs_is_falloc_aligned(ip, offset, len)) {
     972      205920 :                         error = -EINVAL;
     973      205920 :                         goto out_unlock;
     974             :                 }
     975             : 
     976             :                 /*
     977             :                  * There is no need to overlap collapse range with EOF,
     978             :                  * in which case it is effectively a truncate operation
     979             :                  */
     980      527729 :                 if (offset + len >= i_size_read(inode)) {
     981       87858 :                         error = -EINVAL;
     982       87858 :                         goto out_unlock;
     983             :                 }
     984             : 
     985      439871 :                 new_size = i_size_read(inode) - len;
     986             : 
     987      439871 :                 error = xfs_collapse_file_space(ip, offset, len);
     988      439871 :                 if (error)
     989        1849 :                         goto out_unlock;
     990     5193530 :         } else if (mode & FALLOC_FL_INSERT_RANGE) {
     991      629975 :                 loff_t          isize = i_size_read(inode);
     992             : 
     993      629975 :                 if (!xfs_is_falloc_aligned(ip, offset, len)) {
     994      206219 :                         error = -EINVAL;
     995      206219 :                         goto out_unlock;
     996             :                 }
     997             : 
     998             :                 /*
     999             :                  * New inode size must not exceed ->s_maxbytes, accounting for
    1000             :                  * possible signed overflow.
    1001             :                  */
    1002      423756 :                 if (inode->i_sb->s_maxbytes - isize < len) {
    1003           2 :                         error = -EFBIG;
    1004           2 :                         goto out_unlock;
    1005             :                 }
    1006      423754 :                 new_size = isize + len;
    1007             : 
    1008             :                 /* Offset should be less than i_size */
    1009      423754 :                 if (offset >= isize) {
    1010       71624 :                         error = -EINVAL;
    1011       71624 :                         goto out_unlock;
    1012             :                 }
    1013             :                 do_file_insert = true;
    1014             :         } else {
    1015     4563555 :                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
    1016     2111455 :                     offset + len > i_size_read(inode)) {
    1017     1149292 :                         new_size = offset + len;
    1018     1149292 :                         error = inode_newsize_ok(inode, new_size);
    1019     1149292 :                         if (error)
    1020           2 :                                 goto out_unlock;
    1021             :                 }
    1022             : 
    1023     4563553 :                 if (mode & FALLOC_FL_ZERO_RANGE) {
    1024             :                         /*
    1025             :                          * Punch a hole and prealloc the range.  We use a hole
    1026             :                          * punch rather than unwritten extent conversion for two
    1027             :                          * reasons:
    1028             :                          *
    1029             :                          *   1.) Hole punch handles partial block zeroing for us.
    1030             :                          *   2.) If prealloc returns ENOSPC, the file range is
    1031             :                          *       still zero-valued by virtue of the hole punch.
    1032             :                          */
    1033     1517694 :                         unsigned int blksize = i_blocksize(inode);
    1034             : 
    1035     1517695 :                         trace_xfs_zero_file_space(ip);
    1036             : 
    1037     1517693 :                         error = xfs_free_file_space(ip, offset, len);
    1038     1517695 :                         if (error)
    1039        8547 :                                 goto out_unlock;
    1040             : 
    1041     1509148 :                         len = round_up(offset + len, blksize) -
    1042     1509148 :                               round_down(offset, blksize);
    1043     1509148 :                         offset = round_down(offset, blksize);
    1044     3045859 :                 } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
    1045          66 :                         error = xfs_reflink_unshare(ip, offset, len);
    1046          66 :                         if (error)
    1047           2 :                                 goto out_unlock;
    1048             :                 } else {
    1049             :                         /*
    1050             :                          * If always_cow mode we can't use preallocations and
    1051             :                          * thus should not create them.
    1052             :                          */
    1053     3045793 :                         if (xfs_is_always_cow_inode(ip)) {
    1054           0 :                                 error = -EOPNOTSUPP;
    1055           0 :                                 goto out_unlock;
    1056             :                         }
    1057             :                 }
    1058             : 
    1059     4555008 :                 if (!xfs_is_always_cow_inode(ip)) {
    1060     4555011 :                         error = xfs_alloc_file_space(ip, offset, len);
    1061     4555008 :                         if (error)
    1062       88706 :                                 goto out_unlock;
    1063             :                 }
    1064             :         }
    1065             : 
    1066             :         /* Change file size if needed */
    1067    12692680 :         if (new_size) {
    1068     1932953 :                 struct iattr iattr;
    1069             : 
    1070     1932953 :                 iattr.ia_valid = ATTR_SIZE;
    1071     1932953 :                 iattr.ia_size = new_size;
    1072     3865906 :                 error = xfs_vn_setattr_size(file_mnt_idmap(file),
    1073             :                                             file_dentry(file), &iattr);
    1074     1932952 :                 if (error)
    1075         334 :                         goto out_unlock;
    1076             :         }
    1077             : 
    1078             :         /*
    1079             :          * Perform hole insertion now that the file size has been
    1080             :          * updated so that if we crash during the operation we don't
    1081             :          * leave shifted extents past EOF and hence losing access to
    1082             :          * the data that is contained within them.
    1083             :          */
    1084    12692345 :         if (do_file_insert) {
    1085      352028 :                 error = xfs_insert_file_space(ip, offset, len);
    1086      352028 :                 if (error)
    1087        1784 :                         goto out_unlock;
    1088             :         }
    1089             : 
    1090    12690561 :         if (xfs_file_sync_writes(file))
    1091        7128 :                 error = xfs_log_force_inode(ip);
    1092             : 
    1093    12683433 : out_unlock:
    1094    13378486 :         xfs_iunlock(ip, iolock);
    1095    13378486 :         return error;
    1096             : }
    1097             : 
    1098             : STATIC int
    1099     2117253 : xfs_file_fadvise(
    1100             :         struct file     *file,
    1101             :         loff_t          start,
    1102             :         loff_t          end,
    1103             :         int             advice)
    1104             : {
    1105     2117253 :         struct xfs_inode *ip = XFS_I(file_inode(file));
    1106     2117253 :         int ret;
    1107     2117253 :         int lockflags = 0;
    1108             : 
    1109             :         /*
    1110             :          * Operations creating pages in page cache need protection from hole
    1111             :          * punching and similar ops
    1112             :          */
    1113     2117253 :         if (advice == POSIX_FADV_WILLNEED) {
    1114           0 :                 lockflags = XFS_IOLOCK_SHARED;
    1115           0 :                 xfs_ilock(ip, lockflags);
    1116             :         }
    1117     2117253 :         ret = generic_fadvise(file, start, end, advice);
    1118     2117290 :         if (lockflags)
    1119           0 :                 xfs_iunlock(ip, lockflags);
    1120     2117290 :         return ret;
    1121             : }
    1122             : 
    1123             : STATIC loff_t
    1124   155167707 : xfs_file_remap_range(
    1125             :         struct file             *file_in,
    1126             :         loff_t                  pos_in,
    1127             :         struct file             *file_out,
    1128             :         loff_t                  pos_out,
    1129             :         loff_t                  len,
    1130             :         unsigned int            remap_flags)
    1131             : {
    1132   155167707 :         struct inode            *inode_in = file_inode(file_in);
    1133   155167707 :         struct xfs_inode        *src = XFS_I(inode_in);
    1134   155167707 :         struct inode            *inode_out = file_inode(file_out);
    1135   155167707 :         struct xfs_inode        *dest = XFS_I(inode_out);
    1136   155167707 :         struct xfs_mount        *mp = src->i_mount;
    1137   155167707 :         loff_t                  remapped = 0;
    1138   155167707 :         xfs_extlen_t            cowextsize;
    1139   155167707 :         int                     ret;
    1140             : 
    1141   155167707 :         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
    1142             :                 return -EINVAL;
    1143             : 
    1144   155167707 :         if (!xfs_has_reflink(mp))
    1145             :                 return -EOPNOTSUPP;
    1146             : 
    1147   208052910 :         if (xfs_is_shutdown(mp))
    1148             :                 return -EIO;
    1149             : 
    1150             :         /* Prepare and then clone file data. */
    1151   104023893 :         ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
    1152             :                         &len, remap_flags);
    1153   104026798 :         if (ret || len == 0)
    1154    24121121 :                 return ret;
    1155             : 
    1156    79905677 :         trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
    1157             : 
    1158    79906231 :         ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
    1159             :                         &remapped);
    1160    79906480 :         if (ret)
    1161      590168 :                 goto out_unlock;
    1162             : 
    1163             :         /*
    1164             :          * Carry the cowextsize hint from src to dest if we're sharing the
    1165             :          * entire source file to the entire destination file, the source file
    1166             :          * has a cowextsize hint, and the destination file does not.
    1167             :          */
    1168    79316312 :         cowextsize = 0;
    1169    79316312 :         if (pos_in == 0 && len == i_size_read(inode_in) &&
    1170       41903 :             (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
    1171          48 :             pos_out == 0 && len >= i_size_read(inode_out) &&
    1172          46 :             !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
    1173           6 :                 cowextsize = src->i_cowextsize;
    1174             : 
    1175    79316312 :         ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
    1176             :                         remap_flags);
    1177    79316199 :         if (ret)
    1178           0 :                 goto out_unlock;
    1179             : 
    1180    79316199 :         if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
    1181         525 :                 xfs_log_force_inode(dest);
    1182    79315688 : out_unlock:
    1183    79905872 :         xfs_iunlock2_io_mmap(src, dest);
    1184    79906196 :         if (ret)
    1185      590120 :                 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
    1186    79906252 :         return remapped > 0 ? remapped : ret;
    1187             : }
    1188             : 
    1189             : STATIC int
    1190   395136907 : xfs_file_open(
    1191             :         struct inode    *inode,
    1192             :         struct file     *file)
    1193             : {
    1194   790273814 :         if (xfs_is_shutdown(XFS_M(inode->i_sb)))
    1195             :                 return -EIO;
    1196   395123818 :         file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
    1197             :                         FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
    1198   395123818 :         return generic_file_open(inode, file);
    1199             : }
    1200             : 
    1201             : STATIC int
    1202    73895682 : xfs_dir_open(
    1203             :         struct inode    *inode,
    1204             :         struct file     *file)
    1205             : {
    1206    73895682 :         struct xfs_inode *ip = XFS_I(inode);
    1207    73895682 :         unsigned int    mode;
    1208    73895682 :         int             error;
    1209             : 
    1210    73895682 :         error = xfs_file_open(inode, file);
    1211    73903849 :         if (error)
    1212             :                 return error;
    1213             : 
    1214             :         /*
    1215             :          * If there are any blocks, read-ahead block 0 as we're almost
    1216             :          * certain to have the next operation be a read there.
    1217             :          */
    1218    73902806 :         mode = xfs_ilock_data_map_shared(ip);
    1219    73900210 :         if (ip->i_df.if_nextents > 0)
    1220     5869471 :                 error = xfs_dir3_data_readahead(ip, 0, 0);
    1221    73900275 :         xfs_iunlock(ip, mode);
    1222    73900275 :         return error;
    1223             : }
    1224             : 
    1225             : STATIC int
    1226   321212867 : xfs_file_release(
    1227             :         struct inode    *inode,
    1228             :         struct file     *filp)
    1229             : {
    1230   321212867 :         return xfs_release(XFS_I(inode));
    1231             : }
    1232             : 
    1233             : STATIC int
    1234   143554438 : xfs_file_readdir(
    1235             :         struct file     *file,
    1236             :         struct dir_context *ctx)
    1237             : {
    1238   143554438 :         struct inode    *inode = file_inode(file);
    1239   143554438 :         xfs_inode_t     *ip = XFS_I(inode);
    1240   143554438 :         size_t          bufsize;
    1241             : 
    1242             :         /*
    1243             :          * The Linux API doesn't pass down the total size of the buffer
    1244             :          * we read into down to the filesystem.  With the filldir concept
    1245             :          * it's not needed for correct information, but the XFS dir2 leaf
    1246             :          * code wants an estimate of the buffer size to calculate it's
    1247             :          * readahead window and size the buffers used for mapping to
    1248             :          * physical blocks.
    1249             :          *
    1250             :          * Try to give it an estimate that's good enough, maybe at some
    1251             :          * point we can change the ->readdir prototype to include the
    1252             :          * buffer size.  For now we use the current glibc buffer size.
    1253             :          */
    1254   143554438 :         bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
    1255             : 
    1256   143554438 :         return xfs_readdir(NULL, ip, ctx, bufsize);
    1257             : }
    1258             : 
    1259             : STATIC loff_t
    1260    35014929 : xfs_file_llseek(
    1261             :         struct file     *file,
    1262             :         loff_t          offset,
    1263             :         int             whence)
    1264             : {
    1265    35014929 :         struct inode            *inode = file->f_mapping->host;
    1266             : 
    1267    70029858 :         if (xfs_is_shutdown(XFS_I(inode)->i_mount))
    1268             :                 return -EIO;
    1269             : 
    1270    35014926 :         switch (whence) {
    1271    34851959 :         default:
    1272    34851959 :                 return generic_file_llseek(file, offset, whence);
    1273         326 :         case SEEK_HOLE:
    1274         326 :                 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
    1275         326 :                 break;
    1276      162641 :         case SEEK_DATA:
    1277      162641 :                 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
    1278      162641 :                 break;
    1279             :         }
    1280             : 
    1281      162967 :         if (offset < 0)
    1282             :                 return offset;
    1283      125937 :         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
    1284             : }
    1285             : 
    1286             : #ifdef CONFIG_FS_DAX
    1287             : static inline vm_fault_t
    1288             : xfs_dax_fault(
    1289             :         struct vm_fault         *vmf,
    1290             :         enum page_entry_size    pe_size,
    1291             :         bool                    write_fault,
    1292             :         pfn_t                   *pfn)
    1293             : {
    1294             :         return dax_iomap_fault(vmf, pe_size, pfn, NULL,
    1295             :                         (write_fault && !vmf->cow_page) ?
    1296             :                                 &xfs_dax_write_iomap_ops :
    1297             :                                 &xfs_read_iomap_ops);
    1298             : }
    1299             : #else
    1300             : static inline vm_fault_t
    1301             : xfs_dax_fault(
    1302             :         struct vm_fault         *vmf,
    1303             :         enum page_entry_size    pe_size,
    1304             :         bool                    write_fault,
    1305             :         pfn_t                   *pfn)
    1306             : {
    1307             :         ASSERT(0);
    1308             :         return VM_FAULT_SIGBUS;
    1309             : }
    1310             : #endif
    1311             : 
    1312             : /*
    1313             :  * Locking for serialisation of IO during page faults. This results in a lock
    1314             :  * ordering of:
    1315             :  *
    1316             :  * mmap_lock (MM)
    1317             :  *   sb_start_pagefault(vfs, freeze)
    1318             :  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
    1319             :  *       page_lock (MM)
    1320             :  *         i_lock (XFS - extent map serialisation)
    1321             :  */
    1322             : static vm_fault_t
    1323    14109953 : __xfs_filemap_fault(
    1324             :         struct vm_fault         *vmf,
    1325             :         enum page_entry_size    pe_size,
    1326             :         bool                    write_fault)
    1327             : {
    1328    14109953 :         struct inode            *inode = file_inode(vmf->vma->vm_file);
    1329    14109953 :         struct xfs_inode        *ip = XFS_I(inode);
    1330    14109953 :         vm_fault_t              ret;
    1331             : 
    1332    14109953 :         trace_xfs_filemap_fault(ip, pe_size, write_fault);
    1333             : 
    1334    14111748 :         if (write_fault) {
    1335     3446534 :                 sb_start_pagefault(inode->i_sb);
    1336     3446583 :                 file_update_time(vmf->vma->vm_file);
    1337             :         }
    1338             : 
    1339    14111804 :         if (IS_DAX(inode)) {
    1340             :                 pfn_t pfn;
    1341             : 
    1342             :                 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1343             :                 ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
    1344             :                 if (ret & VM_FAULT_NEEDDSYNC)
    1345             :                         ret = dax_finish_sync_fault(vmf, pe_size, pfn);
    1346             :                 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1347             :         } else {
    1348    14111804 :                 if (write_fault) {
    1349     3446591 :                         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1350     3446595 :                         ret = iomap_page_mkwrite(vmf,
    1351             :                                         &xfs_page_mkwrite_iomap_ops);
    1352     3446566 :                         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1353             :                 } else {
    1354    10665213 :                         ret = filemap_fault(vmf);
    1355             :                 }
    1356             :         }
    1357             : 
    1358    14112534 :         if (write_fault)
    1359     3446593 :                 sb_end_pagefault(inode->i_sb);
    1360    14112528 :         return ret;
    1361             : }
    1362             : 
    1363             : static inline bool
    1364             : xfs_is_write_fault(
    1365             :         struct vm_fault         *vmf)
    1366             : {
    1367             :         return (vmf->flags & FAULT_FLAG_WRITE) &&
    1368             :                (vmf->vma->vm_flags & VM_SHARED);
    1369             : }
    1370             : 
    1371             : static vm_fault_t
    1372    10665364 : xfs_filemap_fault(
    1373             :         struct vm_fault         *vmf)
    1374             : {
    1375             :         /* DAX can shortcut the normal fault path on write faults! */
    1376    10665364 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
    1377             :                         IS_DAX(file_inode(vmf->vma->vm_file)) &&
    1378             :                         xfs_is_write_fault(vmf));
    1379             : }
    1380             : 
    1381             : static vm_fault_t
    1382           0 : xfs_filemap_huge_fault(
    1383             :         struct vm_fault         *vmf,
    1384             :         enum page_entry_size    pe_size)
    1385             : {
    1386           0 :         if (!IS_DAX(file_inode(vmf->vma->vm_file)))
    1387           0 :                 return VM_FAULT_FALLBACK;
    1388             : 
    1389             :         /* DAX can shortcut the normal fault path on write faults! */
    1390             :         return __xfs_filemap_fault(vmf, pe_size,
    1391             :                         xfs_is_write_fault(vmf));
    1392             : }
    1393             : 
    1394             : static vm_fault_t
    1395     3446484 : xfs_filemap_page_mkwrite(
    1396             :         struct vm_fault         *vmf)
    1397             : {
    1398     3446484 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
    1399             : }
    1400             : 
    1401             : /*
    1402             :  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
    1403             :  * on write faults. In reality, it needs to serialise against truncate and
    1404             :  * prepare memory for writing so handle is as standard write fault.
    1405             :  */
    1406             : static vm_fault_t
    1407           0 : xfs_filemap_pfn_mkwrite(
    1408             :         struct vm_fault         *vmf)
    1409             : {
    1410             : 
    1411           0 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
    1412             : }
    1413             : 
    1414             : static const struct vm_operations_struct xfs_file_vm_ops = {
    1415             :         .fault          = xfs_filemap_fault,
    1416             :         .huge_fault     = xfs_filemap_huge_fault,
    1417             :         .map_pages      = filemap_map_pages,
    1418             :         .page_mkwrite   = xfs_filemap_page_mkwrite,
    1419             :         .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
    1420             : };
    1421             : 
    1422             : STATIC int
    1423     7551264 : xfs_file_mmap(
    1424             :         struct file             *file,
    1425             :         struct vm_area_struct   *vma)
    1426             : {
    1427     7551264 :         struct inode            *inode = file_inode(file);
    1428     7551264 :         struct xfs_buftarg      *target = xfs_inode_buftarg(XFS_I(inode));
    1429             : 
    1430             :         /*
    1431             :          * We don't support synchronous mappings for non-DAX files and
    1432             :          * for DAX files if underneath dax_device is not synchronous.
    1433             :          */
    1434     7551264 :         if (!daxdev_mapping_supported(vma, target->bt_daxdev))
    1435             :                 return -EOPNOTSUPP;
    1436             : 
    1437     7551265 :         file_accessed(file);
    1438     7551279 :         vma->vm_ops = &xfs_file_vm_ops;
    1439     7551279 :         if (IS_DAX(inode))
    1440             :                 vm_flags_set(vma, VM_HUGEPAGE);
    1441     7551279 :         return 0;
    1442             : }
    1443             : 
    1444             : const struct file_operations xfs_file_operations = {
    1445             :         .llseek         = xfs_file_llseek,
    1446             :         .read_iter      = xfs_file_read_iter,
    1447             :         .write_iter     = xfs_file_write_iter,
    1448             :         .splice_read    = xfs_file_splice_read,
    1449             :         .splice_write   = iter_file_splice_write,
    1450             :         .iopoll         = iocb_bio_iopoll,
    1451             :         .unlocked_ioctl = xfs_file_ioctl,
    1452             : #ifdef CONFIG_COMPAT
    1453             :         .compat_ioctl   = xfs_file_compat_ioctl,
    1454             : #endif
    1455             :         .mmap           = xfs_file_mmap,
    1456             :         .mmap_supported_flags = MAP_SYNC,
    1457             :         .open           = xfs_file_open,
    1458             :         .release        = xfs_file_release,
    1459             :         .fsync          = xfs_file_fsync,
    1460             :         .get_unmapped_area = thp_get_unmapped_area,
    1461             :         .fallocate      = xfs_file_fallocate,
    1462             :         .fadvise        = xfs_file_fadvise,
    1463             :         .remap_file_range = xfs_file_remap_range,
    1464             : };
    1465             : 
    1466             : const struct file_operations xfs_dir_file_operations = {
    1467             :         .open           = xfs_dir_open,
    1468             :         .read           = generic_read_dir,
    1469             :         .iterate_shared = xfs_file_readdir,
    1470             :         .llseek         = generic_file_llseek,
    1471             :         .unlocked_ioctl = xfs_file_ioctl,
    1472             : #ifdef CONFIG_COMPAT
    1473             :         .compat_ioctl   = xfs_file_compat_ioctl,
    1474             : #endif
    1475             :         .fsync          = xfs_dir_fsync,
    1476             : };

Generated by: LCOV version 1.14