LCOV - code coverage report
Current view: top level - fs/xfs - xfs_file.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-acha @ Mon Jul 31 20:08:06 PDT 2023 Lines: 471 494 95.3 %
Date: 2023-07-31 20:08:07 Functions: 30 32 93.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
       4             :  * All Rights Reserved.
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_log_format.h"
      11             : #include "xfs_trans_resv.h"
      12             : #include "xfs_mount.h"
      13             : #include "xfs_inode.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_inode_item.h"
      16             : #include "xfs_bmap.h"
      17             : #include "xfs_bmap_util.h"
      18             : #include "xfs_dir2.h"
      19             : #include "xfs_dir2_priv.h"
      20             : #include "xfs_ioctl.h"
      21             : #include "xfs_trace.h"
      22             : #include "xfs_log.h"
      23             : #include "xfs_icache.h"
      24             : #include "xfs_pnfs.h"
      25             : #include "xfs_iomap.h"
      26             : #include "xfs_reflink.h"
      27             : #include "xfs_file.h"
      28             : 
      29             : #include <linux/dax.h>
      30             : #include <linux/falloc.h>
      31             : #include <linux/backing-dev.h>
      32             : #include <linux/mman.h>
      33             : #include <linux/fadvise.h>
      34             : #include <linux/mount.h>
      35             : 
      36             : static const struct vm_operations_struct xfs_file_vm_ops;
      37             : 
      38             : /*
      39             :  * Decide if the given file range is aligned to the size of the fundamental
      40             :  * allocation unit for the file.
      41             :  */
      42             : bool
      43     1545492 : xfs_is_falloc_aligned(
      44             :         struct xfs_inode        *ip,
      45             :         loff_t                  pos,
      46             :         long long int           len)
      47             : {
      48     1545492 :         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip);
      49             : 
      50     2024502 :         if (XFS_IS_REALTIME_INODE(ip) && !is_power_of_2(alloc_unit))
      51           8 :                 return isaligned_64(pos, alloc_unit) &&
      52           4 :                        isaligned_64(len, alloc_unit);
      53             : 
      54     1545484 :         return !((pos | len) & (alloc_unit - 1));
      55             : }
      56             : 
      57             : /*
      58             :  * Fsync operations on directories are much simpler than on regular files,
      59             :  * as there is no file data to flush, and thus also no need for explicit
      60             :  * cache flush operations, and there are no non-transaction metadata updates
      61             :  * on directories either.
      62             :  */
      63             : STATIC int
      64      511825 : xfs_dir_fsync(
      65             :         struct file             *file,
      66             :         loff_t                  start,
      67             :         loff_t                  end,
      68             :         int                     datasync)
      69             : {
      70      511825 :         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
      71             : 
      72      511825 :         trace_xfs_dir_fsync(ip);
      73      511826 :         return xfs_log_force_inode(ip);
      74             : }
      75             : 
      76             : static xfs_csn_t
      77     1854237 : xfs_fsync_seq(
      78             :         struct xfs_inode        *ip,
      79             :         bool                    datasync)
      80             : {
      81     1854237 :         if (!xfs_ipincount(ip))
      82             :                 return 0;
      83     1854111 :         if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
      84             :                 return 0;
      85     1786971 :         return ip->i_itemp->ili_commit_seq;
      86             : }
      87             : 
      88             : /*
      89             :  * All metadata updates are logged, which means that we just have to flush the
      90             :  * log up to the latest LSN that touched the inode.
      91             :  *
      92             :  * If we have concurrent fsync/fdatasync() calls, we need them to all block on
      93             :  * the log force before we clear the ili_fsync_fields field. This ensures that
      94             :  * we don't get a racing sync operation that does not wait for the metadata to
      95             :  * hit the journal before returning.  If we race with clearing ili_fsync_fields,
      96             :  * then all that will happen is the log force will do nothing as the lsn will
      97             :  * already be on disk.  We can't race with setting ili_fsync_fields because that
      98             :  * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
      99             :  * shared until after the ili_fsync_fields is cleared.
     100             :  */
     101             : static  int
     102     1854265 : xfs_fsync_flush_log(
     103             :         struct xfs_inode        *ip,
     104             :         bool                    datasync,
     105             :         int                     *log_flushed)
     106             : {
     107     1854265 :         int                     error = 0;
     108     1854265 :         xfs_csn_t               seq;
     109             : 
     110     1854265 :         xfs_ilock(ip, XFS_ILOCK_SHARED);
     111     1854265 :         seq = xfs_fsync_seq(ip, datasync);
     112     1854258 :         if (seq) {
     113     1786991 :                 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
     114             :                                           log_flushed);
     115             : 
     116     1785417 :                 spin_lock(&ip->i_itemp->ili_lock);
     117     1786953 :                 ip->i_itemp->ili_fsync_fields = 0;
     118     1786953 :                 spin_unlock(&ip->i_itemp->ili_lock);
     119             :         }
     120     1854093 :         xfs_iunlock(ip, XFS_ILOCK_SHARED);
     121     1853965 :         return error;
     122             : }
     123             : 
     124             : STATIC int
     125    16555924 : xfs_file_fsync(
     126             :         struct file             *file,
     127             :         loff_t                  start,
     128             :         loff_t                  end,
     129             :         int                     datasync)
     130             : {
     131    16555924 :         struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
     132    16555924 :         struct xfs_mount        *mp = ip->i_mount;
     133    16555924 :         int                     error, err2;
     134    16555924 :         int                     log_flushed = 0;
     135             : 
     136    16555924 :         trace_xfs_file_fsync(ip);
     137             : 
     138    16555526 :         error = file_write_and_wait_range(file, start, end);
     139    16555676 :         if (error)
     140             :                 return error;
     141             : 
     142    33108314 :         if (xfs_is_shutdown(mp))
     143             :                 return -EIO;
     144             : 
     145    16552130 :         xfs_iflags_clear(ip, XFS_ITRUNCATED);
     146             : 
     147             :         /*
     148             :          * If we have an RT and/or log subvolume we need to make sure to flush
     149             :          * the write cache the device used for file data first.  This is to
     150             :          * ensure newly written file data make it to disk before logging the new
     151             :          * inode size in case of an extending write.
     152             :          */
     153    16552284 :         if (XFS_IS_REALTIME_INODE(ip))
     154     4835735 :                 error = xfs_buftarg_flush(mp->m_rtdev_targp);
     155    11716549 :         else if (mp->m_logdev_targp != mp->m_ddev_targp)
     156           0 :                 error = xfs_buftarg_flush(mp->m_ddev_targp);
     157             : 
     158             :         /*
     159             :          * Any inode that has dirty modifications in the log is pinned.  The
     160             :          * racy check here for a pinned inode will not catch modifications
     161             :          * that happen concurrently to the fsync call, but fsync semantics
     162             :          * only require to sync previously completed I/O.
     163             :          */
     164    16551322 :         if (xfs_ipincount(ip)) {
     165     1854251 :                 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
     166     1853532 :                 if (err2 && !error)
     167        1048 :                         error = err2;
     168             :         }
     169             : 
     170             :         /*
     171             :          * If we only have a single device, and the log force about was
     172             :          * a no-op we might have to flush the data device cache here.
     173             :          * This can only happen for fdatasync/O_DSYNC if we were overwriting
     174             :          * an already allocated file and thus do not have any metadata to
     175             :          * commit.
     176             :          */
     177    16550603 :         if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
     178    10651207 :             mp->m_logdev_targp == mp->m_ddev_targp) {
     179    10651006 :                 err2 = xfs_buftarg_flush(mp->m_ddev_targp);
     180    10651104 :                 if (err2 && !error)
     181         244 :                         error = err2;
     182             :         }
     183             : 
     184             :         return error;
     185             : }
     186             : 
     187             : static int
     188   386595754 : xfs_ilock_iocb(
     189             :         struct kiocb            *iocb,
     190             :         unsigned int            lock_mode)
     191             : {
     192   386595754 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     193             : 
     194   386595754 :         if (iocb->ki_flags & IOCB_NOWAIT) {
     195           0 :                 if (!xfs_ilock_nowait(ip, lock_mode))
     196           0 :                         return -EAGAIN;
     197             :         } else {
     198   386595754 :                 xfs_ilock(ip, lock_mode);
     199             :         }
     200             : 
     201             :         return 0;
     202             : }
     203             : 
     204             : STATIC ssize_t
     205   264621062 : xfs_file_dio_read(
     206             :         struct kiocb            *iocb,
     207             :         struct iov_iter         *to)
     208             : {
     209   264621062 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     210   264621062 :         ssize_t                 ret;
     211             : 
     212   264621062 :         trace_xfs_file_direct_read(iocb, to);
     213             : 
     214   264621012 :         if (!iov_iter_count(to))
     215             :                 return 0; /* skip atime */
     216             : 
     217   264620986 :         file_accessed(iocb->ki_filp);
     218             : 
     219   264621082 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     220   264621085 :         if (ret)
     221             :                 return ret;
     222   264621087 :         ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
     223   264621089 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     224             : 
     225   264621089 :         return ret;
     226             : }
     227             : 
     228             : static noinline ssize_t
     229             : xfs_file_dax_read(
     230             :         struct kiocb            *iocb,
     231             :         struct iov_iter         *to)
     232             : {
     233             :         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
     234             :         ssize_t                 ret = 0;
     235             : 
     236             :         trace_xfs_file_dax_read(iocb, to);
     237             : 
     238             :         if (!iov_iter_count(to))
     239             :                 return 0; /* skip atime */
     240             : 
     241             :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     242             :         if (ret)
     243             :                 return ret;
     244             :         ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
     245             :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     246             : 
     247             :         file_accessed(iocb->ki_filp);
     248             :         return ret;
     249             : }
     250             : 
     251             : STATIC ssize_t
     252    58292486 : xfs_file_buffered_read(
     253             :         struct kiocb            *iocb,
     254             :         struct iov_iter         *to)
     255             : {
     256    58292486 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     257    58292486 :         ssize_t                 ret;
     258             : 
     259    58292486 :         trace_xfs_file_buffered_read(iocb, to);
     260             : 
     261    58324215 :         ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
     262    58329890 :         if (ret)
     263             :                 return ret;
     264    58329614 :         ret = generic_file_read_iter(iocb, to);
     265    58306421 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     266             : 
     267    58306421 :         return ret;
     268             : }
     269             : 
     270             : STATIC ssize_t
     271   322947291 : xfs_file_read_iter(
     272             :         struct kiocb            *iocb,
     273             :         struct iov_iter         *to)
     274             : {
     275   322947291 :         struct inode            *inode = file_inode(iocb->ki_filp);
     276   322947291 :         struct xfs_mount        *mp = XFS_I(inode)->i_mount;
     277   322947291 :         ssize_t                 ret = 0;
     278             : 
     279   322947291 :         XFS_STATS_INC(mp, xs_read_calls);
     280             : 
     281   645894582 :         if (xfs_is_shutdown(mp))
     282             :                 return -EIO;
     283             : 
     284   322945149 :         if (IS_DAX(inode))
     285             :                 ret = xfs_file_dax_read(iocb, to);
     286   322945149 :         else if (iocb->ki_flags & IOCB_DIRECT)
     287   264620989 :                 ret = xfs_file_dio_read(iocb, to);
     288             :         else
     289    58324160 :                 ret = xfs_file_buffered_read(iocb, to);
     290             : 
     291   322948964 :         if (ret > 0)
     292    60513359 :                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
     293             :         return ret;
     294             : }
     295             : 
     296             : STATIC ssize_t
     297     4917290 : xfs_file_splice_read(
     298             :         struct file             *in,
     299             :         loff_t                  *ppos,
     300             :         struct pipe_inode_info  *pipe,
     301             :         size_t                  len,
     302             :         unsigned int            flags)
     303             : {
     304     4917290 :         struct inode            *inode = file_inode(in);
     305     4917290 :         struct xfs_inode        *ip = XFS_I(inode);
     306     4917290 :         struct xfs_mount        *mp = ip->i_mount;
     307     4917290 :         ssize_t                 ret = 0;
     308             : 
     309     4917290 :         XFS_STATS_INC(mp, xs_read_calls);
     310             : 
     311     9834580 :         if (xfs_is_shutdown(mp))
     312             :                 return -EIO;
     313             : 
     314     4917284 :         trace_xfs_file_splice_read(ip, *ppos, len);
     315             : 
     316     4917283 :         xfs_ilock(ip, XFS_IOLOCK_SHARED);
     317     4917287 :         ret = filemap_splice_read(in, ppos, pipe, len, flags);
     318     4917321 :         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
     319     4917321 :         if (ret > 0)
     320     4917189 :                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
     321             :         return ret;
     322             : }
     323             : 
     324             : /*
     325             :  * Common pre-write limit and setup checks.
     326             :  *
     327             :  * Called with the iolocked held either shared and exclusive according to
     328             :  * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
     329             :  * if called for a direct write beyond i_size.
     330             :  */
     331             : STATIC ssize_t
     332    60500263 : xfs_file_write_checks(
     333             :         struct kiocb            *iocb,
     334             :         struct iov_iter         *from,
     335             :         unsigned int            *iolock)
     336             : {
     337    60500263 :         struct file             *file = iocb->ki_filp;
     338    60500263 :         struct inode            *inode = file->f_mapping->host;
     339    60500263 :         struct xfs_inode        *ip = XFS_I(inode);
     340    60500263 :         ssize_t                 error = 0;
     341    60500263 :         size_t                  count = iov_iter_count(from);
     342    60500263 :         bool                    drained_dio = false;
     343    77572171 :         loff_t                  isize;
     344             : 
     345             : restart:
     346    77572171 :         error = generic_write_checks(iocb, from);
     347    77543357 :         if (error <= 0)
     348           6 :                 return error;
     349             : 
     350    77543351 :         if (iocb->ki_flags & IOCB_NOWAIT) {
     351           0 :                 error = break_layout(inode, false);
     352           0 :                 if (error == -EWOULDBLOCK)
     353             :                         error = -EAGAIN;
     354             :         } else {
     355    77543351 :                 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
     356             :         }
     357             : 
     358    77548137 :         if (error)
     359           0 :                 return error;
     360             : 
     361             :         /*
     362             :          * For changing security info in file_remove_privs() we need i_rwsem
     363             :          * exclusively.
     364             :          */
     365    77548137 :         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
     366      135636 :                 xfs_iunlock(ip, *iolock);
     367      135632 :                 *iolock = XFS_IOLOCK_EXCL;
     368      135632 :                 error = xfs_ilock_iocb(iocb, *iolock);
     369      135632 :                 if (error) {
     370           0 :                         *iolock = 0;
     371           0 :                         return error;
     372             :                 }
     373      135632 :                 goto restart;
     374             :         }
     375             : 
     376             :         /*
     377             :          * If the offset is beyond the size of the file, we need to zero any
     378             :          * blocks that fall between the existing EOF and the start of this
     379             :          * write.  If zeroing is needed and we are currently holding the iolock
     380             :          * shared, we need to update it to exclusive which implies having to
     381             :          * redo all checks before.
     382             :          *
     383             :          * We need to serialise against EOF updates that occur in IO completions
     384             :          * here. We want to make sure that nobody is changing the size while we
     385             :          * do this check until we have placed an IO barrier (i.e.  hold the
     386             :          * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.  The
     387             :          * spinlock effectively forms a memory barrier once we have the
     388             :          * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
     389             :          * hence be able to correctly determine if we need to run zeroing.
     390             :          *
     391             :          * We can do an unlocked check here safely as IO completion can only
     392             :          * extend EOF. Truncate is locked out at this point, so the EOF can
     393             :          * not move backwards, only forwards. Hence we only need to take the
     394             :          * slow path and spin locks when we are at or beyond the current EOF.
     395             :          */
     396    77412501 :         if (iocb->ki_pos <= i_size_read(inode))
     397    43580951 :                 goto out;
     398             : 
     399    33831550 :         spin_lock(&ip->i_flags_lock);
     400    33834417 :         isize = i_size_read(inode);
     401    33834417 :         if (iocb->ki_pos > isize) {
     402    33834417 :                 spin_unlock(&ip->i_flags_lock);
     403             : 
     404    33852889 :                 if (iocb->ki_flags & IOCB_NOWAIT)
     405             :                         return -EAGAIN;
     406             : 
     407    33852889 :                 if (!drained_dio) {
     408    16935798 :                         if (*iolock == XFS_IOLOCK_SHARED) {
     409      100605 :                                 xfs_iunlock(ip, *iolock);
     410      100599 :                                 *iolock = XFS_IOLOCK_EXCL;
     411      100599 :                                 xfs_ilock(ip, *iolock);
     412      100599 :                                 iov_iter_reexpand(from, count);
     413             :                         }
     414             :                         /*
     415             :                          * We now have an IO submission barrier in place, but
     416             :                          * AIO can do EOF updates during IO completion and hence
     417             :                          * we now need to wait for all of them to drain. Non-AIO
     418             :                          * DIO will have drained before we are given the
     419             :                          * XFS_IOLOCK_EXCL, and so for most cases this wait is a
     420             :                          * no-op.
     421             :                          */
     422    16935792 :                         inode_dio_wait(inode);
     423    16936276 :                         drained_dio = true;
     424    16936276 :                         goto restart;
     425             :                 }
     426             : 
     427    16917091 :                 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
     428    16917631 :                 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
     429    16916231 :                 if (error)
     430             :                         return error;
     431             :         } else
     432           0 :                 spin_unlock(&ip->i_flags_lock);
     433             : 
     434    60496540 : out:
     435    60496540 :         return kiocb_modified(iocb);
     436             : }
     437             : 
     438             : static int
     439     6832814 : xfs_dio_write_end_io(
     440             :         struct kiocb            *iocb,
     441             :         ssize_t                 size,
     442             :         int                     error,
     443             :         unsigned                flags)
     444             : {
     445     6832814 :         struct inode            *inode = file_inode(iocb->ki_filp);
     446     6832814 :         struct xfs_inode        *ip = XFS_I(inode);
     447     6832814 :         loff_t                  offset = iocb->ki_pos;
     448     6832814 :         unsigned int            nofs_flag;
     449             : 
     450     6832814 :         trace_xfs_end_io_direct_write(ip, offset, size);
     451             : 
     452    13664884 :         if (xfs_is_shutdown(ip->i_mount))
     453             :                 return -EIO;
     454             : 
     455     6831890 :         if (error)
     456             :                 return error;
     457     5934304 :         if (!size)
     458             :                 return 0;
     459             : 
     460             :         /*
     461             :          * Capture amount written on completion as we can't reliably account
     462             :          * for it on submission.
     463             :          */
     464     5934304 :         XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
     465             : 
     466             :         /*
     467             :          * We can allocate memory here while doing writeback on behalf of
     468             :          * memory reclaim.  To avoid memory allocation deadlocks set the
     469             :          * task-wide nofs context for the following operations.
     470             :          */
     471     5934304 :         nofs_flag = memalloc_nofs_save();
     472             : 
     473     5934304 :         if (flags & IOMAP_DIO_COW) {
     474     1137240 :                 error = xfs_reflink_end_cow(ip, offset, size);
     475     1137240 :                 if (error)
     476          44 :                         goto out;
     477             :         }
     478             : 
     479             :         /*
     480             :          * Unwritten conversion updates the in-core isize after extent
     481             :          * conversion but before updating the on-disk size. Updating isize any
     482             :          * earlier allows a racing dio read to find unwritten extents before
     483             :          * they are converted.
     484             :          */
     485     5934260 :         if (flags & IOMAP_DIO_UNWRITTEN) {
     486     3028286 :                 error = xfs_iomap_write_unwritten(ip, offset, size, true);
     487     3028476 :                 goto out;
     488             :         }
     489             : 
     490             :         /*
     491             :          * We need to update the in-core inode size here so that we don't end up
     492             :          * with the on-disk inode size being outside the in-core inode size. We
     493             :          * have no other method of updating EOF for AIO, so always do it here
     494             :          * if necessary.
     495             :          *
     496             :          * We need to lock the test/set EOF update as we can be racing with
     497             :          * other IO completions here to update the EOF. Failing to serialise
     498             :          * here can result in EOF moving backwards and Bad Things Happen when
     499             :          * that occurs.
     500             :          *
     501             :          * As IO completion only ever extends EOF, we can do an unlocked check
     502             :          * here to avoid taking the spinlock. If we land within the current EOF,
     503             :          * then we do not need to do an extending update at all, and we don't
     504             :          * need to take the lock to check this. If we race with an update moving
     505             :          * EOF, then we'll either still be beyond EOF and need to take the lock,
     506             :          * or we'll be within EOF and we don't need to take it at all.
     507             :          */
     508     2905974 :         if (offset + size <= i_size_read(inode))
     509     2739722 :                 goto out;
     510             : 
     511      166252 :         spin_lock(&ip->i_flags_lock);
     512      166252 :         if (offset + size > i_size_read(inode)) {
     513      166252 :                 i_size_write(inode, offset + size);
     514      166252 :                 spin_unlock(&ip->i_flags_lock);
     515      166252 :                 error = xfs_setfilesize(ip, offset, size);
     516             :         } else {
     517           0 :                 spin_unlock(&ip->i_flags_lock);
     518             :         }
     519             : 
     520     5934494 : out:
     521     5934494 :         memalloc_nofs_restore(nofs_flag);
     522     5934494 :         return error;
     523             : }
     524             : 
     525             : static const struct iomap_dio_ops xfs_dio_write_ops = {
     526             :         .end_io         = xfs_dio_write_end_io,
     527             : };
     528             : 
     529             : /*
     530             :  * Handle block aligned direct I/O writes
     531             :  */
     532             : static noinline ssize_t
     533     3620705 : xfs_file_dio_write_aligned(
     534             :         struct xfs_inode        *ip,
     535             :         struct kiocb            *iocb,
     536             :         struct iov_iter         *from)
     537             : {
     538     3620705 :         unsigned int            iolock = XFS_IOLOCK_SHARED;
     539     3620705 :         ssize_t                 ret;
     540             : 
     541     3620705 :         ret = xfs_ilock_iocb(iocb, iolock);
     542     3620735 :         if (ret)
     543             :                 return ret;
     544     3620696 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     545     3620770 :         if (ret)
     546         642 :                 goto out_unlock;
     547             : 
     548             :         /*
     549             :          * We don't need to hold the IOLOCK exclusively across the IO, so demote
     550             :          * the iolock back to shared if we had to take the exclusive lock in
     551             :          * xfs_file_write_checks() for other reasons.
     552             :          */
     553     3620128 :         if (iolock == XFS_IOLOCK_EXCL) {
     554      134499 :                 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
     555      134497 :                 iolock = XFS_IOLOCK_SHARED;
     556             :         }
     557     3620126 :         trace_xfs_file_direct_write(iocb, from);
     558     3620121 :         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
     559             :                            &xfs_dio_write_ops, 0, NULL, 0);
     560     3620833 : out_unlock:
     561     3620833 :         if (iolock)
     562     3620833 :                 xfs_iunlock(ip, iolock);
     563             :         return ret;
     564             : }
     565             : 
     566             : /*
     567             :  * Handle block unaligned direct I/O writes
     568             :  *
     569             :  * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
     570             :  * them to be done in parallel with reads and other direct I/O writes.  However,
     571             :  * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
     572             :  * to do sub-block zeroing and that requires serialisation against other direct
     573             :  * I/O to the same block.  In this case we need to serialise the submission of
     574             :  * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
     575             :  * In the case where sub-block zeroing is not required, we can do concurrent
     576             :  * sub-block dios to the same block successfully.
     577             :  *
     578             :  * Optimistically submit the I/O using the shared lock first, but use the
     579             :  * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
     580             :  * if block allocation or partial block zeroing would be required.  In that case
     581             :  * we try again with the exclusive lock.
     582             :  */
     583             : static noinline ssize_t
     584     5490563 : xfs_file_dio_write_unaligned(
     585             :         struct xfs_inode        *ip,
     586             :         struct kiocb            *iocb,
     587             :         struct iov_iter         *from)
     588             : {
     589     5490563 :         size_t                  isize = i_size_read(VFS_I(ip));
     590     5490563 :         size_t                  count = iov_iter_count(from);
     591     5490563 :         unsigned int            iolock = XFS_IOLOCK_SHARED;
     592     5490563 :         unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
     593     5490563 :         ssize_t                 ret;
     594             : 
     595             :         /*
     596             :          * Extending writes need exclusivity because of the sub-block zeroing
     597             :          * that the DIO code always does for partial tail blocks beyond EOF, so
     598             :          * don't even bother trying the fast path in this case.
     599             :          */
     600     5490563 :         if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
     601     2019316 :                 if (iocb->ki_flags & IOCB_NOWAIT)
     602             :                         return -EAGAIN;
     603     2019316 : retry_exclusive:
     604     2779949 :                 iolock = XFS_IOLOCK_EXCL;
     605     2779949 :                 flags = IOMAP_DIO_FORCE_WAIT;
     606             :         }
     607             : 
     608     6251196 :         ret = xfs_ilock_iocb(iocb, iolock);
     609     6251158 :         if (ret)
     610             :                 return ret;
     611             : 
     612             :         /*
     613             :          * We can't properly handle unaligned direct I/O to reflink files yet,
     614             :          * as we can't unshare a partial block.
     615             :          */
     616     6251158 :         if (xfs_is_cow_inode(ip)) {
     617     3038297 :                 trace_xfs_reflink_bounce_dio_write(iocb, from);
     618     3038303 :                 ret = -ENOTBLK;
     619     3038303 :                 goto out_unlock;
     620             :         }
     621             : 
     622     3212861 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     623     3212874 :         if (ret)
     624          58 :                 goto out_unlock;
     625             : 
     626             :         /*
     627             :          * If we are doing exclusive unaligned I/O, this must be the only I/O
     628             :          * in-flight.  Otherwise we risk data corruption due to unwritten extent
     629             :          * conversions from the AIO end_io handler.  Wait for all other I/O to
     630             :          * drain first.
     631             :          */
     632     3212816 :         if (flags & IOMAP_DIO_FORCE_WAIT)
     633     2338008 :                 inode_dio_wait(VFS_I(ip));
     634             : 
     635     3212815 :         trace_xfs_file_direct_write(iocb, from);
     636     3212799 :         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
     637             :                            &xfs_dio_write_ops, flags, NULL, 0);
     638             : 
     639             :         /*
     640             :          * Retry unaligned I/O with exclusive blocking semantics if the DIO
     641             :          * layer rejected it for mapping or locking reasons. If we are doing
     642             :          * nonblocking user I/O, propagate the error.
     643             :          */
     644     3212751 :         if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
     645      760635 :                 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
     646      760635 :                 xfs_iunlock(ip, iolock);
     647      760633 :                 goto retry_exclusive;
     648             :         }
     649             : 
     650     2452116 : out_unlock:
     651     5490477 :         if (iolock)
     652     5490482 :                 xfs_iunlock(ip, iolock);
     653             :         return ret;
     654             : }
     655             : 
     656             : static ssize_t
     657     9111186 : xfs_file_dio_write(
     658             :         struct kiocb            *iocb,
     659             :         struct iov_iter         *from)
     660             : {
     661     9111186 :         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
     662     9111186 :         struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
     663     9111186 :         size_t                  count = iov_iter_count(from);
     664             : 
     665             :         /* direct I/O must be aligned to device logical sector size */
     666     9111186 :         if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
     667             :                 return -EINVAL;
     668     9111186 :         if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
     669     5490536 :                 return xfs_file_dio_write_unaligned(ip, iocb, from);
     670     3620650 :         return xfs_file_dio_write_aligned(ip, iocb, from);
     671             : }
     672             : 
     673             : static noinline ssize_t
     674             : xfs_file_dax_write(
     675             :         struct kiocb            *iocb,
     676             :         struct iov_iter         *from)
     677             : {
     678             :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     679             :         struct xfs_inode        *ip = XFS_I(inode);
     680             :         unsigned int            iolock = XFS_IOLOCK_EXCL;
     681             :         ssize_t                 ret, error = 0;
     682             :         loff_t                  pos;
     683             : 
     684             :         ret = xfs_ilock_iocb(iocb, iolock);
     685             :         if (ret)
     686             :                 return ret;
     687             :         ret = xfs_file_write_checks(iocb, from, &iolock);
     688             :         if (ret)
     689             :                 goto out;
     690             : 
     691             :         pos = iocb->ki_pos;
     692             : 
     693             :         trace_xfs_file_dax_write(iocb, from);
     694             :         ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
     695             :         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
     696             :                 i_size_write(inode, iocb->ki_pos);
     697             :                 error = xfs_setfilesize(ip, pos, ret);
     698             :         }
     699             : out:
     700             :         if (iolock)
     701             :                 xfs_iunlock(ip, iolock);
     702             :         if (error)
     703             :                 return error;
     704             : 
     705             :         if (ret > 0) {
     706             :                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
     707             : 
     708             :                 /* Handle various SYNC-type writes */
     709             :                 ret = generic_write_sync(iocb, ret);
     710             :         }
     711             :         return ret;
     712             : }
     713             : 
     714             : STATIC ssize_t
     715    53347754 : xfs_file_buffered_write(
     716             :         struct kiocb            *iocb,
     717             :         struct iov_iter         *from)
     718             : {
     719    53347754 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     720    53347754 :         struct xfs_inode        *ip = XFS_I(inode);
     721    53347754 :         ssize_t                 ret;
     722    53347754 :         bool                    cleared_space = false;
     723    53653059 :         unsigned int            iolock;
     724             : 
     725             : write_retry:
     726    53653059 :         iolock = XFS_IOLOCK_EXCL;
     727    53653059 :         ret = xfs_ilock_iocb(iocb, iolock);
     728    53667681 :         if (ret)
     729           0 :                 return ret;
     730             : 
     731    53667681 :         ret = xfs_file_write_checks(iocb, from, &iolock);
     732    53666557 :         if (ret)
     733         692 :                 goto out;
     734             : 
     735    53665865 :         trace_xfs_file_buffered_write(iocb, from);
     736    53663723 :         ret = iomap_file_buffered_write(iocb, from,
     737             :                         &xfs_buffered_write_iomap_ops);
     738             : 
     739             :         /*
     740             :          * If we hit a space limit, try to free up some lingering preallocated
     741             :          * space before returning an error. In the case of ENOSPC, first try to
     742             :          * write back all dirty inodes to free up some of the excess reserved
     743             :          * metadata space. This reduces the chances that the eofblocks scan
     744             :          * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
     745             :          * also behaves as a filter to prevent too many eofblocks scans from
     746             :          * running at the same time.  Use a synchronous scan to increase the
     747             :          * effectiveness of the scan.
     748             :          */
     749    53607357 :         if (ret == -EDQUOT && !cleared_space) {
     750         212 :                 xfs_iunlock(ip, iolock);
     751         212 :                 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
     752         212 :                 cleared_space = true;
     753         212 :                 goto write_retry;
     754    53607145 :         } else if (ret == -ENOSPC && !cleared_space) {
     755      305100 :                 struct xfs_icwalk       icw = {0};
     756             : 
     757      305100 :                 cleared_space = true;
     758      305100 :                 xfs_flush_inodes(ip->i_mount);
     759             : 
     760      304929 :                 xfs_iunlock(ip, iolock);
     761      304919 :                 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
     762      304919 :                 xfs_blockgc_free_space(ip->i_mount, &icw);
     763      305093 :                 goto write_retry;
     764             :         }
     765             : 
     766    53302045 : out:
     767    53302737 :         if (iolock)
     768    53362309 :                 xfs_iunlock(ip, iolock);
     769             : 
     770    53298104 :         if (ret > 0) {
     771    53086690 :                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
     772             :                 /* Handle various SYNC-type writes */
     773    53086690 :                 ret = generic_write_sync(iocb, ret);
     774             :         }
     775             :         return ret;
     776             : }
     777             : 
     778             : STATIC ssize_t
     779    59440418 : xfs_file_write_iter(
     780             :         struct kiocb            *iocb,
     781             :         struct iov_iter         *from)
     782             : {
     783    59440418 :         struct inode            *inode = iocb->ki_filp->f_mapping->host;
     784    59440418 :         struct xfs_inode        *ip = XFS_I(inode);
     785    59440418 :         ssize_t                 ret;
     786    59440418 :         size_t                  ocount = iov_iter_count(from);
     787             : 
     788    59440418 :         XFS_STATS_INC(ip->i_mount, xs_write_calls);
     789             : 
     790    59440418 :         if (ocount == 0)
     791             :                 return 0;
     792             : 
     793   118880808 :         if (xfs_is_shutdown(ip->i_mount))
     794             :                 return -EIO;
     795             : 
     796    59429646 :         if (IS_DAX(inode))
     797             :                 return xfs_file_dax_write(iocb, from);
     798             : 
     799    59429646 :         if (iocb->ki_flags & IOCB_DIRECT) {
     800             :                 /*
     801             :                  * Allow a directio write to fall back to a buffered
     802             :                  * write *only* in the case that we're doing a reflink
     803             :                  * CoW.  In all other directio scenarios we do not
     804             :                  * allow an operation to fall back to buffered mode.
     805             :                  */
     806     9111255 :                 ret = xfs_file_dio_write(iocb, from);
     807     9110890 :                 if (ret != -ENOTBLK)
     808             :                         return ret;
     809             :         }
     810             : 
     811    53356790 :         return xfs_file_buffered_write(iocb, from);
     812             : }
     813             : 
     814             : /* Does this file, inode, or mount want synchronous writes? */
     815   223734871 : static inline bool xfs_file_sync_writes(struct file *filp)
     816             : {
     817   223734871 :         struct xfs_inode        *ip = XFS_I(file_inode(filp));
     818             : 
     819   223734871 :         if (xfs_has_wsync(ip->i_mount))
     820             :                 return true;
     821   223734863 :         if (filp->f_flags & (__O_SYNC | O_DSYNC))
     822             :                 return true;
     823   223727729 :         if (IS_SYNC(file_inode(filp)))
     824           4 :                 return true;
     825             : 
     826             :         return false;
     827             : }
     828             : 
     829             : #define XFS_FALLOC_FL_SUPPORTED                                         \
     830             :                 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
     831             :                  FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
     832             :                  FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
     833             : 
     834             : STATIC long
     835    14496333 : xfs_file_fallocate(
     836             :         struct file             *file,
     837             :         int                     mode,
     838             :         loff_t                  offset,
     839             :         loff_t                  len)
     840             : {
     841    14496333 :         struct inode            *inode = file_inode(file);
     842    14496333 :         struct xfs_inode        *ip = XFS_I(inode);
     843    14496333 :         long                    error;
     844    14496333 :         uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
     845    14496333 :         loff_t                  new_size = 0;
     846    14496333 :         bool                    do_file_insert = false;
     847             : 
     848    14496333 :         if (!S_ISREG(inode->i_mode))
     849             :                 return -EINVAL;
     850    14496333 :         if (mode & ~XFS_FALLOC_FL_SUPPORTED)
     851             :                 return -EOPNOTSUPP;
     852             : 
     853    14496333 :         xfs_ilock(ip, iolock);
     854    14496507 :         error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
     855    14496473 :         if (error)
     856           0 :                 goto out_unlock;
     857             : 
     858             :         /*
     859             :          * Must wait for all AIO to complete before we continue as AIO can
     860             :          * change the file size on completion without holding any locks we
     861             :          * currently hold. We must do this first because AIO can update both
     862             :          * the on disk and in memory inode sizes, and the operations that follow
     863             :          * require the in-memory size to be fully up-to-date.
     864             :          */
     865    14496473 :         inode_dio_wait(inode);
     866             : 
     867             :         /*
     868             :          * Now AIO and DIO has drained we flush and (if necessary) invalidate
     869             :          * the cached range over the first operation we are about to run.
     870             :          *
     871             :          * We care about zero and collapse here because they both run a hole
     872             :          * punch over the range first. Because that can zero data, and the range
     873             :          * of invalidation for the shift operations is much larger, we still do
     874             :          * the required flush for collapse in xfs_prepare_shift().
     875             :          *
     876             :          * Insert has the same range requirements as collapse, and we extend the
     877             :          * file first which can zero data. Hence insert has the same
     878             :          * flush/invalidate requirements as collapse and so they are both
     879             :          * handled at the right time by xfs_prepare_shift().
     880             :          */
     881    14496235 :         if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
     882             :                     FALLOC_FL_COLLAPSE_RANGE)) {
     883    10384121 :                 error = xfs_flush_unmap_range(ip, offset, len);
     884    10384438 :                 if (error)
     885         135 :                         goto out_unlock;
     886             :         }
     887             : 
     888    14496417 :         error = file_modified(file);
     889    14496499 :         if (error)
     890           9 :                 goto out_unlock;
     891             : 
     892    14496490 :         if (mode & FALLOC_FL_PUNCH_HOLE) {
     893     7899185 :                 error = xfs_free_file_space(ip, offset, len);
     894     7899178 :                 if (error)
     895        8115 :                         goto out_unlock;
     896     6597305 :         } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
     897      822717 :                 if (!xfs_is_falloc_aligned(ip, offset, len)) {
     898      255387 :                         error = -EINVAL;
     899      255387 :                         goto out_unlock;
     900             :                 }
     901             : 
     902             :                 /*
     903             :                  * There is no need to overlap collapse range with EOF,
     904             :                  * in which case it is effectively a truncate operation
     905             :                  */
     906      567330 :                 if (offset + len >= i_size_read(inode)) {
     907      108777 :                         error = -EINVAL;
     908      108777 :                         goto out_unlock;
     909             :                 }
     910             : 
     911      458553 :                 new_size = i_size_read(inode) - len;
     912             : 
     913      458553 :                 error = xfs_collapse_file_space(ip, offset, len);
     914      458553 :                 if (error)
     915        1107 :                         goto out_unlock;
     916     5774588 :         } else if (mode & FALLOC_FL_INSERT_RANGE) {
     917      722778 :                 loff_t          isize = i_size_read(inode);
     918             : 
     919      722778 :                 if (!xfs_is_falloc_aligned(ip, offset, len)) {
     920      254964 :                         error = -EINVAL;
     921      254964 :                         goto out_unlock;
     922             :                 }
     923             : 
     924             :                 /*
     925             :                  * New inode size must not exceed ->s_maxbytes, accounting for
     926             :                  * possible signed overflow.
     927             :                  */
     928      467814 :                 if (inode->i_sb->s_maxbytes - isize < len) {
     929           2 :                         error = -EFBIG;
     930           2 :                         goto out_unlock;
     931             :                 }
     932      467812 :                 new_size = isize + len;
     933             : 
     934             :                 /* Offset should be less than i_size */
     935      467812 :                 if (offset >= isize) {
     936       88509 :                         error = -EINVAL;
     937       88509 :                         goto out_unlock;
     938             :                 }
     939             :                 do_file_insert = true;
     940             :         } else {
     941     5051810 :                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
     942     2258821 :                     offset + len > i_size_read(inode)) {
     943     1216066 :                         new_size = offset + len;
     944     1216066 :                         error = inode_newsize_ok(inode, new_size);
     945     1216064 :                         if (error)
     946           2 :                                 goto out_unlock;
     947             :                 }
     948             : 
     949     5051806 :                 if (mode & FALLOC_FL_ZERO_RANGE) {
     950             :                         /*
     951             :                          * Punch a hole and prealloc the range.  We use a hole
     952             :                          * punch rather than unwritten extent conversion for two
     953             :                          * reasons:
     954             :                          *
     955             :                          *   1.) Hole punch handles partial block zeroing for us.
     956             :                          *   2.) If prealloc returns ENOSPC, the file range is
     957             :                          *       still zero-valued by virtue of the hole punch.
     958             :                          */
     959     1662450 :                         unsigned int blksize = i_blocksize(inode);
     960             : 
     961     1662450 :                         trace_xfs_zero_file_space(ip);
     962             : 
     963     1662451 :                         error = xfs_free_file_space(ip, offset, len);
     964     1662448 :                         if (error)
     965        5305 :                                 goto out_unlock;
     966             : 
     967     1657143 :                         len = round_up(offset + len, blksize) -
     968     1657143 :                               round_down(offset, blksize);
     969     1657143 :                         offset = round_down(offset, blksize);
     970     3389356 :                 } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
     971          66 :                         error = xfs_reflink_unshare(ip, offset, len);
     972          66 :                         if (error)
     973           2 :                                 goto out_unlock;
     974             :                 } else {
     975             :                         /*
     976             :                          * If always_cow mode we can't use preallocations and
     977             :                          * thus should not create them.
     978             :                          */
     979     3389290 :                         if (xfs_is_always_cow_inode(ip)) {
     980           0 :                                 error = -EOPNOTSUPP;
     981           0 :                                 goto out_unlock;
     982             :                         }
     983             :                 }
     984             : 
     985     5046497 :                 if (!xfs_is_always_cow_inode(ip)) {
     986     5046492 :                         error = xfs_alloc_file_space(ip, offset, len);
     987     5046507 :                         if (error)
     988       74798 :                                 goto out_unlock;
     989             :                 }
     990             :         }
     991             : 
     992             :         /* Change file size if needed */
     993    13699526 :         if (new_size) {
     994     2047399 :                 struct iattr iattr;
     995             : 
     996     2047399 :                 iattr.ia_valid = ATTR_SIZE;
     997     2047399 :                 iattr.ia_size = new_size;
     998     4094798 :                 error = xfs_vn_setattr_size(file_mnt_idmap(file),
     999             :                                             file_dentry(file), &iattr);
    1000     2047398 :                 if (error)
    1001         350 :                         goto out_unlock;
    1002             :         }
    1003             : 
    1004             :         /*
    1005             :          * Perform hole insertion now that the file size has been
    1006             :          * updated so that if we crash during the operation we don't
    1007             :          * leave shifted extents past EOF and hence losing access to
    1008             :          * the data that is contained within them.
    1009             :          */
    1010    13699175 :         if (do_file_insert) {
    1011      379186 :                 error = xfs_insert_file_space(ip, offset, len);
    1012      379186 :                 if (error)
    1013        1152 :                         goto out_unlock;
    1014             :         }
    1015             : 
    1016    13698023 :         if (xfs_file_sync_writes(file))
    1017        7130 :                 error = xfs_log_force_inode(ip);
    1018             : 
    1019    13690893 : out_unlock:
    1020    14496637 :         xfs_iunlock(ip, iolock);
    1021    14496637 :         return error;
    1022             : }
    1023             : 
    1024             : STATIC int
    1025     2118205 : xfs_file_fadvise(
    1026             :         struct file     *file,
    1027             :         loff_t          start,
    1028             :         loff_t          end,
    1029             :         int             advice)
    1030             : {
    1031     2118205 :         struct xfs_inode *ip = XFS_I(file_inode(file));
    1032     2118205 :         int ret;
    1033     2118205 :         int lockflags = 0;
    1034             : 
    1035             :         /*
    1036             :          * Operations creating pages in page cache need protection from hole
    1037             :          * punching and similar ops
    1038             :          */
    1039     2118205 :         if (advice == POSIX_FADV_WILLNEED) {
    1040           0 :                 lockflags = XFS_IOLOCK_SHARED;
    1041           0 :                 xfs_ilock(ip, lockflags);
    1042             :         }
    1043     2118205 :         ret = generic_fadvise(file, start, end, advice);
    1044     2119609 :         if (lockflags)
    1045           0 :                 xfs_iunlock(ip, lockflags);
    1046     2119609 :         return ret;
    1047             : }
    1048             : 
    1049             : STATIC loff_t
    1050   198376715 : xfs_file_remap_range(
    1051             :         struct file             *file_in,
    1052             :         loff_t                  pos_in,
    1053             :         struct file             *file_out,
    1054             :         loff_t                  pos_out,
    1055             :         loff_t                  len,
    1056             :         unsigned int            remap_flags)
    1057             : {
    1058   198376715 :         struct inode            *inode_in = file_inode(file_in);
    1059   198376715 :         struct xfs_inode        *src = XFS_I(inode_in);
    1060   198376715 :         struct inode            *inode_out = file_inode(file_out);
    1061   198376715 :         struct xfs_inode        *dest = XFS_I(inode_out);
    1062   198376715 :         struct xfs_mount        *mp = src->i_mount;
    1063   198376715 :         loff_t                  remapped = 0;
    1064   198376715 :         xfs_extlen_t            cowextsize;
    1065   198376715 :         int                     ret;
    1066             : 
    1067   198376715 :         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
    1068             :                 return -EINVAL;
    1069             : 
    1070   198376715 :         if (!xfs_has_reflink(mp))
    1071             :                 return -EOPNOTSUPP;
    1072             : 
    1073   274486162 :         if (xfs_is_shutdown(mp))
    1074             :                 return -EIO;
    1075             : 
    1076             :         /* Prepare and then clone file data. */
    1077   137240703 :         ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
    1078             :                         &len, remap_flags);
    1079   137246518 :         if (ret || len == 0)
    1080    31741456 :                 return ret;
    1081             : 
    1082   105505062 :         trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
    1083             : 
    1084   105504766 :         ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
    1085             :                         &remapped);
    1086   105505245 :         if (ret)
    1087      485513 :                 goto out_unlock;
    1088             : 
    1089             :         /*
    1090             :          * Carry the cowextsize hint from src to dest if we're sharing the
    1091             :          * entire source file to the entire destination file, the source file
    1092             :          * has a cowextsize hint, and the destination file does not.
    1093             :          */
    1094   105019732 :         cowextsize = 0;
    1095   105019732 :         if (pos_in == 0 && len == i_size_read(inode_in) &&
    1096       43138 :             (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
    1097          48 :             pos_out == 0 && len >= i_size_read(inode_out) &&
    1098          46 :             !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
    1099           6 :                 cowextsize = src->i_cowextsize;
    1100             : 
    1101   105019732 :         ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
    1102             :                         remap_flags);
    1103   105019847 :         if (ret)
    1104           0 :                 goto out_unlock;
    1105             : 
    1106   105019847 :         if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
    1107         275 :                 xfs_log_force_inode(dest);
    1108   105019755 : out_unlock:
    1109   105505284 :         xfs_iunlock2_io_mmap(src, dest);
    1110   105505136 :         if (ret)
    1111      485546 :                 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
    1112   105505135 :         return remapped > 0 ? remapped : ret;
    1113             : }
    1114             : 
    1115             : STATIC int
    1116   371641553 : xfs_file_open(
    1117             :         struct inode    *inode,
    1118             :         struct file     *file)
    1119             : {
    1120   743283106 :         if (xfs_is_shutdown(XFS_M(inode->i_sb)))
    1121             :                 return -EIO;
    1122   371627805 :         file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
    1123             :                         FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
    1124   371627805 :         return generic_file_open(inode, file);
    1125             : }
    1126             : 
    1127             : STATIC int
    1128    28954998 : xfs_dir_open(
    1129             :         struct inode    *inode,
    1130             :         struct file     *file)
    1131             : {
    1132    28954998 :         struct xfs_inode *ip = XFS_I(inode);
    1133    28954998 :         unsigned int    mode;
    1134    28954998 :         int             error;
    1135             : 
    1136    28954998 :         error = xfs_file_open(inode, file);
    1137    28954824 :         if (error)
    1138             :                 return error;
    1139             : 
    1140             :         /*
    1141             :          * If there are any blocks, read-ahead block 0 as we're almost
    1142             :          * certain to have the next operation be a read there.
    1143             :          */
    1144    28953824 :         mode = xfs_ilock_data_map_shared(ip);
    1145    28954177 :         if (ip->i_df.if_nextents > 0)
    1146     7195713 :                 error = xfs_dir3_data_readahead(ip, 0, 0);
    1147    28954705 :         xfs_iunlock(ip, mode);
    1148    28954705 :         return error;
    1149             : }
    1150             : 
    1151             : /*
    1152             :  * When we release the file, we don't want it to trim EOF blocks if it is a
    1153             :  * readonly context.  This avoids open/read/close workloads from removing
    1154             :  * EOF blocks that other writers depend upon to reduce fragmentation.
    1155             :  */
    1156             : STATIC int
    1157   342655243 : xfs_file_release(
    1158             :         struct inode    *inode,
    1159             :         struct file     *file)
    1160             : {
    1161   342655243 :         bool            free_eof_blocks = true;
    1162             : 
    1163   342655243 :         if ((file->f_mode & (FMODE_WRITE | FMODE_READ)) == FMODE_READ)
    1164    53137343 :                 free_eof_blocks = false;
    1165             : 
    1166   342655243 :         return xfs_release(XFS_I(inode), free_eof_blocks);
    1167             : }
    1168             : 
    1169             : STATIC int
    1170    55395676 : xfs_file_readdir(
    1171             :         struct file     *file,
    1172             :         struct dir_context *ctx)
    1173             : {
    1174    55395676 :         struct inode    *inode = file_inode(file);
    1175    55395676 :         xfs_inode_t     *ip = XFS_I(inode);
    1176    55395676 :         size_t          bufsize;
    1177             : 
    1178             :         /*
    1179             :          * The Linux API doesn't pass down the total size of the buffer
    1180             :          * we read into down to the filesystem.  With the filldir concept
    1181             :          * it's not needed for correct information, but the XFS dir2 leaf
    1182             :          * code wants an estimate of the buffer size to calculate it's
    1183             :          * readahead window and size the buffers used for mapping to
    1184             :          * physical blocks.
    1185             :          *
    1186             :          * Try to give it an estimate that's good enough, maybe at some
    1187             :          * point we can change the ->readdir prototype to include the
    1188             :          * buffer size.  For now we use the current glibc buffer size.
    1189             :          */
    1190    55395676 :         bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
    1191             : 
    1192    55395676 :         return xfs_readdir(NULL, ip, ctx, bufsize);
    1193             : }
    1194             : 
    1195             : STATIC loff_t
    1196    37917342 : xfs_file_llseek(
    1197             :         struct file     *file,
    1198             :         loff_t          offset,
    1199             :         int             whence)
    1200             : {
    1201    37917342 :         struct inode            *inode = file->f_mapping->host;
    1202             : 
    1203    75834684 :         if (xfs_is_shutdown(XFS_I(inode)->i_mount))
    1204             :                 return -EIO;
    1205             : 
    1206    37917333 :         switch (whence) {
    1207    37698490 :         default:
    1208    37698490 :                 return generic_file_llseek(file, offset, whence);
    1209         326 :         case SEEK_HOLE:
    1210         326 :                 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
    1211         326 :                 break;
    1212      218517 :         case SEEK_DATA:
    1213      218517 :                 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
    1214      218517 :                 break;
    1215             :         }
    1216             : 
    1217      218843 :         if (offset < 0)
    1218             :                 return offset;
    1219      169896 :         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
    1220             : }
    1221             : 
    1222             : #ifdef CONFIG_FS_DAX
    1223             : static inline vm_fault_t
    1224             : xfs_dax_fault(
    1225             :         struct vm_fault         *vmf,
    1226             :         enum page_entry_size    pe_size,
    1227             :         bool                    write_fault,
    1228             :         pfn_t                   *pfn)
    1229             : {
    1230             :         return dax_iomap_fault(vmf, pe_size, pfn, NULL,
    1231             :                         (write_fault && !vmf->cow_page) ?
    1232             :                                 &xfs_dax_write_iomap_ops :
    1233             :                                 &xfs_read_iomap_ops);
    1234             : }
    1235             : #else
    1236             : static inline vm_fault_t
    1237             : xfs_dax_fault(
    1238             :         struct vm_fault         *vmf,
    1239             :         enum page_entry_size    pe_size,
    1240             :         bool                    write_fault,
    1241             :         pfn_t                   *pfn)
    1242             : {
    1243             :         ASSERT(0);
    1244             :         return VM_FAULT_SIGBUS;
    1245             : }
    1246             : #endif
    1247             : 
    1248             : /*
    1249             :  * Locking for serialisation of IO during page faults. This results in a lock
    1250             :  * ordering of:
    1251             :  *
    1252             :  * mmap_lock (MM)
    1253             :  *   sb_start_pagefault(vfs, freeze)
    1254             :  *     invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
    1255             :  *       page_lock (MM)
    1256             :  *         i_lock (XFS - extent map serialisation)
    1257             :  */
    1258             : static vm_fault_t
    1259    14418830 : __xfs_filemap_fault(
    1260             :         struct vm_fault         *vmf,
    1261             :         enum page_entry_size    pe_size,
    1262             :         bool                    write_fault)
    1263             : {
    1264    14418830 :         struct inode            *inode = file_inode(vmf->vma->vm_file);
    1265    14418830 :         struct xfs_inode        *ip = XFS_I(inode);
    1266    14418830 :         vm_fault_t              ret;
    1267             : 
    1268    14418830 :         trace_xfs_filemap_fault(ip, pe_size, write_fault);
    1269             : 
    1270    14421056 :         if (write_fault) {
    1271     3579169 :                 sb_start_pagefault(inode->i_sb);
    1272     3579284 :                 file_update_time(vmf->vma->vm_file);
    1273             :         }
    1274             : 
    1275    14421072 :         if (IS_DAX(inode)) {
    1276             :                 pfn_t pfn;
    1277             : 
    1278             :                 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1279             :                 ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
    1280             :                 if (ret & VM_FAULT_NEEDDSYNC)
    1281             :                         ret = dax_finish_sync_fault(vmf, pe_size, pfn);
    1282             :                 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1283             :         } else {
    1284    14421072 :                 if (write_fault) {
    1285     3579280 :                         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1286     3579312 :                         ret = iomap_page_mkwrite(vmf,
    1287             :                                         &xfs_page_mkwrite_iomap_ops);
    1288     3579309 :                         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
    1289             :                 } else {
    1290    10841792 :                         ret = filemap_fault(vmf);
    1291             :                 }
    1292             :         }
    1293             : 
    1294    14421108 :         if (write_fault)
    1295     3579273 :                 sb_end_pagefault(inode->i_sb);
    1296    14421147 :         return ret;
    1297             : }
    1298             : 
    1299             : static inline bool
    1300             : xfs_is_write_fault(
    1301             :         struct vm_fault         *vmf)
    1302             : {
    1303             :         return (vmf->flags & FAULT_FLAG_WRITE) &&
    1304             :                (vmf->vma->vm_flags & VM_SHARED);
    1305             : }
    1306             : 
    1307             : static vm_fault_t
    1308    10841569 : xfs_filemap_fault(
    1309             :         struct vm_fault         *vmf)
    1310             : {
    1311             :         /* DAX can shortcut the normal fault path on write faults! */
    1312    10841569 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
    1313             :                         IS_DAX(file_inode(vmf->vma->vm_file)) &&
    1314             :                         xfs_is_write_fault(vmf));
    1315             : }
    1316             : 
    1317             : static vm_fault_t
    1318           0 : xfs_filemap_huge_fault(
    1319             :         struct vm_fault         *vmf,
    1320             :         enum page_entry_size    pe_size)
    1321             : {
    1322           0 :         if (!IS_DAX(file_inode(vmf->vma->vm_file)))
    1323           0 :                 return VM_FAULT_FALLBACK;
    1324             : 
    1325             :         /* DAX can shortcut the normal fault path on write faults! */
    1326             :         return __xfs_filemap_fault(vmf, pe_size,
    1327             :                         xfs_is_write_fault(vmf));
    1328             : }
    1329             : 
    1330             : static vm_fault_t
    1331     3578983 : xfs_filemap_page_mkwrite(
    1332             :         struct vm_fault         *vmf)
    1333             : {
    1334     3578983 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
    1335             : }
    1336             : 
    1337             : /*
    1338             :  * pfn_mkwrite was originally intended to ensure we capture time stamp updates
    1339             :  * on write faults. In reality, it needs to serialise against truncate and
    1340             :  * prepare memory for writing so handle is as standard write fault.
    1341             :  */
    1342             : static vm_fault_t
    1343           0 : xfs_filemap_pfn_mkwrite(
    1344             :         struct vm_fault         *vmf)
    1345             : {
    1346             : 
    1347           0 :         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
    1348             : }
    1349             : 
    1350             : static const struct vm_operations_struct xfs_file_vm_ops = {
    1351             :         .fault          = xfs_filemap_fault,
    1352             :         .huge_fault     = xfs_filemap_huge_fault,
    1353             :         .map_pages      = filemap_map_pages,
    1354             :         .page_mkwrite   = xfs_filemap_page_mkwrite,
    1355             :         .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
    1356             : };
    1357             : 
    1358             : STATIC int
    1359     7637390 : xfs_file_mmap(
    1360             :         struct file             *file,
    1361             :         struct vm_area_struct   *vma)
    1362             : {
    1363     7637390 :         struct inode            *inode = file_inode(file);
    1364     7637390 :         struct xfs_buftarg      *target = xfs_inode_buftarg(XFS_I(inode));
    1365             : 
    1366             :         /*
    1367             :          * We don't support synchronous mappings for non-DAX files and
    1368             :          * for DAX files if underneath dax_device is not synchronous.
    1369             :          */
    1370     7637390 :         if (!daxdev_mapping_supported(vma, target->bt_daxdev))
    1371             :                 return -EOPNOTSUPP;
    1372             : 
    1373     7637383 :         file_accessed(file);
    1374     7637420 :         vma->vm_ops = &xfs_file_vm_ops;
    1375     7637420 :         if (IS_DAX(inode))
    1376             :                 vm_flags_set(vma, VM_HUGEPAGE);
    1377     7637420 :         return 0;
    1378             : }
    1379             : 
    1380             : const struct file_operations xfs_file_operations = {
    1381             :         .llseek         = xfs_file_llseek,
    1382             :         .read_iter      = xfs_file_read_iter,
    1383             :         .write_iter     = xfs_file_write_iter,
    1384             :         .splice_read    = xfs_file_splice_read,
    1385             :         .splice_write   = iter_file_splice_write,
    1386             :         .iopoll         = iocb_bio_iopoll,
    1387             :         .unlocked_ioctl = xfs_file_ioctl,
    1388             : #ifdef CONFIG_COMPAT
    1389             :         .compat_ioctl   = xfs_file_compat_ioctl,
    1390             : #endif
    1391             :         .mmap           = xfs_file_mmap,
    1392             :         .mmap_supported_flags = MAP_SYNC,
    1393             :         .open           = xfs_file_open,
    1394             :         .release        = xfs_file_release,
    1395             :         .fsync          = xfs_file_fsync,
    1396             :         .get_unmapped_area = thp_get_unmapped_area,
    1397             :         .fallocate      = xfs_file_fallocate,
    1398             :         .fadvise        = xfs_file_fadvise,
    1399             :         .remap_file_range = xfs_file_remap_range,
    1400             : };
    1401             : 
    1402             : const struct file_operations xfs_dir_file_operations = {
    1403             :         .open           = xfs_dir_open,
    1404             :         .read           = generic_read_dir,
    1405             :         .iterate_shared = xfs_file_readdir,
    1406             :         .llseek         = generic_file_llseek,
    1407             :         .unlocked_ioctl = xfs_file_ioctl,
    1408             : #ifdef CONFIG_COMPAT
    1409             :         .compat_ioctl   = xfs_file_compat_ioctl,
    1410             : #endif
    1411             :         .fsync          = xfs_dir_fsync,
    1412             : };

Generated by: LCOV version 1.14