Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_inode.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_inode_item.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_dir2.h"
19 : #include "xfs_dir2_priv.h"
20 : #include "xfs_ioctl.h"
21 : #include "xfs_trace.h"
22 : #include "xfs_log.h"
23 : #include "xfs_icache.h"
24 : #include "xfs_pnfs.h"
25 : #include "xfs_iomap.h"
26 : #include "xfs_reflink.h"
27 : #include "xfs_file.h"
28 :
29 : #include <linux/dax.h>
30 : #include <linux/falloc.h>
31 : #include <linux/backing-dev.h>
32 : #include <linux/mman.h>
33 : #include <linux/fadvise.h>
34 : #include <linux/mount.h>
35 :
36 : static const struct vm_operations_struct xfs_file_vm_ops;
37 :
38 : /*
39 : * Decide if the given file range is aligned to the size of the fundamental
40 : * allocation unit for the file.
41 : */
42 : bool
43 1545492 : xfs_is_falloc_aligned(
44 : struct xfs_inode *ip,
45 : loff_t pos,
46 : long long int len)
47 : {
48 1545492 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
49 :
50 2024502 : if (XFS_IS_REALTIME_INODE(ip) && !is_power_of_2(alloc_unit))
51 8 : return isaligned_64(pos, alloc_unit) &&
52 4 : isaligned_64(len, alloc_unit);
53 :
54 1545484 : return !((pos | len) & (alloc_unit - 1));
55 : }
56 :
57 : /*
58 : * Fsync operations on directories are much simpler than on regular files,
59 : * as there is no file data to flush, and thus also no need for explicit
60 : * cache flush operations, and there are no non-transaction metadata updates
61 : * on directories either.
62 : */
63 : STATIC int
64 511825 : xfs_dir_fsync(
65 : struct file *file,
66 : loff_t start,
67 : loff_t end,
68 : int datasync)
69 : {
70 511825 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
71 :
72 511825 : trace_xfs_dir_fsync(ip);
73 511826 : return xfs_log_force_inode(ip);
74 : }
75 :
76 : static xfs_csn_t
77 1854237 : xfs_fsync_seq(
78 : struct xfs_inode *ip,
79 : bool datasync)
80 : {
81 1854237 : if (!xfs_ipincount(ip))
82 : return 0;
83 1854111 : if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
84 : return 0;
85 1786971 : return ip->i_itemp->ili_commit_seq;
86 : }
87 :
88 : /*
89 : * All metadata updates are logged, which means that we just have to flush the
90 : * log up to the latest LSN that touched the inode.
91 : *
92 : * If we have concurrent fsync/fdatasync() calls, we need them to all block on
93 : * the log force before we clear the ili_fsync_fields field. This ensures that
94 : * we don't get a racing sync operation that does not wait for the metadata to
95 : * hit the journal before returning. If we race with clearing ili_fsync_fields,
96 : * then all that will happen is the log force will do nothing as the lsn will
97 : * already be on disk. We can't race with setting ili_fsync_fields because that
98 : * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
99 : * shared until after the ili_fsync_fields is cleared.
100 : */
101 : static int
102 1854265 : xfs_fsync_flush_log(
103 : struct xfs_inode *ip,
104 : bool datasync,
105 : int *log_flushed)
106 : {
107 1854265 : int error = 0;
108 1854265 : xfs_csn_t seq;
109 :
110 1854265 : xfs_ilock(ip, XFS_ILOCK_SHARED);
111 1854265 : seq = xfs_fsync_seq(ip, datasync);
112 1854258 : if (seq) {
113 1786991 : error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
114 : log_flushed);
115 :
116 1785417 : spin_lock(&ip->i_itemp->ili_lock);
117 1786953 : ip->i_itemp->ili_fsync_fields = 0;
118 1786953 : spin_unlock(&ip->i_itemp->ili_lock);
119 : }
120 1854093 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
121 1853965 : return error;
122 : }
123 :
124 : STATIC int
125 16555924 : xfs_file_fsync(
126 : struct file *file,
127 : loff_t start,
128 : loff_t end,
129 : int datasync)
130 : {
131 16555924 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
132 16555924 : struct xfs_mount *mp = ip->i_mount;
133 16555924 : int error, err2;
134 16555924 : int log_flushed = 0;
135 :
136 16555924 : trace_xfs_file_fsync(ip);
137 :
138 16555526 : error = file_write_and_wait_range(file, start, end);
139 16555676 : if (error)
140 : return error;
141 :
142 33108314 : if (xfs_is_shutdown(mp))
143 : return -EIO;
144 :
145 16552130 : xfs_iflags_clear(ip, XFS_ITRUNCATED);
146 :
147 : /*
148 : * If we have an RT and/or log subvolume we need to make sure to flush
149 : * the write cache the device used for file data first. This is to
150 : * ensure newly written file data make it to disk before logging the new
151 : * inode size in case of an extending write.
152 : */
153 16552284 : if (XFS_IS_REALTIME_INODE(ip))
154 4835735 : error = xfs_buftarg_flush(mp->m_rtdev_targp);
155 11716549 : else if (mp->m_logdev_targp != mp->m_ddev_targp)
156 0 : error = xfs_buftarg_flush(mp->m_ddev_targp);
157 :
158 : /*
159 : * Any inode that has dirty modifications in the log is pinned. The
160 : * racy check here for a pinned inode will not catch modifications
161 : * that happen concurrently to the fsync call, but fsync semantics
162 : * only require to sync previously completed I/O.
163 : */
164 16551322 : if (xfs_ipincount(ip)) {
165 1854251 : err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
166 1853532 : if (err2 && !error)
167 1048 : error = err2;
168 : }
169 :
170 : /*
171 : * If we only have a single device, and the log force about was
172 : * a no-op we might have to flush the data device cache here.
173 : * This can only happen for fdatasync/O_DSYNC if we were overwriting
174 : * an already allocated file and thus do not have any metadata to
175 : * commit.
176 : */
177 16550603 : if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
178 10651207 : mp->m_logdev_targp == mp->m_ddev_targp) {
179 10651006 : err2 = xfs_buftarg_flush(mp->m_ddev_targp);
180 10651104 : if (err2 && !error)
181 244 : error = err2;
182 : }
183 :
184 : return error;
185 : }
186 :
187 : static int
188 386595754 : xfs_ilock_iocb(
189 : struct kiocb *iocb,
190 : unsigned int lock_mode)
191 : {
192 386595754 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
193 :
194 386595754 : if (iocb->ki_flags & IOCB_NOWAIT) {
195 0 : if (!xfs_ilock_nowait(ip, lock_mode))
196 0 : return -EAGAIN;
197 : } else {
198 386595754 : xfs_ilock(ip, lock_mode);
199 : }
200 :
201 : return 0;
202 : }
203 :
204 : STATIC ssize_t
205 264621062 : xfs_file_dio_read(
206 : struct kiocb *iocb,
207 : struct iov_iter *to)
208 : {
209 264621062 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
210 264621062 : ssize_t ret;
211 :
212 264621062 : trace_xfs_file_direct_read(iocb, to);
213 :
214 264621012 : if (!iov_iter_count(to))
215 : return 0; /* skip atime */
216 :
217 264620986 : file_accessed(iocb->ki_filp);
218 :
219 264621082 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
220 264621085 : if (ret)
221 : return ret;
222 264621087 : ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
223 264621089 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
224 :
225 264621089 : return ret;
226 : }
227 :
228 : static noinline ssize_t
229 : xfs_file_dax_read(
230 : struct kiocb *iocb,
231 : struct iov_iter *to)
232 : {
233 : struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
234 : ssize_t ret = 0;
235 :
236 : trace_xfs_file_dax_read(iocb, to);
237 :
238 : if (!iov_iter_count(to))
239 : return 0; /* skip atime */
240 :
241 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
242 : if (ret)
243 : return ret;
244 : ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
245 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
246 :
247 : file_accessed(iocb->ki_filp);
248 : return ret;
249 : }
250 :
251 : STATIC ssize_t
252 58292486 : xfs_file_buffered_read(
253 : struct kiocb *iocb,
254 : struct iov_iter *to)
255 : {
256 58292486 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
257 58292486 : ssize_t ret;
258 :
259 58292486 : trace_xfs_file_buffered_read(iocb, to);
260 :
261 58324215 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
262 58329890 : if (ret)
263 : return ret;
264 58329614 : ret = generic_file_read_iter(iocb, to);
265 58306421 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
266 :
267 58306421 : return ret;
268 : }
269 :
270 : STATIC ssize_t
271 322947291 : xfs_file_read_iter(
272 : struct kiocb *iocb,
273 : struct iov_iter *to)
274 : {
275 322947291 : struct inode *inode = file_inode(iocb->ki_filp);
276 322947291 : struct xfs_mount *mp = XFS_I(inode)->i_mount;
277 322947291 : ssize_t ret = 0;
278 :
279 322947291 : XFS_STATS_INC(mp, xs_read_calls);
280 :
281 645894582 : if (xfs_is_shutdown(mp))
282 : return -EIO;
283 :
284 322945149 : if (IS_DAX(inode))
285 : ret = xfs_file_dax_read(iocb, to);
286 322945149 : else if (iocb->ki_flags & IOCB_DIRECT)
287 264620989 : ret = xfs_file_dio_read(iocb, to);
288 : else
289 58324160 : ret = xfs_file_buffered_read(iocb, to);
290 :
291 322948964 : if (ret > 0)
292 60513359 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
293 : return ret;
294 : }
295 :
296 : STATIC ssize_t
297 4917290 : xfs_file_splice_read(
298 : struct file *in,
299 : loff_t *ppos,
300 : struct pipe_inode_info *pipe,
301 : size_t len,
302 : unsigned int flags)
303 : {
304 4917290 : struct inode *inode = file_inode(in);
305 4917290 : struct xfs_inode *ip = XFS_I(inode);
306 4917290 : struct xfs_mount *mp = ip->i_mount;
307 4917290 : ssize_t ret = 0;
308 :
309 4917290 : XFS_STATS_INC(mp, xs_read_calls);
310 :
311 9834580 : if (xfs_is_shutdown(mp))
312 : return -EIO;
313 :
314 4917284 : trace_xfs_file_splice_read(ip, *ppos, len);
315 :
316 4917283 : xfs_ilock(ip, XFS_IOLOCK_SHARED);
317 4917287 : ret = filemap_splice_read(in, ppos, pipe, len, flags);
318 4917321 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
319 4917321 : if (ret > 0)
320 4917189 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
321 : return ret;
322 : }
323 :
324 : /*
325 : * Common pre-write limit and setup checks.
326 : *
327 : * Called with the iolocked held either shared and exclusive according to
328 : * @iolock, and returns with it held. Might upgrade the iolock to exclusive
329 : * if called for a direct write beyond i_size.
330 : */
331 : STATIC ssize_t
332 60500263 : xfs_file_write_checks(
333 : struct kiocb *iocb,
334 : struct iov_iter *from,
335 : unsigned int *iolock)
336 : {
337 60500263 : struct file *file = iocb->ki_filp;
338 60500263 : struct inode *inode = file->f_mapping->host;
339 60500263 : struct xfs_inode *ip = XFS_I(inode);
340 60500263 : ssize_t error = 0;
341 60500263 : size_t count = iov_iter_count(from);
342 60500263 : bool drained_dio = false;
343 77572171 : loff_t isize;
344 :
345 : restart:
346 77572171 : error = generic_write_checks(iocb, from);
347 77543357 : if (error <= 0)
348 6 : return error;
349 :
350 77543351 : if (iocb->ki_flags & IOCB_NOWAIT) {
351 0 : error = break_layout(inode, false);
352 0 : if (error == -EWOULDBLOCK)
353 : error = -EAGAIN;
354 : } else {
355 77543351 : error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
356 : }
357 :
358 77548137 : if (error)
359 0 : return error;
360 :
361 : /*
362 : * For changing security info in file_remove_privs() we need i_rwsem
363 : * exclusively.
364 : */
365 77548137 : if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
366 135636 : xfs_iunlock(ip, *iolock);
367 135632 : *iolock = XFS_IOLOCK_EXCL;
368 135632 : error = xfs_ilock_iocb(iocb, *iolock);
369 135632 : if (error) {
370 0 : *iolock = 0;
371 0 : return error;
372 : }
373 135632 : goto restart;
374 : }
375 :
376 : /*
377 : * If the offset is beyond the size of the file, we need to zero any
378 : * blocks that fall between the existing EOF and the start of this
379 : * write. If zeroing is needed and we are currently holding the iolock
380 : * shared, we need to update it to exclusive which implies having to
381 : * redo all checks before.
382 : *
383 : * We need to serialise against EOF updates that occur in IO completions
384 : * here. We want to make sure that nobody is changing the size while we
385 : * do this check until we have placed an IO barrier (i.e. hold the
386 : * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
387 : * spinlock effectively forms a memory barrier once we have the
388 : * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
389 : * hence be able to correctly determine if we need to run zeroing.
390 : *
391 : * We can do an unlocked check here safely as IO completion can only
392 : * extend EOF. Truncate is locked out at this point, so the EOF can
393 : * not move backwards, only forwards. Hence we only need to take the
394 : * slow path and spin locks when we are at or beyond the current EOF.
395 : */
396 77412501 : if (iocb->ki_pos <= i_size_read(inode))
397 43580951 : goto out;
398 :
399 33831550 : spin_lock(&ip->i_flags_lock);
400 33834417 : isize = i_size_read(inode);
401 33834417 : if (iocb->ki_pos > isize) {
402 33834417 : spin_unlock(&ip->i_flags_lock);
403 :
404 33852889 : if (iocb->ki_flags & IOCB_NOWAIT)
405 : return -EAGAIN;
406 :
407 33852889 : if (!drained_dio) {
408 16935798 : if (*iolock == XFS_IOLOCK_SHARED) {
409 100605 : xfs_iunlock(ip, *iolock);
410 100599 : *iolock = XFS_IOLOCK_EXCL;
411 100599 : xfs_ilock(ip, *iolock);
412 100599 : iov_iter_reexpand(from, count);
413 : }
414 : /*
415 : * We now have an IO submission barrier in place, but
416 : * AIO can do EOF updates during IO completion and hence
417 : * we now need to wait for all of them to drain. Non-AIO
418 : * DIO will have drained before we are given the
419 : * XFS_IOLOCK_EXCL, and so for most cases this wait is a
420 : * no-op.
421 : */
422 16935792 : inode_dio_wait(inode);
423 16936276 : drained_dio = true;
424 16936276 : goto restart;
425 : }
426 :
427 16917091 : trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
428 16917631 : error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
429 16916231 : if (error)
430 : return error;
431 : } else
432 0 : spin_unlock(&ip->i_flags_lock);
433 :
434 60496540 : out:
435 60496540 : return kiocb_modified(iocb);
436 : }
437 :
438 : static int
439 6832814 : xfs_dio_write_end_io(
440 : struct kiocb *iocb,
441 : ssize_t size,
442 : int error,
443 : unsigned flags)
444 : {
445 6832814 : struct inode *inode = file_inode(iocb->ki_filp);
446 6832814 : struct xfs_inode *ip = XFS_I(inode);
447 6832814 : loff_t offset = iocb->ki_pos;
448 6832814 : unsigned int nofs_flag;
449 :
450 6832814 : trace_xfs_end_io_direct_write(ip, offset, size);
451 :
452 13664884 : if (xfs_is_shutdown(ip->i_mount))
453 : return -EIO;
454 :
455 6831890 : if (error)
456 : return error;
457 5934304 : if (!size)
458 : return 0;
459 :
460 : /*
461 : * Capture amount written on completion as we can't reliably account
462 : * for it on submission.
463 : */
464 5934304 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
465 :
466 : /*
467 : * We can allocate memory here while doing writeback on behalf of
468 : * memory reclaim. To avoid memory allocation deadlocks set the
469 : * task-wide nofs context for the following operations.
470 : */
471 5934304 : nofs_flag = memalloc_nofs_save();
472 :
473 5934304 : if (flags & IOMAP_DIO_COW) {
474 1137240 : error = xfs_reflink_end_cow(ip, offset, size);
475 1137240 : if (error)
476 44 : goto out;
477 : }
478 :
479 : /*
480 : * Unwritten conversion updates the in-core isize after extent
481 : * conversion but before updating the on-disk size. Updating isize any
482 : * earlier allows a racing dio read to find unwritten extents before
483 : * they are converted.
484 : */
485 5934260 : if (flags & IOMAP_DIO_UNWRITTEN) {
486 3028286 : error = xfs_iomap_write_unwritten(ip, offset, size, true);
487 3028476 : goto out;
488 : }
489 :
490 : /*
491 : * We need to update the in-core inode size here so that we don't end up
492 : * with the on-disk inode size being outside the in-core inode size. We
493 : * have no other method of updating EOF for AIO, so always do it here
494 : * if necessary.
495 : *
496 : * We need to lock the test/set EOF update as we can be racing with
497 : * other IO completions here to update the EOF. Failing to serialise
498 : * here can result in EOF moving backwards and Bad Things Happen when
499 : * that occurs.
500 : *
501 : * As IO completion only ever extends EOF, we can do an unlocked check
502 : * here to avoid taking the spinlock. If we land within the current EOF,
503 : * then we do not need to do an extending update at all, and we don't
504 : * need to take the lock to check this. If we race with an update moving
505 : * EOF, then we'll either still be beyond EOF and need to take the lock,
506 : * or we'll be within EOF and we don't need to take it at all.
507 : */
508 2905974 : if (offset + size <= i_size_read(inode))
509 2739722 : goto out;
510 :
511 166252 : spin_lock(&ip->i_flags_lock);
512 166252 : if (offset + size > i_size_read(inode)) {
513 166252 : i_size_write(inode, offset + size);
514 166252 : spin_unlock(&ip->i_flags_lock);
515 166252 : error = xfs_setfilesize(ip, offset, size);
516 : } else {
517 0 : spin_unlock(&ip->i_flags_lock);
518 : }
519 :
520 5934494 : out:
521 5934494 : memalloc_nofs_restore(nofs_flag);
522 5934494 : return error;
523 : }
524 :
525 : static const struct iomap_dio_ops xfs_dio_write_ops = {
526 : .end_io = xfs_dio_write_end_io,
527 : };
528 :
529 : /*
530 : * Handle block aligned direct I/O writes
531 : */
532 : static noinline ssize_t
533 3620705 : xfs_file_dio_write_aligned(
534 : struct xfs_inode *ip,
535 : struct kiocb *iocb,
536 : struct iov_iter *from)
537 : {
538 3620705 : unsigned int iolock = XFS_IOLOCK_SHARED;
539 3620705 : ssize_t ret;
540 :
541 3620705 : ret = xfs_ilock_iocb(iocb, iolock);
542 3620735 : if (ret)
543 : return ret;
544 3620696 : ret = xfs_file_write_checks(iocb, from, &iolock);
545 3620770 : if (ret)
546 642 : goto out_unlock;
547 :
548 : /*
549 : * We don't need to hold the IOLOCK exclusively across the IO, so demote
550 : * the iolock back to shared if we had to take the exclusive lock in
551 : * xfs_file_write_checks() for other reasons.
552 : */
553 3620128 : if (iolock == XFS_IOLOCK_EXCL) {
554 134499 : xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
555 134497 : iolock = XFS_IOLOCK_SHARED;
556 : }
557 3620126 : trace_xfs_file_direct_write(iocb, from);
558 3620121 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
559 : &xfs_dio_write_ops, 0, NULL, 0);
560 3620833 : out_unlock:
561 3620833 : if (iolock)
562 3620833 : xfs_iunlock(ip, iolock);
563 : return ret;
564 : }
565 :
566 : /*
567 : * Handle block unaligned direct I/O writes
568 : *
569 : * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
570 : * them to be done in parallel with reads and other direct I/O writes. However,
571 : * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
572 : * to do sub-block zeroing and that requires serialisation against other direct
573 : * I/O to the same block. In this case we need to serialise the submission of
574 : * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
575 : * In the case where sub-block zeroing is not required, we can do concurrent
576 : * sub-block dios to the same block successfully.
577 : *
578 : * Optimistically submit the I/O using the shared lock first, but use the
579 : * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
580 : * if block allocation or partial block zeroing would be required. In that case
581 : * we try again with the exclusive lock.
582 : */
583 : static noinline ssize_t
584 5490563 : xfs_file_dio_write_unaligned(
585 : struct xfs_inode *ip,
586 : struct kiocb *iocb,
587 : struct iov_iter *from)
588 : {
589 5490563 : size_t isize = i_size_read(VFS_I(ip));
590 5490563 : size_t count = iov_iter_count(from);
591 5490563 : unsigned int iolock = XFS_IOLOCK_SHARED;
592 5490563 : unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
593 5490563 : ssize_t ret;
594 :
595 : /*
596 : * Extending writes need exclusivity because of the sub-block zeroing
597 : * that the DIO code always does for partial tail blocks beyond EOF, so
598 : * don't even bother trying the fast path in this case.
599 : */
600 5490563 : if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
601 2019316 : if (iocb->ki_flags & IOCB_NOWAIT)
602 : return -EAGAIN;
603 2019316 : retry_exclusive:
604 2779949 : iolock = XFS_IOLOCK_EXCL;
605 2779949 : flags = IOMAP_DIO_FORCE_WAIT;
606 : }
607 :
608 6251196 : ret = xfs_ilock_iocb(iocb, iolock);
609 6251158 : if (ret)
610 : return ret;
611 :
612 : /*
613 : * We can't properly handle unaligned direct I/O to reflink files yet,
614 : * as we can't unshare a partial block.
615 : */
616 6251158 : if (xfs_is_cow_inode(ip)) {
617 3038297 : trace_xfs_reflink_bounce_dio_write(iocb, from);
618 3038303 : ret = -ENOTBLK;
619 3038303 : goto out_unlock;
620 : }
621 :
622 3212861 : ret = xfs_file_write_checks(iocb, from, &iolock);
623 3212874 : if (ret)
624 58 : goto out_unlock;
625 :
626 : /*
627 : * If we are doing exclusive unaligned I/O, this must be the only I/O
628 : * in-flight. Otherwise we risk data corruption due to unwritten extent
629 : * conversions from the AIO end_io handler. Wait for all other I/O to
630 : * drain first.
631 : */
632 3212816 : if (flags & IOMAP_DIO_FORCE_WAIT)
633 2338008 : inode_dio_wait(VFS_I(ip));
634 :
635 3212815 : trace_xfs_file_direct_write(iocb, from);
636 3212799 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
637 : &xfs_dio_write_ops, flags, NULL, 0);
638 :
639 : /*
640 : * Retry unaligned I/O with exclusive blocking semantics if the DIO
641 : * layer rejected it for mapping or locking reasons. If we are doing
642 : * nonblocking user I/O, propagate the error.
643 : */
644 3212751 : if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
645 760635 : ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
646 760635 : xfs_iunlock(ip, iolock);
647 760633 : goto retry_exclusive;
648 : }
649 :
650 2452116 : out_unlock:
651 5490477 : if (iolock)
652 5490482 : xfs_iunlock(ip, iolock);
653 : return ret;
654 : }
655 :
656 : static ssize_t
657 9111186 : xfs_file_dio_write(
658 : struct kiocb *iocb,
659 : struct iov_iter *from)
660 : {
661 9111186 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
662 9111186 : struct xfs_buftarg *target = xfs_inode_buftarg(ip);
663 9111186 : size_t count = iov_iter_count(from);
664 :
665 : /* direct I/O must be aligned to device logical sector size */
666 9111186 : if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
667 : return -EINVAL;
668 9111186 : if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
669 5490536 : return xfs_file_dio_write_unaligned(ip, iocb, from);
670 3620650 : return xfs_file_dio_write_aligned(ip, iocb, from);
671 : }
672 :
673 : static noinline ssize_t
674 : xfs_file_dax_write(
675 : struct kiocb *iocb,
676 : struct iov_iter *from)
677 : {
678 : struct inode *inode = iocb->ki_filp->f_mapping->host;
679 : struct xfs_inode *ip = XFS_I(inode);
680 : unsigned int iolock = XFS_IOLOCK_EXCL;
681 : ssize_t ret, error = 0;
682 : loff_t pos;
683 :
684 : ret = xfs_ilock_iocb(iocb, iolock);
685 : if (ret)
686 : return ret;
687 : ret = xfs_file_write_checks(iocb, from, &iolock);
688 : if (ret)
689 : goto out;
690 :
691 : pos = iocb->ki_pos;
692 :
693 : trace_xfs_file_dax_write(iocb, from);
694 : ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
695 : if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
696 : i_size_write(inode, iocb->ki_pos);
697 : error = xfs_setfilesize(ip, pos, ret);
698 : }
699 : out:
700 : if (iolock)
701 : xfs_iunlock(ip, iolock);
702 : if (error)
703 : return error;
704 :
705 : if (ret > 0) {
706 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
707 :
708 : /* Handle various SYNC-type writes */
709 : ret = generic_write_sync(iocb, ret);
710 : }
711 : return ret;
712 : }
713 :
714 : STATIC ssize_t
715 53347754 : xfs_file_buffered_write(
716 : struct kiocb *iocb,
717 : struct iov_iter *from)
718 : {
719 53347754 : struct inode *inode = iocb->ki_filp->f_mapping->host;
720 53347754 : struct xfs_inode *ip = XFS_I(inode);
721 53347754 : ssize_t ret;
722 53347754 : bool cleared_space = false;
723 53653059 : unsigned int iolock;
724 :
725 : write_retry:
726 53653059 : iolock = XFS_IOLOCK_EXCL;
727 53653059 : ret = xfs_ilock_iocb(iocb, iolock);
728 53667681 : if (ret)
729 0 : return ret;
730 :
731 53667681 : ret = xfs_file_write_checks(iocb, from, &iolock);
732 53666557 : if (ret)
733 692 : goto out;
734 :
735 53665865 : trace_xfs_file_buffered_write(iocb, from);
736 53663723 : ret = iomap_file_buffered_write(iocb, from,
737 : &xfs_buffered_write_iomap_ops);
738 :
739 : /*
740 : * If we hit a space limit, try to free up some lingering preallocated
741 : * space before returning an error. In the case of ENOSPC, first try to
742 : * write back all dirty inodes to free up some of the excess reserved
743 : * metadata space. This reduces the chances that the eofblocks scan
744 : * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
745 : * also behaves as a filter to prevent too many eofblocks scans from
746 : * running at the same time. Use a synchronous scan to increase the
747 : * effectiveness of the scan.
748 : */
749 53607357 : if (ret == -EDQUOT && !cleared_space) {
750 212 : xfs_iunlock(ip, iolock);
751 212 : xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
752 212 : cleared_space = true;
753 212 : goto write_retry;
754 53607145 : } else if (ret == -ENOSPC && !cleared_space) {
755 305100 : struct xfs_icwalk icw = {0};
756 :
757 305100 : cleared_space = true;
758 305100 : xfs_flush_inodes(ip->i_mount);
759 :
760 304929 : xfs_iunlock(ip, iolock);
761 304919 : icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
762 304919 : xfs_blockgc_free_space(ip->i_mount, &icw);
763 305093 : goto write_retry;
764 : }
765 :
766 53302045 : out:
767 53302737 : if (iolock)
768 53362309 : xfs_iunlock(ip, iolock);
769 :
770 53298104 : if (ret > 0) {
771 53086690 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
772 : /* Handle various SYNC-type writes */
773 53086690 : ret = generic_write_sync(iocb, ret);
774 : }
775 : return ret;
776 : }
777 :
778 : STATIC ssize_t
779 59440418 : xfs_file_write_iter(
780 : struct kiocb *iocb,
781 : struct iov_iter *from)
782 : {
783 59440418 : struct inode *inode = iocb->ki_filp->f_mapping->host;
784 59440418 : struct xfs_inode *ip = XFS_I(inode);
785 59440418 : ssize_t ret;
786 59440418 : size_t ocount = iov_iter_count(from);
787 :
788 59440418 : XFS_STATS_INC(ip->i_mount, xs_write_calls);
789 :
790 59440418 : if (ocount == 0)
791 : return 0;
792 :
793 118880808 : if (xfs_is_shutdown(ip->i_mount))
794 : return -EIO;
795 :
796 59429646 : if (IS_DAX(inode))
797 : return xfs_file_dax_write(iocb, from);
798 :
799 59429646 : if (iocb->ki_flags & IOCB_DIRECT) {
800 : /*
801 : * Allow a directio write to fall back to a buffered
802 : * write *only* in the case that we're doing a reflink
803 : * CoW. In all other directio scenarios we do not
804 : * allow an operation to fall back to buffered mode.
805 : */
806 9111255 : ret = xfs_file_dio_write(iocb, from);
807 9110890 : if (ret != -ENOTBLK)
808 : return ret;
809 : }
810 :
811 53356790 : return xfs_file_buffered_write(iocb, from);
812 : }
813 :
814 : /* Does this file, inode, or mount want synchronous writes? */
815 223734871 : static inline bool xfs_file_sync_writes(struct file *filp)
816 : {
817 223734871 : struct xfs_inode *ip = XFS_I(file_inode(filp));
818 :
819 223734871 : if (xfs_has_wsync(ip->i_mount))
820 : return true;
821 223734863 : if (filp->f_flags & (__O_SYNC | O_DSYNC))
822 : return true;
823 223727729 : if (IS_SYNC(file_inode(filp)))
824 4 : return true;
825 :
826 : return false;
827 : }
828 :
829 : #define XFS_FALLOC_FL_SUPPORTED \
830 : (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
831 : FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
832 : FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
833 :
834 : STATIC long
835 14496333 : xfs_file_fallocate(
836 : struct file *file,
837 : int mode,
838 : loff_t offset,
839 : loff_t len)
840 : {
841 14496333 : struct inode *inode = file_inode(file);
842 14496333 : struct xfs_inode *ip = XFS_I(inode);
843 14496333 : long error;
844 14496333 : uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
845 14496333 : loff_t new_size = 0;
846 14496333 : bool do_file_insert = false;
847 :
848 14496333 : if (!S_ISREG(inode->i_mode))
849 : return -EINVAL;
850 14496333 : if (mode & ~XFS_FALLOC_FL_SUPPORTED)
851 : return -EOPNOTSUPP;
852 :
853 14496333 : xfs_ilock(ip, iolock);
854 14496507 : error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
855 14496473 : if (error)
856 0 : goto out_unlock;
857 :
858 : /*
859 : * Must wait for all AIO to complete before we continue as AIO can
860 : * change the file size on completion without holding any locks we
861 : * currently hold. We must do this first because AIO can update both
862 : * the on disk and in memory inode sizes, and the operations that follow
863 : * require the in-memory size to be fully up-to-date.
864 : */
865 14496473 : inode_dio_wait(inode);
866 :
867 : /*
868 : * Now AIO and DIO has drained we flush and (if necessary) invalidate
869 : * the cached range over the first operation we are about to run.
870 : *
871 : * We care about zero and collapse here because they both run a hole
872 : * punch over the range first. Because that can zero data, and the range
873 : * of invalidation for the shift operations is much larger, we still do
874 : * the required flush for collapse in xfs_prepare_shift().
875 : *
876 : * Insert has the same range requirements as collapse, and we extend the
877 : * file first which can zero data. Hence insert has the same
878 : * flush/invalidate requirements as collapse and so they are both
879 : * handled at the right time by xfs_prepare_shift().
880 : */
881 14496235 : if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
882 : FALLOC_FL_COLLAPSE_RANGE)) {
883 10384121 : error = xfs_flush_unmap_range(ip, offset, len);
884 10384438 : if (error)
885 135 : goto out_unlock;
886 : }
887 :
888 14496417 : error = file_modified(file);
889 14496499 : if (error)
890 9 : goto out_unlock;
891 :
892 14496490 : if (mode & FALLOC_FL_PUNCH_HOLE) {
893 7899185 : error = xfs_free_file_space(ip, offset, len);
894 7899178 : if (error)
895 8115 : goto out_unlock;
896 6597305 : } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
897 822717 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
898 255387 : error = -EINVAL;
899 255387 : goto out_unlock;
900 : }
901 :
902 : /*
903 : * There is no need to overlap collapse range with EOF,
904 : * in which case it is effectively a truncate operation
905 : */
906 567330 : if (offset + len >= i_size_read(inode)) {
907 108777 : error = -EINVAL;
908 108777 : goto out_unlock;
909 : }
910 :
911 458553 : new_size = i_size_read(inode) - len;
912 :
913 458553 : error = xfs_collapse_file_space(ip, offset, len);
914 458553 : if (error)
915 1107 : goto out_unlock;
916 5774588 : } else if (mode & FALLOC_FL_INSERT_RANGE) {
917 722778 : loff_t isize = i_size_read(inode);
918 :
919 722778 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
920 254964 : error = -EINVAL;
921 254964 : goto out_unlock;
922 : }
923 :
924 : /*
925 : * New inode size must not exceed ->s_maxbytes, accounting for
926 : * possible signed overflow.
927 : */
928 467814 : if (inode->i_sb->s_maxbytes - isize < len) {
929 2 : error = -EFBIG;
930 2 : goto out_unlock;
931 : }
932 467812 : new_size = isize + len;
933 :
934 : /* Offset should be less than i_size */
935 467812 : if (offset >= isize) {
936 88509 : error = -EINVAL;
937 88509 : goto out_unlock;
938 : }
939 : do_file_insert = true;
940 : } else {
941 5051810 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
942 2258821 : offset + len > i_size_read(inode)) {
943 1216066 : new_size = offset + len;
944 1216066 : error = inode_newsize_ok(inode, new_size);
945 1216064 : if (error)
946 2 : goto out_unlock;
947 : }
948 :
949 5051806 : if (mode & FALLOC_FL_ZERO_RANGE) {
950 : /*
951 : * Punch a hole and prealloc the range. We use a hole
952 : * punch rather than unwritten extent conversion for two
953 : * reasons:
954 : *
955 : * 1.) Hole punch handles partial block zeroing for us.
956 : * 2.) If prealloc returns ENOSPC, the file range is
957 : * still zero-valued by virtue of the hole punch.
958 : */
959 1662450 : unsigned int blksize = i_blocksize(inode);
960 :
961 1662450 : trace_xfs_zero_file_space(ip);
962 :
963 1662451 : error = xfs_free_file_space(ip, offset, len);
964 1662448 : if (error)
965 5305 : goto out_unlock;
966 :
967 1657143 : len = round_up(offset + len, blksize) -
968 1657143 : round_down(offset, blksize);
969 1657143 : offset = round_down(offset, blksize);
970 3389356 : } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
971 66 : error = xfs_reflink_unshare(ip, offset, len);
972 66 : if (error)
973 2 : goto out_unlock;
974 : } else {
975 : /*
976 : * If always_cow mode we can't use preallocations and
977 : * thus should not create them.
978 : */
979 3389290 : if (xfs_is_always_cow_inode(ip)) {
980 0 : error = -EOPNOTSUPP;
981 0 : goto out_unlock;
982 : }
983 : }
984 :
985 5046497 : if (!xfs_is_always_cow_inode(ip)) {
986 5046492 : error = xfs_alloc_file_space(ip, offset, len);
987 5046507 : if (error)
988 74798 : goto out_unlock;
989 : }
990 : }
991 :
992 : /* Change file size if needed */
993 13699526 : if (new_size) {
994 2047399 : struct iattr iattr;
995 :
996 2047399 : iattr.ia_valid = ATTR_SIZE;
997 2047399 : iattr.ia_size = new_size;
998 4094798 : error = xfs_vn_setattr_size(file_mnt_idmap(file),
999 : file_dentry(file), &iattr);
1000 2047398 : if (error)
1001 350 : goto out_unlock;
1002 : }
1003 :
1004 : /*
1005 : * Perform hole insertion now that the file size has been
1006 : * updated so that if we crash during the operation we don't
1007 : * leave shifted extents past EOF and hence losing access to
1008 : * the data that is contained within them.
1009 : */
1010 13699175 : if (do_file_insert) {
1011 379186 : error = xfs_insert_file_space(ip, offset, len);
1012 379186 : if (error)
1013 1152 : goto out_unlock;
1014 : }
1015 :
1016 13698023 : if (xfs_file_sync_writes(file))
1017 7130 : error = xfs_log_force_inode(ip);
1018 :
1019 13690893 : out_unlock:
1020 14496637 : xfs_iunlock(ip, iolock);
1021 14496637 : return error;
1022 : }
1023 :
1024 : STATIC int
1025 2118205 : xfs_file_fadvise(
1026 : struct file *file,
1027 : loff_t start,
1028 : loff_t end,
1029 : int advice)
1030 : {
1031 2118205 : struct xfs_inode *ip = XFS_I(file_inode(file));
1032 2118205 : int ret;
1033 2118205 : int lockflags = 0;
1034 :
1035 : /*
1036 : * Operations creating pages in page cache need protection from hole
1037 : * punching and similar ops
1038 : */
1039 2118205 : if (advice == POSIX_FADV_WILLNEED) {
1040 0 : lockflags = XFS_IOLOCK_SHARED;
1041 0 : xfs_ilock(ip, lockflags);
1042 : }
1043 2118205 : ret = generic_fadvise(file, start, end, advice);
1044 2119609 : if (lockflags)
1045 0 : xfs_iunlock(ip, lockflags);
1046 2119609 : return ret;
1047 : }
1048 :
1049 : STATIC loff_t
1050 198376715 : xfs_file_remap_range(
1051 : struct file *file_in,
1052 : loff_t pos_in,
1053 : struct file *file_out,
1054 : loff_t pos_out,
1055 : loff_t len,
1056 : unsigned int remap_flags)
1057 : {
1058 198376715 : struct inode *inode_in = file_inode(file_in);
1059 198376715 : struct xfs_inode *src = XFS_I(inode_in);
1060 198376715 : struct inode *inode_out = file_inode(file_out);
1061 198376715 : struct xfs_inode *dest = XFS_I(inode_out);
1062 198376715 : struct xfs_mount *mp = src->i_mount;
1063 198376715 : loff_t remapped = 0;
1064 198376715 : xfs_extlen_t cowextsize;
1065 198376715 : int ret;
1066 :
1067 198376715 : if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1068 : return -EINVAL;
1069 :
1070 198376715 : if (!xfs_has_reflink(mp))
1071 : return -EOPNOTSUPP;
1072 :
1073 274486162 : if (xfs_is_shutdown(mp))
1074 : return -EIO;
1075 :
1076 : /* Prepare and then clone file data. */
1077 137240703 : ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1078 : &len, remap_flags);
1079 137246518 : if (ret || len == 0)
1080 31741456 : return ret;
1081 :
1082 105505062 : trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1083 :
1084 105504766 : ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1085 : &remapped);
1086 105505245 : if (ret)
1087 485513 : goto out_unlock;
1088 :
1089 : /*
1090 : * Carry the cowextsize hint from src to dest if we're sharing the
1091 : * entire source file to the entire destination file, the source file
1092 : * has a cowextsize hint, and the destination file does not.
1093 : */
1094 105019732 : cowextsize = 0;
1095 105019732 : if (pos_in == 0 && len == i_size_read(inode_in) &&
1096 43138 : (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1097 48 : pos_out == 0 && len >= i_size_read(inode_out) &&
1098 46 : !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1099 6 : cowextsize = src->i_cowextsize;
1100 :
1101 105019732 : ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1102 : remap_flags);
1103 105019847 : if (ret)
1104 0 : goto out_unlock;
1105 :
1106 105019847 : if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1107 275 : xfs_log_force_inode(dest);
1108 105019755 : out_unlock:
1109 105505284 : xfs_iunlock2_io_mmap(src, dest);
1110 105505136 : if (ret)
1111 485546 : trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1112 105505135 : return remapped > 0 ? remapped : ret;
1113 : }
1114 :
1115 : STATIC int
1116 371641553 : xfs_file_open(
1117 : struct inode *inode,
1118 : struct file *file)
1119 : {
1120 743283106 : if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1121 : return -EIO;
1122 371627805 : file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1123 : FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
1124 371627805 : return generic_file_open(inode, file);
1125 : }
1126 :
1127 : STATIC int
1128 28954998 : xfs_dir_open(
1129 : struct inode *inode,
1130 : struct file *file)
1131 : {
1132 28954998 : struct xfs_inode *ip = XFS_I(inode);
1133 28954998 : unsigned int mode;
1134 28954998 : int error;
1135 :
1136 28954998 : error = xfs_file_open(inode, file);
1137 28954824 : if (error)
1138 : return error;
1139 :
1140 : /*
1141 : * If there are any blocks, read-ahead block 0 as we're almost
1142 : * certain to have the next operation be a read there.
1143 : */
1144 28953824 : mode = xfs_ilock_data_map_shared(ip);
1145 28954177 : if (ip->i_df.if_nextents > 0)
1146 7195713 : error = xfs_dir3_data_readahead(ip, 0, 0);
1147 28954705 : xfs_iunlock(ip, mode);
1148 28954705 : return error;
1149 : }
1150 :
1151 : /*
1152 : * When we release the file, we don't want it to trim EOF blocks if it is a
1153 : * readonly context. This avoids open/read/close workloads from removing
1154 : * EOF blocks that other writers depend upon to reduce fragmentation.
1155 : */
1156 : STATIC int
1157 342655243 : xfs_file_release(
1158 : struct inode *inode,
1159 : struct file *file)
1160 : {
1161 342655243 : bool free_eof_blocks = true;
1162 :
1163 342655243 : if ((file->f_mode & (FMODE_WRITE | FMODE_READ)) == FMODE_READ)
1164 53137343 : free_eof_blocks = false;
1165 :
1166 342655243 : return xfs_release(XFS_I(inode), free_eof_blocks);
1167 : }
1168 :
1169 : STATIC int
1170 55395676 : xfs_file_readdir(
1171 : struct file *file,
1172 : struct dir_context *ctx)
1173 : {
1174 55395676 : struct inode *inode = file_inode(file);
1175 55395676 : xfs_inode_t *ip = XFS_I(inode);
1176 55395676 : size_t bufsize;
1177 :
1178 : /*
1179 : * The Linux API doesn't pass down the total size of the buffer
1180 : * we read into down to the filesystem. With the filldir concept
1181 : * it's not needed for correct information, but the XFS dir2 leaf
1182 : * code wants an estimate of the buffer size to calculate it's
1183 : * readahead window and size the buffers used for mapping to
1184 : * physical blocks.
1185 : *
1186 : * Try to give it an estimate that's good enough, maybe at some
1187 : * point we can change the ->readdir prototype to include the
1188 : * buffer size. For now we use the current glibc buffer size.
1189 : */
1190 55395676 : bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1191 :
1192 55395676 : return xfs_readdir(NULL, ip, ctx, bufsize);
1193 : }
1194 :
1195 : STATIC loff_t
1196 37917342 : xfs_file_llseek(
1197 : struct file *file,
1198 : loff_t offset,
1199 : int whence)
1200 : {
1201 37917342 : struct inode *inode = file->f_mapping->host;
1202 :
1203 75834684 : if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1204 : return -EIO;
1205 :
1206 37917333 : switch (whence) {
1207 37698490 : default:
1208 37698490 : return generic_file_llseek(file, offset, whence);
1209 326 : case SEEK_HOLE:
1210 326 : offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1211 326 : break;
1212 218517 : case SEEK_DATA:
1213 218517 : offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1214 218517 : break;
1215 : }
1216 :
1217 218843 : if (offset < 0)
1218 : return offset;
1219 169896 : return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1220 : }
1221 :
1222 : #ifdef CONFIG_FS_DAX
1223 : static inline vm_fault_t
1224 : xfs_dax_fault(
1225 : struct vm_fault *vmf,
1226 : enum page_entry_size pe_size,
1227 : bool write_fault,
1228 : pfn_t *pfn)
1229 : {
1230 : return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1231 : (write_fault && !vmf->cow_page) ?
1232 : &xfs_dax_write_iomap_ops :
1233 : &xfs_read_iomap_ops);
1234 : }
1235 : #else
1236 : static inline vm_fault_t
1237 : xfs_dax_fault(
1238 : struct vm_fault *vmf,
1239 : enum page_entry_size pe_size,
1240 : bool write_fault,
1241 : pfn_t *pfn)
1242 : {
1243 : ASSERT(0);
1244 : return VM_FAULT_SIGBUS;
1245 : }
1246 : #endif
1247 :
1248 : /*
1249 : * Locking for serialisation of IO during page faults. This results in a lock
1250 : * ordering of:
1251 : *
1252 : * mmap_lock (MM)
1253 : * sb_start_pagefault(vfs, freeze)
1254 : * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1255 : * page_lock (MM)
1256 : * i_lock (XFS - extent map serialisation)
1257 : */
1258 : static vm_fault_t
1259 14418830 : __xfs_filemap_fault(
1260 : struct vm_fault *vmf,
1261 : enum page_entry_size pe_size,
1262 : bool write_fault)
1263 : {
1264 14418830 : struct inode *inode = file_inode(vmf->vma->vm_file);
1265 14418830 : struct xfs_inode *ip = XFS_I(inode);
1266 14418830 : vm_fault_t ret;
1267 :
1268 14418830 : trace_xfs_filemap_fault(ip, pe_size, write_fault);
1269 :
1270 14421056 : if (write_fault) {
1271 3579169 : sb_start_pagefault(inode->i_sb);
1272 3579284 : file_update_time(vmf->vma->vm_file);
1273 : }
1274 :
1275 14421072 : if (IS_DAX(inode)) {
1276 : pfn_t pfn;
1277 :
1278 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1279 : ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1280 : if (ret & VM_FAULT_NEEDDSYNC)
1281 : ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1282 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1283 : } else {
1284 14421072 : if (write_fault) {
1285 3579280 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1286 3579312 : ret = iomap_page_mkwrite(vmf,
1287 : &xfs_page_mkwrite_iomap_ops);
1288 3579309 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1289 : } else {
1290 10841792 : ret = filemap_fault(vmf);
1291 : }
1292 : }
1293 :
1294 14421108 : if (write_fault)
1295 3579273 : sb_end_pagefault(inode->i_sb);
1296 14421147 : return ret;
1297 : }
1298 :
1299 : static inline bool
1300 : xfs_is_write_fault(
1301 : struct vm_fault *vmf)
1302 : {
1303 : return (vmf->flags & FAULT_FLAG_WRITE) &&
1304 : (vmf->vma->vm_flags & VM_SHARED);
1305 : }
1306 :
1307 : static vm_fault_t
1308 10841569 : xfs_filemap_fault(
1309 : struct vm_fault *vmf)
1310 : {
1311 : /* DAX can shortcut the normal fault path on write faults! */
1312 10841569 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1313 : IS_DAX(file_inode(vmf->vma->vm_file)) &&
1314 : xfs_is_write_fault(vmf));
1315 : }
1316 :
1317 : static vm_fault_t
1318 0 : xfs_filemap_huge_fault(
1319 : struct vm_fault *vmf,
1320 : enum page_entry_size pe_size)
1321 : {
1322 0 : if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1323 0 : return VM_FAULT_FALLBACK;
1324 :
1325 : /* DAX can shortcut the normal fault path on write faults! */
1326 : return __xfs_filemap_fault(vmf, pe_size,
1327 : xfs_is_write_fault(vmf));
1328 : }
1329 :
1330 : static vm_fault_t
1331 3578983 : xfs_filemap_page_mkwrite(
1332 : struct vm_fault *vmf)
1333 : {
1334 3578983 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1335 : }
1336 :
1337 : /*
1338 : * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1339 : * on write faults. In reality, it needs to serialise against truncate and
1340 : * prepare memory for writing so handle is as standard write fault.
1341 : */
1342 : static vm_fault_t
1343 0 : xfs_filemap_pfn_mkwrite(
1344 : struct vm_fault *vmf)
1345 : {
1346 :
1347 0 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1348 : }
1349 :
1350 : static const struct vm_operations_struct xfs_file_vm_ops = {
1351 : .fault = xfs_filemap_fault,
1352 : .huge_fault = xfs_filemap_huge_fault,
1353 : .map_pages = filemap_map_pages,
1354 : .page_mkwrite = xfs_filemap_page_mkwrite,
1355 : .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1356 : };
1357 :
1358 : STATIC int
1359 7637390 : xfs_file_mmap(
1360 : struct file *file,
1361 : struct vm_area_struct *vma)
1362 : {
1363 7637390 : struct inode *inode = file_inode(file);
1364 7637390 : struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1365 :
1366 : /*
1367 : * We don't support synchronous mappings for non-DAX files and
1368 : * for DAX files if underneath dax_device is not synchronous.
1369 : */
1370 7637390 : if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1371 : return -EOPNOTSUPP;
1372 :
1373 7637383 : file_accessed(file);
1374 7637420 : vma->vm_ops = &xfs_file_vm_ops;
1375 7637420 : if (IS_DAX(inode))
1376 : vm_flags_set(vma, VM_HUGEPAGE);
1377 7637420 : return 0;
1378 : }
1379 :
1380 : const struct file_operations xfs_file_operations = {
1381 : .llseek = xfs_file_llseek,
1382 : .read_iter = xfs_file_read_iter,
1383 : .write_iter = xfs_file_write_iter,
1384 : .splice_read = xfs_file_splice_read,
1385 : .splice_write = iter_file_splice_write,
1386 : .iopoll = iocb_bio_iopoll,
1387 : .unlocked_ioctl = xfs_file_ioctl,
1388 : #ifdef CONFIG_COMPAT
1389 : .compat_ioctl = xfs_file_compat_ioctl,
1390 : #endif
1391 : .mmap = xfs_file_mmap,
1392 : .mmap_supported_flags = MAP_SYNC,
1393 : .open = xfs_file_open,
1394 : .release = xfs_file_release,
1395 : .fsync = xfs_file_fsync,
1396 : .get_unmapped_area = thp_get_unmapped_area,
1397 : .fallocate = xfs_file_fallocate,
1398 : .fadvise = xfs_file_fadvise,
1399 : .remap_file_range = xfs_file_remap_range,
1400 : };
1401 :
1402 : const struct file_operations xfs_dir_file_operations = {
1403 : .open = xfs_dir_open,
1404 : .read = generic_read_dir,
1405 : .iterate_shared = xfs_file_readdir,
1406 : .llseek = generic_file_llseek,
1407 : .unlocked_ioctl = xfs_file_ioctl,
1408 : #ifdef CONFIG_COMPAT
1409 : .compat_ioctl = xfs_file_compat_ioctl,
1410 : #endif
1411 : .fsync = xfs_dir_fsync,
1412 : };
|