Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_inode.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_inode_item.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_dir2.h"
19 : #include "xfs_dir2_priv.h"
20 : #include "xfs_ioctl.h"
21 : #include "xfs_trace.h"
22 : #include "xfs_log.h"
23 : #include "xfs_icache.h"
24 : #include "xfs_pnfs.h"
25 : #include "xfs_iomap.h"
26 : #include "xfs_reflink.h"
27 : #include "xfs_file.h"
28 :
29 : #include <linux/dax.h>
30 : #include <linux/falloc.h>
31 : #include <linux/backing-dev.h>
32 : #include <linux/mman.h>
33 : #include <linux/fadvise.h>
34 : #include <linux/mount.h>
35 :
36 : static const struct vm_operations_struct xfs_file_vm_ops;
37 :
38 : /*
39 : * Decide if the given file range is aligned to the size of the fundamental
40 : * allocation unit for the file.
41 : */
42 : bool
43 5371201 : xfs_is_falloc_aligned(
44 : struct xfs_inode *ip,
45 : loff_t pos,
46 : long long int len)
47 : {
48 5371201 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
49 :
50 7390093 : if (XFS_IS_REALTIME_INODE(ip) && !is_power_of_2(alloc_unit))
51 524809 : return isaligned_64(pos, alloc_unit) &&
52 248597 : isaligned_64(len, alloc_unit);
53 :
54 5103035 : return !((pos | len) & (alloc_unit - 1));
55 : }
56 :
57 : /*
58 : * Fsync operations on directories are much simpler than on regular files,
59 : * as there is no file data to flush, and thus also no need for explicit
60 : * cache flush operations, and there are no non-transaction metadata updates
61 : * on directories either.
62 : */
63 : STATIC int
64 864170 : xfs_dir_fsync(
65 : struct file *file,
66 : loff_t start,
67 : loff_t end,
68 : int datasync)
69 : {
70 864170 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
71 :
72 864170 : trace_xfs_dir_fsync(ip);
73 864158 : return xfs_log_force_inode(ip);
74 : }
75 :
76 : static xfs_csn_t
77 4294265 : xfs_fsync_seq(
78 : struct xfs_inode *ip,
79 : bool datasync)
80 : {
81 4294265 : if (!xfs_ipincount(ip))
82 : return 0;
83 4293158 : if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
84 : return 0;
85 4005966 : return ip->i_itemp->ili_commit_seq;
86 : }
87 :
88 : /*
89 : * All metadata updates are logged, which means that we just have to flush the
90 : * log up to the latest LSN that touched the inode.
91 : *
92 : * If we have concurrent fsync/fdatasync() calls, we need them to all block on
93 : * the log force before we clear the ili_fsync_fields field. This ensures that
94 : * we don't get a racing sync operation that does not wait for the metadata to
95 : * hit the journal before returning. If we race with clearing ili_fsync_fields,
96 : * then all that will happen is the log force will do nothing as the lsn will
97 : * already be on disk. We can't race with setting ili_fsync_fields because that
98 : * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
99 : * shared until after the ili_fsync_fields is cleared.
100 : */
101 : static int
102 4294298 : xfs_fsync_flush_log(
103 : struct xfs_inode *ip,
104 : bool datasync,
105 : int *log_flushed)
106 : {
107 4294298 : int error = 0;
108 4294298 : xfs_csn_t seq;
109 :
110 4294298 : xfs_ilock(ip, XFS_ILOCK_SHARED);
111 4294358 : seq = xfs_fsync_seq(ip, datasync);
112 4294060 : if (seq) {
113 4005983 : error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
114 : log_flushed);
115 :
116 4005581 : spin_lock(&ip->i_itemp->ili_lock);
117 4005966 : ip->i_itemp->ili_fsync_fields = 0;
118 4005966 : spin_unlock(&ip->i_itemp->ili_lock);
119 : }
120 4294049 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
121 4292920 : return error;
122 : }
123 :
124 : STATIC int
125 20332276 : xfs_file_fsync(
126 : struct file *file,
127 : loff_t start,
128 : loff_t end,
129 : int datasync)
130 : {
131 20332276 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
132 20332276 : struct xfs_mount *mp = ip->i_mount;
133 20332276 : int error, err2;
134 20332276 : int log_flushed = 0;
135 :
136 20332276 : trace_xfs_file_fsync(ip);
137 :
138 20331775 : error = file_write_and_wait_range(file, start, end);
139 20336659 : if (error)
140 : return error;
141 :
142 40666880 : if (xfs_is_shutdown(mp))
143 : return -EIO;
144 :
145 20331492 : xfs_iflags_clear(ip, XFS_ITRUNCATED);
146 :
147 : /*
148 : * If we have an RT and/or log subvolume we need to make sure to flush
149 : * the write cache the device used for file data first. This is to
150 : * ensure newly written file data make it to disk before logging the new
151 : * inode size in case of an extending write.
152 : */
153 20331972 : if (XFS_IS_REALTIME_INODE(ip))
154 6390901 : error = xfs_buftarg_flush(mp->m_rtdev_targp);
155 13941071 : else if (mp->m_logdev_targp != mp->m_ddev_targp)
156 542241 : error = xfs_buftarg_flush(mp->m_ddev_targp);
157 :
158 : /*
159 : * Any inode that has dirty modifications in the log is pinned. The
160 : * racy check here for a pinned inode will not catch modifications
161 : * that happen concurrently to the fsync call, but fsync semantics
162 : * only require to sync previously completed I/O.
163 : */
164 20329679 : if (xfs_ipincount(ip)) {
165 4294305 : err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
166 4292870 : if (err2 && !error)
167 1019 : error = err2;
168 : }
169 :
170 : /*
171 : * If we only have a single device, and the log force about was
172 : * a no-op we might have to flush the data device cache here.
173 : * This can only happen for fdatasync/O_DSYNC if we were overwriting
174 : * an already allocated file and thus do not have any metadata to
175 : * commit.
176 : */
177 20328244 : if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
178 11869854 : mp->m_logdev_targp == mp->m_ddev_targp) {
179 11575534 : err2 = xfs_buftarg_flush(mp->m_ddev_targp);
180 11566540 : if (err2 && !error)
181 257 : error = err2;
182 : }
183 :
184 : return error;
185 : }
186 :
187 : static int
188 1197571972 : xfs_ilock_iocb(
189 : struct kiocb *iocb,
190 : unsigned int lock_mode)
191 : {
192 1197571972 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
193 :
194 1197571972 : if (iocb->ki_flags & IOCB_NOWAIT) {
195 0 : if (!xfs_ilock_nowait(ip, lock_mode))
196 0 : return -EAGAIN;
197 : } else {
198 1197571972 : xfs_ilock(ip, lock_mode);
199 : }
200 :
201 : return 0;
202 : }
203 :
204 : STATIC ssize_t
205 534592992 : xfs_file_dio_read(
206 : struct kiocb *iocb,
207 : struct iov_iter *to)
208 : {
209 534592992 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
210 534592992 : ssize_t ret;
211 :
212 534592992 : trace_xfs_file_direct_read(iocb, to);
213 :
214 534592313 : if (!iov_iter_count(to))
215 : return 0; /* skip atime */
216 :
217 534566479 : file_accessed(iocb->ki_filp);
218 :
219 534565496 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
220 534565276 : if (ret)
221 : return ret;
222 534565302 : ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
223 534567927 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
224 :
225 534567927 : return ret;
226 : }
227 :
228 : static noinline ssize_t
229 0 : xfs_file_dax_read(
230 : struct kiocb *iocb,
231 : struct iov_iter *to)
232 : {
233 0 : struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
234 0 : ssize_t ret = 0;
235 :
236 0 : trace_xfs_file_dax_read(iocb, to);
237 :
238 0 : if (!iov_iter_count(to))
239 : return 0; /* skip atime */
240 :
241 0 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
242 0 : if (ret)
243 : return ret;
244 0 : ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
245 0 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
246 :
247 0 : file_accessed(iocb->ki_filp);
248 0 : return ret;
249 : }
250 :
251 : STATIC ssize_t
252 386888712 : xfs_file_buffered_read(
253 : struct kiocb *iocb,
254 : struct iov_iter *to)
255 : {
256 386888712 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
257 386888712 : ssize_t ret;
258 :
259 386888712 : trace_xfs_file_buffered_read(iocb, to);
260 :
261 386871109 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
262 385563502 : if (ret)
263 : return ret;
264 385597650 : ret = generic_file_read_iter(iocb, to);
265 386164803 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
266 :
267 386164803 : return ret;
268 : }
269 :
270 : STATIC ssize_t
271 922293176 : xfs_file_read_iter(
272 : struct kiocb *iocb,
273 : struct iov_iter *to)
274 : {
275 922293176 : struct inode *inode = file_inode(iocb->ki_filp);
276 922293176 : struct xfs_mount *mp = XFS_I(inode)->i_mount;
277 922293176 : ssize_t ret = 0;
278 :
279 922293176 : XFS_STATS_INC(mp, xs_read_calls);
280 :
281 1841069260 : if (xfs_is_shutdown(mp))
282 : return -EIO;
283 :
284 920531270 : if (IS_DAX(inode))
285 0 : ret = xfs_file_dax_read(iocb, to);
286 920531270 : else if (iocb->ki_flags & IOCB_DIRECT)
287 534592985 : ret = xfs_file_dio_read(iocb, to);
288 : else
289 385938285 : ret = xfs_file_buffered_read(iocb, to);
290 :
291 919595060 : if (ret > 0)
292 381242311 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
293 : return ret;
294 : }
295 :
296 : STATIC ssize_t
297 11929995 : xfs_file_splice_read(
298 : struct file *in,
299 : loff_t *ppos,
300 : struct pipe_inode_info *pipe,
301 : size_t len,
302 : unsigned int flags)
303 : {
304 11929995 : struct inode *inode = file_inode(in);
305 11929995 : struct xfs_inode *ip = XFS_I(inode);
306 11929995 : struct xfs_mount *mp = ip->i_mount;
307 11929995 : ssize_t ret = 0;
308 :
309 11929995 : XFS_STATS_INC(mp, xs_read_calls);
310 :
311 23860036 : if (xfs_is_shutdown(mp))
312 : return -EIO;
313 :
314 11929965 : trace_xfs_file_splice_read(ip, *ppos, len);
315 :
316 11929887 : xfs_ilock(ip, XFS_IOLOCK_SHARED);
317 11929813 : ret = filemap_splice_read(in, ppos, pipe, len, flags);
318 11929920 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
319 11929878 : if (ret > 0)
320 11929635 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
321 : return ret;
322 : }
323 :
324 : /*
325 : * Common pre-write limit and setup checks.
326 : *
327 : * Called with the iolocked held either shared and exclusive according to
328 : * @iolock, and returns with it held. Might upgrade the iolock to exclusive
329 : * if called for a direct write beyond i_size.
330 : */
331 : STATIC ssize_t
332 268850806 : xfs_file_write_checks(
333 : struct kiocb *iocb,
334 : struct iov_iter *from,
335 : unsigned int *iolock)
336 : {
337 268850806 : struct file *file = iocb->ki_filp;
338 268850806 : struct inode *inode = file->f_mapping->host;
339 268850806 : struct xfs_inode *ip = XFS_I(inode);
340 268850806 : ssize_t error = 0;
341 268850806 : size_t count = iov_iter_count(from);
342 268850806 : bool drained_dio = false;
343 334138695 : loff_t isize;
344 :
345 : restart:
346 334138695 : error = generic_write_checks(iocb, from);
347 334337203 : if (error <= 0)
348 30 : return error;
349 :
350 334337173 : if (iocb->ki_flags & IOCB_NOWAIT) {
351 0 : error = break_layout(inode, false);
352 0 : if (error == -EWOULDBLOCK)
353 : error = -EAGAIN;
354 : } else {
355 334337173 : error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
356 : }
357 :
358 334803951 : if (error)
359 0 : return error;
360 :
361 : /*
362 : * For changing security info in file_remove_privs() we need i_rwsem
363 : * exclusively.
364 : */
365 334803951 : if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
366 88494 : xfs_iunlock(ip, *iolock);
367 88491 : *iolock = XFS_IOLOCK_EXCL;
368 88491 : error = xfs_ilock_iocb(iocb, *iolock);
369 88485 : if (error) {
370 0 : *iolock = 0;
371 0 : return error;
372 : }
373 88485 : goto restart;
374 : }
375 :
376 : /*
377 : * If the offset is beyond the size of the file, we need to zero any
378 : * blocks that fall between the existing EOF and the start of this
379 : * write. If zeroing is needed and we are currently holding the iolock
380 : * shared, we need to update it to exclusive which implies having to
381 : * redo all checks before.
382 : *
383 : * We need to serialise against EOF updates that occur in IO completions
384 : * here. We want to make sure that nobody is changing the size while we
385 : * do this check until we have placed an IO barrier (i.e. hold the
386 : * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
387 : * spinlock effectively forms a memory barrier once we have the
388 : * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
389 : * hence be able to correctly determine if we need to run zeroing.
390 : *
391 : * We can do an unlocked check here safely as IO completion can only
392 : * extend EOF. Truncate is locked out at this point, so the EOF can
393 : * not move backwards, only forwards. Hence we only need to take the
394 : * slow path and spin locks when we are at or beyond the current EOF.
395 : */
396 334715457 : if (iocb->ki_pos <= i_size_read(inode))
397 203803035 : goto out;
398 :
399 130912422 : spin_lock(&ip->i_flags_lock);
400 131149675 : isize = i_size_read(inode);
401 131149675 : if (iocb->ki_pos > isize) {
402 131149675 : spin_unlock(&ip->i_flags_lock);
403 :
404 131054414 : if (iocb->ki_flags & IOCB_NOWAIT)
405 : return -EAGAIN;
406 :
407 131054414 : if (!drained_dio) {
408 65527467 : if (*iolock == XFS_IOLOCK_SHARED) {
409 803139 : xfs_iunlock(ip, *iolock);
410 803061 : *iolock = XFS_IOLOCK_EXCL;
411 803061 : xfs_ilock(ip, *iolock);
412 802954 : iov_iter_reexpand(from, count);
413 : }
414 : /*
415 : * We now have an IO submission barrier in place, but
416 : * AIO can do EOF updates during IO completion and hence
417 : * we now need to wait for all of them to drain. Non-AIO
418 : * DIO will have drained before we are given the
419 : * XFS_IOLOCK_EXCL, and so for most cases this wait is a
420 : * no-op.
421 : */
422 65527282 : inode_dio_wait(inode);
423 65199404 : drained_dio = true;
424 65199404 : goto restart;
425 : }
426 :
427 65526947 : trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
428 65242387 : error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
429 65041536 : if (error)
430 : return error;
431 : } else
432 0 : spin_unlock(&ip->i_flags_lock);
433 :
434 268830794 : out:
435 268830794 : return kiocb_modified(iocb);
436 : }
437 :
438 : static int
439 19065270 : xfs_dio_write_end_io(
440 : struct kiocb *iocb,
441 : ssize_t size,
442 : int error,
443 : unsigned flags)
444 : {
445 19065270 : struct inode *inode = file_inode(iocb->ki_filp);
446 19065270 : struct xfs_inode *ip = XFS_I(inode);
447 19065270 : loff_t offset = iocb->ki_pos;
448 19065270 : unsigned int nofs_flag;
449 :
450 19065270 : trace_xfs_end_io_direct_write(ip, offset, size);
451 :
452 38130986 : if (xfs_is_shutdown(ip->i_mount))
453 : return -EIO;
454 :
455 19064751 : if (error)
456 : return error;
457 16795551 : if (!size)
458 : return 0;
459 :
460 : /*
461 : * Capture amount written on completion as we can't reliably account
462 : * for it on submission.
463 : */
464 16795551 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
465 :
466 : /*
467 : * We can allocate memory here while doing writeback on behalf of
468 : * memory reclaim. To avoid memory allocation deadlocks set the
469 : * task-wide nofs context for the following operations.
470 : */
471 16795667 : nofs_flag = memalloc_nofs_save();
472 :
473 16795667 : if (flags & IOMAP_DIO_COW) {
474 3486630 : error = xfs_reflink_end_cow(ip, offset, size);
475 3486630 : if (error)
476 15 : goto out;
477 : }
478 :
479 : /*
480 : * Unwritten conversion updates the in-core isize after extent
481 : * conversion but before updating the on-disk size. Updating isize any
482 : * earlier allows a racing dio read to find unwritten extents before
483 : * they are converted.
484 : */
485 16795652 : if (flags & IOMAP_DIO_UNWRITTEN) {
486 8451978 : error = xfs_iomap_write_unwritten(ip, offset, size, true);
487 8450896 : goto out;
488 : }
489 :
490 : /*
491 : * We need to update the in-core inode size here so that we don't end up
492 : * with the on-disk inode size being outside the in-core inode size. We
493 : * have no other method of updating EOF for AIO, so always do it here
494 : * if necessary.
495 : *
496 : * We need to lock the test/set EOF update as we can be racing with
497 : * other IO completions here to update the EOF. Failing to serialise
498 : * here can result in EOF moving backwards and Bad Things Happen when
499 : * that occurs.
500 : *
501 : * As IO completion only ever extends EOF, we can do an unlocked check
502 : * here to avoid taking the spinlock. If we land within the current EOF,
503 : * then we do not need to do an extending update at all, and we don't
504 : * need to take the lock to check this. If we race with an update moving
505 : * EOF, then we'll either still be beyond EOF and need to take the lock,
506 : * or we'll be within EOF and we don't need to take it at all.
507 : */
508 8343674 : if (offset + size <= i_size_read(inode))
509 7727649 : goto out;
510 :
511 616025 : spin_lock(&ip->i_flags_lock);
512 616025 : if (offset + size > i_size_read(inode)) {
513 616025 : i_size_write(inode, offset + size);
514 616025 : spin_unlock(&ip->i_flags_lock);
515 616025 : error = xfs_setfilesize(ip, offset, size);
516 : } else {
517 0 : spin_unlock(&ip->i_flags_lock);
518 : }
519 :
520 16794585 : out:
521 16794585 : memalloc_nofs_restore(nofs_flag);
522 16794585 : return error;
523 : }
524 :
525 : static const struct iomap_dio_ops xfs_dio_write_ops = {
526 : .end_io = xfs_dio_write_end_io,
527 : };
528 :
529 : /*
530 : * Handle block aligned direct I/O writes
531 : */
532 : static noinline ssize_t
533 13144423 : xfs_file_dio_write_aligned(
534 : struct xfs_inode *ip,
535 : struct kiocb *iocb,
536 : struct iov_iter *from)
537 : {
538 13144423 : unsigned int iolock = XFS_IOLOCK_SHARED;
539 13144423 : ssize_t ret;
540 :
541 13144423 : ret = xfs_ilock_iocb(iocb, iolock);
542 13140952 : if (ret)
543 : return ret;
544 13141122 : ret = xfs_file_write_checks(iocb, from, &iolock);
545 13141226 : if (ret)
546 623 : goto out_unlock;
547 :
548 : /*
549 : * We don't need to hold the IOLOCK exclusively across the IO, so demote
550 : * the iolock back to shared if we had to take the exclusive lock in
551 : * xfs_file_write_checks() for other reasons.
552 : */
553 13140603 : if (iolock == XFS_IOLOCK_EXCL) {
554 869503 : xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
555 869539 : iolock = XFS_IOLOCK_SHARED;
556 : }
557 13140639 : trace_xfs_file_direct_write(iocb, from);
558 13137665 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
559 : &xfs_dio_write_ops, 0, NULL, 0);
560 13148165 : out_unlock:
561 13148165 : if (iolock)
562 13147984 : xfs_iunlock(ip, iolock);
563 : return ret;
564 : }
565 :
566 : /*
567 : * Handle block unaligned direct I/O writes
568 : *
569 : * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
570 : * them to be done in parallel with reads and other direct I/O writes. However,
571 : * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
572 : * to do sub-block zeroing and that requires serialisation against other direct
573 : * I/O to the same block. In this case we need to serialise the submission of
574 : * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
575 : * In the case where sub-block zeroing is not required, we can do concurrent
576 : * sub-block dios to the same block successfully.
577 : *
578 : * Optimistically submit the I/O using the shared lock first, but use the
579 : * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
580 : * if block allocation or partial block zeroing would be required. In that case
581 : * we try again with the exclusive lock.
582 : */
583 : static noinline ssize_t
584 11620044 : xfs_file_dio_write_unaligned(
585 : struct xfs_inode *ip,
586 : struct kiocb *iocb,
587 : struct iov_iter *from)
588 : {
589 11620044 : size_t isize = i_size_read(VFS_I(ip));
590 11620044 : size_t count = iov_iter_count(from);
591 11620044 : unsigned int iolock = XFS_IOLOCK_SHARED;
592 11620044 : unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
593 11620044 : ssize_t ret;
594 :
595 : /*
596 : * Extending writes need exclusivity because of the sub-block zeroing
597 : * that the DIO code always does for partial tail blocks beyond EOF, so
598 : * don't even bother trying the fast path in this case.
599 : */
600 11620044 : if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
601 7153462 : if (iocb->ki_flags & IOCB_NOWAIT)
602 : return -EAGAIN;
603 7153462 : retry_exclusive:
604 8415027 : iolock = XFS_IOLOCK_EXCL;
605 8415027 : flags = IOMAP_DIO_FORCE_WAIT;
606 : }
607 :
608 12881609 : ret = xfs_ilock_iocb(iocb, iolock);
609 12881400 : if (ret)
610 : return ret;
611 :
612 : /*
613 : * We can't properly handle unaligned direct I/O to reflink files yet,
614 : * as we can't unshare a partial block.
615 : */
616 12881399 : if (xfs_is_cow_inode(ip)) {
617 6960397 : trace_xfs_reflink_bounce_dio_write(iocb, from);
618 6960383 : ret = -ENOTBLK;
619 6960383 : goto out_unlock;
620 : }
621 :
622 5920931 : ret = xfs_file_write_checks(iocb, from, &iolock);
623 5920617 : if (ret)
624 103 : goto out_unlock;
625 :
626 : /*
627 : * If we are doing exclusive unaligned I/O, this must be the only I/O
628 : * in-flight. Otherwise we risk data corruption due to unwritten extent
629 : * conversions from the AIO end_io handler. Wait for all other I/O to
630 : * drain first.
631 : */
632 5920514 : if (flags & IOMAP_DIO_FORCE_WAIT)
633 4390853 : inode_dio_wait(VFS_I(ip));
634 :
635 5920386 : trace_xfs_file_direct_write(iocb, from);
636 5920396 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
637 : &xfs_dio_write_ops, flags, NULL, 0);
638 :
639 : /*
640 : * Retry unaligned I/O with exclusive blocking semantics if the DIO
641 : * layer rejected it for mapping or locking reasons. If we are doing
642 : * nonblocking user I/O, propagate the error.
643 : */
644 5920692 : if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
645 1261579 : ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
646 1261579 : xfs_iunlock(ip, iolock);
647 1261565 : goto retry_exclusive;
648 : }
649 :
650 4659113 : out_unlock:
651 11619599 : if (iolock)
652 11619279 : xfs_iunlock(ip, iolock);
653 : return ret;
654 : }
655 :
656 : static ssize_t
657 24765437 : xfs_file_dio_write(
658 : struct kiocb *iocb,
659 : struct iov_iter *from)
660 : {
661 24765437 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
662 24765437 : struct xfs_buftarg *target = xfs_inode_buftarg(ip);
663 24765437 : size_t count = iov_iter_count(from);
664 :
665 : /* direct I/O must be aligned to device logical sector size */
666 24765437 : if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
667 : return -EINVAL;
668 24765437 : if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
669 11620116 : return xfs_file_dio_write_unaligned(ip, iocb, from);
670 13145321 : return xfs_file_dio_write_aligned(ip, iocb, from);
671 : }
672 :
673 : static noinline ssize_t
674 0 : xfs_file_dax_write(
675 : struct kiocb *iocb,
676 : struct iov_iter *from)
677 : {
678 0 : struct inode *inode = iocb->ki_filp->f_mapping->host;
679 0 : struct xfs_inode *ip = XFS_I(inode);
680 0 : unsigned int iolock = XFS_IOLOCK_EXCL;
681 0 : ssize_t ret, error = 0;
682 0 : loff_t pos;
683 :
684 0 : ret = xfs_ilock_iocb(iocb, iolock);
685 0 : if (ret)
686 : return ret;
687 0 : ret = xfs_file_write_checks(iocb, from, &iolock);
688 0 : if (ret)
689 0 : goto out;
690 :
691 0 : pos = iocb->ki_pos;
692 :
693 0 : trace_xfs_file_dax_write(iocb, from);
694 0 : ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
695 0 : if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
696 0 : i_size_write(inode, iocb->ki_pos);
697 0 : error = xfs_setfilesize(ip, pos, ret);
698 : }
699 0 : out:
700 0 : if (iolock)
701 0 : xfs_iunlock(ip, iolock);
702 0 : if (error)
703 : return error;
704 :
705 0 : if (ret > 0) {
706 0 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
707 :
708 : /* Handle various SYNC-type writes */
709 0 : ret = generic_write_sync(iocb, ret);
710 : }
711 : return ret;
712 : }
713 :
714 : STATIC ssize_t
715 248879643 : xfs_file_buffered_write(
716 : struct kiocb *iocb,
717 : struct iov_iter *from)
718 : {
719 248879643 : struct inode *inode = iocb->ki_filp->f_mapping->host;
720 248879643 : struct xfs_inode *ip = XFS_I(inode);
721 248879643 : ssize_t ret;
722 248879643 : bool cleared_space = false;
723 250000334 : unsigned int iolock;
724 :
725 : write_retry:
726 250000334 : iolock = XFS_IOLOCK_EXCL;
727 250000334 : ret = xfs_ilock_iocb(iocb, iolock);
728 249577708 : if (ret)
729 0 : return ret;
730 :
731 249577708 : ret = xfs_file_write_checks(iocb, from, &iolock);
732 249618491 : if (ret)
733 13727 : goto out;
734 :
735 249604764 : trace_xfs_file_buffered_write(iocb, from);
736 249269792 : ret = iomap_file_buffered_write(iocb, from,
737 : &xfs_buffered_write_iomap_ops);
738 :
739 : /*
740 : * If we hit a space limit, try to free up some lingering preallocated
741 : * space before returning an error. In the case of ENOSPC, first try to
742 : * write back all dirty inodes to free up some of the excess reserved
743 : * metadata space. This reduces the chances that the eofblocks scan
744 : * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
745 : * also behaves as a filter to prevent too many eofblocks scans from
746 : * running at the same time. Use a synchronous scan to increase the
747 : * effectiveness of the scan.
748 : */
749 249922409 : if (ret == -EDQUOT && !cleared_space) {
750 1376 : xfs_iunlock(ip, iolock);
751 1376 : xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
752 1376 : cleared_space = true;
753 1376 : goto write_retry;
754 249921033 : } else if (ret == -ENOSPC && !cleared_space) {
755 1119263 : struct xfs_icwalk icw = {0};
756 :
757 1119263 : cleared_space = true;
758 1119263 : xfs_flush_inodes(ip->i_mount);
759 :
760 1119165 : xfs_iunlock(ip, iolock);
761 1118855 : icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
762 1118855 : xfs_blockgc_free_space(ip->i_mount, &icw);
763 1119315 : goto write_retry;
764 : }
765 :
766 248801770 : out:
767 248815497 : if (iolock)
768 248699812 : xfs_iunlock(ip, iolock);
769 :
770 248915241 : if (ret > 0) {
771 247803481 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
772 : /* Handle various SYNC-type writes */
773 247864185 : ret = generic_write_sync(iocb, ret);
774 : }
775 : return ret;
776 : }
777 :
778 : STATIC ssize_t
779 266839291 : xfs_file_write_iter(
780 : struct kiocb *iocb,
781 : struct iov_iter *from)
782 : {
783 266839291 : struct inode *inode = iocb->ki_filp->f_mapping->host;
784 266839291 : struct xfs_inode *ip = XFS_I(inode);
785 266839291 : ssize_t ret;
786 266839291 : size_t ocount = iov_iter_count(from);
787 :
788 266839291 : XFS_STATS_INC(ip->i_mount, xs_write_calls);
789 :
790 266781795 : if (ocount == 0)
791 : return 0;
792 :
793 533563098 : if (xfs_is_shutdown(ip->i_mount))
794 : return -EIO;
795 :
796 266770649 : if (IS_DAX(inode))
797 0 : return xfs_file_dax_write(iocb, from);
798 :
799 266770649 : if (iocb->ki_flags & IOCB_DIRECT) {
800 : /*
801 : * Allow a directio write to fall back to a buffered
802 : * write *only* in the case that we're doing a reflink
803 : * CoW. In all other directio scenarios we do not
804 : * allow an operation to fall back to buffered mode.
805 : */
806 24766493 : ret = xfs_file_dio_write(iocb, from);
807 24762124 : if (ret != -ENOTBLK)
808 : return ret;
809 : }
810 :
811 248964877 : return xfs_file_buffered_write(iocb, from);
812 : }
813 :
814 : /* Does this file, inode, or mount want synchronous writes? */
815 287042623 : static inline bool xfs_file_sync_writes(struct file *filp)
816 : {
817 287042623 : struct xfs_inode *ip = XFS_I(file_inode(filp));
818 :
819 287042623 : if (xfs_has_wsync(ip->i_mount))
820 : return true;
821 287042599 : if (filp->f_flags & (__O_SYNC | O_DSYNC))
822 : return true;
823 287012696 : if (IS_SYNC(file_inode(filp)))
824 13 : return true;
825 :
826 : return false;
827 : }
828 :
829 : #define XFS_FALLOC_FL_SUPPORTED \
830 : (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
831 : FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
832 : FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
833 :
834 : STATIC long
835 57295415 : xfs_file_fallocate(
836 : struct file *file,
837 : int mode,
838 : loff_t offset,
839 : loff_t len)
840 : {
841 57295415 : struct inode *inode = file_inode(file);
842 57295415 : struct xfs_inode *ip = XFS_I(inode);
843 57295415 : long error;
844 57295415 : uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
845 57295415 : loff_t new_size = 0;
846 57295415 : bool do_file_insert = false;
847 :
848 57295415 : if (!S_ISREG(inode->i_mode))
849 : return -EINVAL;
850 57295415 : if (mode & ~XFS_FALLOC_FL_SUPPORTED)
851 : return -EOPNOTSUPP;
852 :
853 57295415 : xfs_ilock(ip, iolock);
854 57295578 : error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
855 57295514 : if (error)
856 0 : goto out_unlock;
857 :
858 : /*
859 : * Must wait for all AIO to complete before we continue as AIO can
860 : * change the file size on completion without holding any locks we
861 : * currently hold. We must do this first because AIO can update both
862 : * the on disk and in memory inode sizes, and the operations that follow
863 : * require the in-memory size to be fully up-to-date.
864 : */
865 57295514 : inode_dio_wait(inode);
866 :
867 : /*
868 : * Now AIO and DIO has drained we flush and (if necessary) invalidate
869 : * the cached range over the first operation we are about to run.
870 : *
871 : * We care about zero and collapse here because they both run a hole
872 : * punch over the range first. Because that can zero data, and the range
873 : * of invalidation for the shift operations is much larger, we still do
874 : * the required flush for collapse in xfs_prepare_shift().
875 : *
876 : * Insert has the same range requirements as collapse, and we extend the
877 : * file first which can zero data. Hence insert has the same
878 : * flush/invalidate requirements as collapse and so they are both
879 : * handled at the right time by xfs_prepare_shift().
880 : */
881 57294810 : if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
882 : FALLOC_FL_COLLAPSE_RANGE)) {
883 45796554 : error = xfs_flush_unmap_range(ip, offset, len);
884 45796466 : if (error)
885 337 : goto out_unlock;
886 : }
887 :
888 57294385 : error = file_modified(file);
889 57295436 : if (error)
890 13 : goto out_unlock;
891 :
892 57295423 : if (mode & FALLOC_FL_PUNCH_HOLE) {
893 38901176 : error = xfs_free_file_space(ip, offset, len);
894 38901334 : if (error)
895 35669 : goto out_unlock;
896 18394247 : } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
897 3022246 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
898 397835 : error = -EINVAL;
899 397835 : goto out_unlock;
900 : }
901 :
902 : /*
903 : * There is no need to overlap collapse range with EOF,
904 : * in which case it is effectively a truncate operation
905 : */
906 2624405 : if (offset + len >= i_size_read(inode)) {
907 218380 : error = -EINVAL;
908 218380 : goto out_unlock;
909 : }
910 :
911 2406025 : new_size = i_size_read(inode) - len;
912 :
913 2406025 : error = xfs_collapse_file_space(ip, offset, len);
914 2406029 : if (error)
915 1850 : goto out_unlock;
916 15372001 : } else if (mode & FALLOC_FL_INSERT_RANGE) {
917 2348969 : loff_t isize = i_size_read(inode);
918 :
919 2348969 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
920 382201 : error = -EINVAL;
921 382201 : goto out_unlock;
922 : }
923 :
924 : /*
925 : * New inode size must not exceed ->s_maxbytes, accounting for
926 : * possible signed overflow.
927 : */
928 1966769 : if (inode->i_sb->s_maxbytes - isize < len) {
929 10 : error = -EFBIG;
930 10 : goto out_unlock;
931 : }
932 1966759 : new_size = isize + len;
933 :
934 : /* Offset should be less than i_size */
935 1966759 : if (offset >= isize) {
936 170770 : error = -EINVAL;
937 170770 : goto out_unlock;
938 : }
939 : do_file_insert = true;
940 : } else {
941 13023032 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
942 7230985 : offset + len > i_size_read(inode)) {
943 4893753 : new_size = offset + len;
944 4893753 : error = inode_newsize_ok(inode, new_size);
945 4893760 : if (error)
946 10 : goto out_unlock;
947 : }
948 :
949 13023029 : if (mode & FALLOC_FL_ZERO_RANGE) {
950 : /*
951 : * Punch a hole and prealloc the range. We use a hole
952 : * punch rather than unwritten extent conversion for two
953 : * reasons:
954 : *
955 : * 1.) Hole punch handles partial block zeroing for us.
956 : * 2.) If prealloc returns ENOSPC, the file range is
957 : * still zero-valued by virtue of the hole punch.
958 : */
959 3873130 : unsigned int blksize = i_blocksize(inode);
960 :
961 3873127 : trace_xfs_zero_file_space(ip);
962 :
963 3873118 : error = xfs_free_file_space(ip, offset, len);
964 3873137 : if (error)
965 13273 : goto out_unlock;
966 :
967 3859864 : len = round_up(offset + len, blksize) -
968 3859864 : round_down(offset, blksize);
969 3859864 : offset = round_down(offset, blksize);
970 9149899 : } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
971 205 : error = xfs_reflink_unshare(ip, offset, len);
972 205 : if (error)
973 6 : goto out_unlock;
974 : } else {
975 : /*
976 : * If always_cow mode we can't use preallocations and
977 : * thus should not create them.
978 : */
979 9149694 : if (xfs_is_always_cow_inode(ip)) {
980 424145 : error = -EOPNOTSUPP;
981 424145 : goto out_unlock;
982 : }
983 : }
984 :
985 12585538 : if (!xfs_is_always_cow_inode(ip)) {
986 12340201 : error = xfs_alloc_file_space(ip, offset, len);
987 12340348 : if (error)
988 278781 : goto out_unlock;
989 : }
990 : }
991 :
992 : /* Change file size if needed */
993 55372739 : if (new_size) {
994 8764321 : struct iattr iattr;
995 :
996 8764321 : iattr.ia_valid = ATTR_SIZE;
997 8764321 : iattr.ia_size = new_size;
998 17528641 : error = xfs_vn_setattr_size(file_mnt_idmap(file),
999 : file_dentry(file), &iattr);
1000 8764311 : if (error)
1001 1256 : goto out_unlock;
1002 : }
1003 :
1004 : /*
1005 : * Perform hole insertion now that the file size has been
1006 : * updated so that if we crash during the operation we don't
1007 : * leave shifted extents past EOF and hence losing access to
1008 : * the data that is contained within them.
1009 : */
1010 55371473 : if (do_file_insert) {
1011 1795519 : error = xfs_insert_file_space(ip, offset, len);
1012 1795520 : if (error)
1013 2007 : goto out_unlock;
1014 : }
1015 :
1016 55369467 : if (xfs_file_sync_writes(file))
1017 29892 : error = xfs_log_force_inode(ip);
1018 :
1019 55339575 : out_unlock:
1020 57296010 : xfs_iunlock(ip, iolock);
1021 57296010 : return error;
1022 : }
1023 :
1024 : STATIC int
1025 11750559 : xfs_file_fadvise(
1026 : struct file *file,
1027 : loff_t start,
1028 : loff_t end,
1029 : int advice)
1030 : {
1031 11750559 : struct xfs_inode *ip = XFS_I(file_inode(file));
1032 11750559 : int ret;
1033 11750559 : int lockflags = 0;
1034 :
1035 : /*
1036 : * Operations creating pages in page cache need protection from hole
1037 : * punching and similar ops
1038 : */
1039 11750559 : if (advice == POSIX_FADV_WILLNEED) {
1040 0 : lockflags = XFS_IOLOCK_SHARED;
1041 0 : xfs_ilock(ip, lockflags);
1042 : }
1043 11750559 : ret = generic_fadvise(file, start, end, advice);
1044 11742090 : if (lockflags)
1045 0 : xfs_iunlock(ip, lockflags);
1046 11742090 : return ret;
1047 : }
1048 :
1049 : STATIC loff_t
1050 290961369 : xfs_file_remap_range(
1051 : struct file *file_in,
1052 : loff_t pos_in,
1053 : struct file *file_out,
1054 : loff_t pos_out,
1055 : loff_t len,
1056 : unsigned int remap_flags)
1057 : {
1058 290961369 : struct inode *inode_in = file_inode(file_in);
1059 290961369 : struct xfs_inode *src = XFS_I(inode_in);
1060 290961369 : struct inode *inode_out = file_inode(file_out);
1061 290961369 : struct xfs_inode *dest = XFS_I(inode_out);
1062 290961369 : struct xfs_mount *mp = src->i_mount;
1063 290961369 : loff_t remapped = 0;
1064 290961369 : xfs_extlen_t cowextsize;
1065 290961369 : int ret;
1066 :
1067 290961369 : if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1068 : return -EINVAL;
1069 :
1070 290961369 : if (!xfs_has_reflink(mp))
1071 : return -EOPNOTSUPP;
1072 :
1073 380505334 : if (xfs_is_shutdown(mp))
1074 : return -EIO;
1075 :
1076 : /* Prepare and then clone file data. */
1077 190245782 : ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1078 : &len, remap_flags);
1079 190262254 : if (ret || len == 0)
1080 73549180 : return ret;
1081 :
1082 116713074 : trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1083 :
1084 116708541 : ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1085 : &remapped);
1086 116694388 : if (ret)
1087 837038 : goto out_unlock;
1088 :
1089 : /*
1090 : * Carry the cowextsize hint from src to dest if we're sharing the
1091 : * entire source file to the entire destination file, the source file
1092 : * has a cowextsize hint, and the destination file does not.
1093 : */
1094 115857350 : cowextsize = 0;
1095 115857350 : if (pos_in == 0 && len == i_size_read(inode_in) &&
1096 103898 : (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1097 140 : pos_out == 0 && len >= i_size_read(inode_out) &&
1098 134 : !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1099 18 : cowextsize = src->i_cowextsize;
1100 :
1101 115857350 : ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1102 : remap_flags);
1103 115840295 : if (ret)
1104 0 : goto out_unlock;
1105 :
1106 115840295 : if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1107 670 : xfs_log_force_inode(dest);
1108 115841210 : out_unlock:
1109 116678296 : xfs_iunlock2_io_mmap(src, dest);
1110 116704854 : if (ret)
1111 837053 : trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1112 116704796 : return remapped > 0 ? remapped : ret;
1113 : }
1114 :
1115 : STATIC int
1116 607836608 : xfs_file_open(
1117 : struct inode *inode,
1118 : struct file *file)
1119 : {
1120 1215673216 : if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1121 : return -EIO;
1122 607822907 : file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1123 : FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
1124 607822907 : return generic_file_open(inode, file);
1125 : }
1126 :
1127 : STATIC int
1128 46111457 : xfs_dir_open(
1129 : struct inode *inode,
1130 : struct file *file)
1131 : {
1132 46111457 : struct xfs_inode *ip = XFS_I(inode);
1133 46111457 : unsigned int mode;
1134 46111457 : int error;
1135 :
1136 46111457 : error = xfs_file_open(inode, file);
1137 46047783 : if (error)
1138 : return error;
1139 :
1140 : /*
1141 : * If there are any blocks, read-ahead block 0 as we're almost
1142 : * certain to have the next operation be a read there.
1143 : */
1144 46036743 : mode = xfs_ilock_data_map_shared(ip);
1145 46001325 : if (ip->i_df.if_nextents > 0)
1146 10540538 : error = xfs_dir3_data_readahead(ip, 0, 0);
1147 46009844 : xfs_iunlock(ip, mode);
1148 46009844 : return error;
1149 : }
1150 :
1151 : /*
1152 : * When we release the file, we don't want it to trim EOF blocks if it is a
1153 : * readonly context. This avoids open/read/close workloads from removing
1154 : * EOF blocks that other writers depend upon to reduce fragmentation.
1155 : */
1156 : STATIC int
1157 561397852 : xfs_file_release(
1158 : struct inode *inode,
1159 : struct file *file)
1160 : {
1161 561397852 : bool free_eof_blocks = true;
1162 :
1163 561397852 : if ((file->f_mode & (FMODE_WRITE | FMODE_READ)) == FMODE_READ)
1164 117116664 : free_eof_blocks = false;
1165 :
1166 561397852 : return xfs_release(XFS_I(inode), free_eof_blocks);
1167 : }
1168 :
1169 : STATIC int
1170 96953043 : xfs_file_readdir(
1171 : struct file *file,
1172 : struct dir_context *ctx)
1173 : {
1174 96953043 : struct inode *inode = file_inode(file);
1175 96953043 : xfs_inode_t *ip = XFS_I(inode);
1176 96953043 : size_t bufsize;
1177 :
1178 : /*
1179 : * The Linux API doesn't pass down the total size of the buffer
1180 : * we read into down to the filesystem. With the filldir concept
1181 : * it's not needed for correct information, but the XFS dir2 leaf
1182 : * code wants an estimate of the buffer size to calculate it's
1183 : * readahead window and size the buffers used for mapping to
1184 : * physical blocks.
1185 : *
1186 : * Try to give it an estimate that's good enough, maybe at some
1187 : * point we can change the ->readdir prototype to include the
1188 : * buffer size. For now we use the current glibc buffer size.
1189 : */
1190 96953043 : bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1191 :
1192 96953043 : return xfs_readdir(NULL, ip, ctx, bufsize);
1193 : }
1194 :
1195 : STATIC loff_t
1196 91736405 : xfs_file_llseek(
1197 : struct file *file,
1198 : loff_t offset,
1199 : int whence)
1200 : {
1201 91736405 : struct inode *inode = file->f_mapping->host;
1202 :
1203 183472810 : if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1204 : return -EIO;
1205 :
1206 91736400 : switch (whence) {
1207 91492229 : default:
1208 91492229 : return generic_file_llseek(file, offset, whence);
1209 1482 : case SEEK_HOLE:
1210 1482 : offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1211 1482 : break;
1212 242689 : case SEEK_DATA:
1213 242689 : offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1214 242689 : break;
1215 : }
1216 :
1217 244171 : if (offset < 0)
1218 : return offset;
1219 193956 : return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1220 : }
1221 :
1222 : #ifdef CONFIG_FS_DAX
1223 : static inline vm_fault_t
1224 0 : xfs_dax_fault(
1225 : struct vm_fault *vmf,
1226 : enum page_entry_size pe_size,
1227 : bool write_fault,
1228 : pfn_t *pfn)
1229 : {
1230 0 : return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1231 0 : (write_fault && !vmf->cow_page) ?
1232 : &xfs_dax_write_iomap_ops :
1233 : &xfs_read_iomap_ops);
1234 : }
1235 : #else
1236 : static inline vm_fault_t
1237 : xfs_dax_fault(
1238 : struct vm_fault *vmf,
1239 : enum page_entry_size pe_size,
1240 : bool write_fault,
1241 : pfn_t *pfn)
1242 : {
1243 : ASSERT(0);
1244 : return VM_FAULT_SIGBUS;
1245 : }
1246 : #endif
1247 :
1248 : /*
1249 : * Locking for serialisation of IO during page faults. This results in a lock
1250 : * ordering of:
1251 : *
1252 : * mmap_lock (MM)
1253 : * sb_start_pagefault(vfs, freeze)
1254 : * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1255 : * page_lock (MM)
1256 : * i_lock (XFS - extent map serialisation)
1257 : */
1258 : static vm_fault_t
1259 196789862 : __xfs_filemap_fault(
1260 : struct vm_fault *vmf,
1261 : enum page_entry_size pe_size,
1262 : bool write_fault)
1263 : {
1264 196789862 : struct inode *inode = file_inode(vmf->vma->vm_file);
1265 196789862 : struct xfs_inode *ip = XFS_I(inode);
1266 196789862 : vm_fault_t ret;
1267 :
1268 196789862 : trace_xfs_filemap_fault(ip, pe_size, write_fault);
1269 :
1270 196819350 : if (write_fault) {
1271 83828510 : sb_start_pagefault(inode->i_sb);
1272 83673803 : file_update_time(vmf->vma->vm_file);
1273 : }
1274 :
1275 196678961 : if (IS_DAX(inode)) {
1276 0 : pfn_t pfn;
1277 :
1278 0 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1279 0 : ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1280 0 : if (ret & VM_FAULT_NEEDDSYNC)
1281 0 : ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1282 0 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1283 : } else {
1284 196678961 : if (write_fault) {
1285 83686865 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1286 83691715 : ret = iomap_page_mkwrite(vmf,
1287 : &xfs_page_mkwrite_iomap_ops);
1288 83623620 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1289 : } else {
1290 112992096 : ret = filemap_fault(vmf);
1291 : }
1292 : }
1293 :
1294 196807112 : if (write_fault)
1295 83759636 : sb_end_pagefault(inode->i_sb);
1296 196766581 : return ret;
1297 : }
1298 :
1299 : static inline bool
1300 : xfs_is_write_fault(
1301 : struct vm_fault *vmf)
1302 : {
1303 0 : return (vmf->flags & FAULT_FLAG_WRITE) &&
1304 0 : (vmf->vma->vm_flags & VM_SHARED);
1305 : }
1306 :
1307 : static vm_fault_t
1308 113045276 : xfs_filemap_fault(
1309 : struct vm_fault *vmf)
1310 : {
1311 : /* DAX can shortcut the normal fault path on write faults! */
1312 113045276 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1313 113045276 : IS_DAX(file_inode(vmf->vma->vm_file)) &&
1314 : xfs_is_write_fault(vmf));
1315 : }
1316 :
1317 : static vm_fault_t
1318 16437 : xfs_filemap_huge_fault(
1319 : struct vm_fault *vmf,
1320 : enum page_entry_size pe_size)
1321 : {
1322 16437 : if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1323 : return VM_FAULT_FALLBACK;
1324 :
1325 : /* DAX can shortcut the normal fault path on write faults! */
1326 0 : return __xfs_filemap_fault(vmf, pe_size,
1327 : xfs_is_write_fault(vmf));
1328 : }
1329 :
1330 : static vm_fault_t
1331 83870314 : xfs_filemap_page_mkwrite(
1332 : struct vm_fault *vmf)
1333 : {
1334 83870314 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1335 : }
1336 :
1337 : /*
1338 : * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1339 : * on write faults. In reality, it needs to serialise against truncate and
1340 : * prepare memory for writing so handle is as standard write fault.
1341 : */
1342 : static vm_fault_t
1343 0 : xfs_filemap_pfn_mkwrite(
1344 : struct vm_fault *vmf)
1345 : {
1346 :
1347 0 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1348 : }
1349 :
1350 : static const struct vm_operations_struct xfs_file_vm_ops = {
1351 : .fault = xfs_filemap_fault,
1352 : .huge_fault = xfs_filemap_huge_fault,
1353 : .map_pages = filemap_map_pages,
1354 : .page_mkwrite = xfs_filemap_page_mkwrite,
1355 : .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1356 : };
1357 :
1358 : STATIC int
1359 14074434 : xfs_file_mmap(
1360 : struct file *file,
1361 : struct vm_area_struct *vma)
1362 : {
1363 14074434 : struct inode *inode = file_inode(file);
1364 14074434 : struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1365 :
1366 : /*
1367 : * We don't support synchronous mappings for non-DAX files and
1368 : * for DAX files if underneath dax_device is not synchronous.
1369 : */
1370 14074434 : if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1371 : return -EOPNOTSUPP;
1372 :
1373 14074231 : file_accessed(file);
1374 14074427 : vma->vm_ops = &xfs_file_vm_ops;
1375 14074427 : if (IS_DAX(inode))
1376 0 : vm_flags_set(vma, VM_HUGEPAGE);
1377 : return 0;
1378 : }
1379 :
1380 : const struct file_operations xfs_file_operations = {
1381 : .llseek = xfs_file_llseek,
1382 : .read_iter = xfs_file_read_iter,
1383 : .write_iter = xfs_file_write_iter,
1384 : .splice_read = xfs_file_splice_read,
1385 : .splice_write = iter_file_splice_write,
1386 : .iopoll = iocb_bio_iopoll,
1387 : .unlocked_ioctl = xfs_file_ioctl,
1388 : #ifdef CONFIG_COMPAT
1389 : .compat_ioctl = xfs_file_compat_ioctl,
1390 : #endif
1391 : .mmap = xfs_file_mmap,
1392 : .mmap_supported_flags = MAP_SYNC,
1393 : .open = xfs_file_open,
1394 : .release = xfs_file_release,
1395 : .fsync = xfs_file_fsync,
1396 : .get_unmapped_area = thp_get_unmapped_area,
1397 : .fallocate = xfs_file_fallocate,
1398 : .fadvise = xfs_file_fadvise,
1399 : .remap_file_range = xfs_file_remap_range,
1400 : };
1401 :
1402 : const struct file_operations xfs_dir_file_operations = {
1403 : .open = xfs_dir_open,
1404 : .read = generic_read_dir,
1405 : .iterate_shared = xfs_file_readdir,
1406 : .llseek = generic_file_llseek,
1407 : .unlocked_ioctl = xfs_file_ioctl,
1408 : #ifdef CONFIG_COMPAT
1409 : .compat_ioctl = xfs_file_compat_ioctl,
1410 : #endif
1411 : .fsync = xfs_dir_fsync,
1412 : };
|