Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_inode.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_inode_item.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_dir2.h"
19 : #include "xfs_dir2_priv.h"
20 : #include "xfs_ioctl.h"
21 : #include "xfs_trace.h"
22 : #include "xfs_log.h"
23 : #include "xfs_icache.h"
24 : #include "xfs_pnfs.h"
25 : #include "xfs_iomap.h"
26 : #include "xfs_reflink.h"
27 :
28 : #include <linux/dax.h>
29 : #include <linux/falloc.h>
30 : #include <linux/backing-dev.h>
31 : #include <linux/mman.h>
32 : #include <linux/fadvise.h>
33 : #include <linux/mount.h>
34 :
35 : static const struct vm_operations_struct xfs_file_vm_ops;
36 :
37 : /*
38 : * Decide if the given file range is aligned to the size of the fundamental
39 : * allocation unit for the file.
40 : */
41 : static bool
42 1363624 : xfs_is_falloc_aligned(
43 : struct xfs_inode *ip,
44 : loff_t pos,
45 : long long int len)
46 : {
47 1363624 : struct xfs_mount *mp = ip->i_mount;
48 1363624 : uint64_t mask;
49 :
50 1363624 : if (XFS_IS_REALTIME_INODE(ip)) {
51 807650 : if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
52 8 : u64 rextbytes;
53 8 : u32 mod;
54 :
55 8 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
56 8 : div_u64_rem(pos, rextbytes, &mod);
57 8 : if (mod)
58 : return false;
59 4 : div_u64_rem(len, rextbytes, &mod);
60 4 : return mod == 0;
61 : }
62 403817 : mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
63 : } else {
64 959799 : mask = mp->m_sb.sb_blocksize - 1;
65 : }
66 :
67 1363616 : return !((pos | len) & mask);
68 : }
69 :
70 : /*
71 : * Fsync operations on directories are much simpler than on regular files,
72 : * as there is no file data to flush, and thus also no need for explicit
73 : * cache flush operations, and there are no non-transaction metadata updates
74 : * on directories either.
75 : */
76 : STATIC int
77 296764 : xfs_dir_fsync(
78 : struct file *file,
79 : loff_t start,
80 : loff_t end,
81 : int datasync)
82 : {
83 296764 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
84 :
85 296764 : trace_xfs_dir_fsync(ip);
86 296766 : return xfs_log_force_inode(ip);
87 : }
88 :
89 : static xfs_csn_t
90 1596268 : xfs_fsync_seq(
91 : struct xfs_inode *ip,
92 : bool datasync)
93 : {
94 1596268 : if (!xfs_ipincount(ip))
95 : return 0;
96 1596029 : if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
97 : return 0;
98 1527492 : return ip->i_itemp->ili_commit_seq;
99 : }
100 :
101 : /*
102 : * All metadata updates are logged, which means that we just have to flush the
103 : * log up to the latest LSN that touched the inode.
104 : *
105 : * If we have concurrent fsync/fdatasync() calls, we need them to all block on
106 : * the log force before we clear the ili_fsync_fields field. This ensures that
107 : * we don't get a racing sync operation that does not wait for the metadata to
108 : * hit the journal before returning. If we race with clearing ili_fsync_fields,
109 : * then all that will happen is the log force will do nothing as the lsn will
110 : * already be on disk. We can't race with setting ili_fsync_fields because that
111 : * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
112 : * shared until after the ili_fsync_fields is cleared.
113 : */
114 : static int
115 1596275 : xfs_fsync_flush_log(
116 : struct xfs_inode *ip,
117 : bool datasync,
118 : int *log_flushed)
119 : {
120 1596275 : int error = 0;
121 1596275 : xfs_csn_t seq;
122 :
123 1596275 : xfs_ilock(ip, XFS_ILOCK_SHARED);
124 1596275 : seq = xfs_fsync_seq(ip, datasync);
125 1596276 : if (seq) {
126 1527496 : error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
127 : log_flushed);
128 :
129 1526985 : spin_lock(&ip->i_itemp->ili_lock);
130 1527492 : ip->i_itemp->ili_fsync_fields = 0;
131 1527492 : spin_unlock(&ip->i_itemp->ili_lock);
132 : }
133 1596197 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
134 1596214 : return error;
135 : }
136 :
137 : STATIC int
138 10780990 : xfs_file_fsync(
139 : struct file *file,
140 : loff_t start,
141 : loff_t end,
142 : int datasync)
143 : {
144 10780990 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
145 10780990 : struct xfs_mount *mp = ip->i_mount;
146 10780990 : int error, err2;
147 10780990 : int log_flushed = 0;
148 :
149 10780990 : trace_xfs_file_fsync(ip);
150 :
151 10781247 : error = file_write_and_wait_range(file, start, end);
152 10781296 : if (error)
153 : return error;
154 :
155 21559308 : if (xfs_is_shutdown(mp))
156 : return -EIO;
157 :
158 10777542 : xfs_iflags_clear(ip, XFS_ITRUNCATED);
159 :
160 : /*
161 : * If we have an RT and/or log subvolume we need to make sure to flush
162 : * the write cache the device used for file data first. This is to
163 : * ensure newly written file data make it to disk before logging the new
164 : * inode size in case of an extending write.
165 : */
166 10777297 : if (XFS_IS_REALTIME_INODE(ip))
167 4311372 : error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
168 6465925 : else if (mp->m_logdev_targp != mp->m_ddev_targp)
169 0 : error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
170 :
171 : /*
172 : * Any inode that has dirty modifications in the log is pinned. The
173 : * racy check here for a pinned inode will not catch modifications
174 : * that happen concurrently to the fsync call, but fsync semantics
175 : * only require to sync previously completed I/O.
176 : */
177 10776527 : if (xfs_ipincount(ip)) {
178 1596278 : err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
179 1596094 : if (err2 && !error)
180 976 : error = err2;
181 : }
182 :
183 : /*
184 : * If we only have a single device, and the log force about was
185 : * a no-op we might have to flush the data device cache here.
186 : * This can only happen for fdatasync/O_DSYNC if we were overwriting
187 : * an already allocated file and thus do not have any metadata to
188 : * commit.
189 : */
190 10776343 : if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
191 5458059 : mp->m_logdev_targp == mp->m_ddev_targp) {
192 5457924 : err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
193 5458061 : if (err2 && !error)
194 210 : error = err2;
195 : }
196 :
197 : return error;
198 : }
199 :
200 : static int
201 369832508 : xfs_ilock_iocb(
202 : struct kiocb *iocb,
203 : unsigned int lock_mode)
204 : {
205 369832508 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
206 :
207 369832508 : if (iocb->ki_flags & IOCB_NOWAIT) {
208 0 : if (!xfs_ilock_nowait(ip, lock_mode))
209 0 : return -EAGAIN;
210 : } else {
211 369832508 : xfs_ilock(ip, lock_mode);
212 : }
213 :
214 : return 0;
215 : }
216 :
217 : STATIC ssize_t
218 254229897 : xfs_file_dio_read(
219 : struct kiocb *iocb,
220 : struct iov_iter *to)
221 : {
222 254229897 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
223 254229897 : ssize_t ret;
224 :
225 254229897 : trace_xfs_file_direct_read(iocb, to);
226 :
227 254229900 : if (!iov_iter_count(to))
228 : return 0; /* skip atime */
229 :
230 254229850 : file_accessed(iocb->ki_filp);
231 :
232 254229902 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
233 254229929 : if (ret)
234 : return ret;
235 254229929 : ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
236 254229907 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
237 :
238 254229907 : return ret;
239 : }
240 :
241 : static noinline ssize_t
242 : xfs_file_dax_read(
243 : struct kiocb *iocb,
244 : struct iov_iter *to)
245 : {
246 : struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
247 : ssize_t ret = 0;
248 :
249 : trace_xfs_file_dax_read(iocb, to);
250 :
251 : if (!iov_iter_count(to))
252 : return 0; /* skip atime */
253 :
254 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
255 : if (ret)
256 : return ret;
257 : ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
258 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
259 :
260 : file_accessed(iocb->ki_filp);
261 : return ret;
262 : }
263 :
264 : STATIC ssize_t
265 56551291 : xfs_file_buffered_read(
266 : struct kiocb *iocb,
267 : struct iov_iter *to)
268 : {
269 56551291 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
270 56551291 : ssize_t ret;
271 :
272 56551291 : trace_xfs_file_buffered_read(iocb, to);
273 :
274 56552844 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
275 56553489 : if (ret)
276 : return ret;
277 56553952 : ret = generic_file_read_iter(iocb, to);
278 56545118 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
279 :
280 56545118 : return ret;
281 : }
282 :
283 : STATIC ssize_t
284 310785580 : xfs_file_read_iter(
285 : struct kiocb *iocb,
286 : struct iov_iter *to)
287 : {
288 310785580 : struct inode *inode = file_inode(iocb->ki_filp);
289 310785580 : struct xfs_mount *mp = XFS_I(inode)->i_mount;
290 310785580 : ssize_t ret = 0;
291 :
292 310785580 : XFS_STATS_INC(mp, xs_read_calls);
293 :
294 621568152 : if (xfs_is_shutdown(mp))
295 : return -EIO;
296 :
297 310780450 : if (IS_DAX(inode))
298 : ret = xfs_file_dax_read(iocb, to);
299 310780450 : else if (iocb->ki_flags & IOCB_DIRECT)
300 254229851 : ret = xfs_file_dio_read(iocb, to);
301 : else
302 56550599 : ret = xfs_file_buffered_read(iocb, to);
303 :
304 310781473 : if (ret > 0)
305 57921598 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
306 : return ret;
307 : }
308 :
309 : STATIC ssize_t
310 4118990 : xfs_file_splice_read(
311 : struct file *in,
312 : loff_t *ppos,
313 : struct pipe_inode_info *pipe,
314 : size_t len,
315 : unsigned int flags)
316 : {
317 4118990 : struct inode *inode = file_inode(in);
318 4118990 : struct xfs_inode *ip = XFS_I(inode);
319 4118990 : struct xfs_mount *mp = ip->i_mount;
320 4118990 : ssize_t ret = 0;
321 :
322 4118990 : XFS_STATS_INC(mp, xs_read_calls);
323 :
324 8238004 : if (xfs_is_shutdown(mp))
325 : return -EIO;
326 :
327 4118995 : trace_xfs_file_splice_read(ip, *ppos, len);
328 :
329 4118997 : xfs_ilock(ip, XFS_IOLOCK_SHARED);
330 4118990 : ret = filemap_splice_read(in, ppos, pipe, len, flags);
331 4119005 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
332 4119005 : if (ret > 0)
333 4118865 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
334 : return ret;
335 : }
336 :
337 : /*
338 : * Common pre-write limit and setup checks.
339 : *
340 : * Called with the iolocked held either shared and exclusive according to
341 : * @iolock, and returns with it held. Might upgrade the iolock to exclusive
342 : * if called for a direct write beyond i_size.
343 : */
344 : STATIC ssize_t
345 56625354 : xfs_file_write_checks(
346 : struct kiocb *iocb,
347 : struct iov_iter *from,
348 : unsigned int *iolock)
349 : {
350 56625354 : struct file *file = iocb->ki_filp;
351 56625354 : struct inode *inode = file->f_mapping->host;
352 56625354 : struct xfs_inode *ip = XFS_I(inode);
353 56625354 : ssize_t error = 0;
354 56625354 : size_t count = iov_iter_count(from);
355 56625354 : bool drained_dio = false;
356 72238969 : loff_t isize;
357 :
358 : restart:
359 72238969 : error = generic_write_checks(iocb, from);
360 72212530 : if (error <= 0)
361 6 : return error;
362 :
363 72212524 : if (iocb->ki_flags & IOCB_NOWAIT) {
364 0 : error = break_layout(inode, false);
365 0 : if (error == -EWOULDBLOCK)
366 : error = -EAGAIN;
367 : } else {
368 72212524 : error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
369 : }
370 :
371 72236426 : if (error)
372 0 : return error;
373 :
374 : /*
375 : * For changing security info in file_remove_privs() we need i_rwsem
376 : * exclusively.
377 : */
378 72236426 : if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
379 25691 : xfs_iunlock(ip, *iolock);
380 25691 : *iolock = XFS_IOLOCK_EXCL;
381 25691 : error = xfs_ilock_iocb(iocb, *iolock);
382 25691 : if (error) {
383 0 : *iolock = 0;
384 0 : return error;
385 : }
386 25691 : goto restart;
387 : }
388 :
389 : /*
390 : * If the offset is beyond the size of the file, we need to zero any
391 : * blocks that fall between the existing EOF and the start of this
392 : * write. If zeroing is needed and we are currently holding the iolock
393 : * shared, we need to update it to exclusive which implies having to
394 : * redo all checks before.
395 : *
396 : * We need to serialise against EOF updates that occur in IO completions
397 : * here. We want to make sure that nobody is changing the size while we
398 : * do this check until we have placed an IO barrier (i.e. hold the
399 : * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
400 : * spinlock effectively forms a memory barrier once we have the
401 : * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
402 : * hence be able to correctly determine if we need to run zeroing.
403 : *
404 : * We can do an unlocked check here safely as IO completion can only
405 : * extend EOF. Truncate is locked out at this point, so the EOF can
406 : * not move backwards, only forwards. Hence we only need to take the
407 : * slow path and spin locks when we are at or beyond the current EOF.
408 : */
409 72210735 : if (iocb->ki_pos <= i_size_read(inode))
410 41054335 : goto out;
411 :
412 31156400 : spin_lock(&ip->i_flags_lock);
413 31155154 : isize = i_size_read(inode);
414 31155154 : if (iocb->ki_pos > isize) {
415 31155154 : spin_unlock(&ip->i_flags_lock);
416 :
417 31161308 : if (iocb->ki_flags & IOCB_NOWAIT)
418 : return -EAGAIN;
419 :
420 31161308 : if (!drained_dio) {
421 15587975 : if (*iolock == XFS_IOLOCK_SHARED) {
422 101835 : xfs_iunlock(ip, *iolock);
423 101835 : *iolock = XFS_IOLOCK_EXCL;
424 101835 : xfs_ilock(ip, *iolock);
425 101833 : iov_iter_reexpand(from, count);
426 : }
427 : /*
428 : * We now have an IO submission barrier in place, but
429 : * AIO can do EOF updates during IO completion and hence
430 : * we now need to wait for all of them to drain. Non-AIO
431 : * DIO will have drained before we are given the
432 : * XFS_IOLOCK_EXCL, and so for most cases this wait is a
433 : * no-op.
434 : */
435 15587973 : inode_dio_wait(inode);
436 15587924 : drained_dio = true;
437 15587924 : goto restart;
438 : }
439 :
440 15573333 : trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
441 15573743 : error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
442 15569394 : if (error)
443 : return error;
444 : } else
445 0 : spin_unlock(&ip->i_flags_lock);
446 :
447 56623114 : out:
448 56623114 : return kiocb_modified(iocb);
449 : }
450 :
451 : static int
452 6287111 : xfs_dio_write_end_io(
453 : struct kiocb *iocb,
454 : ssize_t size,
455 : int error,
456 : unsigned flags)
457 : {
458 6287111 : struct inode *inode = file_inode(iocb->ki_filp);
459 6287111 : struct xfs_inode *ip = XFS_I(inode);
460 6287111 : loff_t offset = iocb->ki_pos;
461 6287111 : unsigned int nofs_flag;
462 :
463 6287111 : trace_xfs_end_io_direct_write(ip, offset, size);
464 :
465 12574090 : if (xfs_is_shutdown(ip->i_mount))
466 : return -EIO;
467 :
468 6286376 : if (error)
469 : return error;
470 5494092 : if (!size)
471 : return 0;
472 :
473 : /*
474 : * Capture amount written on completion as we can't reliably account
475 : * for it on submission.
476 : */
477 5494092 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
478 :
479 : /*
480 : * We can allocate memory here while doing writeback on behalf of
481 : * memory reclaim. To avoid memory allocation deadlocks set the
482 : * task-wide nofs context for the following operations.
483 : */
484 5494075 : nofs_flag = memalloc_nofs_save();
485 :
486 5494075 : if (flags & IOMAP_DIO_COW) {
487 1103891 : error = xfs_reflink_end_cow(ip, offset, size);
488 1103891 : if (error)
489 29 : goto out;
490 : }
491 :
492 : /*
493 : * Unwritten conversion updates the in-core isize after extent
494 : * conversion but before updating the on-disk size. Updating isize any
495 : * earlier allows a racing dio read to find unwritten extents before
496 : * they are converted.
497 : */
498 5494046 : if (flags & IOMAP_DIO_UNWRITTEN) {
499 2671488 : error = xfs_iomap_write_unwritten(ip, offset, size, true);
500 2671592 : goto out;
501 : }
502 :
503 : /*
504 : * We need to update the in-core inode size here so that we don't end up
505 : * with the on-disk inode size being outside the in-core inode size. We
506 : * have no other method of updating EOF for AIO, so always do it here
507 : * if necessary.
508 : *
509 : * We need to lock the test/set EOF update as we can be racing with
510 : * other IO completions here to update the EOF. Failing to serialise
511 : * here can result in EOF moving backwards and Bad Things Happen when
512 : * that occurs.
513 : *
514 : * As IO completion only ever extends EOF, we can do an unlocked check
515 : * here to avoid taking the spinlock. If we land within the current EOF,
516 : * then we do not need to do an extending update at all, and we don't
517 : * need to take the lock to check this. If we race with an update moving
518 : * EOF, then we'll either still be beyond EOF and need to take the lock,
519 : * or we'll be within EOF and we don't need to take it at all.
520 : */
521 2822558 : if (offset + size <= i_size_read(inode))
522 2612239 : goto out;
523 :
524 210319 : spin_lock(&ip->i_flags_lock);
525 210319 : if (offset + size > i_size_read(inode)) {
526 210319 : i_size_write(inode, offset + size);
527 210319 : spin_unlock(&ip->i_flags_lock);
528 210319 : error = xfs_setfilesize(ip, offset, size);
529 : } else {
530 0 : spin_unlock(&ip->i_flags_lock);
531 : }
532 :
533 5494179 : out:
534 5494179 : memalloc_nofs_restore(nofs_flag);
535 5494179 : return error;
536 : }
537 :
538 : static const struct iomap_dio_ops xfs_dio_write_ops = {
539 : .end_io = xfs_dio_write_end_io,
540 : };
541 :
542 : /*
543 : * Handle block aligned direct I/O writes
544 : */
545 : static noinline ssize_t
546 3445034 : xfs_file_dio_write_aligned(
547 : struct xfs_inode *ip,
548 : struct kiocb *iocb,
549 : struct iov_iter *from)
550 : {
551 3445034 : unsigned int iolock = XFS_IOLOCK_SHARED;
552 3445034 : ssize_t ret;
553 :
554 3445034 : ret = xfs_ilock_iocb(iocb, iolock);
555 3445069 : if (ret)
556 : return ret;
557 3445065 : ret = xfs_file_write_checks(iocb, from, &iolock);
558 3445032 : if (ret)
559 575 : goto out_unlock;
560 :
561 : /*
562 : * We don't need to hold the IOLOCK exclusively across the IO, so demote
563 : * the iolock back to shared if we had to take the exclusive lock in
564 : * xfs_file_write_checks() for other reasons.
565 : */
566 3444457 : if (iolock == XFS_IOLOCK_EXCL) {
567 104697 : xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
568 104696 : iolock = XFS_IOLOCK_SHARED;
569 : }
570 3444456 : trace_xfs_file_direct_write(iocb, from);
571 3444445 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
572 : &xfs_dio_write_ops, 0, NULL, 0);
573 3445051 : out_unlock:
574 3445051 : if (iolock)
575 3445072 : xfs_iunlock(ip, iolock);
576 : return ret;
577 : }
578 :
579 : /*
580 : * Handle block unaligned direct I/O writes
581 : *
582 : * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
583 : * them to be done in parallel with reads and other direct I/O writes. However,
584 : * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
585 : * to do sub-block zeroing and that requires serialisation against other direct
586 : * I/O to the same block. In this case we need to serialise the submission of
587 : * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
588 : * In the case where sub-block zeroing is not required, we can do concurrent
589 : * sub-block dios to the same block successfully.
590 : *
591 : * Optimistically submit the I/O using the shared lock first, but use the
592 : * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
593 : * if block allocation or partial block zeroing would be required. In that case
594 : * we try again with the exclusive lock.
595 : */
596 : static noinline ssize_t
597 4589504 : xfs_file_dio_write_unaligned(
598 : struct xfs_inode *ip,
599 : struct kiocb *iocb,
600 : struct iov_iter *from)
601 : {
602 4589504 : size_t isize = i_size_read(VFS_I(ip));
603 4589504 : size_t count = iov_iter_count(from);
604 4589504 : unsigned int iolock = XFS_IOLOCK_SHARED;
605 4589504 : unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
606 4589504 : ssize_t ret;
607 :
608 : /*
609 : * Extending writes need exclusivity because of the sub-block zeroing
610 : * that the DIO code always does for partial tail blocks beyond EOF, so
611 : * don't even bother trying the fast path in this case.
612 : */
613 4589504 : if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
614 1743223 : if (iocb->ki_flags & IOCB_NOWAIT)
615 : return -EAGAIN;
616 1743223 : retry_exclusive:
617 2413792 : iolock = XFS_IOLOCK_EXCL;
618 2413792 : flags = IOMAP_DIO_FORCE_WAIT;
619 : }
620 :
621 5260073 : ret = xfs_ilock_iocb(iocb, iolock);
622 5260068 : if (ret)
623 : return ret;
624 :
625 : /*
626 : * We can't properly handle unaligned direct I/O to reflink files yet,
627 : * as we can't unshare a partial block.
628 : */
629 5260065 : if (xfs_is_cow_inode(ip)) {
630 2417249 : trace_xfs_reflink_bounce_dio_write(iocb, from);
631 2417250 : ret = -ENOTBLK;
632 2417250 : goto out_unlock;
633 : }
634 :
635 2842814 : ret = xfs_file_write_checks(iocb, from, &iolock);
636 2842812 : if (ret)
637 66 : goto out_unlock;
638 :
639 : /*
640 : * If we are doing exclusive unaligned I/O, this must be the only I/O
641 : * in-flight. Otherwise we risk data corruption due to unwritten extent
642 : * conversions from the AIO end_io handler. Wait for all other I/O to
643 : * drain first.
644 : */
645 2842746 : if (flags & IOMAP_DIO_FORCE_WAIT)
646 2066550 : inode_dio_wait(VFS_I(ip));
647 :
648 2842746 : trace_xfs_file_direct_write(iocb, from);
649 2842746 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
650 : &xfs_dio_write_ops, flags, NULL, 0);
651 :
652 : /*
653 : * Retry unaligned I/O with exclusive blocking semantics if the DIO
654 : * layer rejected it for mapping or locking reasons. If we are doing
655 : * nonblocking user I/O, propagate the error.
656 : */
657 2842746 : if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
658 670571 : ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
659 670571 : xfs_iunlock(ip, iolock);
660 670569 : goto retry_exclusive;
661 : }
662 :
663 2172175 : out_unlock:
664 4589491 : if (iolock)
665 4589491 : xfs_iunlock(ip, iolock);
666 : return ret;
667 : }
668 :
669 : static ssize_t
670 8034534 : xfs_file_dio_write(
671 : struct kiocb *iocb,
672 : struct iov_iter *from)
673 : {
674 8034534 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
675 8034534 : struct xfs_buftarg *target = xfs_inode_buftarg(ip);
676 8034534 : size_t count = iov_iter_count(from);
677 :
678 : /* direct I/O must be aligned to device logical sector size */
679 8034534 : if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
680 : return -EINVAL;
681 8034534 : if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
682 4589487 : return xfs_file_dio_write_unaligned(ip, iocb, from);
683 3445047 : return xfs_file_dio_write_aligned(ip, iocb, from);
684 : }
685 :
686 : static noinline ssize_t
687 : xfs_file_dax_write(
688 : struct kiocb *iocb,
689 : struct iov_iter *from)
690 : {
691 : struct inode *inode = iocb->ki_filp->f_mapping->host;
692 : struct xfs_inode *ip = XFS_I(inode);
693 : unsigned int iolock = XFS_IOLOCK_EXCL;
694 : ssize_t ret, error = 0;
695 : loff_t pos;
696 :
697 : ret = xfs_ilock_iocb(iocb, iolock);
698 : if (ret)
699 : return ret;
700 : ret = xfs_file_write_checks(iocb, from, &iolock);
701 : if (ret)
702 : goto out;
703 :
704 : pos = iocb->ki_pos;
705 :
706 : trace_xfs_file_dax_write(iocb, from);
707 : ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
708 : if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
709 : i_size_write(inode, iocb->ki_pos);
710 : error = xfs_setfilesize(ip, pos, ret);
711 : }
712 : out:
713 : if (iolock)
714 : xfs_iunlock(ip, iolock);
715 : if (error)
716 : return error;
717 :
718 : if (ret > 0) {
719 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
720 :
721 : /* Handle various SYNC-type writes */
722 : ret = generic_write_sync(iocb, ret);
723 : }
724 : return ret;
725 : }
726 :
727 : STATIC ssize_t
728 49913564 : xfs_file_buffered_write(
729 : struct kiocb *iocb,
730 : struct iov_iter *from)
731 : {
732 49913564 : struct inode *inode = iocb->ki_filp->f_mapping->host;
733 49913564 : struct xfs_inode *ip = XFS_I(inode);
734 49913564 : ssize_t ret;
735 49913564 : bool cleared_space = false;
736 50320147 : unsigned int iolock;
737 :
738 : write_retry:
739 50320147 : iolock = XFS_IOLOCK_EXCL;
740 50320147 : ret = xfs_ilock_iocb(iocb, iolock);
741 50340183 : if (ret)
742 0 : return ret;
743 :
744 50340183 : ret = xfs_file_write_checks(iocb, from, &iolock);
745 50332400 : if (ret)
746 649 : goto out;
747 :
748 50331751 : trace_xfs_file_buffered_write(iocb, from);
749 50339154 : ret = iomap_file_buffered_write(iocb, from,
750 : &xfs_buffered_write_iomap_ops);
751 :
752 : /*
753 : * If we hit a space limit, try to free up some lingering preallocated
754 : * space before returning an error. In the case of ENOSPC, first try to
755 : * write back all dirty inodes to free up some of the excess reserved
756 : * metadata space. This reduces the chances that the eofblocks scan
757 : * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
758 : * also behaves as a filter to prevent too many eofblocks scans from
759 : * running at the same time. Use a synchronous scan to increase the
760 : * effectiveness of the scan.
761 : */
762 50329316 : if (ret == -EDQUOT && !cleared_space) {
763 333 : xfs_iunlock(ip, iolock);
764 333 : xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
765 333 : cleared_space = true;
766 333 : goto write_retry;
767 50328983 : } else if (ret == -ENOSPC && !cleared_space) {
768 406270 : struct xfs_icwalk icw = {0};
769 :
770 406270 : cleared_space = true;
771 406270 : xfs_flush_inodes(ip->i_mount);
772 :
773 406166 : xfs_iunlock(ip, iolock);
774 406066 : icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
775 406066 : xfs_blockgc_free_space(ip->i_mount, &icw);
776 406250 : goto write_retry;
777 : }
778 :
779 49922713 : out:
780 49923362 : if (iolock)
781 49933547 : xfs_iunlock(ip, iolock);
782 :
783 49923458 : if (ret > 0) {
784 49562558 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
785 : /* Handle various SYNC-type writes */
786 49562448 : ret = generic_write_sync(iocb, ret);
787 : }
788 : return ret;
789 : }
790 :
791 : STATIC ssize_t
792 55533279 : xfs_file_write_iter(
793 : struct kiocb *iocb,
794 : struct iov_iter *from)
795 : {
796 55533279 : struct inode *inode = iocb->ki_filp->f_mapping->host;
797 55533279 : struct xfs_inode *ip = XFS_I(inode);
798 55533279 : ssize_t ret;
799 55533279 : size_t ocount = iov_iter_count(from);
800 :
801 55533279 : XFS_STATS_INC(ip->i_mount, xs_write_calls);
802 :
803 55557287 : if (ocount == 0)
804 : return 0;
805 :
806 111114510 : if (xfs_is_shutdown(ip->i_mount))
807 : return -EIO;
808 :
809 55547740 : if (IS_DAX(inode))
810 : return xfs_file_dax_write(iocb, from);
811 :
812 55547740 : if (iocb->ki_flags & IOCB_DIRECT) {
813 : /*
814 : * Allow a directio write to fall back to a buffered
815 : * write *only* in the case that we're doing a reflink
816 : * CoW. In all other directio scenarios we do not
817 : * allow an operation to fall back to buffered mode.
818 : */
819 8034528 : ret = xfs_file_dio_write(iocb, from);
820 8034489 : if (ret != -ENOTBLK)
821 : return ret;
822 : }
823 :
824 49930562 : return xfs_file_buffered_write(iocb, from);
825 : }
826 :
827 : static void
828 : xfs_wait_dax_page(
829 : struct inode *inode)
830 : {
831 : struct xfs_inode *ip = XFS_I(inode);
832 :
833 : xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
834 : schedule();
835 : xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
836 : }
837 :
838 : int
839 16480174 : xfs_break_dax_layouts(
840 : struct inode *inode,
841 : bool *retry)
842 : {
843 16480174 : struct page *page;
844 :
845 16480174 : ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
846 :
847 16480175 : page = dax_layout_busy_page(inode->i_mapping);
848 16480175 : if (!page)
849 16480175 : return 0;
850 :
851 : *retry = true;
852 : return ___wait_var_event(&page->_refcount,
853 : atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
854 : 0, 0, xfs_wait_dax_page(inode));
855 : }
856 :
857 : int
858 88706769 : xfs_break_layouts(
859 : struct inode *inode,
860 : uint *iolock,
861 : enum layout_break_reason reason)
862 : {
863 88706769 : bool retry;
864 88706769 : int error;
865 :
866 88706769 : ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
867 :
868 88710683 : do {
869 88710683 : retry = false;
870 88710683 : switch (reason) {
871 16480167 : case BREAK_UNMAP:
872 16480167 : error = xfs_break_dax_layouts(inode, &retry);
873 16480173 : if (error || retry)
874 : break;
875 88710680 : fallthrough;
876 : case BREAK_WRITE:
877 88710680 : error = xfs_break_leased_layouts(inode, iolock, &retry);
878 88710680 : break;
879 : default:
880 0 : WARN_ON_ONCE(1);
881 0 : error = -EINVAL;
882 : }
883 88712684 : } while (error == 0 && retry);
884 :
885 88711866 : return error;
886 : }
887 :
888 : /* Does this file, inode, or mount want synchronous writes? */
889 171321549 : static inline bool xfs_file_sync_writes(struct file *filp)
890 : {
891 171321549 : struct xfs_inode *ip = XFS_I(file_inode(filp));
892 :
893 171321549 : if (xfs_has_wsync(ip->i_mount))
894 : return true;
895 171321541 : if (filp->f_flags & (__O_SYNC | O_DSYNC))
896 : return true;
897 171314409 : if (IS_SYNC(file_inode(filp)))
898 4 : return true;
899 :
900 : return false;
901 : }
902 :
903 : #define XFS_FALLOC_FL_SUPPORTED \
904 : (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
905 : FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
906 : FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
907 :
908 : STATIC long
909 13378462 : xfs_file_fallocate(
910 : struct file *file,
911 : int mode,
912 : loff_t offset,
913 : loff_t len)
914 : {
915 13378462 : struct inode *inode = file_inode(file);
916 13378462 : struct xfs_inode *ip = XFS_I(inode);
917 13378462 : long error;
918 13378462 : uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
919 13378462 : loff_t new_size = 0;
920 13378462 : bool do_file_insert = false;
921 :
922 13378462 : if (!S_ISREG(inode->i_mode))
923 : return -EINVAL;
924 13378462 : if (mode & ~XFS_FALLOC_FL_SUPPORTED)
925 : return -EOPNOTSUPP;
926 :
927 13378462 : xfs_ilock(ip, iolock);
928 13378479 : error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
929 13378485 : if (error)
930 0 : goto out_unlock;
931 :
932 : /*
933 : * Must wait for all AIO to complete before we continue as AIO can
934 : * change the file size on completion without holding any locks we
935 : * currently hold. We must do this first because AIO can update both
936 : * the on disk and in memory inode sizes, and the operations that follow
937 : * require the in-memory size to be fully up-to-date.
938 : */
939 13378485 : inode_dio_wait(inode);
940 :
941 : /*
942 : * Now AIO and DIO has drained we flush and (if necessary) invalidate
943 : * the cached range over the first operation we are about to run.
944 : *
945 : * We care about zero and collapse here because they both run a hole
946 : * punch over the range first. Because that can zero data, and the range
947 : * of invalidation for the shift operations is much larger, we still do
948 : * the required flush for collapse in xfs_prepare_shift().
949 : *
950 : * Insert has the same range requirements as collapse, and we extend the
951 : * file first which can zero data. Hence insert has the same
952 : * flush/invalidate requirements as collapse and so they are both
953 : * handled at the right time by xfs_prepare_shift().
954 : */
955 13378453 : if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
956 : FALLOC_FL_COLLAPSE_RANGE)) {
957 9702637 : error = xfs_flush_unmap_range(ip, offset, len);
958 9702644 : if (error)
959 147 : goto out_unlock;
960 : }
961 :
962 13378313 : error = file_modified(file);
963 13378335 : if (error)
964 5 : goto out_unlock;
965 :
966 13378330 : if (mode & FALLOC_FL_PUNCH_HOLE) {
967 7451151 : error = xfs_free_file_space(ip, offset, len);
968 7451154 : if (error)
969 14926 : goto out_unlock;
970 5927179 : } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
971 733649 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
972 205920 : error = -EINVAL;
973 205920 : goto out_unlock;
974 : }
975 :
976 : /*
977 : * There is no need to overlap collapse range with EOF,
978 : * in which case it is effectively a truncate operation
979 : */
980 527729 : if (offset + len >= i_size_read(inode)) {
981 87858 : error = -EINVAL;
982 87858 : goto out_unlock;
983 : }
984 :
985 439871 : new_size = i_size_read(inode) - len;
986 :
987 439871 : error = xfs_collapse_file_space(ip, offset, len);
988 439871 : if (error)
989 1849 : goto out_unlock;
990 5193530 : } else if (mode & FALLOC_FL_INSERT_RANGE) {
991 629975 : loff_t isize = i_size_read(inode);
992 :
993 629975 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
994 206219 : error = -EINVAL;
995 206219 : goto out_unlock;
996 : }
997 :
998 : /*
999 : * New inode size must not exceed ->s_maxbytes, accounting for
1000 : * possible signed overflow.
1001 : */
1002 423756 : if (inode->i_sb->s_maxbytes - isize < len) {
1003 2 : error = -EFBIG;
1004 2 : goto out_unlock;
1005 : }
1006 423754 : new_size = isize + len;
1007 :
1008 : /* Offset should be less than i_size */
1009 423754 : if (offset >= isize) {
1010 71624 : error = -EINVAL;
1011 71624 : goto out_unlock;
1012 : }
1013 : do_file_insert = true;
1014 : } else {
1015 4563555 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
1016 2111455 : offset + len > i_size_read(inode)) {
1017 1149292 : new_size = offset + len;
1018 1149292 : error = inode_newsize_ok(inode, new_size);
1019 1149292 : if (error)
1020 2 : goto out_unlock;
1021 : }
1022 :
1023 4563553 : if (mode & FALLOC_FL_ZERO_RANGE) {
1024 : /*
1025 : * Punch a hole and prealloc the range. We use a hole
1026 : * punch rather than unwritten extent conversion for two
1027 : * reasons:
1028 : *
1029 : * 1.) Hole punch handles partial block zeroing for us.
1030 : * 2.) If prealloc returns ENOSPC, the file range is
1031 : * still zero-valued by virtue of the hole punch.
1032 : */
1033 1517694 : unsigned int blksize = i_blocksize(inode);
1034 :
1035 1517695 : trace_xfs_zero_file_space(ip);
1036 :
1037 1517693 : error = xfs_free_file_space(ip, offset, len);
1038 1517695 : if (error)
1039 8547 : goto out_unlock;
1040 :
1041 1509148 : len = round_up(offset + len, blksize) -
1042 1509148 : round_down(offset, blksize);
1043 1509148 : offset = round_down(offset, blksize);
1044 3045859 : } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1045 66 : error = xfs_reflink_unshare(ip, offset, len);
1046 66 : if (error)
1047 2 : goto out_unlock;
1048 : } else {
1049 : /*
1050 : * If always_cow mode we can't use preallocations and
1051 : * thus should not create them.
1052 : */
1053 3045793 : if (xfs_is_always_cow_inode(ip)) {
1054 0 : error = -EOPNOTSUPP;
1055 0 : goto out_unlock;
1056 : }
1057 : }
1058 :
1059 4555008 : if (!xfs_is_always_cow_inode(ip)) {
1060 4555011 : error = xfs_alloc_file_space(ip, offset, len);
1061 4555008 : if (error)
1062 88706 : goto out_unlock;
1063 : }
1064 : }
1065 :
1066 : /* Change file size if needed */
1067 12692680 : if (new_size) {
1068 1932953 : struct iattr iattr;
1069 :
1070 1932953 : iattr.ia_valid = ATTR_SIZE;
1071 1932953 : iattr.ia_size = new_size;
1072 3865906 : error = xfs_vn_setattr_size(file_mnt_idmap(file),
1073 : file_dentry(file), &iattr);
1074 1932952 : if (error)
1075 334 : goto out_unlock;
1076 : }
1077 :
1078 : /*
1079 : * Perform hole insertion now that the file size has been
1080 : * updated so that if we crash during the operation we don't
1081 : * leave shifted extents past EOF and hence losing access to
1082 : * the data that is contained within them.
1083 : */
1084 12692345 : if (do_file_insert) {
1085 352028 : error = xfs_insert_file_space(ip, offset, len);
1086 352028 : if (error)
1087 1784 : goto out_unlock;
1088 : }
1089 :
1090 12690561 : if (xfs_file_sync_writes(file))
1091 7128 : error = xfs_log_force_inode(ip);
1092 :
1093 12683433 : out_unlock:
1094 13378486 : xfs_iunlock(ip, iolock);
1095 13378486 : return error;
1096 : }
1097 :
1098 : STATIC int
1099 2117253 : xfs_file_fadvise(
1100 : struct file *file,
1101 : loff_t start,
1102 : loff_t end,
1103 : int advice)
1104 : {
1105 2117253 : struct xfs_inode *ip = XFS_I(file_inode(file));
1106 2117253 : int ret;
1107 2117253 : int lockflags = 0;
1108 :
1109 : /*
1110 : * Operations creating pages in page cache need protection from hole
1111 : * punching and similar ops
1112 : */
1113 2117253 : if (advice == POSIX_FADV_WILLNEED) {
1114 0 : lockflags = XFS_IOLOCK_SHARED;
1115 0 : xfs_ilock(ip, lockflags);
1116 : }
1117 2117253 : ret = generic_fadvise(file, start, end, advice);
1118 2117290 : if (lockflags)
1119 0 : xfs_iunlock(ip, lockflags);
1120 2117290 : return ret;
1121 : }
1122 :
1123 : STATIC loff_t
1124 155167707 : xfs_file_remap_range(
1125 : struct file *file_in,
1126 : loff_t pos_in,
1127 : struct file *file_out,
1128 : loff_t pos_out,
1129 : loff_t len,
1130 : unsigned int remap_flags)
1131 : {
1132 155167707 : struct inode *inode_in = file_inode(file_in);
1133 155167707 : struct xfs_inode *src = XFS_I(inode_in);
1134 155167707 : struct inode *inode_out = file_inode(file_out);
1135 155167707 : struct xfs_inode *dest = XFS_I(inode_out);
1136 155167707 : struct xfs_mount *mp = src->i_mount;
1137 155167707 : loff_t remapped = 0;
1138 155167707 : xfs_extlen_t cowextsize;
1139 155167707 : int ret;
1140 :
1141 155167707 : if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1142 : return -EINVAL;
1143 :
1144 155167707 : if (!xfs_has_reflink(mp))
1145 : return -EOPNOTSUPP;
1146 :
1147 208052910 : if (xfs_is_shutdown(mp))
1148 : return -EIO;
1149 :
1150 : /* Prepare and then clone file data. */
1151 104023893 : ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1152 : &len, remap_flags);
1153 104026798 : if (ret || len == 0)
1154 24121121 : return ret;
1155 :
1156 79905677 : trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1157 :
1158 79906231 : ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1159 : &remapped);
1160 79906480 : if (ret)
1161 590168 : goto out_unlock;
1162 :
1163 : /*
1164 : * Carry the cowextsize hint from src to dest if we're sharing the
1165 : * entire source file to the entire destination file, the source file
1166 : * has a cowextsize hint, and the destination file does not.
1167 : */
1168 79316312 : cowextsize = 0;
1169 79316312 : if (pos_in == 0 && len == i_size_read(inode_in) &&
1170 41903 : (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1171 48 : pos_out == 0 && len >= i_size_read(inode_out) &&
1172 46 : !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1173 6 : cowextsize = src->i_cowextsize;
1174 :
1175 79316312 : ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1176 : remap_flags);
1177 79316199 : if (ret)
1178 0 : goto out_unlock;
1179 :
1180 79316199 : if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1181 525 : xfs_log_force_inode(dest);
1182 79315688 : out_unlock:
1183 79905872 : xfs_iunlock2_io_mmap(src, dest);
1184 79906196 : if (ret)
1185 590120 : trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1186 79906252 : return remapped > 0 ? remapped : ret;
1187 : }
1188 :
1189 : STATIC int
1190 395136907 : xfs_file_open(
1191 : struct inode *inode,
1192 : struct file *file)
1193 : {
1194 790273814 : if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1195 : return -EIO;
1196 395123818 : file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1197 : FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
1198 395123818 : return generic_file_open(inode, file);
1199 : }
1200 :
1201 : STATIC int
1202 73895682 : xfs_dir_open(
1203 : struct inode *inode,
1204 : struct file *file)
1205 : {
1206 73895682 : struct xfs_inode *ip = XFS_I(inode);
1207 73895682 : unsigned int mode;
1208 73895682 : int error;
1209 :
1210 73895682 : error = xfs_file_open(inode, file);
1211 73903849 : if (error)
1212 : return error;
1213 :
1214 : /*
1215 : * If there are any blocks, read-ahead block 0 as we're almost
1216 : * certain to have the next operation be a read there.
1217 : */
1218 73902806 : mode = xfs_ilock_data_map_shared(ip);
1219 73900210 : if (ip->i_df.if_nextents > 0)
1220 5869471 : error = xfs_dir3_data_readahead(ip, 0, 0);
1221 73900275 : xfs_iunlock(ip, mode);
1222 73900275 : return error;
1223 : }
1224 :
1225 : STATIC int
1226 321212867 : xfs_file_release(
1227 : struct inode *inode,
1228 : struct file *filp)
1229 : {
1230 321212867 : return xfs_release(XFS_I(inode));
1231 : }
1232 :
1233 : STATIC int
1234 143554438 : xfs_file_readdir(
1235 : struct file *file,
1236 : struct dir_context *ctx)
1237 : {
1238 143554438 : struct inode *inode = file_inode(file);
1239 143554438 : xfs_inode_t *ip = XFS_I(inode);
1240 143554438 : size_t bufsize;
1241 :
1242 : /*
1243 : * The Linux API doesn't pass down the total size of the buffer
1244 : * we read into down to the filesystem. With the filldir concept
1245 : * it's not needed for correct information, but the XFS dir2 leaf
1246 : * code wants an estimate of the buffer size to calculate it's
1247 : * readahead window and size the buffers used for mapping to
1248 : * physical blocks.
1249 : *
1250 : * Try to give it an estimate that's good enough, maybe at some
1251 : * point we can change the ->readdir prototype to include the
1252 : * buffer size. For now we use the current glibc buffer size.
1253 : */
1254 143554438 : bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1255 :
1256 143554438 : return xfs_readdir(NULL, ip, ctx, bufsize);
1257 : }
1258 :
1259 : STATIC loff_t
1260 35014929 : xfs_file_llseek(
1261 : struct file *file,
1262 : loff_t offset,
1263 : int whence)
1264 : {
1265 35014929 : struct inode *inode = file->f_mapping->host;
1266 :
1267 70029858 : if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1268 : return -EIO;
1269 :
1270 35014926 : switch (whence) {
1271 34851959 : default:
1272 34851959 : return generic_file_llseek(file, offset, whence);
1273 326 : case SEEK_HOLE:
1274 326 : offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1275 326 : break;
1276 162641 : case SEEK_DATA:
1277 162641 : offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1278 162641 : break;
1279 : }
1280 :
1281 162967 : if (offset < 0)
1282 : return offset;
1283 125937 : return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1284 : }
1285 :
1286 : #ifdef CONFIG_FS_DAX
1287 : static inline vm_fault_t
1288 : xfs_dax_fault(
1289 : struct vm_fault *vmf,
1290 : enum page_entry_size pe_size,
1291 : bool write_fault,
1292 : pfn_t *pfn)
1293 : {
1294 : return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1295 : (write_fault && !vmf->cow_page) ?
1296 : &xfs_dax_write_iomap_ops :
1297 : &xfs_read_iomap_ops);
1298 : }
1299 : #else
1300 : static inline vm_fault_t
1301 : xfs_dax_fault(
1302 : struct vm_fault *vmf,
1303 : enum page_entry_size pe_size,
1304 : bool write_fault,
1305 : pfn_t *pfn)
1306 : {
1307 : ASSERT(0);
1308 : return VM_FAULT_SIGBUS;
1309 : }
1310 : #endif
1311 :
1312 : /*
1313 : * Locking for serialisation of IO during page faults. This results in a lock
1314 : * ordering of:
1315 : *
1316 : * mmap_lock (MM)
1317 : * sb_start_pagefault(vfs, freeze)
1318 : * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1319 : * page_lock (MM)
1320 : * i_lock (XFS - extent map serialisation)
1321 : */
1322 : static vm_fault_t
1323 14109953 : __xfs_filemap_fault(
1324 : struct vm_fault *vmf,
1325 : enum page_entry_size pe_size,
1326 : bool write_fault)
1327 : {
1328 14109953 : struct inode *inode = file_inode(vmf->vma->vm_file);
1329 14109953 : struct xfs_inode *ip = XFS_I(inode);
1330 14109953 : vm_fault_t ret;
1331 :
1332 14109953 : trace_xfs_filemap_fault(ip, pe_size, write_fault);
1333 :
1334 14111748 : if (write_fault) {
1335 3446534 : sb_start_pagefault(inode->i_sb);
1336 3446583 : file_update_time(vmf->vma->vm_file);
1337 : }
1338 :
1339 14111804 : if (IS_DAX(inode)) {
1340 : pfn_t pfn;
1341 :
1342 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1343 : ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1344 : if (ret & VM_FAULT_NEEDDSYNC)
1345 : ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1346 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1347 : } else {
1348 14111804 : if (write_fault) {
1349 3446591 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1350 3446595 : ret = iomap_page_mkwrite(vmf,
1351 : &xfs_page_mkwrite_iomap_ops);
1352 3446566 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1353 : } else {
1354 10665213 : ret = filemap_fault(vmf);
1355 : }
1356 : }
1357 :
1358 14112534 : if (write_fault)
1359 3446593 : sb_end_pagefault(inode->i_sb);
1360 14112528 : return ret;
1361 : }
1362 :
1363 : static inline bool
1364 : xfs_is_write_fault(
1365 : struct vm_fault *vmf)
1366 : {
1367 : return (vmf->flags & FAULT_FLAG_WRITE) &&
1368 : (vmf->vma->vm_flags & VM_SHARED);
1369 : }
1370 :
1371 : static vm_fault_t
1372 10665364 : xfs_filemap_fault(
1373 : struct vm_fault *vmf)
1374 : {
1375 : /* DAX can shortcut the normal fault path on write faults! */
1376 10665364 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1377 : IS_DAX(file_inode(vmf->vma->vm_file)) &&
1378 : xfs_is_write_fault(vmf));
1379 : }
1380 :
1381 : static vm_fault_t
1382 0 : xfs_filemap_huge_fault(
1383 : struct vm_fault *vmf,
1384 : enum page_entry_size pe_size)
1385 : {
1386 0 : if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1387 0 : return VM_FAULT_FALLBACK;
1388 :
1389 : /* DAX can shortcut the normal fault path on write faults! */
1390 : return __xfs_filemap_fault(vmf, pe_size,
1391 : xfs_is_write_fault(vmf));
1392 : }
1393 :
1394 : static vm_fault_t
1395 3446484 : xfs_filemap_page_mkwrite(
1396 : struct vm_fault *vmf)
1397 : {
1398 3446484 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1399 : }
1400 :
1401 : /*
1402 : * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1403 : * on write faults. In reality, it needs to serialise against truncate and
1404 : * prepare memory for writing so handle is as standard write fault.
1405 : */
1406 : static vm_fault_t
1407 0 : xfs_filemap_pfn_mkwrite(
1408 : struct vm_fault *vmf)
1409 : {
1410 :
1411 0 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1412 : }
1413 :
1414 : static const struct vm_operations_struct xfs_file_vm_ops = {
1415 : .fault = xfs_filemap_fault,
1416 : .huge_fault = xfs_filemap_huge_fault,
1417 : .map_pages = filemap_map_pages,
1418 : .page_mkwrite = xfs_filemap_page_mkwrite,
1419 : .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1420 : };
1421 :
1422 : STATIC int
1423 7551264 : xfs_file_mmap(
1424 : struct file *file,
1425 : struct vm_area_struct *vma)
1426 : {
1427 7551264 : struct inode *inode = file_inode(file);
1428 7551264 : struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1429 :
1430 : /*
1431 : * We don't support synchronous mappings for non-DAX files and
1432 : * for DAX files if underneath dax_device is not synchronous.
1433 : */
1434 7551264 : if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1435 : return -EOPNOTSUPP;
1436 :
1437 7551265 : file_accessed(file);
1438 7551279 : vma->vm_ops = &xfs_file_vm_ops;
1439 7551279 : if (IS_DAX(inode))
1440 : vm_flags_set(vma, VM_HUGEPAGE);
1441 7551279 : return 0;
1442 : }
1443 :
1444 : const struct file_operations xfs_file_operations = {
1445 : .llseek = xfs_file_llseek,
1446 : .read_iter = xfs_file_read_iter,
1447 : .write_iter = xfs_file_write_iter,
1448 : .splice_read = xfs_file_splice_read,
1449 : .splice_write = iter_file_splice_write,
1450 : .iopoll = iocb_bio_iopoll,
1451 : .unlocked_ioctl = xfs_file_ioctl,
1452 : #ifdef CONFIG_COMPAT
1453 : .compat_ioctl = xfs_file_compat_ioctl,
1454 : #endif
1455 : .mmap = xfs_file_mmap,
1456 : .mmap_supported_flags = MAP_SYNC,
1457 : .open = xfs_file_open,
1458 : .release = xfs_file_release,
1459 : .fsync = xfs_file_fsync,
1460 : .get_unmapped_area = thp_get_unmapped_area,
1461 : .fallocate = xfs_file_fallocate,
1462 : .fadvise = xfs_file_fadvise,
1463 : .remap_file_range = xfs_file_remap_range,
1464 : };
1465 :
1466 : const struct file_operations xfs_dir_file_operations = {
1467 : .open = xfs_dir_open,
1468 : .read = generic_read_dir,
1469 : .iterate_shared = xfs_file_readdir,
1470 : .llseek = generic_file_llseek,
1471 : .unlocked_ioctl = xfs_file_ioctl,
1472 : #ifdef CONFIG_COMPAT
1473 : .compat_ioctl = xfs_file_compat_ioctl,
1474 : #endif
1475 : .fsync = xfs_dir_fsync,
1476 : };
|