Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_inode.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_inode_item.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_dir2.h"
19 : #include "xfs_dir2_priv.h"
20 : #include "xfs_ioctl.h"
21 : #include "xfs_trace.h"
22 : #include "xfs_log.h"
23 : #include "xfs_icache.h"
24 : #include "xfs_pnfs.h"
25 : #include "xfs_iomap.h"
26 : #include "xfs_reflink.h"
27 :
28 : #include <linux/dax.h>
29 : #include <linux/falloc.h>
30 : #include <linux/backing-dev.h>
31 : #include <linux/mman.h>
32 : #include <linux/fadvise.h>
33 : #include <linux/mount.h>
34 :
35 : static const struct vm_operations_struct xfs_file_vm_ops;
36 :
37 : /*
38 : * Decide if the given file range is aligned to the size of the fundamental
39 : * allocation unit for the file.
40 : */
41 : static bool
42 4939099 : xfs_is_falloc_aligned(
43 : struct xfs_inode *ip,
44 : loff_t pos,
45 : long long int len)
46 : {
47 4939099 : struct xfs_mount *mp = ip->i_mount;
48 4939099 : uint64_t mask;
49 :
50 4939099 : if (XFS_IS_REALTIME_INODE(ip)) {
51 3773610 : if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
52 265827 : u64 rextbytes;
53 265827 : u32 mod;
54 :
55 265827 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
56 265827 : div_u64_rem(pos, rextbytes, &mod);
57 265827 : if (mod)
58 : return false;
59 246562 : div_u64_rem(len, rextbytes, &mod);
60 246562 : return mod == 0;
61 : }
62 1620978 : mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
63 : } else {
64 3052294 : mask = mp->m_sb.sb_blocksize - 1;
65 : }
66 :
67 4673272 : return !((pos | len) & mask);
68 : }
69 :
70 : /*
71 : * Fsync operations on directories are much simpler than on regular files,
72 : * as there is no file data to flush, and thus also no need for explicit
73 : * cache flush operations, and there are no non-transaction metadata updates
74 : * on directories either.
75 : */
76 : STATIC int
77 499229 : xfs_dir_fsync(
78 : struct file *file,
79 : loff_t start,
80 : loff_t end,
81 : int datasync)
82 : {
83 499229 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
84 :
85 499229 : trace_xfs_dir_fsync(ip);
86 499218 : return xfs_log_force_inode(ip);
87 : }
88 :
89 : static xfs_csn_t
90 3863870 : xfs_fsync_seq(
91 : struct xfs_inode *ip,
92 : bool datasync)
93 : {
94 3863870 : if (!xfs_ipincount(ip))
95 : return 0;
96 3862657 : if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
97 : return 0;
98 3575166 : return ip->i_itemp->ili_commit_seq;
99 : }
100 :
101 : /*
102 : * All metadata updates are logged, which means that we just have to flush the
103 : * log up to the latest LSN that touched the inode.
104 : *
105 : * If we have concurrent fsync/fdatasync() calls, we need them to all block on
106 : * the log force before we clear the ili_fsync_fields field. This ensures that
107 : * we don't get a racing sync operation that does not wait for the metadata to
108 : * hit the journal before returning. If we race with clearing ili_fsync_fields,
109 : * then all that will happen is the log force will do nothing as the lsn will
110 : * already be on disk. We can't race with setting ili_fsync_fields because that
111 : * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
112 : * shared until after the ili_fsync_fields is cleared.
113 : */
114 : static int
115 3863967 : xfs_fsync_flush_log(
116 : struct xfs_inode *ip,
117 : bool datasync,
118 : int *log_flushed)
119 : {
120 3863967 : int error = 0;
121 3863967 : xfs_csn_t seq;
122 :
123 3863967 : xfs_ilock(ip, XFS_ILOCK_SHARED);
124 3863871 : seq = xfs_fsync_seq(ip, datasync);
125 3863613 : if (seq) {
126 3575195 : error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
127 : log_flushed);
128 :
129 3574938 : spin_lock(&ip->i_itemp->ili_lock);
130 3575222 : ip->i_itemp->ili_fsync_fields = 0;
131 3575222 : spin_unlock(&ip->i_itemp->ili_lock);
132 : }
133 3863676 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
134 3863110 : return error;
135 : }
136 :
137 : STATIC int
138 14441839 : xfs_file_fsync(
139 : struct file *file,
140 : loff_t start,
141 : loff_t end,
142 : int datasync)
143 : {
144 14441839 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
145 14441839 : struct xfs_mount *mp = ip->i_mount;
146 14441839 : int error, err2;
147 14441839 : int log_flushed = 0;
148 :
149 14441839 : trace_xfs_file_fsync(ip);
150 :
151 14440848 : error = file_write_and_wait_range(file, start, end);
152 14443036 : if (error)
153 : return error;
154 :
155 28879358 : if (xfs_is_shutdown(mp))
156 : return -EIO;
157 :
158 14437469 : xfs_iflags_clear(ip, XFS_ITRUNCATED);
159 :
160 : /*
161 : * If we have an RT and/or log subvolume we need to make sure to flush
162 : * the write cache the device used for file data first. This is to
163 : * ensure newly written file data make it to disk before logging the new
164 : * inode size in case of an extending write.
165 : */
166 14439596 : if (XFS_IS_REALTIME_INODE(ip))
167 6250565 : error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
168 8189031 : else if (mp->m_logdev_targp != mp->m_ddev_targp)
169 414584 : error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
170 :
171 : /*
172 : * Any inode that has dirty modifications in the log is pinned. The
173 : * racy check here for a pinned inode will not catch modifications
174 : * that happen concurrently to the fsync call, but fsync semantics
175 : * only require to sync previously completed I/O.
176 : */
177 14432814 : if (xfs_ipincount(ip)) {
178 3863873 : err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
179 3863077 : if (err2 && !error)
180 1108 : error = err2;
181 : }
182 :
183 : /*
184 : * If we only have a single device, and the log force about was
185 : * a no-op we might have to flush the data device cache here.
186 : * This can only happen for fdatasync/O_DSYNC if we were overwriting
187 : * an already allocated file and thus do not have any metadata to
188 : * commit.
189 : */
190 14432018 : if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
191 6330955 : mp->m_logdev_targp == mp->m_ddev_targp) {
192 6154491 : err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
193 6149372 : if (err2 && !error)
194 317 : error = err2;
195 : }
196 :
197 : return error;
198 : }
199 :
200 : static int
201 1170515961 : xfs_ilock_iocb(
202 : struct kiocb *iocb,
203 : unsigned int lock_mode)
204 : {
205 1170515961 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
206 :
207 1170515961 : if (iocb->ki_flags & IOCB_NOWAIT) {
208 0 : if (!xfs_ilock_nowait(ip, lock_mode))
209 0 : return -EAGAIN;
210 : } else {
211 1170515961 : xfs_ilock(ip, lock_mode);
212 : }
213 :
214 : return 0;
215 : }
216 :
217 : STATIC ssize_t
218 524209125 : xfs_file_dio_read(
219 : struct kiocb *iocb,
220 : struct iov_iter *to)
221 : {
222 524209125 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
223 524209125 : ssize_t ret;
224 :
225 524209125 : trace_xfs_file_direct_read(iocb, to);
226 :
227 524208867 : if (!iov_iter_count(to))
228 : return 0; /* skip atime */
229 :
230 524185244 : file_accessed(iocb->ki_filp);
231 :
232 524183861 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
233 524183781 : if (ret)
234 : return ret;
235 524183791 : ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
236 524186286 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
237 :
238 524186286 : return ret;
239 : }
240 :
241 : static noinline ssize_t
242 0 : xfs_file_dax_read(
243 : struct kiocb *iocb,
244 : struct iov_iter *to)
245 : {
246 0 : struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
247 0 : ssize_t ret = 0;
248 :
249 0 : trace_xfs_file_dax_read(iocb, to);
250 :
251 0 : if (!iov_iter_count(to))
252 : return 0; /* skip atime */
253 :
254 0 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
255 0 : if (ret)
256 : return ret;
257 0 : ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
258 0 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
259 :
260 0 : file_accessed(iocb->ki_filp);
261 0 : return ret;
262 : }
263 :
264 : STATIC ssize_t
265 379687963 : xfs_file_buffered_read(
266 : struct kiocb *iocb,
267 : struct iov_iter *to)
268 : {
269 379687963 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
270 379687963 : ssize_t ret;
271 :
272 379687963 : trace_xfs_file_buffered_read(iocb, to);
273 :
274 379708021 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
275 378724458 : if (ret)
276 : return ret;
277 378755985 : ret = generic_file_read_iter(iocb, to);
278 379954919 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
279 :
280 379954919 : return ret;
281 : }
282 :
283 : STATIC ssize_t
284 905563243 : xfs_file_read_iter(
285 : struct kiocb *iocb,
286 : struct iov_iter *to)
287 : {
288 905563243 : struct inode *inode = file_inode(iocb->ki_filp);
289 905563243 : struct xfs_mount *mp = XFS_I(inode)->i_mount;
290 905563243 : ssize_t ret = 0;
291 :
292 905563243 : XFS_STATS_INC(mp, xs_read_calls);
293 :
294 1807066148 : if (xfs_is_shutdown(mp))
295 : return -EIO;
296 :
297 903530249 : if (IS_DAX(inode))
298 0 : ret = xfs_file_dax_read(iocb, to);
299 903530249 : else if (iocb->ki_flags & IOCB_DIRECT)
300 524209173 : ret = xfs_file_dio_read(iocb, to);
301 : else
302 379321076 : ret = xfs_file_buffered_read(iocb, to);
303 :
304 902911849 : if (ret > 0)
305 373870074 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
306 : return ret;
307 : }
308 :
309 : STATIC ssize_t
310 9923454 : xfs_file_splice_read(
311 : struct file *in,
312 : loff_t *ppos,
313 : struct pipe_inode_info *pipe,
314 : size_t len,
315 : unsigned int flags)
316 : {
317 9923454 : struct inode *inode = file_inode(in);
318 9923454 : struct xfs_inode *ip = XFS_I(inode);
319 9923454 : struct xfs_mount *mp = ip->i_mount;
320 9923454 : ssize_t ret = 0;
321 :
322 9923454 : XFS_STATS_INC(mp, xs_read_calls);
323 :
324 19846882 : if (xfs_is_shutdown(mp))
325 : return -EIO;
326 :
327 9923410 : trace_xfs_file_splice_read(ip, *ppos, len);
328 :
329 9923338 : xfs_ilock(ip, XFS_IOLOCK_SHARED);
330 9923298 : ret = filemap_splice_read(in, ppos, pipe, len, flags);
331 9923378 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
332 9923387 : if (ret > 0)
333 9923172 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
334 : return ret;
335 : }
336 :
337 : /*
338 : * Common pre-write limit and setup checks.
339 : *
340 : * Called with the iolocked held either shared and exclusive according to
341 : * @iolock, and returns with it held. Might upgrade the iolock to exclusive
342 : * if called for a direct write beyond i_size.
343 : */
344 : STATIC ssize_t
345 260642712 : xfs_file_write_checks(
346 : struct kiocb *iocb,
347 : struct iov_iter *from,
348 : unsigned int *iolock)
349 : {
350 260642712 : struct file *file = iocb->ki_filp;
351 260642712 : struct inode *inode = file->f_mapping->host;
352 260642712 : struct xfs_inode *ip = XFS_I(inode);
353 260642712 : ssize_t error = 0;
354 260642712 : size_t count = iov_iter_count(from);
355 260642712 : bool drained_dio = false;
356 327539145 : loff_t isize;
357 :
358 : restart:
359 327539145 : error = generic_write_checks(iocb, from);
360 327702182 : if (error <= 0)
361 30 : return error;
362 :
363 327702152 : if (iocb->ki_flags & IOCB_NOWAIT) {
364 0 : error = break_layout(inode, false);
365 0 : if (error == -EWOULDBLOCK)
366 : error = -EAGAIN;
367 : } else {
368 327702152 : error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
369 : }
370 :
371 328430801 : if (error)
372 0 : return error;
373 :
374 : /*
375 : * For changing security info in file_remove_privs() we need i_rwsem
376 : * exclusively.
377 : */
378 328430801 : if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
379 1461 : xfs_iunlock(ip, *iolock);
380 1461 : *iolock = XFS_IOLOCK_EXCL;
381 1461 : error = xfs_ilock_iocb(iocb, *iolock);
382 1461 : if (error) {
383 0 : *iolock = 0;
384 0 : return error;
385 : }
386 1461 : goto restart;
387 : }
388 :
389 : /*
390 : * If the offset is beyond the size of the file, we need to zero any
391 : * blocks that fall between the existing EOF and the start of this
392 : * write. If zeroing is needed and we are currently holding the iolock
393 : * shared, we need to update it to exclusive which implies having to
394 : * redo all checks before.
395 : *
396 : * We need to serialise against EOF updates that occur in IO completions
397 : * here. We want to make sure that nobody is changing the size while we
398 : * do this check until we have placed an IO barrier (i.e. hold the
399 : * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
400 : * spinlock effectively forms a memory barrier once we have the
401 : * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
402 : * hence be able to correctly determine if we need to run zeroing.
403 : *
404 : * We can do an unlocked check here safely as IO completion can only
405 : * extend EOF. Truncate is locked out at this point, so the EOF can
406 : * not move backwards, only forwards. Hence we only need to take the
407 : * slow path and spin locks when we are at or beyond the current EOF.
408 : */
409 328429340 : if (iocb->ki_pos <= i_size_read(inode))
410 193606103 : goto out;
411 :
412 134823237 : spin_lock(&ip->i_flags_lock);
413 135090601 : isize = i_size_read(inode);
414 135090601 : if (iocb->ki_pos > isize) {
415 135090601 : spin_unlock(&ip->i_flags_lock);
416 :
417 134853758 : if (iocb->ki_flags & IOCB_NOWAIT)
418 : return -EAGAIN;
419 :
420 134853758 : if (!drained_dio) {
421 67372943 : if (*iolock == XFS_IOLOCK_SHARED) {
422 5074539 : xfs_iunlock(ip, *iolock);
423 4977010 : *iolock = XFS_IOLOCK_EXCL;
424 4977010 : xfs_ilock(ip, *iolock);
425 5005263 : iov_iter_reexpand(from, count);
426 : }
427 : /*
428 : * We now have an IO submission barrier in place, but
429 : * AIO can do EOF updates during IO completion and hence
430 : * we now need to wait for all of them to drain. Non-AIO
431 : * DIO will have drained before we are given the
432 : * XFS_IOLOCK_EXCL, and so for most cases this wait is a
433 : * no-op.
434 : */
435 67303667 : inode_dio_wait(inode);
436 66894972 : drained_dio = true;
437 66894972 : goto restart;
438 : }
439 :
440 67480815 : trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
441 67264534 : error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
442 66623627 : if (error)
443 : return error;
444 : } else
445 0 : spin_unlock(&ip->i_flags_lock);
446 :
447 260215823 : out:
448 260215823 : return kiocb_modified(iocb);
449 : }
450 :
451 : static int
452 23229437 : xfs_dio_write_end_io(
453 : struct kiocb *iocb,
454 : ssize_t size,
455 : int error,
456 : unsigned flags)
457 : {
458 23229437 : struct inode *inode = file_inode(iocb->ki_filp);
459 23229437 : struct xfs_inode *ip = XFS_I(inode);
460 23229437 : loff_t offset = iocb->ki_pos;
461 23229437 : unsigned int nofs_flag;
462 :
463 23229437 : trace_xfs_end_io_direct_write(ip, offset, size);
464 :
465 46394446 : if (xfs_is_shutdown(ip->i_mount))
466 : return -EIO;
467 :
468 23196355 : if (error)
469 : return error;
470 16488391 : if (!size)
471 : return 0;
472 :
473 : /*
474 : * Capture amount written on completion as we can't reliably account
475 : * for it on submission.
476 : */
477 16488391 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
478 :
479 : /*
480 : * We can allocate memory here while doing writeback on behalf of
481 : * memory reclaim. To avoid memory allocation deadlocks set the
482 : * task-wide nofs context for the following operations.
483 : */
484 16488537 : nofs_flag = memalloc_nofs_save();
485 :
486 16488537 : if (flags & IOMAP_DIO_COW) {
487 3488587 : error = xfs_reflink_end_cow(ip, offset, size);
488 3488582 : if (error)
489 21 : goto out;
490 : }
491 :
492 : /*
493 : * Unwritten conversion updates the in-core isize after extent
494 : * conversion but before updating the on-disk size. Updating isize any
495 : * earlier allows a racing dio read to find unwritten extents before
496 : * they are converted.
497 : */
498 16488511 : if (flags & IOMAP_DIO_UNWRITTEN) {
499 7946931 : error = xfs_iomap_write_unwritten(ip, offset, size, true);
500 7945834 : goto out;
501 : }
502 :
503 : /*
504 : * We need to update the in-core inode size here so that we don't end up
505 : * with the on-disk inode size being outside the in-core inode size. We
506 : * have no other method of updating EOF for AIO, so always do it here
507 : * if necessary.
508 : *
509 : * We need to lock the test/set EOF update as we can be racing with
510 : * other IO completions here to update the EOF. Failing to serialise
511 : * here can result in EOF moving backwards and Bad Things Happen when
512 : * that occurs.
513 : *
514 : * As IO completion only ever extends EOF, we can do an unlocked check
515 : * here to avoid taking the spinlock. If we land within the current EOF,
516 : * then we do not need to do an extending update at all, and we don't
517 : * need to take the lock to check this. If we race with an update moving
518 : * EOF, then we'll either still be beyond EOF and need to take the lock,
519 : * or we'll be within EOF and we don't need to take it at all.
520 : */
521 8541580 : if (offset + size <= i_size_read(inode))
522 7678672 : goto out;
523 :
524 862908 : spin_lock(&ip->i_flags_lock);
525 862908 : if (offset + size > i_size_read(inode)) {
526 862908 : i_size_write(inode, offset + size);
527 862908 : spin_unlock(&ip->i_flags_lock);
528 862908 : error = xfs_setfilesize(ip, offset, size);
529 : } else {
530 0 : spin_unlock(&ip->i_flags_lock);
531 : }
532 :
533 16487435 : out:
534 16487435 : memalloc_nofs_restore(nofs_flag);
535 16487435 : return error;
536 : }
537 :
538 : static const struct iomap_dio_ops xfs_dio_write_ops = {
539 : .end_io = xfs_dio_write_end_io,
540 : };
541 :
542 : /*
543 : * Handle block aligned direct I/O writes
544 : */
545 : static noinline ssize_t
546 17637732 : xfs_file_dio_write_aligned(
547 : struct xfs_inode *ip,
548 : struct kiocb *iocb,
549 : struct iov_iter *from)
550 : {
551 17637732 : unsigned int iolock = XFS_IOLOCK_SHARED;
552 17637732 : ssize_t ret;
553 :
554 17637732 : ret = xfs_ilock_iocb(iocb, iolock);
555 17603221 : if (ret)
556 : return ret;
557 17605777 : ret = xfs_file_write_checks(iocb, from, &iolock);
558 17595783 : if (ret)
559 700 : goto out_unlock;
560 :
561 : /*
562 : * We don't need to hold the IOLOCK exclusively across the IO, so demote
563 : * the iolock back to shared if we had to take the exclusive lock in
564 : * xfs_file_write_checks() for other reasons.
565 : */
566 17595083 : if (iolock == XFS_IOLOCK_EXCL) {
567 5021264 : xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
568 4962614 : iolock = XFS_IOLOCK_SHARED;
569 : }
570 17536433 : trace_xfs_file_direct_write(iocb, from);
571 17449041 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
572 : &xfs_dio_write_ops, 0, NULL, 0);
573 17657725 : out_unlock:
574 17657725 : if (iolock)
575 17655130 : xfs_iunlock(ip, iolock);
576 : return ret;
577 : }
578 :
579 : /*
580 : * Handle block unaligned direct I/O writes
581 : *
582 : * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
583 : * them to be done in parallel with reads and other direct I/O writes. However,
584 : * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
585 : * to do sub-block zeroing and that requires serialisation against other direct
586 : * I/O to the same block. In this case we need to serialise the submission of
587 : * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
588 : * In the case where sub-block zeroing is not required, we can do concurrent
589 : * sub-block dios to the same block successfully.
590 : *
591 : * Optimistically submit the I/O using the shared lock first, but use the
592 : * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
593 : * if block allocation or partial block zeroing would be required. In that case
594 : * we try again with the exclusive lock.
595 : */
596 : static noinline ssize_t
597 10458095 : xfs_file_dio_write_unaligned(
598 : struct xfs_inode *ip,
599 : struct kiocb *iocb,
600 : struct iov_iter *from)
601 : {
602 10458095 : size_t isize = i_size_read(VFS_I(ip));
603 10458095 : size_t count = iov_iter_count(from);
604 10458095 : unsigned int iolock = XFS_IOLOCK_SHARED;
605 10458095 : unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
606 10458095 : ssize_t ret;
607 :
608 : /*
609 : * Extending writes need exclusivity because of the sub-block zeroing
610 : * that the DIO code always does for partial tail blocks beyond EOF, so
611 : * don't even bother trying the fast path in this case.
612 : */
613 10458095 : if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
614 6762999 : if (iocb->ki_flags & IOCB_NOWAIT)
615 : return -EAGAIN;
616 6762999 : retry_exclusive:
617 7886243 : iolock = XFS_IOLOCK_EXCL;
618 7886243 : flags = IOMAP_DIO_FORCE_WAIT;
619 : }
620 :
621 11581339 : ret = xfs_ilock_iocb(iocb, iolock);
622 11581223 : if (ret)
623 : return ret;
624 :
625 : /*
626 : * We can't properly handle unaligned direct I/O to reflink files yet,
627 : * as we can't unshare a partial block.
628 : */
629 11581208 : if (xfs_is_cow_inode(ip)) {
630 5985651 : trace_xfs_reflink_bounce_dio_write(iocb, from);
631 5985652 : ret = -ENOTBLK;
632 5985652 : goto out_unlock;
633 : }
634 :
635 5595548 : ret = xfs_file_write_checks(iocb, from, &iolock);
636 5595252 : if (ret)
637 124 : goto out_unlock;
638 :
639 : /*
640 : * If we are doing exclusive unaligned I/O, this must be the only I/O
641 : * in-flight. Otherwise we risk data corruption due to unwritten extent
642 : * conversions from the AIO end_io handler. Wait for all other I/O to
643 : * drain first.
644 : */
645 5595128 : if (flags & IOMAP_DIO_FORCE_WAIT)
646 4209946 : inode_dio_wait(VFS_I(ip));
647 :
648 5595029 : trace_xfs_file_direct_write(iocb, from);
649 5594906 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
650 : &xfs_dio_write_ops, flags, NULL, 0);
651 :
652 : /*
653 : * Retry unaligned I/O with exclusive blocking semantics if the DIO
654 : * layer rejected it for mapping or locking reasons. If we are doing
655 : * nonblocking user I/O, propagate the error.
656 : */
657 5595281 : if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
658 1123267 : ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
659 1123267 : xfs_iunlock(ip, iolock);
660 1123244 : goto retry_exclusive;
661 : }
662 :
663 4472014 : out_unlock:
664 10457790 : if (iolock)
665 10457481 : xfs_iunlock(ip, iolock);
666 : return ret;
667 : }
668 :
669 : static ssize_t
670 28093716 : xfs_file_dio_write(
671 : struct kiocb *iocb,
672 : struct iov_iter *from)
673 : {
674 28093716 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
675 28093716 : struct xfs_buftarg *target = xfs_inode_buftarg(ip);
676 28093716 : size_t count = iov_iter_count(from);
677 :
678 : /* direct I/O must be aligned to device logical sector size */
679 28093716 : if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
680 : return -EINVAL;
681 28093716 : if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
682 10458245 : return xfs_file_dio_write_unaligned(ip, iocb, from);
683 17635471 : return xfs_file_dio_write_aligned(ip, iocb, from);
684 : }
685 :
686 : static noinline ssize_t
687 0 : xfs_file_dax_write(
688 : struct kiocb *iocb,
689 : struct iov_iter *from)
690 : {
691 0 : struct inode *inode = iocb->ki_filp->f_mapping->host;
692 0 : struct xfs_inode *ip = XFS_I(inode);
693 0 : unsigned int iolock = XFS_IOLOCK_EXCL;
694 0 : ssize_t ret, error = 0;
695 0 : loff_t pos;
696 :
697 0 : ret = xfs_ilock_iocb(iocb, iolock);
698 0 : if (ret)
699 : return ret;
700 0 : ret = xfs_file_write_checks(iocb, from, &iolock);
701 0 : if (ret)
702 0 : goto out;
703 :
704 0 : pos = iocb->ki_pos;
705 :
706 0 : trace_xfs_file_dax_write(iocb, from);
707 0 : ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
708 0 : if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
709 0 : i_size_write(inode, iocb->ki_pos);
710 0 : error = xfs_setfilesize(ip, pos, ret);
711 : }
712 0 : out:
713 0 : if (iolock)
714 0 : xfs_iunlock(ip, iolock);
715 0 : if (error)
716 : return error;
717 :
718 0 : if (ret > 0) {
719 0 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
720 :
721 : /* Handle various SYNC-type writes */
722 0 : ret = generic_write_sync(iocb, ret);
723 : }
724 : return ret;
725 : }
726 :
727 : STATIC ssize_t
728 236289084 : xfs_file_buffered_write(
729 : struct kiocb *iocb,
730 : struct iov_iter *from)
731 : {
732 236289084 : struct inode *inode = iocb->ki_filp->f_mapping->host;
733 236289084 : struct xfs_inode *ip = XFS_I(inode);
734 236289084 : ssize_t ret;
735 236289084 : bool cleared_space = false;
736 237428582 : unsigned int iolock;
737 :
738 : write_retry:
739 237428582 : iolock = XFS_IOLOCK_EXCL;
740 237428582 : ret = xfs_ilock_iocb(iocb, iolock);
741 237255780 : if (ret)
742 0 : return ret;
743 :
744 237255780 : ret = xfs_file_write_checks(iocb, from, &iolock);
745 237172079 : if (ret)
746 13887 : goto out;
747 :
748 237158192 : trace_xfs_file_buffered_write(iocb, from);
749 236822191 : ret = iomap_file_buffered_write(iocb, from,
750 : &xfs_buffered_write_iomap_ops);
751 :
752 : /*
753 : * If we hit a space limit, try to free up some lingering preallocated
754 : * space before returning an error. In the case of ENOSPC, first try to
755 : * write back all dirty inodes to free up some of the excess reserved
756 : * metadata space. This reduces the chances that the eofblocks scan
757 : * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
758 : * also behaves as a filter to prevent too many eofblocks scans from
759 : * running at the same time. Use a synchronous scan to increase the
760 : * effectiveness of the scan.
761 : */
762 237295558 : if (ret == -EDQUOT && !cleared_space) {
763 2826 : xfs_iunlock(ip, iolock);
764 2826 : xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
765 2825 : cleared_space = true;
766 2825 : goto write_retry;
767 237292732 : } else if (ret == -ENOSPC && !cleared_space) {
768 1136138 : struct xfs_icwalk icw = {0};
769 :
770 1136138 : cleared_space = true;
771 1136138 : xfs_flush_inodes(ip->i_mount);
772 :
773 1136355 : xfs_iunlock(ip, iolock);
774 1136073 : icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
775 1136073 : xfs_blockgc_free_space(ip->i_mount, &icw);
776 1136673 : goto write_retry;
777 : }
778 :
779 236156594 : out:
780 236170481 : if (iolock)
781 236017395 : xfs_iunlock(ip, iolock);
782 :
783 236405752 : if (ret > 0) {
784 235218549 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
785 : /* Handle various SYNC-type writes */
786 235280718 : ret = generic_write_sync(iocb, ret);
787 : }
788 : return ret;
789 : }
790 :
791 : STATIC ssize_t
792 258521131 : xfs_file_write_iter(
793 : struct kiocb *iocb,
794 : struct iov_iter *from)
795 : {
796 258521131 : struct inode *inode = iocb->ki_filp->f_mapping->host;
797 258521131 : struct xfs_inode *ip = XFS_I(inode);
798 258521131 : ssize_t ret;
799 258521131 : size_t ocount = iov_iter_count(from);
800 :
801 258521131 : XFS_STATS_INC(ip->i_mount, xs_write_calls);
802 :
803 258550739 : if (ocount == 0)
804 : return 0;
805 :
806 517101036 : if (xfs_is_shutdown(ip->i_mount))
807 : return -EIO;
808 :
809 258537443 : if (IS_DAX(inode))
810 0 : return xfs_file_dax_write(iocb, from);
811 :
812 258537443 : if (iocb->ki_flags & IOCB_DIRECT) {
813 : /*
814 : * Allow a directio write to fall back to a buffered
815 : * write *only* in the case that we're doing a reflink
816 : * CoW. In all other directio scenarios we do not
817 : * allow an operation to fall back to buffered mode.
818 : */
819 28095888 : ret = xfs_file_dio_write(iocb, from);
820 28037223 : if (ret != -ENOTBLK)
821 : return ret;
822 : }
823 :
824 236427575 : return xfs_file_buffered_write(iocb, from);
825 : }
826 :
827 : static void
828 0 : xfs_wait_dax_page(
829 : struct inode *inode)
830 : {
831 0 : struct xfs_inode *ip = XFS_I(inode);
832 :
833 0 : xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
834 0 : schedule();
835 0 : xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
836 0 : }
837 :
838 : int
839 61393942 : xfs_break_dax_layouts(
840 : struct inode *inode,
841 : bool *retry)
842 : {
843 61393942 : struct page *page;
844 :
845 61393942 : ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
846 :
847 61393433 : page = dax_layout_busy_page(inode->i_mapping);
848 61392528 : if (!page)
849 : return 0;
850 :
851 0 : *retry = true;
852 0 : return ___wait_var_event(&page->_refcount,
853 : atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
854 : 0, 0, xfs_wait_dax_page(inode));
855 : }
856 :
857 : int
858 389083378 : xfs_break_layouts(
859 : struct inode *inode,
860 : uint *iolock,
861 : enum layout_break_reason reason)
862 : {
863 389083378 : bool retry;
864 389083378 : int error;
865 :
866 389083378 : ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
867 :
868 388943912 : do {
869 388943912 : retry = false;
870 388943912 : switch (reason) {
871 61393903 : case BREAK_UNMAP:
872 61393903 : error = xfs_break_dax_layouts(inode, &retry);
873 61392687 : if (error || retry)
874 : break;
875 388942752 : fallthrough;
876 : case BREAK_WRITE:
877 388942752 : error = xfs_break_leased_layouts(inode, iolock, &retry);
878 388942752 : break;
879 : default:
880 0 : WARN_ON_ONCE(1);
881 0 : error = -EINVAL;
882 : }
883 389873778 : } while (error == 0 && retry);
884 :
885 389861460 : return error;
886 : }
887 :
888 : /* Does this file, inode, or mount want synchronous writes? */
889 222656640 : static inline bool xfs_file_sync_writes(struct file *filp)
890 : {
891 222656640 : struct xfs_inode *ip = XFS_I(file_inode(filp));
892 :
893 222656640 : if (xfs_has_wsync(ip->i_mount))
894 : return true;
895 222656616 : if (filp->f_flags & (__O_SYNC | O_DSYNC))
896 : return true;
897 222626715 : if (IS_SYNC(file_inode(filp)))
898 13 : return true;
899 :
900 : return false;
901 : }
902 :
903 : #define XFS_FALLOC_FL_SUPPORTED \
904 : (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
905 : FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
906 : FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
907 :
908 : STATIC long
909 54201349 : xfs_file_fallocate(
910 : struct file *file,
911 : int mode,
912 : loff_t offset,
913 : loff_t len)
914 : {
915 54201349 : struct inode *inode = file_inode(file);
916 54201349 : struct xfs_inode *ip = XFS_I(inode);
917 54201349 : long error;
918 54201349 : uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
919 54201349 : loff_t new_size = 0;
920 54201349 : bool do_file_insert = false;
921 :
922 54201349 : if (!S_ISREG(inode->i_mode))
923 : return -EINVAL;
924 54201349 : if (mode & ~XFS_FALLOC_FL_SUPPORTED)
925 : return -EOPNOTSUPP;
926 :
927 54201349 : xfs_ilock(ip, iolock);
928 54201731 : error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
929 54201831 : if (error)
930 0 : goto out_unlock;
931 :
932 : /*
933 : * Must wait for all AIO to complete before we continue as AIO can
934 : * change the file size on completion without holding any locks we
935 : * currently hold. We must do this first because AIO can update both
936 : * the on disk and in memory inode sizes, and the operations that follow
937 : * require the in-memory size to be fully up-to-date.
938 : */
939 54201831 : inode_dio_wait(inode);
940 :
941 : /*
942 : * Now AIO and DIO has drained we flush and (if necessary) invalidate
943 : * the cached range over the first operation we are about to run.
944 : *
945 : * We care about zero and collapse here because they both run a hole
946 : * punch over the range first. Because that can zero data, and the range
947 : * of invalidation for the shift operations is much larger, we still do
948 : * the required flush for collapse in xfs_prepare_shift().
949 : *
950 : * Insert has the same range requirements as collapse, and we extend the
951 : * file first which can zero data. Hence insert has the same
952 : * flush/invalidate requirements as collapse and so they are both
953 : * handled at the right time by xfs_prepare_shift().
954 : */
955 54201063 : if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
956 : FALLOC_FL_COLLAPSE_RANGE)) {
957 43544651 : error = xfs_flush_unmap_range(ip, offset, len);
958 43544523 : if (error)
959 340 : goto out_unlock;
960 : }
961 :
962 54200595 : error = file_modified(file);
963 54201535 : if (error)
964 12 : goto out_unlock;
965 :
966 54201523 : if (mode & FALLOC_FL_PUNCH_HOLE) {
967 37245654 : error = xfs_free_file_space(ip, offset, len);
968 37245808 : if (error)
969 48691 : goto out_unlock;
970 16955869 : } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
971 2796012 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
972 326171 : error = -EINVAL;
973 326171 : goto out_unlock;
974 : }
975 :
976 : /*
977 : * There is no need to overlap collapse range with EOF,
978 : * in which case it is effectively a truncate operation
979 : */
980 2469839 : if (offset + len >= i_size_read(inode)) {
981 178795 : error = -EINVAL;
982 178795 : goto out_unlock;
983 : }
984 :
985 2291044 : new_size = i_size_read(inode) - len;
986 :
987 2291044 : error = xfs_collapse_file_space(ip, offset, len);
988 2291051 : if (error)
989 2058 : goto out_unlock;
990 14159857 : } else if (mode & FALLOC_FL_INSERT_RANGE) {
991 2143097 : loff_t isize = i_size_read(inode);
992 :
993 2143097 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
994 310490 : error = -EINVAL;
995 310490 : goto out_unlock;
996 : }
997 :
998 : /*
999 : * New inode size must not exceed ->s_maxbytes, accounting for
1000 : * possible signed overflow.
1001 : */
1002 1832605 : if (inode->i_sb->s_maxbytes - isize < len) {
1003 10 : error = -EFBIG;
1004 10 : goto out_unlock;
1005 : }
1006 1832595 : new_size = isize + len;
1007 :
1008 : /* Offset should be less than i_size */
1009 1832595 : if (offset >= isize) {
1010 140032 : error = -EINVAL;
1011 140032 : goto out_unlock;
1012 : }
1013 : do_file_insert = true;
1014 : } else {
1015 12016760 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
1016 6871564 : offset + len > i_size_read(inode)) {
1017 4719109 : new_size = offset + len;
1018 4719109 : error = inode_newsize_ok(inode, new_size);
1019 4719111 : if (error)
1020 10 : goto out_unlock;
1021 : }
1022 :
1023 12016752 : if (mode & FALLOC_FL_ZERO_RANGE) {
1024 : /*
1025 : * Punch a hole and prealloc the range. We use a hole
1026 : * punch rather than unwritten extent conversion for two
1027 : * reasons:
1028 : *
1029 : * 1.) Hole punch handles partial block zeroing for us.
1030 : * 2.) If prealloc returns ENOSPC, the file range is
1031 : * still zero-valued by virtue of the hole punch.
1032 : */
1033 3502925 : unsigned int blksize = i_blocksize(inode);
1034 :
1035 3502926 : trace_xfs_zero_file_space(ip);
1036 :
1037 3502912 : error = xfs_free_file_space(ip, offset, len);
1038 3502924 : if (error)
1039 14945 : goto out_unlock;
1040 :
1041 3487979 : len = round_up(offset + len, blksize) -
1042 3487979 : round_down(offset, blksize);
1043 3487979 : offset = round_down(offset, blksize);
1044 8513827 : } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1045 205 : error = xfs_reflink_unshare(ip, offset, len);
1046 205 : if (error)
1047 6 : goto out_unlock;
1048 : } else {
1049 : /*
1050 : * If always_cow mode we can't use preallocations and
1051 : * thus should not create them.
1052 : */
1053 8513622 : if (xfs_is_always_cow_inode(ip)) {
1054 407637 : error = -EOPNOTSUPP;
1055 407637 : goto out_unlock;
1056 : }
1057 : }
1058 :
1059 11594143 : if (!xfs_is_always_cow_inode(ip)) {
1060 11357263 : error = xfs_alloc_file_space(ip, offset, len);
1061 11357381 : if (error)
1062 291989 : goto out_unlock;
1063 : }
1064 : }
1065 :
1066 : /* Change file size if needed */
1067 52480917 : if (new_size) {
1068 8365441 : struct iattr iattr;
1069 :
1070 8365441 : iattr.ia_valid = ATTR_SIZE;
1071 8365441 : iattr.ia_size = new_size;
1072 16730882 : error = xfs_vn_setattr_size(file_mnt_idmap(file),
1073 : file_dentry(file), &iattr);
1074 8365428 : if (error)
1075 1231 : goto out_unlock;
1076 : }
1077 :
1078 : /*
1079 : * Perform hole insertion now that the file size has been
1080 : * updated so that if we crash during the operation we don't
1081 : * leave shifted extents past EOF and hence losing access to
1082 : * the data that is contained within them.
1083 : */
1084 52479673 : if (do_file_insert) {
1085 1692133 : error = xfs_insert_file_space(ip, offset, len);
1086 1692137 : if (error)
1087 2367 : goto out_unlock;
1088 : }
1089 :
1090 52477310 : if (xfs_file_sync_writes(file))
1091 29890 : error = xfs_log_force_inode(ip);
1092 :
1093 52447420 : out_unlock:
1094 54202094 : xfs_iunlock(ip, iolock);
1095 54202094 : return error;
1096 : }
1097 :
1098 : STATIC int
1099 11728153 : xfs_file_fadvise(
1100 : struct file *file,
1101 : loff_t start,
1102 : loff_t end,
1103 : int advice)
1104 : {
1105 11728153 : struct xfs_inode *ip = XFS_I(file_inode(file));
1106 11728153 : int ret;
1107 11728153 : int lockflags = 0;
1108 :
1109 : /*
1110 : * Operations creating pages in page cache need protection from hole
1111 : * punching and similar ops
1112 : */
1113 11728153 : if (advice == POSIX_FADV_WILLNEED) {
1114 0 : lockflags = XFS_IOLOCK_SHARED;
1115 0 : xfs_ilock(ip, lockflags);
1116 : }
1117 11728153 : ret = generic_fadvise(file, start, end, advice);
1118 11708156 : if (lockflags)
1119 0 : xfs_iunlock(ip, lockflags);
1120 11708156 : return ret;
1121 : }
1122 :
1123 : STATIC loff_t
1124 228198733 : xfs_file_remap_range(
1125 : struct file *file_in,
1126 : loff_t pos_in,
1127 : struct file *file_out,
1128 : loff_t pos_out,
1129 : loff_t len,
1130 : unsigned int remap_flags)
1131 : {
1132 228198733 : struct inode *inode_in = file_inode(file_in);
1133 228198733 : struct xfs_inode *src = XFS_I(inode_in);
1134 228198733 : struct inode *inode_out = file_inode(file_out);
1135 228198733 : struct xfs_inode *dest = XFS_I(inode_out);
1136 228198733 : struct xfs_mount *mp = src->i_mount;
1137 228198733 : loff_t remapped = 0;
1138 228198733 : xfs_extlen_t cowextsize;
1139 228198733 : int ret;
1140 :
1141 228198733 : if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1142 : return -EINVAL;
1143 :
1144 228198733 : if (!xfs_has_reflink(mp))
1145 : return -EOPNOTSUPP;
1146 :
1147 280292276 : if (xfs_is_shutdown(mp))
1148 : return -EIO;
1149 :
1150 : /* Prepare and then clone file data. */
1151 140137567 : ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1152 : &len, remap_flags);
1153 140147126 : if (ret || len == 0)
1154 54082124 : return ret;
1155 :
1156 86065002 : trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1157 :
1158 86062817 : ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1159 : &remapped);
1160 86053976 : if (ret)
1161 945383 : goto out_unlock;
1162 :
1163 : /*
1164 : * Carry the cowextsize hint from src to dest if we're sharing the
1165 : * entire source file to the entire destination file, the source file
1166 : * has a cowextsize hint, and the destination file does not.
1167 : */
1168 85108593 : cowextsize = 0;
1169 85108593 : if (pos_in == 0 && len == i_size_read(inode_in) &&
1170 116460 : (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1171 140 : pos_out == 0 && len >= i_size_read(inode_out) &&
1172 134 : !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1173 18 : cowextsize = src->i_cowextsize;
1174 :
1175 85108593 : ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1176 : remap_flags);
1177 85091269 : if (ret)
1178 1 : goto out_unlock;
1179 :
1180 85091268 : if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1181 725 : xfs_log_force_inode(dest);
1182 85090706 : out_unlock:
1183 86036138 : xfs_iunlock2_io_mmap(src, dest);
1184 86057827 : if (ret)
1185 945362 : trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1186 86057675 : return remapped > 0 ? remapped : ret;
1187 : }
1188 :
1189 : STATIC int
1190 597213522 : xfs_file_open(
1191 : struct inode *inode,
1192 : struct file *file)
1193 : {
1194 1194427044 : if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1195 : return -EIO;
1196 597199407 : file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1197 : FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
1198 597199407 : return generic_file_open(inode, file);
1199 : }
1200 :
1201 : STATIC int
1202 86962988 : xfs_dir_open(
1203 : struct inode *inode,
1204 : struct file *file)
1205 : {
1206 86962988 : struct xfs_inode *ip = XFS_I(inode);
1207 86962988 : unsigned int mode;
1208 86962988 : int error;
1209 :
1210 86962988 : error = xfs_file_open(inode, file);
1211 86483712 : if (error)
1212 : return error;
1213 :
1214 : /*
1215 : * If there are any blocks, read-ahead block 0 as we're almost
1216 : * certain to have the next operation be a read there.
1217 : */
1218 86510973 : mode = xfs_ilock_data_map_shared(ip);
1219 86154952 : if (ip->i_df.if_nextents > 0)
1220 7569450 : error = xfs_dir3_data_readahead(ip, 0, 0);
1221 86155579 : xfs_iunlock(ip, mode);
1222 86155579 : return error;
1223 : }
1224 :
1225 : STATIC int
1226 510043990 : xfs_file_release(
1227 : struct inode *inode,
1228 : struct file *filp)
1229 : {
1230 510043990 : return xfs_release(XFS_I(inode));
1231 : }
1232 :
1233 : STATIC int
1234 167399125 : xfs_file_readdir(
1235 : struct file *file,
1236 : struct dir_context *ctx)
1237 : {
1238 167399125 : struct inode *inode = file_inode(file);
1239 167399125 : xfs_inode_t *ip = XFS_I(inode);
1240 167399125 : size_t bufsize;
1241 :
1242 : /*
1243 : * The Linux API doesn't pass down the total size of the buffer
1244 : * we read into down to the filesystem. With the filldir concept
1245 : * it's not needed for correct information, but the XFS dir2 leaf
1246 : * code wants an estimate of the buffer size to calculate it's
1247 : * readahead window and size the buffers used for mapping to
1248 : * physical blocks.
1249 : *
1250 : * Try to give it an estimate that's good enough, maybe at some
1251 : * point we can change the ->readdir prototype to include the
1252 : * buffer size. For now we use the current glibc buffer size.
1253 : */
1254 167399125 : bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1255 :
1256 167399125 : return xfs_readdir(NULL, ip, ctx, bufsize);
1257 : }
1258 :
1259 : STATIC loff_t
1260 83141950 : xfs_file_llseek(
1261 : struct file *file,
1262 : loff_t offset,
1263 : int whence)
1264 : {
1265 83141950 : struct inode *inode = file->f_mapping->host;
1266 :
1267 166283900 : if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1268 : return -EIO;
1269 :
1270 83141942 : switch (whence) {
1271 82944505 : default:
1272 82944505 : return generic_file_llseek(file, offset, whence);
1273 1482 : case SEEK_HOLE:
1274 1482 : offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1275 1482 : break;
1276 195955 : case SEEK_DATA:
1277 195955 : offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1278 195955 : break;
1279 : }
1280 :
1281 197437 : if (offset < 0)
1282 : return offset;
1283 154312 : return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1284 : }
1285 :
1286 : #ifdef CONFIG_FS_DAX
1287 : static inline vm_fault_t
1288 0 : xfs_dax_fault(
1289 : struct vm_fault *vmf,
1290 : enum page_entry_size pe_size,
1291 : bool write_fault,
1292 : pfn_t *pfn)
1293 : {
1294 0 : return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1295 0 : (write_fault && !vmf->cow_page) ?
1296 : &xfs_dax_write_iomap_ops :
1297 : &xfs_read_iomap_ops);
1298 : }
1299 : #else
1300 : static inline vm_fault_t
1301 : xfs_dax_fault(
1302 : struct vm_fault *vmf,
1303 : enum page_entry_size pe_size,
1304 : bool write_fault,
1305 : pfn_t *pfn)
1306 : {
1307 : ASSERT(0);
1308 : return VM_FAULT_SIGBUS;
1309 : }
1310 : #endif
1311 :
1312 : /*
1313 : * Locking for serialisation of IO during page faults. This results in a lock
1314 : * ordering of:
1315 : *
1316 : * mmap_lock (MM)
1317 : * sb_start_pagefault(vfs, freeze)
1318 : * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1319 : * page_lock (MM)
1320 : * i_lock (XFS - extent map serialisation)
1321 : */
1322 : static vm_fault_t
1323 181896922 : __xfs_filemap_fault(
1324 : struct vm_fault *vmf,
1325 : enum page_entry_size pe_size,
1326 : bool write_fault)
1327 : {
1328 181896922 : struct inode *inode = file_inode(vmf->vma->vm_file);
1329 181896922 : struct xfs_inode *ip = XFS_I(inode);
1330 181896922 : vm_fault_t ret;
1331 :
1332 181896922 : trace_xfs_filemap_fault(ip, pe_size, write_fault);
1333 :
1334 181890837 : if (write_fault) {
1335 78443786 : sb_start_pagefault(inode->i_sb);
1336 78349763 : file_update_time(vmf->vma->vm_file);
1337 : }
1338 :
1339 181933539 : if (IS_DAX(inode)) {
1340 0 : pfn_t pfn;
1341 :
1342 0 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1343 0 : ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1344 0 : if (ret & VM_FAULT_NEEDDSYNC)
1345 0 : ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1346 0 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1347 : } else {
1348 181933539 : if (write_fault) {
1349 78468031 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1350 78367835 : ret = iomap_page_mkwrite(vmf,
1351 : &xfs_page_mkwrite_iomap_ops);
1352 78396468 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1353 : } else {
1354 103465508 : ret = filemap_fault(vmf);
1355 : }
1356 : }
1357 :
1358 181915311 : if (write_fault)
1359 78509544 : sb_end_pagefault(inode->i_sb);
1360 181864266 : return ret;
1361 : }
1362 :
1363 : static inline bool
1364 : xfs_is_write_fault(
1365 : struct vm_fault *vmf)
1366 : {
1367 0 : return (vmf->flags & FAULT_FLAG_WRITE) &&
1368 0 : (vmf->vma->vm_flags & VM_SHARED);
1369 : }
1370 :
1371 : static vm_fault_t
1372 103522390 : xfs_filemap_fault(
1373 : struct vm_fault *vmf)
1374 : {
1375 : /* DAX can shortcut the normal fault path on write faults! */
1376 103522390 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1377 103522390 : IS_DAX(file_inode(vmf->vma->vm_file)) &&
1378 : xfs_is_write_fault(vmf));
1379 : }
1380 :
1381 : static vm_fault_t
1382 16423 : xfs_filemap_huge_fault(
1383 : struct vm_fault *vmf,
1384 : enum page_entry_size pe_size)
1385 : {
1386 16423 : if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1387 : return VM_FAULT_FALLBACK;
1388 :
1389 : /* DAX can shortcut the normal fault path on write faults! */
1390 0 : return __xfs_filemap_fault(vmf, pe_size,
1391 : xfs_is_write_fault(vmf));
1392 : }
1393 :
1394 : static vm_fault_t
1395 78494868 : xfs_filemap_page_mkwrite(
1396 : struct vm_fault *vmf)
1397 : {
1398 78494868 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1399 : }
1400 :
1401 : /*
1402 : * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1403 : * on write faults. In reality, it needs to serialise against truncate and
1404 : * prepare memory for writing so handle is as standard write fault.
1405 : */
1406 : static vm_fault_t
1407 0 : xfs_filemap_pfn_mkwrite(
1408 : struct vm_fault *vmf)
1409 : {
1410 :
1411 0 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1412 : }
1413 :
1414 : static const struct vm_operations_struct xfs_file_vm_ops = {
1415 : .fault = xfs_filemap_fault,
1416 : .huge_fault = xfs_filemap_huge_fault,
1417 : .map_pages = filemap_map_pages,
1418 : .page_mkwrite = xfs_filemap_page_mkwrite,
1419 : .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1420 : };
1421 :
1422 : STATIC int
1423 11760057 : xfs_file_mmap(
1424 : struct file *file,
1425 : struct vm_area_struct *vma)
1426 : {
1427 11760057 : struct inode *inode = file_inode(file);
1428 11760057 : struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1429 :
1430 : /*
1431 : * We don't support synchronous mappings for non-DAX files and
1432 : * for DAX files if underneath dax_device is not synchronous.
1433 : */
1434 11760057 : if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1435 : return -EOPNOTSUPP;
1436 :
1437 11759956 : file_accessed(file);
1438 11760077 : vma->vm_ops = &xfs_file_vm_ops;
1439 11760077 : if (IS_DAX(inode))
1440 0 : vm_flags_set(vma, VM_HUGEPAGE);
1441 : return 0;
1442 : }
1443 :
1444 : const struct file_operations xfs_file_operations = {
1445 : .llseek = xfs_file_llseek,
1446 : .read_iter = xfs_file_read_iter,
1447 : .write_iter = xfs_file_write_iter,
1448 : .splice_read = xfs_file_splice_read,
1449 : .splice_write = iter_file_splice_write,
1450 : .iopoll = iocb_bio_iopoll,
1451 : .unlocked_ioctl = xfs_file_ioctl,
1452 : #ifdef CONFIG_COMPAT
1453 : .compat_ioctl = xfs_file_compat_ioctl,
1454 : #endif
1455 : .mmap = xfs_file_mmap,
1456 : .mmap_supported_flags = MAP_SYNC,
1457 : .open = xfs_file_open,
1458 : .release = xfs_file_release,
1459 : .fsync = xfs_file_fsync,
1460 : .get_unmapped_area = thp_get_unmapped_area,
1461 : .fallocate = xfs_file_fallocate,
1462 : .fadvise = xfs_file_fadvise,
1463 : .remap_file_range = xfs_file_remap_range,
1464 : };
1465 :
1466 : const struct file_operations xfs_dir_file_operations = {
1467 : .open = xfs_dir_open,
1468 : .read = generic_read_dir,
1469 : .iterate_shared = xfs_file_readdir,
1470 : .llseek = generic_file_llseek,
1471 : .unlocked_ioctl = xfs_file_ioctl,
1472 : #ifdef CONFIG_COMPAT
1473 : .compat_ioctl = xfs_file_compat_ioctl,
1474 : #endif
1475 : .fsync = xfs_dir_fsync,
1476 : };
|