Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_inode.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_inode_item.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_dir2.h"
19 : #include "xfs_dir2_priv.h"
20 : #include "xfs_ioctl.h"
21 : #include "xfs_trace.h"
22 : #include "xfs_log.h"
23 : #include "xfs_icache.h"
24 : #include "xfs_pnfs.h"
25 : #include "xfs_iomap.h"
26 : #include "xfs_reflink.h"
27 : #include "xfs_file.h"
28 :
29 : #include <linux/dax.h>
30 : #include <linux/falloc.h>
31 : #include <linux/backing-dev.h>
32 : #include <linux/mman.h>
33 : #include <linux/fadvise.h>
34 : #include <linux/mount.h>
35 : #include <linux/buffer_head.h> /* for block_page_mkwrite_return */
36 :
37 : static const struct vm_operations_struct xfs_file_vm_ops;
38 :
39 : /*
40 : * Decide if the given file range is aligned to the size of the fundamental
41 : * allocation unit for the file.
42 : */
43 : bool
44 1813513 : xfs_is_falloc_aligned(
45 : struct xfs_inode *ip,
46 : loff_t pos,
47 : long long int len)
48 : {
49 1813513 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
50 :
51 2432538 : if (XFS_IS_REALTIME_INODE(ip) && !is_power_of_2(alloc_unit))
52 8 : return isaligned_64(pos, alloc_unit) &&
53 4 : isaligned_64(len, alloc_unit);
54 :
55 1813507 : return !((pos | len) & (alloc_unit - 1));
56 : }
57 :
58 : /*
59 : * Fsync operations on directories are much simpler than on regular files,
60 : * as there is no file data to flush, and thus also no need for explicit
61 : * cache flush operations, and there are no non-transaction metadata updates
62 : * on directories either.
63 : */
64 : STATIC int
65 615427 : xfs_dir_fsync(
66 : struct file *file,
67 : loff_t start,
68 : loff_t end,
69 : int datasync)
70 : {
71 615427 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
72 :
73 615427 : trace_xfs_dir_fsync(ip);
74 615426 : return xfs_log_force_inode(ip);
75 : }
76 :
77 : static xfs_csn_t
78 2538341 : xfs_fsync_seq(
79 : struct xfs_inode *ip,
80 : bool datasync)
81 : {
82 2538341 : if (!xfs_ipincount(ip))
83 : return 0;
84 2538241 : if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
85 : return 0;
86 2470226 : return ip->i_itemp->ili_commit_seq;
87 : }
88 :
89 : /*
90 : * All metadata updates are logged, which means that we just have to flush the
91 : * log up to the latest LSN that touched the inode.
92 : *
93 : * If we have concurrent fsync/fdatasync() calls, we need them to all block on
94 : * the log force before we clear the ili_fsync_fields field. This ensures that
95 : * we don't get a racing sync operation that does not wait for the metadata to
96 : * hit the journal before returning. If we race with clearing ili_fsync_fields,
97 : * then all that will happen is the log force will do nothing as the lsn will
98 : * already be on disk. We can't race with setting ili_fsync_fields because that
99 : * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
100 : * shared until after the ili_fsync_fields is cleared.
101 : */
102 : static int
103 2538371 : xfs_fsync_flush_log(
104 : struct xfs_inode *ip,
105 : bool datasync,
106 : int *log_flushed)
107 : {
108 2538371 : int error = 0;
109 2538371 : xfs_csn_t seq;
110 :
111 2538371 : xfs_ilock(ip, XFS_ILOCK_SHARED);
112 2538366 : seq = xfs_fsync_seq(ip, datasync);
113 2538362 : if (seq) {
114 2470246 : error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
115 : log_flushed);
116 :
117 2469049 : spin_lock(&ip->i_itemp->ili_lock);
118 2470208 : ip->i_itemp->ili_fsync_fields = 0;
119 2470208 : spin_unlock(&ip->i_itemp->ili_lock);
120 : }
121 2538214 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
122 2538172 : return error;
123 : }
124 :
125 : STATIC int
126 32138353 : xfs_file_fsync(
127 : struct file *file,
128 : loff_t start,
129 : loff_t end,
130 : int datasync)
131 : {
132 32138353 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
133 32138353 : struct xfs_mount *mp = ip->i_mount;
134 32138353 : int error, err2;
135 32138353 : int log_flushed = 0;
136 :
137 32138353 : trace_xfs_file_fsync(ip);
138 :
139 32138630 : error = file_write_and_wait_range(file, start, end);
140 32139489 : if (error)
141 : return error;
142 :
143 64275728 : if (xfs_is_shutdown(mp))
144 : return -EIO;
145 :
146 32135771 : xfs_iflags_clear(ip, XFS_ITRUNCATED);
147 :
148 : /*
149 : * If we have an RT and/or log subvolume we need to make sure to flush
150 : * the write cache the device used for file data first. This is to
151 : * ensure newly written file data make it to disk before logging the new
152 : * inode size in case of an extending write.
153 : */
154 32135254 : if (XFS_IS_REALTIME_INODE(ip))
155 13030179 : error = xfs_buftarg_flush(mp->m_rtdev_targp);
156 19105075 : else if (mp->m_logdev_targp != mp->m_ddev_targp)
157 0 : error = xfs_buftarg_flush(mp->m_ddev_targp);
158 :
159 : /*
160 : * Any inode that has dirty modifications in the log is pinned. The
161 : * racy check here for a pinned inode will not catch modifications
162 : * that happen concurrently to the fsync call, but fsync semantics
163 : * only require to sync previously completed I/O.
164 : */
165 32134053 : if (xfs_ipincount(ip)) {
166 2538373 : err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
167 2537867 : if (err2 && !error)
168 975 : error = err2;
169 : }
170 :
171 : /*
172 : * If we only have a single device, and the log force about was
173 : * a no-op we might have to flush the data device cache here.
174 : * This can only happen for fdatasync/O_DSYNC if we were overwriting
175 : * an already allocated file and thus do not have any metadata to
176 : * commit.
177 : */
178 32133547 : if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
179 17753166 : mp->m_logdev_targp == mp->m_ddev_targp) {
180 17753000 : err2 = xfs_buftarg_flush(mp->m_ddev_targp);
181 17752934 : if (err2 && !error)
182 283 : error = err2;
183 : }
184 :
185 : return error;
186 : }
187 :
188 : static int
189 219221358 : xfs_ilock_iocb(
190 : struct kiocb *iocb,
191 : unsigned int lock_mode)
192 : {
193 219221358 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
194 :
195 219221358 : if (iocb->ki_flags & IOCB_NOWAIT) {
196 0 : if (!xfs_ilock_nowait(ip, lock_mode))
197 0 : return -EAGAIN;
198 : } else {
199 219221358 : xfs_ilock(ip, lock_mode);
200 : }
201 :
202 : return 0;
203 : }
204 :
205 : STATIC ssize_t
206 82967708 : xfs_file_dio_read(
207 : struct kiocb *iocb,
208 : struct iov_iter *to)
209 : {
210 82967708 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
211 82967708 : ssize_t ret;
212 :
213 82967708 : trace_xfs_file_direct_read(iocb, to);
214 :
215 82967710 : if (!iov_iter_count(to))
216 : return 0; /* skip atime */
217 :
218 82967703 : file_accessed(iocb->ki_filp);
219 :
220 82967745 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
221 82967744 : if (ret)
222 : return ret;
223 82967735 : ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
224 82967737 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
225 :
226 82967737 : return ret;
227 : }
228 :
229 : static noinline ssize_t
230 : xfs_file_dax_read(
231 : struct kiocb *iocb,
232 : struct iov_iter *to)
233 : {
234 : struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
235 : ssize_t ret = 0;
236 :
237 : trace_xfs_file_dax_read(iocb, to);
238 :
239 : if (!iov_iter_count(to))
240 : return 0; /* skip atime */
241 :
242 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
243 : if (ret)
244 : return ret;
245 : ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
246 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
247 :
248 : file_accessed(iocb->ki_filp);
249 : return ret;
250 : }
251 :
252 : STATIC ssize_t
253 65519523 : xfs_file_buffered_read(
254 : struct kiocb *iocb,
255 : struct iov_iter *to)
256 : {
257 65519523 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
258 65519523 : ssize_t ret;
259 :
260 65519523 : trace_xfs_file_buffered_read(iocb, to);
261 :
262 65531540 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
263 65519416 : if (ret)
264 : return ret;
265 65533149 : ret = generic_file_read_iter(iocb, to);
266 65530536 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
267 :
268 65530536 : return ret;
269 : }
270 :
271 : STATIC ssize_t
272 148493266 : xfs_file_read_iter(
273 : struct kiocb *iocb,
274 : struct iov_iter *to)
275 : {
276 148493266 : struct inode *inode = file_inode(iocb->ki_filp);
277 148493266 : struct xfs_mount *mp = XFS_I(inode)->i_mount;
278 148493266 : ssize_t ret = 0;
279 :
280 148493266 : XFS_STATS_INC(mp, xs_read_calls);
281 :
282 296986532 : if (xfs_is_shutdown(mp))
283 : return -EIO;
284 :
285 148490463 : if (IS_DAX(inode))
286 : ret = xfs_file_dax_read(iocb, to);
287 148490463 : else if (iocb->ki_flags & IOCB_DIRECT)
288 82967680 : ret = xfs_file_dio_read(iocb, to);
289 : else
290 65522783 : ret = xfs_file_buffered_read(iocb, to);
291 :
292 148500250 : if (ret > 0)
293 68985305 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
294 : return ret;
295 : }
296 :
297 : STATIC ssize_t
298 6345641 : xfs_file_splice_read(
299 : struct file *in,
300 : loff_t *ppos,
301 : struct pipe_inode_info *pipe,
302 : size_t len,
303 : unsigned int flags)
304 : {
305 6345641 : struct inode *inode = file_inode(in);
306 6345641 : struct xfs_inode *ip = XFS_I(inode);
307 6345641 : struct xfs_mount *mp = ip->i_mount;
308 6345641 : ssize_t ret = 0;
309 :
310 6345641 : XFS_STATS_INC(mp, xs_read_calls);
311 :
312 12691282 : if (xfs_is_shutdown(mp))
313 : return -EIO;
314 :
315 6345636 : trace_xfs_file_splice_read(ip, *ppos, len);
316 :
317 6345639 : xfs_ilock(ip, XFS_IOLOCK_SHARED);
318 6345637 : ret = filemap_splice_read(in, ppos, pipe, len, flags);
319 6345661 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
320 6345661 : if (ret > 0)
321 6345528 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
322 : return ret;
323 : }
324 :
325 : /*
326 : * Decide if this file write requires COWing-around at either end of the write
327 : * range. This is only required if the file allocation unit is larger than
328 : * 1FSB and the write range is not aligned with the allocation unit.
329 : */
330 : static bool
331 128286184 : xfs_file_write_needs_cow_around(
332 : struct xfs_inode *ip,
333 : loff_t pos,
334 : long long int count)
335 : {
336 : /*
337 : * No COWing required if this inode doesn't do COW.
338 : *
339 : * If the allocation unit is 1FSB, we do not need to COW around the
340 : * edges of the operation range. This applies to all files on the data
341 : * device and rt files that have an extent size of 1FSB.
342 : */
343 128286184 : if (!xfs_inode_needs_cow_around(ip))
344 : return false;
345 :
346 : /*
347 : * Otherwise, check that the operation is aligned to the rt extent
348 : * size. Any unaligned operation /must/ be COWed around since the
349 : * regular reflink code only handles extending writes up to fsblock
350 : * boundaries.
351 : */
352 0 : return !xfs_is_falloc_aligned(ip, pos, count);
353 : }
354 :
355 : /* Do we need to COW-around at this offset to handle a truncate up or down? */
356 : bool
357 6209813 : xfs_truncate_needs_cow_around(
358 : struct xfs_inode *ip,
359 : loff_t pos)
360 : {
361 6209813 : return xfs_file_write_needs_cow_around(ip, pos, 0);
362 : }
363 :
364 : /* Does this file write require COWing around? */
365 : static inline bool
366 : xfs_iocb_needs_cow_around(
367 : struct xfs_inode *ip,
368 : const struct kiocb *iocb,
369 : const struct iov_iter *from)
370 : {
371 4174674 : return xfs_file_write_needs_cow_around(ip, iocb->ki_pos,
372 : iov_iter_count(from));
373 : }
374 :
375 : /* Unshare the allocation unit mapped to the given file position. */
376 : inline int
377 0 : xfs_file_unshare_at(
378 : struct xfs_inode *ip,
379 : loff_t pos)
380 : {
381 0 : loff_t isize = i_size_read(VFS_I(ip));
382 0 : unsigned int extsize, len;
383 0 : uint32_t mod;
384 :
385 0 : len = extsize = xfs_inode_alloc_unitsize(ip);
386 :
387 : /* Open-coded rounddown_64 so that we can skip out if aligned */
388 0 : div_u64_rem(pos, extsize, &mod);
389 0 : if (mod == 0)
390 : return 0;
391 0 : pos -= mod;
392 :
393 : /* Do not extend the file. */
394 0 : if (pos >= isize)
395 : return 0;
396 0 : if (pos + len > isize)
397 0 : len = isize - pos;
398 :
399 0 : trace_xfs_file_cow_around(ip, pos, len);
400 :
401 0 : if (IS_DAX(VFS_I(ip)))
402 : return dax_file_unshare(VFS_I(ip), pos, len,
403 : &xfs_dax_write_iomap_ops);
404 0 : return iomap_file_unshare(VFS_I(ip), pos, len,
405 : &xfs_buffered_write_iomap_ops);
406 : }
407 :
408 : /*
409 : * Dirty the pages on either side of a write request as needed to satisfy
410 : * alignment requirements if we're going to perform a copy-write.
411 : *
412 : * This is only needed for realtime files when the rt extent size is larger
413 : * than 1 fs block, because we don't allow a logical rt extent in a file to map
414 : * to multiple physical rt extents. In other words, we can only map and unmap
415 : * full rt extents. Note that page cache doesn't exist above EOF, so be
416 : * careful to stay below EOF.
417 : */
418 : static int
419 0 : xfs_file_cow_around(
420 : struct xfs_inode *ip,
421 : loff_t pos,
422 : long long int count)
423 : {
424 0 : int error;
425 :
426 : /* Unshare at the start of the extent. */
427 0 : error = xfs_file_unshare_at(ip, pos);
428 0 : if (error)
429 : return error;
430 :
431 : /* Unshare at the end. */
432 0 : return xfs_file_unshare_at(ip, pos + count);
433 : }
434 :
435 : /*
436 : * Common pre-write limit and setup checks.
437 : *
438 : * Called with the iolocked held either shared and exclusive according to
439 : * @iolock, and returns with it held. Might upgrade the iolock to exclusive
440 : * if called for a direct write beyond i_size.
441 : */
442 : STATIC ssize_t
443 66052703 : xfs_file_write_checks(
444 : struct kiocb *iocb,
445 : struct iov_iter *from,
446 : unsigned int *iolock)
447 : {
448 66052703 : struct file *file = iocb->ki_filp;
449 66052703 : struct inode *inode = file->f_mapping->host;
450 66052703 : struct xfs_inode *ip = XFS_I(inode);
451 66052703 : ssize_t error = 0;
452 66052703 : size_t count = iov_iter_count(from);
453 66052703 : bool drained_dio = false;
454 85162436 : loff_t isize;
455 :
456 : restart:
457 85162436 : error = generic_write_checks(iocb, from);
458 85149389 : if (error <= 0)
459 6 : return error;
460 :
461 85149383 : if (iocb->ki_flags & IOCB_NOWAIT) {
462 0 : error = break_layout(inode, false);
463 0 : if (error == -EWOULDBLOCK)
464 : error = -EAGAIN;
465 : } else {
466 85149383 : error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
467 : }
468 :
469 85156521 : if (error)
470 0 : return error;
471 :
472 : /*
473 : * For changing security info in file_remove_privs() we need i_rwsem
474 : * exclusively. We also need it to COW around the range being written.
475 : */
476 85156521 : if (*iolock == XFS_IOLOCK_SHARED &&
477 8536747 : (!IS_NOSEC(inode) || xfs_iocb_needs_cow_around(ip, iocb, from))) {
478 187432 : xfs_iunlock(ip, *iolock);
479 187431 : *iolock = XFS_IOLOCK_EXCL;
480 187431 : error = xfs_ilock_iocb(iocb, *iolock);
481 187430 : if (error) {
482 0 : *iolock = 0;
483 0 : return error;
484 : }
485 187430 : goto restart;
486 : }
487 :
488 : /*
489 : * The write is not aligned to the file's allocation unit. If either
490 : * of the allocation units at the start or end of the write range are
491 : * shared, unshare them through the page cache.
492 : */
493 84969056 : if (xfs_iocb_needs_cow_around(ip, iocb, from)) {
494 0 : ASSERT(*iolock == XFS_IOLOCK_EXCL);
495 :
496 0 : inode_dio_wait(VFS_I(ip));
497 0 : drained_dio = true;
498 :
499 0 : error = xfs_file_cow_around(ip, iocb->ki_pos, count);
500 0 : if (error)
501 0 : return error;
502 : }
503 :
504 : /*
505 : * If the offset is beyond the size of the file, we need to zero any
506 : * blocks that fall between the existing EOF and the start of this
507 : * write. If zeroing is needed and we are currently holding the iolock
508 : * shared, we need to update it to exclusive which implies having to
509 : * redo all checks before.
510 : *
511 : * We need to serialise against EOF updates that occur in IO completions
512 : * here. We want to make sure that nobody is changing the size while we
513 : * do this check until we have placed an IO barrier (i.e. hold the
514 : * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
515 : * spinlock effectively forms a memory barrier once we have the
516 : * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
517 : * hence be able to correctly determine if we need to run zeroing.
518 : *
519 : * We can do an unlocked check here safely as IO completion can only
520 : * extend EOF. Truncate is locked out at this point, so the EOF can
521 : * not move backwards, only forwards. Hence we only need to take the
522 : * slow path and spin locks when we are at or beyond the current EOF.
523 : */
524 84963809 : if (iocb->ki_pos <= i_size_read(inode))
525 47140600 : goto out;
526 :
527 37823209 : spin_lock(&ip->i_flags_lock);
528 37833517 : isize = i_size_read(inode);
529 37833517 : if (iocb->ki_pos > isize) {
530 37833517 : spin_unlock(&ip->i_flags_lock);
531 :
532 37839267 : if (iocb->ki_flags & IOCB_NOWAIT)
533 : return -EAGAIN;
534 :
535 37839267 : if (!drained_dio) {
536 18928783 : if (*iolock == XFS_IOLOCK_SHARED) {
537 100403 : xfs_iunlock(ip, *iolock);
538 100395 : *iolock = XFS_IOLOCK_EXCL;
539 100395 : xfs_ilock(ip, *iolock);
540 100400 : iov_iter_reexpand(from, count);
541 : }
542 : /*
543 : * We now have an IO submission barrier in place, but
544 : * AIO can do EOF updates during IO completion and hence
545 : * we now need to wait for all of them to drain. Non-AIO
546 : * DIO will have drained before we are given the
547 : * XFS_IOLOCK_EXCL, and so for most cases this wait is a
548 : * no-op.
549 : */
550 18928780 : inode_dio_wait(inode);
551 18922303 : drained_dio = true;
552 18922303 : goto restart;
553 : }
554 :
555 : /*
556 : * If we're starting the write past EOF, COW the allocation
557 : * unit containing the current EOF before we start zeroing the
558 : * range between EOF and the start of the write.
559 : */
560 18910484 : if (xfs_truncate_needs_cow_around(ip, isize)) {
561 0 : error = xfs_file_unshare_at(ip, isize);
562 0 : if (error)
563 : return error;
564 : }
565 :
566 18912862 : trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
567 18912727 : error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
568 18909880 : if (error)
569 : return error;
570 : } else
571 0 : spin_unlock(&ip->i_flags_lock);
572 :
573 66049829 : out:
574 66049829 : return kiocb_modified(iocb);
575 : }
576 :
577 : static int
578 6769038 : xfs_dio_write_end_io(
579 : struct kiocb *iocb,
580 : ssize_t size,
581 : int error,
582 : unsigned flags)
583 : {
584 6769038 : struct inode *inode = file_inode(iocb->ki_filp);
585 6769038 : struct xfs_inode *ip = XFS_I(inode);
586 6769038 : loff_t offset = iocb->ki_pos;
587 6769038 : unsigned int nofs_flag;
588 :
589 6769038 : trace_xfs_end_io_direct_write(ip, offset, size);
590 :
591 13537904 : if (xfs_is_shutdown(ip->i_mount))
592 : return -EIO;
593 :
594 6768263 : if (error)
595 : return error;
596 6219394 : if (!size)
597 : return 0;
598 :
599 : /*
600 : * Capture amount written on completion as we can't reliably account
601 : * for it on submission.
602 : */
603 6219394 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
604 :
605 : /*
606 : * We can allocate memory here while doing writeback on behalf of
607 : * memory reclaim. To avoid memory allocation deadlocks set the
608 : * task-wide nofs context for the following operations.
609 : */
610 6219394 : nofs_flag = memalloc_nofs_save();
611 :
612 6219394 : if (flags & IOMAP_DIO_COW) {
613 1132002 : error = xfs_reflink_end_cow(ip, offset, size);
614 1132003 : if (error)
615 15 : goto out;
616 : }
617 :
618 : /*
619 : * Unwritten conversion updates the in-core isize after extent
620 : * conversion but before updating the on-disk size. Updating isize any
621 : * earlier allows a racing dio read to find unwritten extents before
622 : * they are converted.
623 : */
624 6219380 : if (flags & IOMAP_DIO_UNWRITTEN) {
625 3301173 : error = xfs_iomap_write_unwritten(ip, offset, size, true);
626 3301222 : goto out;
627 : }
628 :
629 : /*
630 : * We need to update the in-core inode size here so that we don't end up
631 : * with the on-disk inode size being outside the in-core inode size. We
632 : * have no other method of updating EOF for AIO, so always do it here
633 : * if necessary.
634 : *
635 : * We need to lock the test/set EOF update as we can be racing with
636 : * other IO completions here to update the EOF. Failing to serialise
637 : * here can result in EOF moving backwards and Bad Things Happen when
638 : * that occurs.
639 : *
640 : * As IO completion only ever extends EOF, we can do an unlocked check
641 : * here to avoid taking the spinlock. If we land within the current EOF,
642 : * then we do not need to do an extending update at all, and we don't
643 : * need to take the lock to check this. If we race with an update moving
644 : * EOF, then we'll either still be beyond EOF and need to take the lock,
645 : * or we'll be within EOF and we don't need to take it at all.
646 : */
647 2918207 : if (offset + size <= i_size_read(inode))
648 2752012 : goto out;
649 :
650 166195 : spin_lock(&ip->i_flags_lock);
651 166195 : if (offset + size > i_size_read(inode)) {
652 166195 : i_size_write(inode, offset + size);
653 166195 : spin_unlock(&ip->i_flags_lock);
654 166195 : error = xfs_setfilesize(ip, offset, size);
655 : } else {
656 0 : spin_unlock(&ip->i_flags_lock);
657 : }
658 :
659 6219444 : out:
660 6219444 : memalloc_nofs_restore(nofs_flag);
661 6219444 : return error;
662 : }
663 :
664 : static const struct iomap_dio_ops xfs_dio_write_ops = {
665 : .end_io = xfs_dio_write_end_io,
666 : };
667 :
668 : /*
669 : * Handle block aligned direct I/O writes
670 : */
671 : static noinline ssize_t
672 3739988 : xfs_file_dio_write_aligned(
673 : struct xfs_inode *ip,
674 : struct kiocb *iocb,
675 : struct iov_iter *from)
676 : {
677 3739988 : unsigned int iolock = XFS_IOLOCK_SHARED;
678 3739988 : ssize_t ret;
679 :
680 : /*
681 : * If the range to write is not aligned to an allocation unit, we will
682 : * have to COW the allocation units on both ends of the write. Because
683 : * this runs through the page cache, it requires IOLOCK_EXCL. This
684 : * predicate performs an unlocked access of the rt and reflink inode
685 : * state.
686 : */
687 3739988 : if (xfs_iocb_needs_cow_around(ip, iocb, from))
688 0 : iolock = XFS_IOLOCK_EXCL;
689 :
690 3739980 : ret = xfs_ilock_iocb(iocb, iolock);
691 3740089 : if (ret)
692 : return ret;
693 3740087 : ret = xfs_file_write_checks(iocb, from, &iolock);
694 3740036 : if (ret)
695 631 : goto out_unlock;
696 :
697 : /*
698 : * We don't need to hold the IOLOCK exclusively across the IO, so demote
699 : * the iolock back to shared if we had to take the exclusive lock in
700 : * xfs_file_write_checks() for other reasons.
701 : */
702 3739405 : if (iolock == XFS_IOLOCK_EXCL) {
703 140954 : xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
704 140953 : iolock = XFS_IOLOCK_SHARED;
705 : }
706 3739404 : trace_xfs_file_direct_write(iocb, from);
707 3739395 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
708 : &xfs_dio_write_ops, 0, NULL, 0);
709 3740108 : out_unlock:
710 3740108 : if (iolock)
711 3740112 : xfs_iunlock(ip, iolock);
712 : return ret;
713 : }
714 :
715 : /*
716 : * Handle block unaligned direct I/O writes
717 : *
718 : * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
719 : * them to be done in parallel with reads and other direct I/O writes. However,
720 : * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
721 : * to do sub-block zeroing and that requires serialisation against other direct
722 : * I/O to the same block. In this case we need to serialise the submission of
723 : * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
724 : * In the case where sub-block zeroing is not required, we can do concurrent
725 : * sub-block dios to the same block successfully.
726 : *
727 : * Optimistically submit the I/O using the shared lock first, but use the
728 : * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
729 : * if block allocation or partial block zeroing would be required. In that case
730 : * we try again with the exclusive lock.
731 : */
732 : static noinline ssize_t
733 7024080 : xfs_file_dio_write_unaligned(
734 : struct xfs_inode *ip,
735 : struct kiocb *iocb,
736 : struct iov_iter *from)
737 : {
738 7024080 : size_t isize = i_size_read(VFS_I(ip));
739 7024080 : size_t count = iov_iter_count(from);
740 7024080 : unsigned int iolock = XFS_IOLOCK_SHARED;
741 7024080 : unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
742 7024080 : ssize_t ret;
743 :
744 : /*
745 : * Extending writes need exclusivity because of the sub-block zeroing
746 : * that the DIO code always does for partial tail blocks beyond EOF, so
747 : * don't even bother trying the fast path in this case.
748 : */
749 7024080 : if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
750 2650425 : if (iocb->ki_flags & IOCB_NOWAIT)
751 : return -EAGAIN;
752 2650425 : retry_exclusive:
753 3171972 : iolock = XFS_IOLOCK_EXCL;
754 3171972 : flags = IOMAP_DIO_FORCE_WAIT;
755 : }
756 :
757 7545627 : ret = xfs_ilock_iocb(iocb, iolock);
758 7545624 : if (ret)
759 : return ret;
760 :
761 : /*
762 : * We can't properly handle unaligned direct I/O to reflink files yet,
763 : * as we can't unshare a partial block.
764 : */
765 7545626 : if (xfs_is_cow_inode(ip)) {
766 4515916 : trace_xfs_reflink_bounce_dio_write(iocb, from);
767 4515916 : ret = -ENOTBLK;
768 4515916 : goto out_unlock;
769 : }
770 :
771 3029700 : ret = xfs_file_write_checks(iocb, from, &iolock);
772 3029716 : if (ret)
773 77 : goto out_unlock;
774 :
775 : /*
776 : * If we are doing exclusive unaligned I/O, this must be the only I/O
777 : * in-flight. Otherwise we risk data corruption due to unwritten extent
778 : * conversions from the AIO end_io handler. Wait for all other I/O to
779 : * drain first.
780 : */
781 3029639 : if (flags & IOMAP_DIO_FORCE_WAIT)
782 2407613 : inode_dio_wait(VFS_I(ip));
783 :
784 3029631 : trace_xfs_file_direct_write(iocb, from);
785 3029624 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
786 : &xfs_dio_write_ops, flags, NULL, 0);
787 :
788 : /*
789 : * Retry unaligned I/O with exclusive blocking semantics if the DIO
790 : * layer rejected it for mapping or locking reasons. If we are doing
791 : * nonblocking user I/O, propagate the error.
792 : */
793 3029690 : if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
794 521549 : ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
795 521549 : xfs_iunlock(ip, iolock);
796 521547 : goto retry_exclusive;
797 : }
798 :
799 2508141 : out_unlock:
800 7024134 : if (iolock)
801 7024117 : xfs_iunlock(ip, iolock);
802 : return ret;
803 : }
804 :
805 : static ssize_t
806 10763985 : xfs_file_dio_write(
807 : struct kiocb *iocb,
808 : struct iov_iter *from)
809 : {
810 10763985 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
811 10763985 : struct xfs_buftarg *target = xfs_inode_buftarg(ip);
812 10763985 : size_t count = iov_iter_count(from);
813 :
814 : /* direct I/O must be aligned to device logical sector size */
815 10763985 : if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
816 : return -EINVAL;
817 10763985 : if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
818 7024061 : return xfs_file_dio_write_unaligned(ip, iocb, from);
819 3739924 : return xfs_file_dio_write_aligned(ip, iocb, from);
820 : }
821 :
822 : static noinline ssize_t
823 : xfs_file_dax_write(
824 : struct kiocb *iocb,
825 : struct iov_iter *from)
826 : {
827 : struct inode *inode = iocb->ki_filp->f_mapping->host;
828 : struct xfs_inode *ip = XFS_I(inode);
829 : unsigned int iolock = XFS_IOLOCK_EXCL;
830 : ssize_t ret, error = 0;
831 : loff_t pos;
832 :
833 : ret = xfs_ilock_iocb(iocb, iolock);
834 : if (ret)
835 : return ret;
836 : ret = xfs_file_write_checks(iocb, from, &iolock);
837 : if (ret)
838 : goto out;
839 :
840 : pos = iocb->ki_pos;
841 :
842 : trace_xfs_file_dax_write(iocb, from);
843 : ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
844 : if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
845 : i_size_write(inode, iocb->ki_pos);
846 : error = xfs_setfilesize(ip, pos, ret);
847 : }
848 : out:
849 : if (iolock)
850 : xfs_iunlock(ip, iolock);
851 : if (error)
852 : return error;
853 :
854 : if (ret > 0) {
855 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
856 :
857 : /* Handle various SYNC-type writes */
858 : ret = generic_write_sync(iocb, ret);
859 : }
860 : return ret;
861 : }
862 :
863 : STATIC ssize_t
864 58854271 : xfs_file_buffered_write(
865 : struct kiocb *iocb,
866 : struct iov_iter *from)
867 : {
868 58854271 : struct inode *inode = iocb->ki_filp->f_mapping->host;
869 58854271 : struct xfs_inode *ip = XFS_I(inode);
870 58854271 : ssize_t ret;
871 58854271 : bool cleared_space = false;
872 59284213 : unsigned int iolock;
873 :
874 : write_retry:
875 59284213 : iolock = XFS_IOLOCK_EXCL;
876 59284213 : ret = xfs_ilock_iocb(iocb, iolock);
877 59284482 : if (ret)
878 0 : return ret;
879 :
880 59284482 : ret = xfs_file_write_checks(iocb, from, &iolock);
881 59270324 : if (ret)
882 669 : goto out;
883 :
884 59269655 : trace_xfs_file_buffered_write(iocb, from);
885 59282568 : ret = iomap_file_buffered_write(iocb, from,
886 : &xfs_buffered_write_iomap_ops);
887 :
888 : /*
889 : * If we hit a space limit, try to free up some lingering preallocated
890 : * space before returning an error. In the case of ENOSPC, first try to
891 : * write back all dirty inodes to free up some of the excess reserved
892 : * metadata space. This reduces the chances that the eofblocks scan
893 : * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
894 : * also behaves as a filter to prevent too many eofblocks scans from
895 : * running at the same time. Use a synchronous scan to increase the
896 : * effectiveness of the scan.
897 : */
898 59279652 : if (ret == -EDQUOT && !cleared_space) {
899 232 : xfs_iunlock(ip, iolock);
900 232 : xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
901 232 : cleared_space = true;
902 232 : goto write_retry;
903 59279420 : } else if (ret == -ENOSPC && !cleared_space) {
904 429539 : struct xfs_icwalk icw = {0};
905 :
906 429539 : cleared_space = true;
907 429539 : xfs_flush_inodes(ip->i_mount);
908 :
909 429580 : xfs_iunlock(ip, iolock);
910 429505 : icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
911 429505 : xfs_blockgc_free_space(ip->i_mount, &icw);
912 429710 : goto write_retry;
913 : }
914 :
915 58849881 : out:
916 58850550 : if (iolock)
917 58856607 : xfs_iunlock(ip, iolock);
918 :
919 58848367 : if (ret > 0) {
920 58477592 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
921 : /* Handle various SYNC-type writes */
922 58477592 : ret = generic_write_sync(iocb, ret);
923 : }
924 : return ret;
925 : }
926 :
927 : STATIC ssize_t
928 65104148 : xfs_file_write_iter(
929 : struct kiocb *iocb,
930 : struct iov_iter *from)
931 : {
932 65104148 : struct inode *inode = iocb->ki_filp->f_mapping->host;
933 65104148 : struct xfs_inode *ip = XFS_I(inode);
934 65104148 : ssize_t ret;
935 65104148 : size_t ocount = iov_iter_count(from);
936 :
937 65104148 : XFS_STATS_INC(ip->i_mount, xs_write_calls);
938 :
939 65104148 : if (ocount == 0)
940 : return 0;
941 :
942 130208152 : if (xfs_is_shutdown(ip->i_mount))
943 : return -EIO;
944 :
945 65094347 : if (IS_DAX(inode))
946 : return xfs_file_dax_write(iocb, from);
947 :
948 65094347 : if (iocb->ki_flags & IOCB_DIRECT) {
949 : /*
950 : * Allow a directio write to fall back to a buffered
951 : * write *only* in the case that we're doing a reflink
952 : * CoW. In all other directio scenarios we do not
953 : * allow an operation to fall back to buffered mode.
954 : */
955 10764045 : ret = xfs_file_dio_write(iocb, from);
956 10764133 : if (ret != -ENOTBLK)
957 : return ret;
958 : }
959 :
960 58846315 : return xfs_file_buffered_write(iocb, from);
961 : }
962 :
963 : /* Does this file, inode, or mount want synchronous writes? */
964 408708277 : static inline bool xfs_file_sync_writes(struct file *filp)
965 : {
966 408708277 : struct xfs_inode *ip = XFS_I(file_inode(filp));
967 :
968 408708277 : if (xfs_has_wsync(ip->i_mount))
969 : return true;
970 408708269 : if (filp->f_flags & (__O_SYNC | O_DSYNC))
971 : return true;
972 408701135 : if (IS_SYNC(file_inode(filp)))
973 4 : return true;
974 :
975 : return false;
976 : }
977 :
978 : #define XFS_FALLOC_FL_SUPPORTED \
979 : (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
980 : FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
981 : FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE | \
982 : FALLOC_FL_MAP_FREE_SPACE)
983 :
984 : STATIC long
985 16125042 : xfs_file_fallocate(
986 : struct file *file,
987 : int mode,
988 : loff_t offset,
989 : loff_t len)
990 : {
991 16125042 : struct inode *inode = file_inode(file);
992 16125042 : struct xfs_inode *ip = XFS_I(inode);
993 16125042 : long error;
994 16125042 : uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
995 16125042 : loff_t new_size = 0;
996 16125042 : bool do_file_insert = false;
997 :
998 16125042 : if (!S_ISREG(inode->i_mode))
999 : return -EINVAL;
1000 16125042 : if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1001 : return -EOPNOTSUPP;
1002 :
1003 16125042 : xfs_ilock(ip, iolock);
1004 16125108 : error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1005 16125120 : if (error)
1006 0 : goto out_unlock;
1007 :
1008 : /*
1009 : * Must wait for all AIO to complete before we continue as AIO can
1010 : * change the file size on completion without holding any locks we
1011 : * currently hold. We must do this first because AIO can update both
1012 : * the on disk and in memory inode sizes, and the operations that follow
1013 : * require the in-memory size to be fully up-to-date.
1014 : */
1015 16125120 : inode_dio_wait(inode);
1016 :
1017 : /*
1018 : * Now AIO and DIO has drained we flush and (if necessary) invalidate
1019 : * the cached range over the first operation we are about to run.
1020 : *
1021 : * We care about zero and collapse here because they both run a hole
1022 : * punch over the range first. Because that can zero data, and the range
1023 : * of invalidation for the shift operations is much larger, we still do
1024 : * the required flush for collapse in xfs_prepare_shift().
1025 : *
1026 : * Insert has the same range requirements as collapse, and we extend the
1027 : * file first which can zero data. Hence insert has the same
1028 : * flush/invalidate requirements as collapse and so they are both
1029 : * handled at the right time by xfs_prepare_shift().
1030 : */
1031 16125059 : if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1032 : FALLOC_FL_COLLAPSE_RANGE)) {
1033 11250602 : error = xfs_flush_unmap_range(ip, offset, len);
1034 11250674 : if (error)
1035 178 : goto out_unlock;
1036 : }
1037 :
1038 16124953 : error = file_modified(file);
1039 16124997 : if (error)
1040 4 : goto out_unlock;
1041 :
1042 16124993 : if (mode & FALLOC_FL_PUNCH_HOLE) {
1043 : /* Unshare around the region to punch, if needed. */
1044 8321726 : if (xfs_file_write_needs_cow_around(ip, offset, len)) {
1045 0 : error = xfs_file_cow_around(ip, offset, len);
1046 0 : if (error)
1047 0 : goto out_unlock;
1048 : }
1049 :
1050 8321721 : error = xfs_free_file_space(ip, offset, len);
1051 8321720 : if (error)
1052 14190 : goto out_unlock;
1053 7803267 : } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
1054 955014 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
1055 334242 : error = -EINVAL;
1056 334242 : goto out_unlock;
1057 : }
1058 :
1059 : /*
1060 : * There is no need to overlap collapse range with EOF,
1061 : * in which case it is effectively a truncate operation
1062 : */
1063 620771 : if (offset + len >= i_size_read(inode)) {
1064 146235 : error = -EINVAL;
1065 146235 : goto out_unlock;
1066 : }
1067 :
1068 474536 : new_size = i_size_read(inode) - len;
1069 :
1070 474536 : error = xfs_collapse_file_space(ip, offset, len);
1071 474536 : if (error)
1072 1809 : goto out_unlock;
1073 6848253 : } else if (mode & FALLOC_FL_INSERT_RANGE) {
1074 858502 : loff_t isize = i_size_read(inode);
1075 :
1076 858502 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
1077 334578 : error = -EINVAL;
1078 334578 : goto out_unlock;
1079 : }
1080 :
1081 : /*
1082 : * New inode size must not exceed ->s_maxbytes, accounting for
1083 : * possible signed overflow.
1084 : */
1085 523924 : if (inode->i_sb->s_maxbytes - isize < len) {
1086 2 : error = -EFBIG;
1087 2 : goto out_unlock;
1088 : }
1089 523922 : new_size = isize + len;
1090 :
1091 : /* Offset should be less than i_size */
1092 523922 : if (offset >= isize) {
1093 119480 : error = -EINVAL;
1094 119480 : goto out_unlock;
1095 : }
1096 : do_file_insert = true;
1097 5989751 : } else if (mode & FALLOC_FL_MAP_FREE_SPACE) {
1098 204 : struct xfs_mount *mp = ip->i_mount;
1099 204 : xfs_off_t device_size;
1100 :
1101 204 : if (!capable(CAP_SYS_ADMIN)) {
1102 0 : error = -EPERM;
1103 0 : goto out_unlock;
1104 : }
1105 :
1106 204 : if (XFS_IS_REALTIME_INODE(ip))
1107 0 : device_size = XFS_FSB_TO_B(mp, mp->m_sb.sb_rblocks);
1108 : else
1109 204 : device_size = XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks);
1110 :
1111 : /*
1112 : * Bail out now if we aren't allowed to make the file size the
1113 : * same length as the device.
1114 : */
1115 204 : if (device_size > i_size_read(inode)) {
1116 4 : new_size = device_size;
1117 4 : error = inode_newsize_ok(inode, new_size);
1118 4 : if (error)
1119 0 : goto out_unlock;
1120 : }
1121 :
1122 204 : if (XFS_IS_REALTIME_INODE(ip))
1123 0 : error = xfs_map_free_rt_space(ip, offset, len);
1124 : else
1125 204 : error = xfs_map_free_space(ip, offset, len);
1126 204 : if (error) {
1127 0 : if (error == -ECANCELED)
1128 0 : error = 0;
1129 0 : goto out_unlock;
1130 : }
1131 : } else {
1132 5989547 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
1133 2574464 : offset + len > i_size_read(inode)) {
1134 1370327 : new_size = offset + len;
1135 1370327 : error = inode_newsize_ok(inode, new_size);
1136 1370327 : if (error)
1137 2 : goto out_unlock;
1138 : }
1139 :
1140 5989545 : if (mode & FALLOC_FL_ZERO_RANGE) {
1141 : /*
1142 : * Punch a hole and prealloc the range. We use a hole
1143 : * punch rather than unwritten extent conversion for two
1144 : * reasons:
1145 : *
1146 : * 1.) Hole punch handles partial block zeroing for us.
1147 : * 2.) If prealloc returns ENOSPC, the file range is
1148 : * still zero-valued by virtue of the hole punch.
1149 : */
1150 1973777 : unsigned int blksize = i_blocksize(inode);
1151 :
1152 1973777 : trace_xfs_zero_file_space(ip, offset, len);
1153 :
1154 : /* Unshare around the region to zero, if needed. */
1155 1973777 : if (xfs_file_write_needs_cow_around(ip, offset, len)) {
1156 0 : error = xfs_file_cow_around(ip, offset, len);
1157 0 : if (error)
1158 0 : goto out_unlock;
1159 : }
1160 :
1161 1973776 : error = xfs_free_file_space(ip, offset, len);
1162 1973778 : if (error)
1163 8367 : goto out_unlock;
1164 :
1165 1965411 : len = round_up(offset + len, blksize) -
1166 1965411 : round_down(offset, blksize);
1167 1965411 : offset = round_down(offset, blksize);
1168 4015768 : } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1169 : /*
1170 : * Enlarge the unshare region to align to a full
1171 : * allocation unit.
1172 : */
1173 66 : if (xfs_inode_needs_cow_around(ip)) {
1174 0 : loff_t isize = i_size_read(VFS_I(ip));
1175 0 : unsigned int rextsize;
1176 0 : uint32_t mod;
1177 :
1178 0 : rextsize = xfs_inode_alloc_unitsize(ip);
1179 0 : div_u64_rem(offset, rextsize, &mod);
1180 0 : offset -= mod;
1181 0 : len += mod;
1182 :
1183 0 : div_u64_rem(offset + len, rextsize, &mod);
1184 0 : if (mod)
1185 0 : len += rextsize - mod;
1186 0 : if (offset + len > isize)
1187 0 : len = isize - offset;
1188 : }
1189 66 : error = xfs_reflink_unshare(ip, offset, len);
1190 66 : if (error)
1191 2 : goto out_unlock;
1192 : } else {
1193 : /*
1194 : * If always_cow mode we can't use preallocations and
1195 : * thus should not create them.
1196 : */
1197 4015702 : if (xfs_is_always_cow_inode(ip)) {
1198 0 : error = -EOPNOTSUPP;
1199 0 : goto out_unlock;
1200 : }
1201 : }
1202 :
1203 5981179 : if (!xfs_is_always_cow_inode(ip)) {
1204 5981167 : error = xfs_alloc_file_space(ip, offset, len);
1205 5981180 : if (error)
1206 99939 : goto out_unlock;
1207 : }
1208 : }
1209 :
1210 : /* Change file size if needed */
1211 15066144 : if (new_size) {
1212 2239869 : struct iattr iattr;
1213 :
1214 2239869 : iattr.ia_valid = ATTR_SIZE;
1215 2239869 : iattr.ia_size = new_size;
1216 4479739 : error = xfs_vn_setattr_size(file_mnt_idmap(file),
1217 : file_dentry(file), &iattr);
1218 2239869 : if (error)
1219 328 : goto out_unlock;
1220 : }
1221 :
1222 : /*
1223 : * Perform hole insertion now that the file size has been
1224 : * updated so that if we crash during the operation we don't
1225 : * leave shifted extents past EOF and hence losing access to
1226 : * the data that is contained within them.
1227 : */
1228 15065816 : if (do_file_insert) {
1229 404342 : error = xfs_insert_file_space(ip, offset, len);
1230 404342 : if (error)
1231 1779 : goto out_unlock;
1232 : }
1233 :
1234 15064037 : if (xfs_file_sync_writes(file))
1235 7130 : error = xfs_log_force_inode(ip);
1236 :
1237 15056907 : out_unlock:
1238 16125172 : xfs_iunlock(ip, iolock);
1239 16125172 : return error;
1240 : }
1241 :
1242 : STATIC int
1243 2120886 : xfs_file_fadvise(
1244 : struct file *file,
1245 : loff_t start,
1246 : loff_t end,
1247 : int advice)
1248 : {
1249 2120886 : struct xfs_inode *ip = XFS_I(file_inode(file));
1250 2120886 : int ret;
1251 2120886 : int lockflags = 0;
1252 :
1253 : /*
1254 : * Operations creating pages in page cache need protection from hole
1255 : * punching and similar ops
1256 : */
1257 2120886 : if (advice == POSIX_FADV_WILLNEED) {
1258 0 : lockflags = XFS_IOLOCK_SHARED;
1259 0 : xfs_ilock(ip, lockflags);
1260 : }
1261 2120886 : ret = generic_fadvise(file, start, end, advice);
1262 2120945 : if (lockflags)
1263 0 : xfs_iunlock(ip, lockflags);
1264 2120945 : return ret;
1265 : }
1266 :
1267 : STATIC loff_t
1268 272727396 : xfs_file_remap_range(
1269 : struct file *file_in,
1270 : loff_t pos_in,
1271 : struct file *file_out,
1272 : loff_t pos_out,
1273 : loff_t len,
1274 : unsigned int remap_flags)
1275 : {
1276 272727396 : struct inode *inode_in = file_inode(file_in);
1277 272727396 : struct xfs_inode *src = XFS_I(inode_in);
1278 272727396 : struct inode *inode_out = file_inode(file_out);
1279 272727396 : struct xfs_inode *dest = XFS_I(inode_out);
1280 272727396 : struct xfs_mount *mp = src->i_mount;
1281 272727396 : loff_t remapped = 0;
1282 272727396 : xfs_extlen_t cowextsize;
1283 272727396 : int ret;
1284 :
1285 272727396 : if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1286 : return -EINVAL;
1287 :
1288 272727396 : if (!xfs_has_reflink(mp))
1289 : return -EOPNOTSUPP;
1290 :
1291 545419526 : if (xfs_is_shutdown(mp))
1292 : return -EIO;
1293 :
1294 : /* Prepare and then clone file data. */
1295 272707663 : ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1296 : &len, remap_flags);
1297 272712711 : if (ret || len == 0)
1298 75293247 : return ret;
1299 :
1300 197419464 : trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1301 :
1302 197417069 : ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1303 : &remapped);
1304 197422847 : if (ret)
1305 597900 : goto out_unlock;
1306 :
1307 : /*
1308 : * Carry the cowextsize hint from src to dest if we're sharing the
1309 : * entire source file to the entire destination file, the source file
1310 : * has a cowextsize hint, and the destination file does not.
1311 : */
1312 196824947 : cowextsize = 0;
1313 196824947 : if (pos_in == 0 && len == i_size_read(inode_in) &&
1314 45816 : (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1315 48 : pos_out == 0 && len >= i_size_read(inode_out) &&
1316 46 : !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1317 6 : cowextsize = src->i_cowextsize;
1318 :
1319 196824947 : ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1320 : remap_flags);
1321 196824584 : if (ret)
1322 0 : goto out_unlock;
1323 :
1324 196824584 : if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1325 942 : xfs_log_force_inode(dest);
1326 196823724 : out_unlock:
1327 197421640 : xfs_iunlock2_io_mmap(src, dest);
1328 197419236 : if (ret)
1329 597904 : trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1330 197419235 : return remapped > 0 ? remapped : ret;
1331 : }
1332 :
1333 : STATIC int
1334 484048933 : xfs_file_open(
1335 : struct inode *inode,
1336 : struct file *file)
1337 : {
1338 968097866 : if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1339 : return -EIO;
1340 484034572 : file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1341 : FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
1342 484034572 : return generic_file_open(inode, file);
1343 : }
1344 :
1345 : STATIC int
1346 26767218 : xfs_dir_open(
1347 : struct inode *inode,
1348 : struct file *file)
1349 : {
1350 26767218 : struct xfs_inode *ip = XFS_I(inode);
1351 26767218 : unsigned int mode;
1352 26767218 : int error;
1353 :
1354 26767218 : error = xfs_file_open(inode, file);
1355 26767471 : if (error)
1356 : return error;
1357 :
1358 : /*
1359 : * If there are any blocks, read-ahead block 0 as we're almost
1360 : * certain to have the next operation be a read there.
1361 : */
1362 26766430 : mode = xfs_ilock_data_map_shared(ip);
1363 26766549 : if (ip->i_df.if_nextents > 0)
1364 6481797 : error = xfs_dir3_data_readahead(ip, 0, 0);
1365 26766502 : xfs_iunlock(ip, mode);
1366 26766502 : return error;
1367 : }
1368 :
1369 : /*
1370 : * When we release the file, we don't want it to trim EOF blocks if it is a
1371 : * readonly context. This avoids open/read/close workloads from removing
1372 : * EOF blocks that other writers depend upon to reduce fragmentation.
1373 : */
1374 : STATIC int
1375 457253893 : xfs_file_release(
1376 : struct inode *inode,
1377 : struct file *file)
1378 : {
1379 457253893 : bool free_eof_blocks = true;
1380 :
1381 457253893 : if ((file->f_mode & (FMODE_WRITE | FMODE_READ)) == FMODE_READ)
1382 57658639 : free_eof_blocks = false;
1383 :
1384 457253893 : return xfs_release(XFS_I(inode), free_eof_blocks);
1385 : }
1386 :
1387 : STATIC int
1388 49460947 : xfs_file_readdir(
1389 : struct file *file,
1390 : struct dir_context *ctx)
1391 : {
1392 49460947 : struct inode *inode = file_inode(file);
1393 49460947 : xfs_inode_t *ip = XFS_I(inode);
1394 49460947 : size_t bufsize;
1395 :
1396 : /*
1397 : * The Linux API doesn't pass down the total size of the buffer
1398 : * we read into down to the filesystem. With the filldir concept
1399 : * it's not needed for correct information, but the XFS dir2 leaf
1400 : * code wants an estimate of the buffer size to calculate it's
1401 : * readahead window and size the buffers used for mapping to
1402 : * physical blocks.
1403 : *
1404 : * Try to give it an estimate that's good enough, maybe at some
1405 : * point we can change the ->readdir prototype to include the
1406 : * buffer size. For now we use the current glibc buffer size.
1407 : */
1408 49460947 : bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1409 :
1410 49460947 : return xfs_readdir(NULL, ip, ctx, bufsize);
1411 : }
1412 :
1413 : STATIC loff_t
1414 44634637 : xfs_file_llseek(
1415 : struct file *file,
1416 : loff_t offset,
1417 : int whence)
1418 : {
1419 44634637 : struct inode *inode = file->f_mapping->host;
1420 :
1421 89269274 : if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1422 : return -EIO;
1423 :
1424 44634626 : switch (whence) {
1425 44402137 : default:
1426 44402137 : return generic_file_llseek(file, offset, whence);
1427 326 : case SEEK_HOLE:
1428 326 : offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1429 326 : break;
1430 232163 : case SEEK_DATA:
1431 232163 : offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1432 232163 : break;
1433 : }
1434 :
1435 232489 : if (offset < 0)
1436 : return offset;
1437 178526 : return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1438 : }
1439 :
1440 : #ifdef CONFIG_FS_DAX
1441 : static inline vm_fault_t
1442 : xfs_dax_fault(
1443 : struct vm_fault *vmf,
1444 : enum page_entry_size pe_size,
1445 : bool write_fault,
1446 : pfn_t *pfn)
1447 : {
1448 : return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1449 : (write_fault && !vmf->cow_page) ?
1450 : &xfs_dax_write_iomap_ops :
1451 : &xfs_read_iomap_ops);
1452 : }
1453 : #else
1454 : static inline vm_fault_t
1455 : xfs_dax_fault(
1456 : struct vm_fault *vmf,
1457 : enum page_entry_size pe_size,
1458 : bool write_fault,
1459 : pfn_t *pfn)
1460 : {
1461 : ASSERT(0);
1462 : return VM_FAULT_SIGBUS;
1463 : }
1464 : #endif
1465 :
1466 : static int
1467 4031509 : xfs_filemap_fault_around(
1468 : struct vm_fault *vmf,
1469 : struct inode *inode)
1470 : {
1471 4031509 : struct xfs_inode *ip = XFS_I(inode);
1472 4031509 : struct folio *folio = page_folio(vmf->page);
1473 4031509 : loff_t pos;
1474 4031509 : ssize_t len;
1475 :
1476 4031509 : if (!xfs_inode_needs_cow_around(ip))
1477 : return 0;
1478 :
1479 0 : folio_lock(folio);
1480 0 : len = folio_mkwrite_check_truncate(folio, inode);
1481 0 : if (len < 0) {
1482 0 : folio_unlock(folio);
1483 0 : return len;
1484 : }
1485 0 : pos = folio_pos(folio);
1486 0 : folio_unlock(folio);
1487 :
1488 0 : if (!xfs_file_write_needs_cow_around(ip, pos, len))
1489 : return 0;
1490 :
1491 0 : return xfs_file_cow_around(XFS_I(inode), pos, len);
1492 : }
1493 :
1494 : /*
1495 : * Locking for serialisation of IO during page faults. This results in a lock
1496 : * ordering of:
1497 : *
1498 : * mmap_lock (MM)
1499 : * sb_start_pagefault(vfs, freeze)
1500 : * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1501 : * page_lock (MM)
1502 : * i_lock (XFS - extent map serialisation)
1503 : */
1504 : static vm_fault_t
1505 16328634 : __xfs_filemap_fault(
1506 : struct vm_fault *vmf,
1507 : enum page_entry_size pe_size,
1508 : bool write_fault)
1509 : {
1510 16328634 : struct inode *inode = file_inode(vmf->vma->vm_file);
1511 16328634 : struct xfs_inode *ip = XFS_I(inode);
1512 16328634 : vm_fault_t ret;
1513 :
1514 16328634 : trace_xfs_filemap_fault(ip, pe_size, write_fault);
1515 :
1516 16330132 : if (write_fault) {
1517 4031446 : sb_start_pagefault(inode->i_sb);
1518 4031496 : file_update_time(vmf->vma->vm_file);
1519 : }
1520 :
1521 16330183 : if (IS_DAX(inode)) {
1522 : pfn_t pfn;
1523 :
1524 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1525 : ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1526 : if (ret & VM_FAULT_NEEDDSYNC)
1527 : ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1528 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1529 : } else {
1530 16330183 : if (write_fault) {
1531 4031497 : int error;
1532 :
1533 4031497 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1534 :
1535 : /*
1536 : * Unshare all the blocks in this rt extent surrounding
1537 : * this page.
1538 : */
1539 4031500 : error = xfs_filemap_fault_around(vmf, inode);
1540 4031502 : if (error) {
1541 0 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1542 0 : ret = block_page_mkwrite_return(error);
1543 0 : goto out;
1544 : }
1545 :
1546 4031502 : ret = iomap_page_mkwrite(vmf,
1547 : &xfs_page_mkwrite_iomap_ops);
1548 4031483 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1549 : } else {
1550 12298686 : ret = filemap_fault(vmf);
1551 : }
1552 : }
1553 :
1554 16330762 : out:
1555 16330762 : if (write_fault)
1556 4031489 : sb_end_pagefault(inode->i_sb);
1557 16330752 : return ret;
1558 : }
1559 :
1560 : static inline bool
1561 : xfs_is_write_fault(
1562 : struct vm_fault *vmf)
1563 : {
1564 : return (vmf->flags & FAULT_FLAG_WRITE) &&
1565 : (vmf->vma->vm_flags & VM_SHARED);
1566 : }
1567 :
1568 : static vm_fault_t
1569 12298758 : xfs_filemap_fault(
1570 : struct vm_fault *vmf)
1571 : {
1572 : /* DAX can shortcut the normal fault path on write faults! */
1573 12298758 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1574 : IS_DAX(file_inode(vmf->vma->vm_file)) &&
1575 : xfs_is_write_fault(vmf));
1576 : }
1577 :
1578 : static vm_fault_t
1579 0 : xfs_filemap_huge_fault(
1580 : struct vm_fault *vmf,
1581 : enum page_entry_size pe_size)
1582 : {
1583 0 : if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1584 0 : return VM_FAULT_FALLBACK;
1585 :
1586 : /* DAX can shortcut the normal fault path on write faults! */
1587 : return __xfs_filemap_fault(vmf, pe_size,
1588 : xfs_is_write_fault(vmf));
1589 : }
1590 :
1591 : static vm_fault_t
1592 4031412 : xfs_filemap_page_mkwrite(
1593 : struct vm_fault *vmf)
1594 : {
1595 4031412 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1596 : }
1597 :
1598 : /*
1599 : * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1600 : * on write faults. In reality, it needs to serialise against truncate and
1601 : * prepare memory for writing so handle is as standard write fault.
1602 : */
1603 : static vm_fault_t
1604 0 : xfs_filemap_pfn_mkwrite(
1605 : struct vm_fault *vmf)
1606 : {
1607 :
1608 0 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1609 : }
1610 :
1611 : static const struct vm_operations_struct xfs_file_vm_ops = {
1612 : .fault = xfs_filemap_fault,
1613 : .huge_fault = xfs_filemap_huge_fault,
1614 : .map_pages = filemap_map_pages,
1615 : .page_mkwrite = xfs_filemap_page_mkwrite,
1616 : .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1617 : };
1618 :
1619 : STATIC int
1620 8862963 : xfs_file_mmap(
1621 : struct file *file,
1622 : struct vm_area_struct *vma)
1623 : {
1624 8862963 : struct inode *inode = file_inode(file);
1625 8862963 : struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1626 :
1627 : /*
1628 : * We don't support synchronous mappings for non-DAX files and
1629 : * for DAX files if underneath dax_device is not synchronous.
1630 : */
1631 8862963 : if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1632 : return -EOPNOTSUPP;
1633 :
1634 8862972 : file_accessed(file);
1635 8862983 : vma->vm_ops = &xfs_file_vm_ops;
1636 8862983 : if (IS_DAX(inode))
1637 : vm_flags_set(vma, VM_HUGEPAGE);
1638 8862983 : return 0;
1639 : }
1640 :
1641 : const struct file_operations xfs_file_operations = {
1642 : .llseek = xfs_file_llseek,
1643 : .read_iter = xfs_file_read_iter,
1644 : .write_iter = xfs_file_write_iter,
1645 : .splice_read = xfs_file_splice_read,
1646 : .splice_write = iter_file_splice_write,
1647 : .iopoll = iocb_bio_iopoll,
1648 : .unlocked_ioctl = xfs_file_ioctl,
1649 : #ifdef CONFIG_COMPAT
1650 : .compat_ioctl = xfs_file_compat_ioctl,
1651 : #endif
1652 : .mmap = xfs_file_mmap,
1653 : .mmap_supported_flags = MAP_SYNC,
1654 : .open = xfs_file_open,
1655 : .release = xfs_file_release,
1656 : .fsync = xfs_file_fsync,
1657 : .get_unmapped_area = thp_get_unmapped_area,
1658 : .fallocate = xfs_file_fallocate,
1659 : .fadvise = xfs_file_fadvise,
1660 : .remap_file_range = xfs_file_remap_range,
1661 : };
1662 :
1663 : const struct file_operations xfs_dir_file_operations = {
1664 : .open = xfs_dir_open,
1665 : .read = generic_read_dir,
1666 : .iterate_shared = xfs_file_readdir,
1667 : .llseek = generic_file_llseek,
1668 : .unlocked_ioctl = xfs_file_ioctl,
1669 : #ifdef CONFIG_COMPAT
1670 : .compat_ioctl = xfs_file_compat_ioctl,
1671 : #endif
1672 : .fsync = xfs_dir_fsync,
1673 : };
|