Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0
2 : /*
3 : * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 : * All Rights Reserved.
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_inode.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_inode_item.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_dir2.h"
19 : #include "xfs_dir2_priv.h"
20 : #include "xfs_ioctl.h"
21 : #include "xfs_trace.h"
22 : #include "xfs_log.h"
23 : #include "xfs_icache.h"
24 : #include "xfs_pnfs.h"
25 : #include "xfs_iomap.h"
26 : #include "xfs_reflink.h"
27 : #include "xfs_file.h"
28 :
29 : #include <linux/dax.h>
30 : #include <linux/falloc.h>
31 : #include <linux/backing-dev.h>
32 : #include <linux/mman.h>
33 : #include <linux/fadvise.h>
34 : #include <linux/mount.h>
35 : #include <linux/buffer_head.h> /* for block_page_mkwrite_return */
36 :
37 : static const struct vm_operations_struct xfs_file_vm_ops;
38 :
39 : /*
40 : * Decide if the given file range is aligned to the size of the fundamental
41 : * allocation unit for the file.
42 : */
43 : bool
44 21208366 : xfs_is_falloc_aligned(
45 : struct xfs_inode *ip,
46 : loff_t pos,
47 : long long int len)
48 : {
49 21208366 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip);
50 :
51 39017984 : if (XFS_IS_REALTIME_INODE(ip) && !is_power_of_2(alloc_unit))
52 8652853 : return isaligned_64(pos, alloc_unit) &&
53 1112342 : isaligned_64(len, alloc_unit);
54 :
55 14235620 : return !((pos | len) & (alloc_unit - 1));
56 : }
57 :
58 : /*
59 : * Fsync operations on directories are much simpler than on regular files,
60 : * as there is no file data to flush, and thus also no need for explicit
61 : * cache flush operations, and there are no non-transaction metadata updates
62 : * on directories either.
63 : */
64 : STATIC int
65 1033495 : xfs_dir_fsync(
66 : struct file *file,
67 : loff_t start,
68 : loff_t end,
69 : int datasync)
70 : {
71 1033495 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
72 :
73 1033495 : trace_xfs_dir_fsync(ip);
74 1033483 : return xfs_log_force_inode(ip);
75 : }
76 :
77 : static xfs_csn_t
78 5036636 : xfs_fsync_seq(
79 : struct xfs_inode *ip,
80 : bool datasync)
81 : {
82 5036636 : if (!xfs_ipincount(ip))
83 : return 0;
84 5035579 : if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
85 : return 0;
86 4750040 : return ip->i_itemp->ili_commit_seq;
87 : }
88 :
89 : /*
90 : * All metadata updates are logged, which means that we just have to flush the
91 : * log up to the latest LSN that touched the inode.
92 : *
93 : * If we have concurrent fsync/fdatasync() calls, we need them to all block on
94 : * the log force before we clear the ili_fsync_fields field. This ensures that
95 : * we don't get a racing sync operation that does not wait for the metadata to
96 : * hit the journal before returning. If we race with clearing ili_fsync_fields,
97 : * then all that will happen is the log force will do nothing as the lsn will
98 : * already be on disk. We can't race with setting ili_fsync_fields because that
99 : * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
100 : * shared until after the ili_fsync_fields is cleared.
101 : */
102 : static int
103 5036622 : xfs_fsync_flush_log(
104 : struct xfs_inode *ip,
105 : bool datasync,
106 : int *log_flushed)
107 : {
108 5036622 : int error = 0;
109 5036622 : xfs_csn_t seq;
110 :
111 5036622 : xfs_ilock(ip, XFS_ILOCK_SHARED);
112 5036654 : seq = xfs_fsync_seq(ip, datasync);
113 5036302 : if (seq) {
114 4750053 : error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
115 : log_flushed);
116 :
117 4749372 : spin_lock(&ip->i_itemp->ili_lock);
118 4749861 : ip->i_itemp->ili_fsync_fields = 0;
119 4749861 : spin_unlock(&ip->i_itemp->ili_lock);
120 : }
121 5036225 : xfs_iunlock(ip, XFS_ILOCK_SHARED);
122 5034579 : return error;
123 : }
124 :
125 : STATIC int
126 33192607 : xfs_file_fsync(
127 : struct file *file,
128 : loff_t start,
129 : loff_t end,
130 : int datasync)
131 : {
132 33192607 : struct xfs_inode *ip = XFS_I(file->f_mapping->host);
133 33192607 : struct xfs_mount *mp = ip->i_mount;
134 33192607 : int error, err2;
135 33192607 : int log_flushed = 0;
136 :
137 33192607 : trace_xfs_file_fsync(ip);
138 :
139 33191315 : error = file_write_and_wait_range(file, start, end);
140 33200476 : if (error)
141 : return error;
142 :
143 66394772 : if (xfs_is_shutdown(mp))
144 : return -EIO;
145 :
146 33195504 : xfs_iflags_clear(ip, XFS_ITRUNCATED);
147 :
148 : /*
149 : * If we have an RT and/or log subvolume we need to make sure to flush
150 : * the write cache the device used for file data first. This is to
151 : * ensure newly written file data make it to disk before logging the new
152 : * inode size in case of an extending write.
153 : */
154 33193156 : if (XFS_IS_REALTIME_INODE(ip))
155 14903303 : error = xfs_buftarg_flush(mp->m_rtdev_targp);
156 18289853 : else if (mp->m_logdev_targp != mp->m_ddev_targp)
157 550987 : error = xfs_buftarg_flush(mp->m_ddev_targp);
158 :
159 : /*
160 : * Any inode that has dirty modifications in the log is pinned. The
161 : * racy check here for a pinned inode will not catch modifications
162 : * that happen concurrently to the fsync call, but fsync semantics
163 : * only require to sync previously completed I/O.
164 : */
165 33184470 : if (xfs_ipincount(ip)) {
166 5036618 : err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
167 5034505 : if (err2 && !error)
168 1052 : error = err2;
169 : }
170 :
171 : /*
172 : * If we only have a single device, and the log force about was
173 : * a no-op we might have to flush the data device cache here.
174 : * This can only happen for fdatasync/O_DSYNC if we were overwriting
175 : * an already allocated file and thus do not have any metadata to
176 : * commit.
177 : */
178 33182357 : if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
179 16111580 : mp->m_logdev_targp == mp->m_ddev_targp) {
180 15808925 : err2 = xfs_buftarg_flush(mp->m_ddev_targp);
181 15794266 : if (err2 && !error)
182 262 : error = err2;
183 : }
184 :
185 : return error;
186 : }
187 :
188 : static int
189 1057054673 : xfs_ilock_iocb(
190 : struct kiocb *iocb,
191 : unsigned int lock_mode)
192 : {
193 1057054673 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
194 :
195 1057054673 : if (iocb->ki_flags & IOCB_NOWAIT) {
196 0 : if (!xfs_ilock_nowait(ip, lock_mode))
197 0 : return -EAGAIN;
198 : } else {
199 1057054673 : xfs_ilock(ip, lock_mode);
200 : }
201 :
202 : return 0;
203 : }
204 :
205 : STATIC ssize_t
206 346694982 : xfs_file_dio_read(
207 : struct kiocb *iocb,
208 : struct iov_iter *to)
209 : {
210 346694982 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
211 346694982 : ssize_t ret;
212 :
213 346694982 : trace_xfs_file_direct_read(iocb, to);
214 :
215 346693940 : if (!iov_iter_count(to))
216 : return 0; /* skip atime */
217 :
218 346669051 : file_accessed(iocb->ki_filp);
219 :
220 346667830 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
221 346667932 : if (ret)
222 : return ret;
223 346667934 : ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
224 346671315 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
225 :
226 346671315 : return ret;
227 : }
228 :
229 : static noinline ssize_t
230 0 : xfs_file_dax_read(
231 : struct kiocb *iocb,
232 : struct iov_iter *to)
233 : {
234 0 : struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
235 0 : ssize_t ret = 0;
236 :
237 0 : trace_xfs_file_dax_read(iocb, to);
238 :
239 0 : if (!iov_iter_count(to))
240 : return 0; /* skip atime */
241 :
242 0 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
243 0 : if (ret)
244 : return ret;
245 0 : ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
246 0 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
247 :
248 0 : file_accessed(iocb->ki_filp);
249 0 : return ret;
250 : }
251 :
252 : STATIC ssize_t
253 405572902 : xfs_file_buffered_read(
254 : struct kiocb *iocb,
255 : struct iov_iter *to)
256 : {
257 405572902 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
258 405572902 : ssize_t ret;
259 :
260 405572902 : trace_xfs_file_buffered_read(iocb, to);
261 :
262 405594300 : ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
263 404517890 : if (ret)
264 : return ret;
265 404552981 : ret = generic_file_read_iter(iocb, to);
266 405192589 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
267 :
268 405192589 : return ret;
269 : }
270 :
271 : STATIC ssize_t
272 753164330 : xfs_file_read_iter(
273 : struct kiocb *iocb,
274 : struct iov_iter *to)
275 : {
276 753164330 : struct inode *inode = file_inode(iocb->ki_filp);
277 753164330 : struct xfs_mount *mp = XFS_I(inode)->i_mount;
278 753164330 : ssize_t ret = 0;
279 :
280 753164330 : XFS_STATS_INC(mp, xs_read_calls);
281 :
282 1503314558 : if (xfs_is_shutdown(mp))
283 : return -EIO;
284 :
285 751654682 : if (IS_DAX(inode))
286 0 : ret = xfs_file_dax_read(iocb, to);
287 751654682 : else if (iocb->ki_flags & IOCB_DIRECT)
288 346695000 : ret = xfs_file_dio_read(iocb, to);
289 : else
290 404959682 : ret = xfs_file_buffered_read(iocb, to);
291 :
292 750930624 : if (ret > 0)
293 402116093 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
294 : return ret;
295 : }
296 :
297 : STATIC ssize_t
298 15220907 : xfs_file_splice_read(
299 : struct file *in,
300 : loff_t *ppos,
301 : struct pipe_inode_info *pipe,
302 : size_t len,
303 : unsigned int flags)
304 : {
305 15220907 : struct inode *inode = file_inode(in);
306 15220907 : struct xfs_inode *ip = XFS_I(inode);
307 15220907 : struct xfs_mount *mp = ip->i_mount;
308 15220907 : ssize_t ret = 0;
309 :
310 15220907 : XFS_STATS_INC(mp, xs_read_calls);
311 :
312 30441724 : if (xfs_is_shutdown(mp))
313 : return -EIO;
314 :
315 15220836 : trace_xfs_file_splice_read(ip, *ppos, len);
316 :
317 15220742 : xfs_ilock(ip, XFS_IOLOCK_SHARED);
318 15220723 : ret = filemap_splice_read(in, ppos, pipe, len, flags);
319 15220828 : xfs_iunlock(ip, XFS_IOLOCK_SHARED);
320 15220818 : if (ret > 0)
321 15220642 : XFS_STATS_ADD(mp, xs_read_bytes, ret);
322 : return ret;
323 : }
324 :
325 : /*
326 : * Decide if this file write requires COWing-around at either end of the write
327 : * range. This is only required if the file allocation unit is larger than
328 : * 1FSB and the write range is not aligned with the allocation unit.
329 : */
330 : static bool
331 530935221 : xfs_file_write_needs_cow_around(
332 : struct xfs_inode *ip,
333 : loff_t pos,
334 : long long int count)
335 : {
336 : /*
337 : * No COWing required if this inode doesn't do COW.
338 : *
339 : * If the allocation unit is 1FSB, we do not need to COW around the
340 : * edges of the operation range. This applies to all files on the data
341 : * device and rt files that have an extent size of 1FSB.
342 : */
343 530935221 : if (!xfs_inode_needs_cow_around(ip))
344 : return false;
345 :
346 : /*
347 : * Otherwise, check that the operation is aligned to the rt extent
348 : * size. Any unaligned operation /must/ be COWed around since the
349 : * regular reflink code only handles extending writes up to fsblock
350 : * boundaries.
351 : */
352 15355928 : return !xfs_is_falloc_aligned(ip, pos, count);
353 : }
354 :
355 : /* Do we need to COW-around at this offset to handle a truncate up or down? */
356 : bool
357 17598875 : xfs_truncate_needs_cow_around(
358 : struct xfs_inode *ip,
359 : loff_t pos)
360 : {
361 17598875 : return xfs_file_write_needs_cow_around(ip, pos, 0);
362 : }
363 :
364 : /* Does this file write require COWing around? */
365 : static inline bool
366 : xfs_iocb_needs_cow_around(
367 : struct xfs_inode *ip,
368 : const struct kiocb *iocb,
369 : const struct iov_iter *from)
370 : {
371 16300030 : return xfs_file_write_needs_cow_around(ip, iocb->ki_pos,
372 : iov_iter_count(from));
373 : }
374 :
375 : /* Unshare the allocation unit mapped to the given file position. */
376 : inline int
377 25335153 : xfs_file_unshare_at(
378 : struct xfs_inode *ip,
379 : loff_t pos)
380 : {
381 25335153 : loff_t isize = i_size_read(VFS_I(ip));
382 25335153 : unsigned int extsize, len;
383 25335153 : uint32_t mod;
384 :
385 25335153 : len = extsize = xfs_inode_alloc_unitsize(ip);
386 :
387 : /* Open-coded rounddown_64 so that we can skip out if aligned */
388 25334953 : div_u64_rem(pos, extsize, &mod);
389 25335244 : if (mod == 0)
390 : return 0;
391 23363300 : pos -= mod;
392 :
393 : /* Do not extend the file. */
394 23363300 : if (pos >= isize)
395 : return 0;
396 21892941 : if (pos + len > isize)
397 2311086 : len = isize - pos;
398 :
399 21892941 : trace_xfs_file_cow_around(ip, pos, len);
400 :
401 21892982 : if (IS_DAX(VFS_I(ip)))
402 0 : return dax_file_unshare(VFS_I(ip), pos, len,
403 : &xfs_dax_write_iomap_ops);
404 21892982 : return iomap_file_unshare(VFS_I(ip), pos, len,
405 : &xfs_buffered_write_iomap_ops);
406 : }
407 :
408 : /*
409 : * Dirty the pages on either side of a write request as needed to satisfy
410 : * alignment requirements if we're going to perform a copy-write.
411 : *
412 : * This is only needed for realtime files when the rt extent size is larger
413 : * than 1 fs block, because we don't allow a logical rt extent in a file to map
414 : * to multiple physical rt extents. In other words, we can only map and unmap
415 : * full rt extents. Note that page cache doesn't exist above EOF, so be
416 : * careful to stay below EOF.
417 : */
418 : static int
419 12054326 : xfs_file_cow_around(
420 : struct xfs_inode *ip,
421 : loff_t pos,
422 : long long int count)
423 : {
424 12054326 : int error;
425 :
426 : /* Unshare at the start of the extent. */
427 12054326 : error = xfs_file_unshare_at(ip, pos);
428 12054667 : if (error)
429 : return error;
430 :
431 : /* Unshare at the end. */
432 11909447 : return xfs_file_unshare_at(ip, pos + count);
433 : }
434 :
435 : /*
436 : * Common pre-write limit and setup checks.
437 : *
438 : * Called with the iolocked held either shared and exclusive according to
439 : * @iolock, and returns with it held. Might upgrade the iolock to exclusive
440 : * if called for a direct write beyond i_size.
441 : */
442 : STATIC ssize_t
443 295520402 : xfs_file_write_checks(
444 : struct kiocb *iocb,
445 : struct iov_iter *from,
446 : unsigned int *iolock)
447 : {
448 295520402 : struct file *file = iocb->ki_filp;
449 295520402 : struct inode *inode = file->f_mapping->host;
450 295520402 : struct xfs_inode *ip = XFS_I(inode);
451 295520402 : ssize_t error = 0;
452 295520402 : size_t count = iov_iter_count(from);
453 295520402 : bool drained_dio = false;
454 363223509 : loff_t isize;
455 :
456 : restart:
457 363223509 : error = generic_write_checks(iocb, from);
458 363735164 : if (error <= 0)
459 30 : return error;
460 :
461 363735134 : if (iocb->ki_flags & IOCB_NOWAIT) {
462 0 : error = break_layout(inode, false);
463 0 : if (error == -EWOULDBLOCK)
464 : error = -EAGAIN;
465 : } else {
466 363735134 : error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
467 : }
468 :
469 364695266 : if (error)
470 0 : return error;
471 :
472 : /*
473 : * For changing security info in file_remove_privs() we need i_rwsem
474 : * exclusively. We also need it to COW around the range being written.
475 : */
476 364695266 : if (*iolock == XFS_IOLOCK_SHARED &&
477 32684435 : (!IS_NOSEC(inode) || xfs_iocb_needs_cow_around(ip, iocb, from))) {
478 86120 : xfs_iunlock(ip, *iolock);
479 86090 : *iolock = XFS_IOLOCK_EXCL;
480 86090 : error = xfs_ilock_iocb(iocb, *iolock);
481 86087 : if (error) {
482 0 : *iolock = 0;
483 0 : return error;
484 : }
485 86087 : goto restart;
486 : }
487 :
488 : /*
489 : * The write is not aligned to the file's allocation unit. If either
490 : * of the allocation units at the start or end of the write range are
491 : * shared, unshare them through the page cache.
492 : */
493 364607402 : if (xfs_iocb_needs_cow_around(ip, iocb, from)) {
494 4968033 : ASSERT(*iolock == XFS_IOLOCK_EXCL);
495 :
496 4968033 : inode_dio_wait(VFS_I(ip));
497 4967981 : drained_dio = true;
498 :
499 4967981 : error = xfs_file_cow_around(ip, iocb->ki_pos, count);
500 4968371 : if (error)
501 129724 : return error;
502 : }
503 :
504 : /*
505 : * If the offset is beyond the size of the file, we need to zero any
506 : * blocks that fall between the existing EOF and the start of this
507 : * write. If zeroing is needed and we are currently holding the iolock
508 : * shared, we need to update it to exclusive which implies having to
509 : * redo all checks before.
510 : *
511 : * We need to serialise against EOF updates that occur in IO completions
512 : * here. We want to make sure that nobody is changing the size while we
513 : * do this check until we have placed an IO barrier (i.e. hold the
514 : * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
515 : * spinlock effectively forms a memory barrier once we have the
516 : * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
517 : * hence be able to correctly determine if we need to run zeroing.
518 : *
519 : * We can do an unlocked check here safely as IO completion can only
520 : * extend EOF. Truncate is locked out at this point, so the EOF can
521 : * not move backwards, only forwards. Hence we only need to take the
522 : * slow path and spin locks when we are at or beyond the current EOF.
523 : */
524 363935179 : if (iocb->ki_pos <= i_size_read(inode))
525 227164750 : goto out;
526 :
527 136770429 : spin_lock(&ip->i_flags_lock);
528 137237500 : isize = i_size_read(inode);
529 137237500 : if (iocb->ki_pos > isize) {
530 137237500 : spin_unlock(&ip->i_flags_lock);
531 :
532 137177119 : if (iocb->ki_flags & IOCB_NOWAIT)
533 : return -EAGAIN;
534 :
535 137177119 : if (!drained_dio) {
536 68270193 : if (*iolock == XFS_IOLOCK_SHARED) {
537 772451 : xfs_iunlock(ip, *iolock);
538 772226 : *iolock = XFS_IOLOCK_EXCL;
539 772226 : xfs_ilock(ip, *iolock);
540 771994 : iov_iter_reexpand(from, count);
541 : }
542 : /*
543 : * We now have an IO submission barrier in place, but
544 : * AIO can do EOF updates during IO completion and hence
545 : * we now need to wait for all of them to drain. Non-AIO
546 : * DIO will have drained before we are given the
547 : * XFS_IOLOCK_EXCL, and so for most cases this wait is a
548 : * no-op.
549 : */
550 68269736 : inode_dio_wait(inode);
551 67617020 : drained_dio = true;
552 67617020 : goto restart;
553 : }
554 :
555 : /*
556 : * If we're starting the write past EOF, COW the allocation
557 : * unit containing the current EOF before we start zeroing the
558 : * range between EOF and the start of the write.
559 : */
560 68906926 : if (xfs_truncate_needs_cow_around(ip, isize)) {
561 435930 : error = xfs_file_unshare_at(ip, isize);
562 435935 : if (error)
563 : return error;
564 : }
565 :
566 68616504 : trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
567 68113703 : error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
568 68141147 : if (error)
569 : return error;
570 : } else
571 0 : spin_unlock(&ip->i_flags_lock);
572 :
573 295290145 : out:
574 295290145 : return kiocb_modified(iocb);
575 : }
576 :
577 : static int
578 22543114 : xfs_dio_write_end_io(
579 : struct kiocb *iocb,
580 : ssize_t size,
581 : int error,
582 : unsigned flags)
583 : {
584 22543114 : struct inode *inode = file_inode(iocb->ki_filp);
585 22543114 : struct xfs_inode *ip = XFS_I(inode);
586 22543114 : loff_t offset = iocb->ki_pos;
587 22543114 : unsigned int nofs_flag;
588 :
589 22543114 : trace_xfs_end_io_direct_write(ip, offset, size);
590 :
591 45085698 : if (xfs_is_shutdown(ip->i_mount))
592 : return -EIO;
593 :
594 22542137 : if (error)
595 : return error;
596 20503197 : if (!size)
597 : return 0;
598 :
599 : /*
600 : * Capture amount written on completion as we can't reliably account
601 : * for it on submission.
602 : */
603 20503197 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
604 :
605 : /*
606 : * We can allocate memory here while doing writeback on behalf of
607 : * memory reclaim. To avoid memory allocation deadlocks set the
608 : * task-wide nofs context for the following operations.
609 : */
610 20503314 : nofs_flag = memalloc_nofs_save();
611 :
612 20503314 : if (flags & IOMAP_DIO_COW) {
613 4952403 : error = xfs_reflink_end_cow(ip, offset, size);
614 4952402 : if (error)
615 4 : goto out;
616 : }
617 :
618 : /*
619 : * Unwritten conversion updates the in-core isize after extent
620 : * conversion but before updating the on-disk size. Updating isize any
621 : * earlier allows a racing dio read to find unwritten extents before
622 : * they are converted.
623 : */
624 20503309 : if (flags & IOMAP_DIO_UNWRITTEN) {
625 8945512 : error = xfs_iomap_write_unwritten(ip, offset, size, true);
626 8945565 : goto out;
627 : }
628 :
629 : /*
630 : * We need to update the in-core inode size here so that we don't end up
631 : * with the on-disk inode size being outside the in-core inode size. We
632 : * have no other method of updating EOF for AIO, so always do it here
633 : * if necessary.
634 : *
635 : * We need to lock the test/set EOF update as we can be racing with
636 : * other IO completions here to update the EOF. Failing to serialise
637 : * here can result in EOF moving backwards and Bad Things Happen when
638 : * that occurs.
639 : *
640 : * As IO completion only ever extends EOF, we can do an unlocked check
641 : * here to avoid taking the spinlock. If we land within the current EOF,
642 : * then we do not need to do an extending update at all, and we don't
643 : * need to take the lock to check this. If we race with an update moving
644 : * EOF, then we'll either still be beyond EOF and need to take the lock,
645 : * or we'll be within EOF and we don't need to take it at all.
646 : */
647 11557797 : if (offset + size <= i_size_read(inode))
648 10941422 : goto out;
649 :
650 616375 : spin_lock(&ip->i_flags_lock);
651 616375 : if (offset + size > i_size_read(inode)) {
652 616375 : i_size_write(inode, offset + size);
653 616375 : spin_unlock(&ip->i_flags_lock);
654 616375 : error = xfs_setfilesize(ip, offset, size);
655 : } else {
656 0 : spin_unlock(&ip->i_flags_lock);
657 : }
658 :
659 20503366 : out:
660 20503366 : memalloc_nofs_restore(nofs_flag);
661 20503366 : return error;
662 : }
663 :
664 : static const struct iomap_dio_ops xfs_dio_write_ops = {
665 : .end_io = xfs_dio_write_end_io,
666 : };
667 :
668 : /*
669 : * Handle block aligned direct I/O writes
670 : */
671 : static noinline ssize_t
672 16644072 : xfs_file_dio_write_aligned(
673 : struct xfs_inode *ip,
674 : struct kiocb *iocb,
675 : struct iov_iter *from)
676 : {
677 16644072 : unsigned int iolock = XFS_IOLOCK_SHARED;
678 16644072 : ssize_t ret;
679 :
680 : /*
681 : * If the range to write is not aligned to an allocation unit, we will
682 : * have to COW the allocation units on both ends of the write. Because
683 : * this runs through the page cache, it requires IOLOCK_EXCL. This
684 : * predicate performs an unlocked access of the rt and reflink inode
685 : * state.
686 : */
687 16644072 : if (xfs_iocb_needs_cow_around(ip, iocb, from))
688 1454474 : iolock = XFS_IOLOCK_EXCL;
689 :
690 16642072 : ret = xfs_ilock_iocb(iocb, iolock);
691 16639585 : if (ret)
692 : return ret;
693 16642170 : ret = xfs_file_write_checks(iocb, from, &iolock);
694 16640330 : if (ret)
695 1072 : goto out_unlock;
696 :
697 : /*
698 : * We don't need to hold the IOLOCK exclusively across the IO, so demote
699 : * the iolock back to shared if we had to take the exclusive lock in
700 : * xfs_file_write_checks() for other reasons.
701 : */
702 16639258 : if (iolock == XFS_IOLOCK_EXCL) {
703 2289955 : xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
704 2289921 : iolock = XFS_IOLOCK_SHARED;
705 : }
706 16639224 : trace_xfs_file_direct_write(iocb, from);
707 16638375 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
708 : &xfs_dio_write_ops, 0, NULL, 0);
709 16646898 : out_unlock:
710 16646898 : if (iolock)
711 16646694 : xfs_iunlock(ip, iolock);
712 : return ret;
713 : }
714 :
715 : /*
716 : * Handle block unaligned direct I/O writes
717 : *
718 : * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
719 : * them to be done in parallel with reads and other direct I/O writes. However,
720 : * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
721 : * to do sub-block zeroing and that requires serialisation against other direct
722 : * I/O to the same block. In this case we need to serialise the submission of
723 : * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
724 : * In the case where sub-block zeroing is not required, we can do concurrent
725 : * sub-block dios to the same block successfully.
726 : *
727 : * Optimistically submit the I/O using the shared lock first, but use the
728 : * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
729 : * if block allocation or partial block zeroing would be required. In that case
730 : * we try again with the exclusive lock.
731 : */
732 : static noinline ssize_t
733 13543975 : xfs_file_dio_write_unaligned(
734 : struct xfs_inode *ip,
735 : struct kiocb *iocb,
736 : struct iov_iter *from)
737 : {
738 13543975 : size_t isize = i_size_read(VFS_I(ip));
739 13543975 : size_t count = iov_iter_count(from);
740 13543975 : unsigned int iolock = XFS_IOLOCK_SHARED;
741 13543975 : unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
742 13543975 : ssize_t ret;
743 :
744 : /*
745 : * Extending writes need exclusivity because of the sub-block zeroing
746 : * that the DIO code always does for partial tail blocks beyond EOF, so
747 : * don't even bother trying the fast path in this case.
748 : */
749 13543975 : if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
750 7812642 : if (iocb->ki_flags & IOCB_NOWAIT)
751 : return -EAGAIN;
752 7812642 : retry_exclusive:
753 8751582 : iolock = XFS_IOLOCK_EXCL;
754 8751582 : flags = IOMAP_DIO_FORCE_WAIT;
755 : }
756 :
757 14482915 : ret = xfs_ilock_iocb(iocb, iolock);
758 14482792 : if (ret)
759 : return ret;
760 :
761 : /*
762 : * We can't properly handle unaligned direct I/O to reflink files yet,
763 : * as we can't unshare a partial block.
764 : */
765 14482760 : if (xfs_is_cow_inode(ip)) {
766 8583954 : trace_xfs_reflink_bounce_dio_write(iocb, from);
767 8583918 : ret = -ENOTBLK;
768 8583918 : goto out_unlock;
769 : }
770 :
771 5898795 : ret = xfs_file_write_checks(iocb, from, &iolock);
772 5898589 : if (ret)
773 87 : goto out_unlock;
774 :
775 : /*
776 : * If we are doing exclusive unaligned I/O, this must be the only I/O
777 : * in-flight. Otherwise we risk data corruption due to unwritten extent
778 : * conversions from the AIO end_io handler. Wait for all other I/O to
779 : * drain first.
780 : */
781 5898502 : if (flags & IOMAP_DIO_FORCE_WAIT)
782 4703212 : inode_dio_wait(VFS_I(ip));
783 :
784 5898463 : trace_xfs_file_direct_write(iocb, from);
785 5898120 : ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
786 : &xfs_dio_write_ops, flags, NULL, 0);
787 :
788 : /*
789 : * Retry unaligned I/O with exclusive blocking semantics if the DIO
790 : * layer rejected it for mapping or locking reasons. If we are doing
791 : * nonblocking user I/O, propagate the error.
792 : */
793 5899006 : if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
794 938958 : ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
795 938958 : xfs_iunlock(ip, iolock);
796 938940 : goto retry_exclusive;
797 : }
798 :
799 4960048 : out_unlock:
800 13544053 : if (iolock)
801 13543976 : xfs_iunlock(ip, iolock);
802 : return ret;
803 : }
804 :
805 : static ssize_t
806 30189213 : xfs_file_dio_write(
807 : struct kiocb *iocb,
808 : struct iov_iter *from)
809 : {
810 30189213 : struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
811 30189213 : struct xfs_buftarg *target = xfs_inode_buftarg(ip);
812 30189213 : size_t count = iov_iter_count(from);
813 :
814 : /* direct I/O must be aligned to device logical sector size */
815 30189213 : if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
816 : return -EINVAL;
817 30189213 : if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
818 13544030 : return xfs_file_dio_write_unaligned(ip, iocb, from);
819 16645183 : return xfs_file_dio_write_aligned(ip, iocb, from);
820 : }
821 :
822 : static noinline ssize_t
823 0 : xfs_file_dax_write(
824 : struct kiocb *iocb,
825 : struct iov_iter *from)
826 : {
827 0 : struct inode *inode = iocb->ki_filp->f_mapping->host;
828 0 : struct xfs_inode *ip = XFS_I(inode);
829 0 : unsigned int iolock = XFS_IOLOCK_EXCL;
830 0 : ssize_t ret, error = 0;
831 0 : loff_t pos;
832 :
833 0 : ret = xfs_ilock_iocb(iocb, iolock);
834 0 : if (ret)
835 : return ret;
836 0 : ret = xfs_file_write_checks(iocb, from, &iolock);
837 0 : if (ret)
838 0 : goto out;
839 :
840 0 : pos = iocb->ki_pos;
841 :
842 0 : trace_xfs_file_dax_write(iocb, from);
843 0 : ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
844 0 : if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
845 0 : i_size_write(inode, iocb->ki_pos);
846 0 : error = xfs_setfilesize(ip, pos, ret);
847 : }
848 0 : out:
849 0 : if (iolock)
850 0 : xfs_iunlock(ip, iolock);
851 0 : if (error)
852 : return error;
853 :
854 0 : if (ret > 0) {
855 0 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
856 :
857 : /* Handle various SYNC-type writes */
858 0 : ret = generic_write_sync(iocb, ret);
859 : }
860 : return ret;
861 : }
862 :
863 : STATIC ssize_t
864 271853253 : xfs_file_buffered_write(
865 : struct kiocb *iocb,
866 : struct iov_iter *from)
867 : {
868 271853253 : struct inode *inode = iocb->ki_filp->f_mapping->host;
869 271853253 : struct xfs_inode *ip = XFS_I(inode);
870 271853253 : ssize_t ret;
871 271853253 : bool cleared_space = false;
872 273507686 : unsigned int iolock;
873 :
874 : write_retry:
875 273507686 : iolock = XFS_IOLOCK_EXCL;
876 273507686 : ret = xfs_ilock_iocb(iocb, iolock);
877 272747057 : if (ret)
878 0 : return ret;
879 :
880 272747057 : ret = xfs_file_write_checks(iocb, from, &iolock);
881 272996851 : if (ret)
882 154714 : goto out;
883 :
884 272842137 : trace_xfs_file_buffered_write(iocb, from);
885 272278840 : ret = iomap_file_buffered_write(iocb, from,
886 : &xfs_buffered_write_iomap_ops);
887 :
888 : /*
889 : * If we hit a space limit, try to free up some lingering preallocated
890 : * space before returning an error. In the case of ENOSPC, first try to
891 : * write back all dirty inodes to free up some of the excess reserved
892 : * metadata space. This reduces the chances that the eofblocks scan
893 : * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
894 : * also behaves as a filter to prevent too many eofblocks scans from
895 : * running at the same time. Use a synchronous scan to increase the
896 : * effectiveness of the scan.
897 : */
898 272986706 : if (ret == -EDQUOT && !cleared_space) {
899 1826 : xfs_iunlock(ip, iolock);
900 1826 : xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
901 1825 : cleared_space = true;
902 1825 : goto write_retry;
903 272984880 : } else if (ret == -ENOSPC && !cleared_space) {
904 1652405 : struct xfs_icwalk icw = {0};
905 :
906 1652405 : cleared_space = true;
907 1652405 : xfs_flush_inodes(ip->i_mount);
908 :
909 1652293 : xfs_iunlock(ip, iolock);
910 1651973 : icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
911 1651973 : xfs_blockgc_free_space(ip->i_mount, &icw);
912 1652608 : goto write_retry;
913 : }
914 :
915 271332475 : out:
916 271487189 : if (iolock)
917 271461178 : xfs_iunlock(ip, iolock);
918 :
919 271697451 : if (ret > 0) {
920 270003014 : XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
921 : /* Handle various SYNC-type writes */
922 270134419 : ret = generic_write_sync(iocb, ret);
923 : }
924 : return ret;
925 : }
926 :
927 : STATIC ssize_t
928 293595208 : xfs_file_write_iter(
929 : struct kiocb *iocb,
930 : struct iov_iter *from)
931 : {
932 293595208 : struct inode *inode = iocb->ki_filp->f_mapping->host;
933 293595208 : struct xfs_inode *ip = XFS_I(inode);
934 293595208 : ssize_t ret;
935 293595208 : size_t ocount = iov_iter_count(from);
936 :
937 293595208 : XFS_STATS_INC(ip->i_mount, xs_write_calls);
938 :
939 293530055 : if (ocount == 0)
940 : return 0;
941 :
942 587059380 : if (xfs_is_shutdown(ip->i_mount))
943 : return -EIO;
944 :
945 293519542 : if (IS_DAX(inode))
946 0 : return xfs_file_dax_write(iocb, from);
947 :
948 293519542 : if (iocb->ki_flags & IOCB_DIRECT) {
949 : /*
950 : * Allow a directio write to fall back to a buffered
951 : * write *only* in the case that we're doing a reflink
952 : * CoW. In all other directio scenarios we do not
953 : * allow an operation to fall back to buffered mode.
954 : */
955 30189694 : ret = xfs_file_dio_write(iocb, from);
956 30186966 : if (ret != -ENOTBLK)
957 : return ret;
958 : }
959 :
960 271914071 : return xfs_file_buffered_write(iocb, from);
961 : }
962 :
963 : /* Does this file, inode, or mount want synchronous writes? */
964 538793731 : static inline bool xfs_file_sync_writes(struct file *filp)
965 : {
966 538793731 : struct xfs_inode *ip = XFS_I(file_inode(filp));
967 :
968 538793731 : if (xfs_has_wsync(ip->i_mount))
969 : return true;
970 538793691 : if (filp->f_flags & (__O_SYNC | O_DSYNC))
971 : return true;
972 538763750 : if (IS_SYNC(file_inode(filp)))
973 21 : return true;
974 :
975 : return false;
976 : }
977 :
978 : #define XFS_FALLOC_FL_SUPPORTED \
979 : (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
980 : FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
981 : FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE | \
982 : FALLOC_FL_MAP_FREE_SPACE)
983 :
984 : STATIC long
985 57407164 : xfs_file_fallocate(
986 : struct file *file,
987 : int mode,
988 : loff_t offset,
989 : loff_t len)
990 : {
991 57407164 : struct inode *inode = file_inode(file);
992 57407164 : struct xfs_inode *ip = XFS_I(inode);
993 57407164 : long error;
994 57407164 : uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
995 57407164 : loff_t new_size = 0;
996 57407164 : bool do_file_insert = false;
997 :
998 57407164 : if (!S_ISREG(inode->i_mode))
999 : return -EINVAL;
1000 57407164 : if (mode & ~XFS_FALLOC_FL_SUPPORTED)
1001 : return -EOPNOTSUPP;
1002 :
1003 57407164 : xfs_ilock(ip, iolock);
1004 57407877 : error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
1005 57407824 : if (error)
1006 0 : goto out_unlock;
1007 :
1008 : /*
1009 : * Must wait for all AIO to complete before we continue as AIO can
1010 : * change the file size on completion without holding any locks we
1011 : * currently hold. We must do this first because AIO can update both
1012 : * the on disk and in memory inode sizes, and the operations that follow
1013 : * require the in-memory size to be fully up-to-date.
1014 : */
1015 57407824 : inode_dio_wait(inode);
1016 :
1017 : /*
1018 : * Now AIO and DIO has drained we flush and (if necessary) invalidate
1019 : * the cached range over the first operation we are about to run.
1020 : *
1021 : * We care about zero and collapse here because they both run a hole
1022 : * punch over the range first. Because that can zero data, and the range
1023 : * of invalidation for the shift operations is much larger, we still do
1024 : * the required flush for collapse in xfs_prepare_shift().
1025 : *
1026 : * Insert has the same range requirements as collapse, and we extend the
1027 : * file first which can zero data. Hence insert has the same
1028 : * flush/invalidate requirements as collapse and so they are both
1029 : * handled at the right time by xfs_prepare_shift().
1030 : */
1031 57406918 : if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
1032 : FALLOC_FL_COLLAPSE_RANGE)) {
1033 44676380 : error = xfs_flush_unmap_range(ip, offset, len);
1034 44676166 : if (error)
1035 281 : goto out_unlock;
1036 : }
1037 :
1038 57406423 : error = file_modified(file);
1039 57407572 : if (error)
1040 6 : goto out_unlock;
1041 :
1042 57407566 : if (mode & FALLOC_FL_PUNCH_HOLE) {
1043 : /* Unshare around the region to punch, if needed. */
1044 37044007 : if (xfs_file_write_needs_cow_around(ip, offset, len)) {
1045 480281 : error = xfs_file_cow_around(ip, offset, len);
1046 480343 : if (error)
1047 8088 : goto out_unlock;
1048 : }
1049 :
1050 37035542 : error = xfs_free_file_space(ip, offset, len);
1051 37036074 : if (error)
1052 54597 : goto out_unlock;
1053 20363559 : } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
1054 3264929 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
1055 517675 : error = -EINVAL;
1056 517675 : goto out_unlock;
1057 : }
1058 :
1059 : /*
1060 : * There is no need to overlap collapse range with EOF,
1061 : * in which case it is effectively a truncate operation
1062 : */
1063 2747252 : if (offset + len >= i_size_read(inode)) {
1064 285657 : error = -EINVAL;
1065 285657 : goto out_unlock;
1066 : }
1067 :
1068 2461595 : new_size = i_size_read(inode) - len;
1069 :
1070 2461595 : error = xfs_collapse_file_space(ip, offset, len);
1071 2461610 : if (error)
1072 2818 : goto out_unlock;
1073 17098630 : } else if (mode & FALLOC_FL_INSERT_RANGE) {
1074 2587517 : loff_t isize = i_size_read(inode);
1075 :
1076 2587517 : if (!xfs_is_falloc_aligned(ip, offset, len)) {
1077 500837 : error = -EINVAL;
1078 500837 : goto out_unlock;
1079 : }
1080 :
1081 : /*
1082 : * New inode size must not exceed ->s_maxbytes, accounting for
1083 : * possible signed overflow.
1084 : */
1085 2086674 : if (inode->i_sb->s_maxbytes - isize < len) {
1086 10 : error = -EFBIG;
1087 10 : goto out_unlock;
1088 : }
1089 2086664 : new_size = isize + len;
1090 :
1091 : /* Offset should be less than i_size */
1092 2086664 : if (offset >= isize) {
1093 225363 : error = -EINVAL;
1094 225363 : goto out_unlock;
1095 : }
1096 : do_file_insert = true;
1097 14511113 : } else if (mode & FALLOC_FL_MAP_FREE_SPACE) {
1098 1122 : struct xfs_mount *mp = ip->i_mount;
1099 1122 : xfs_off_t device_size;
1100 :
1101 1122 : if (!capable(CAP_SYS_ADMIN)) {
1102 0 : error = -EPERM;
1103 0 : goto out_unlock;
1104 : }
1105 :
1106 1122 : if (XFS_IS_REALTIME_INODE(ip))
1107 510 : device_size = XFS_FSB_TO_B(mp, mp->m_sb.sb_rblocks);
1108 : else
1109 612 : device_size = XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks);
1110 :
1111 : /*
1112 : * Bail out now if we aren't allowed to make the file size the
1113 : * same length as the device.
1114 : */
1115 1122 : if (device_size > i_size_read(inode)) {
1116 22 : new_size = device_size;
1117 22 : error = inode_newsize_ok(inode, new_size);
1118 22 : if (error)
1119 0 : goto out_unlock;
1120 : }
1121 :
1122 1122 : if (XFS_IS_REALTIME_INODE(ip))
1123 510 : error = xfs_map_free_rt_space(ip, offset, len);
1124 : else
1125 612 : error = xfs_map_free_space(ip, offset, len);
1126 1122 : if (error) {
1127 0 : if (error == -ECANCELED)
1128 0 : error = 0;
1129 0 : goto out_unlock;
1130 : }
1131 : } else {
1132 14509991 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
1133 7736253 : offset + len > i_size_read(inode)) {
1134 5155961 : new_size = offset + len;
1135 5155961 : error = inode_newsize_ok(inode, new_size);
1136 5155961 : if (error)
1137 10 : goto out_unlock;
1138 : }
1139 :
1140 14509981 : if (mode & FALLOC_FL_ZERO_RANGE) {
1141 : /*
1142 : * Punch a hole and prealloc the range. We use a hole
1143 : * punch rather than unwritten extent conversion for two
1144 : * reasons:
1145 : *
1146 : * 1.) Hole punch handles partial block zeroing for us.
1147 : * 2.) If prealloc returns ENOSPC, the file range is
1148 : * still zero-valued by virtue of the hole punch.
1149 : */
1150 4367436 : unsigned int blksize = i_blocksize(inode);
1151 :
1152 4367414 : trace_xfs_zero_file_space(ip, offset, len);
1153 :
1154 : /* Unshare around the region to zero, if needed. */
1155 4367415 : if (xfs_file_write_needs_cow_around(ip, offset, len)) {
1156 395606 : error = xfs_file_cow_around(ip, offset, len);
1157 395607 : if (error)
1158 5317 : goto out_unlock;
1159 : }
1160 :
1161 4362106 : error = xfs_free_file_space(ip, offset, len);
1162 4362132 : if (error)
1163 20872 : goto out_unlock;
1164 :
1165 4341260 : len = round_up(offset + len, blksize) -
1166 4341260 : round_down(offset, blksize);
1167 4341260 : offset = round_down(offset, blksize);
1168 10142545 : } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1169 : /*
1170 : * Enlarge the unshare region to align to a full
1171 : * allocation unit.
1172 : */
1173 315 : if (xfs_inode_needs_cow_around(ip)) {
1174 40 : loff_t isize = i_size_read(VFS_I(ip));
1175 40 : unsigned int rextsize;
1176 40 : uint32_t mod;
1177 :
1178 40 : rextsize = xfs_inode_alloc_unitsize(ip);
1179 40 : div_u64_rem(offset, rextsize, &mod);
1180 40 : offset -= mod;
1181 40 : len += mod;
1182 :
1183 40 : div_u64_rem(offset + len, rextsize, &mod);
1184 40 : if (mod)
1185 25 : len += rextsize - mod;
1186 40 : if (offset + len > isize)
1187 8 : len = isize - offset;
1188 : }
1189 315 : error = xfs_reflink_unshare(ip, offset, len);
1190 315 : if (error)
1191 8 : goto out_unlock;
1192 : } else {
1193 : /*
1194 : * If always_cow mode we can't use preallocations and
1195 : * thus should not create them.
1196 : */
1197 10142230 : if (xfs_is_always_cow_inode(ip)) {
1198 448052 : error = -EOPNOTSUPP;
1199 448052 : goto out_unlock;
1200 : }
1201 : }
1202 :
1203 14035676 : if (!xfs_is_always_cow_inode(ip)) {
1204 13779593 : error = xfs_alloc_file_space(ip, offset, len);
1205 13779770 : if (error)
1206 430666 : goto out_unlock;
1207 : }
1208 : }
1209 :
1210 : /* Change file size if needed */
1211 54907870 : if (new_size) {
1212 9107278 : struct iattr iattr;
1213 :
1214 9107278 : iattr.ia_valid = ATTR_SIZE;
1215 9107278 : iattr.ia_size = new_size;
1216 18214560 : error = xfs_vn_setattr_size(file_mnt_idmap(file),
1217 : file_dentry(file), &iattr);
1218 9107273 : if (error)
1219 1356 : goto out_unlock;
1220 : }
1221 :
1222 : /*
1223 : * Perform hole insertion now that the file size has been
1224 : * updated so that if we crash during the operation we don't
1225 : * leave shifted extents past EOF and hence losing access to
1226 : * the data that is contained within them.
1227 : */
1228 54906509 : if (do_file_insert) {
1229 1860817 : error = xfs_insert_file_space(ip, offset, len);
1230 1860819 : if (error)
1231 3255 : goto out_unlock;
1232 : }
1233 :
1234 54903256 : if (xfs_file_sync_writes(file))
1235 29922 : error = xfs_log_force_inode(ip);
1236 :
1237 54873334 : out_unlock:
1238 57408124 : xfs_iunlock(ip, iolock);
1239 57408124 : return error;
1240 : }
1241 :
1242 : STATIC int
1243 11769975 : xfs_file_fadvise(
1244 : struct file *file,
1245 : loff_t start,
1246 : loff_t end,
1247 : int advice)
1248 : {
1249 11769975 : struct xfs_inode *ip = XFS_I(file_inode(file));
1250 11769975 : int ret;
1251 11769975 : int lockflags = 0;
1252 :
1253 : /*
1254 : * Operations creating pages in page cache need protection from hole
1255 : * punching and similar ops
1256 : */
1257 11769975 : if (advice == POSIX_FADV_WILLNEED) {
1258 0 : lockflags = XFS_IOLOCK_SHARED;
1259 0 : xfs_ilock(ip, lockflags);
1260 : }
1261 11769975 : ret = generic_fadvise(file, start, end, advice);
1262 11758079 : if (lockflags)
1263 0 : xfs_iunlock(ip, lockflags);
1264 11758079 : return ret;
1265 : }
1266 :
1267 : STATIC loff_t
1268 398542861 : xfs_file_remap_range(
1269 : struct file *file_in,
1270 : loff_t pos_in,
1271 : struct file *file_out,
1272 : loff_t pos_out,
1273 : loff_t len,
1274 : unsigned int remap_flags)
1275 : {
1276 398542861 : struct inode *inode_in = file_inode(file_in);
1277 398542861 : struct xfs_inode *src = XFS_I(inode_in);
1278 398542861 : struct inode *inode_out = file_inode(file_out);
1279 398542861 : struct xfs_inode *dest = XFS_I(inode_out);
1280 398542861 : struct xfs_mount *mp = src->i_mount;
1281 398542861 : loff_t remapped = 0;
1282 398542861 : xfs_extlen_t cowextsize;
1283 398542861 : int ret;
1284 :
1285 398542861 : if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1286 : return -EINVAL;
1287 :
1288 398542861 : if (!xfs_has_reflink(mp))
1289 : return -EOPNOTSUPP;
1290 :
1291 777039224 : if (xfs_is_shutdown(mp))
1292 : return -EIO;
1293 :
1294 : /* Prepare and then clone file data. */
1295 388511897 : ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1296 : &len, remap_flags);
1297 388562817 : if (ret || len == 0)
1298 145155528 : return ret;
1299 :
1300 243407289 : trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1301 :
1302 243400417 : ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1303 : &remapped);
1304 243377598 : if (ret)
1305 1378045 : goto out_unlock;
1306 :
1307 : /*
1308 : * Carry the cowextsize hint from src to dest if we're sharing the
1309 : * entire source file to the entire destination file, the source file
1310 : * has a cowextsize hint, and the destination file does not.
1311 : */
1312 241999553 : cowextsize = 0;
1313 241999553 : if (pos_in == 0 && len == i_size_read(inode_in) &&
1314 215833 : (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1315 212 : pos_out == 0 && len >= i_size_read(inode_out) &&
1316 203 : !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
1317 27 : cowextsize = src->i_cowextsize;
1318 :
1319 241999553 : ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1320 : remap_flags);
1321 241952138 : if (ret)
1322 0 : goto out_unlock;
1323 :
1324 241952138 : if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1325 1098 : xfs_log_force_inode(dest);
1326 241953214 : out_unlock:
1327 243331339 : xfs_iunlock2_io_mmap(src, dest);
1328 243381616 : if (ret)
1329 1378022 : trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1330 243381490 : return remapped > 0 ? remapped : ret;
1331 : }
1332 :
1333 : STATIC int
1334 755387812 : xfs_file_open(
1335 : struct inode *inode,
1336 : struct file *file)
1337 : {
1338 1510775624 : if (xfs_is_shutdown(XFS_M(inode->i_sb)))
1339 : return -EIO;
1340 755374667 : file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1341 : FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
1342 755374667 : return generic_file_open(inode, file);
1343 : }
1344 :
1345 : STATIC int
1346 44864334 : xfs_dir_open(
1347 : struct inode *inode,
1348 : struct file *file)
1349 : {
1350 44864334 : struct xfs_inode *ip = XFS_I(inode);
1351 44864334 : unsigned int mode;
1352 44864334 : int error;
1353 :
1354 44864334 : error = xfs_file_open(inode, file);
1355 44790015 : if (error)
1356 : return error;
1357 :
1358 : /*
1359 : * If there are any blocks, read-ahead block 0 as we're almost
1360 : * certain to have the next operation be a read there.
1361 : */
1362 44794659 : mode = xfs_ilock_data_map_shared(ip);
1363 44777865 : if (ip->i_df.if_nextents > 0)
1364 9913703 : error = xfs_dir3_data_readahead(ip, 0, 0);
1365 44787701 : xfs_iunlock(ip, mode);
1366 44787701 : return error;
1367 : }
1368 :
1369 : /*
1370 : * When we release the file, we don't want it to trim EOF blocks if it is a
1371 : * readonly context. This avoids open/read/close workloads from removing
1372 : * EOF blocks that other writers depend upon to reduce fragmentation.
1373 : */
1374 : STATIC int
1375 710475869 : xfs_file_release(
1376 : struct inode *inode,
1377 : struct file *file)
1378 : {
1379 710475869 : bool free_eof_blocks = true;
1380 :
1381 710475869 : if ((file->f_mode & (FMODE_WRITE | FMODE_READ)) == FMODE_READ)
1382 118660453 : free_eof_blocks = false;
1383 :
1384 710475869 : return xfs_release(XFS_I(inode), free_eof_blocks);
1385 : }
1386 :
1387 : STATIC int
1388 90770073 : xfs_file_readdir(
1389 : struct file *file,
1390 : struct dir_context *ctx)
1391 : {
1392 90770073 : struct inode *inode = file_inode(file);
1393 90770073 : xfs_inode_t *ip = XFS_I(inode);
1394 90770073 : size_t bufsize;
1395 :
1396 : /*
1397 : * The Linux API doesn't pass down the total size of the buffer
1398 : * we read into down to the filesystem. With the filldir concept
1399 : * it's not needed for correct information, but the XFS dir2 leaf
1400 : * code wants an estimate of the buffer size to calculate it's
1401 : * readahead window and size the buffers used for mapping to
1402 : * physical blocks.
1403 : *
1404 : * Try to give it an estimate that's good enough, maybe at some
1405 : * point we can change the ->readdir prototype to include the
1406 : * buffer size. For now we use the current glibc buffer size.
1407 : */
1408 90770073 : bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
1409 :
1410 90770073 : return xfs_readdir(NULL, ip, ctx, bufsize);
1411 : }
1412 :
1413 : STATIC loff_t
1414 102045447 : xfs_file_llseek(
1415 : struct file *file,
1416 : loff_t offset,
1417 : int whence)
1418 : {
1419 102045447 : struct inode *inode = file->f_mapping->host;
1420 :
1421 204090894 : if (xfs_is_shutdown(XFS_I(inode)->i_mount))
1422 : return -EIO;
1423 :
1424 102045442 : switch (whence) {
1425 101704266 : default:
1426 101704266 : return generic_file_llseek(file, offset, whence);
1427 1581 : case SEEK_HOLE:
1428 1581 : offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1429 1581 : break;
1430 339595 : case SEEK_DATA:
1431 339595 : offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1432 339595 : break;
1433 : }
1434 :
1435 341176 : if (offset < 0)
1436 : return offset;
1437 262849 : return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1438 : }
1439 :
1440 : #ifdef CONFIG_FS_DAX
1441 : static inline vm_fault_t
1442 0 : xfs_dax_fault(
1443 : struct vm_fault *vmf,
1444 : enum page_entry_size pe_size,
1445 : bool write_fault,
1446 : pfn_t *pfn)
1447 : {
1448 0 : return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1449 0 : (write_fault && !vmf->cow_page) ?
1450 : &xfs_dax_write_iomap_ops :
1451 : &xfs_read_iomap_ops);
1452 : }
1453 : #else
1454 : static inline vm_fault_t
1455 : xfs_dax_fault(
1456 : struct vm_fault *vmf,
1457 : enum page_entry_size pe_size,
1458 : bool write_fault,
1459 : pfn_t *pfn)
1460 : {
1461 : ASSERT(0);
1462 : return VM_FAULT_SIGBUS;
1463 : }
1464 : #endif
1465 :
1466 : static int
1467 92725784 : xfs_filemap_fault_around(
1468 : struct vm_fault *vmf,
1469 : struct inode *inode)
1470 : {
1471 92725784 : struct xfs_inode *ip = XFS_I(inode);
1472 92725784 : struct folio *folio = page_folio(vmf->page);
1473 92702471 : loff_t pos;
1474 92702471 : ssize_t len;
1475 :
1476 92702471 : if (!xfs_inode_needs_cow_around(ip))
1477 : return 0;
1478 :
1479 6222705 : folio_lock(folio);
1480 6222708 : len = folio_mkwrite_check_truncate(folio, inode);
1481 6222706 : if (len < 0) {
1482 6 : folio_unlock(folio);
1483 6 : return len;
1484 : }
1485 6222700 : pos = folio_pos(folio);
1486 6222700 : folio_unlock(folio);
1487 :
1488 6222707 : if (!xfs_file_write_needs_cow_around(ip, pos, len))
1489 : return 0;
1490 :
1491 6210429 : return xfs_file_cow_around(XFS_I(inode), pos, len);
1492 : }
1493 :
1494 : /*
1495 : * Locking for serialisation of IO during page faults. This results in a lock
1496 : * ordering of:
1497 : *
1498 : * mmap_lock (MM)
1499 : * sb_start_pagefault(vfs, freeze)
1500 : * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
1501 : * page_lock (MM)
1502 : * i_lock (XFS - extent map serialisation)
1503 : */
1504 : static vm_fault_t
1505 221800070 : __xfs_filemap_fault(
1506 : struct vm_fault *vmf,
1507 : enum page_entry_size pe_size,
1508 : bool write_fault)
1509 : {
1510 221800070 : struct inode *inode = file_inode(vmf->vma->vm_file);
1511 221800070 : struct xfs_inode *ip = XFS_I(inode);
1512 221800070 : vm_fault_t ret;
1513 :
1514 221800070 : trace_xfs_filemap_fault(ip, pe_size, write_fault);
1515 :
1516 221826474 : if (write_fault) {
1517 92972933 : sb_start_pagefault(inode->i_sb);
1518 92809907 : file_update_time(vmf->vma->vm_file);
1519 : }
1520 :
1521 221717413 : if (IS_DAX(inode)) {
1522 0 : pfn_t pfn;
1523 :
1524 0 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1525 0 : ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
1526 0 : if (ret & VM_FAULT_NEEDDSYNC)
1527 0 : ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1528 0 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1529 : } else {
1530 221717413 : if (write_fault) {
1531 92862578 : int error;
1532 :
1533 92862578 : xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1534 :
1535 : /*
1536 : * Unshare all the blocks in this rt extent surrounding
1537 : * this page.
1538 : */
1539 92832903 : error = xfs_filemap_fault_around(vmf, inode);
1540 92775737 : if (error) {
1541 6071 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1542 6071 : ret = block_page_mkwrite_return(error);
1543 6071 : goto out;
1544 : }
1545 :
1546 92769666 : ret = iomap_page_mkwrite(vmf,
1547 : &xfs_page_mkwrite_iomap_ops);
1548 92729701 : xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1549 : } else {
1550 128854835 : ret = filemap_fault(vmf);
1551 : }
1552 : }
1553 :
1554 221808207 : out:
1555 221808207 : if (write_fault)
1556 92893971 : sb_end_pagefault(inode->i_sb);
1557 221723162 : return ret;
1558 : }
1559 :
1560 : static inline bool
1561 : xfs_is_write_fault(
1562 : struct vm_fault *vmf)
1563 : {
1564 0 : return (vmf->flags & FAULT_FLAG_WRITE) &&
1565 0 : (vmf->vma->vm_flags & VM_SHARED);
1566 : }
1567 :
1568 : static vm_fault_t
1569 128916407 : xfs_filemap_fault(
1570 : struct vm_fault *vmf)
1571 : {
1572 : /* DAX can shortcut the normal fault path on write faults! */
1573 128916407 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1574 128916407 : IS_DAX(file_inode(vmf->vma->vm_file)) &&
1575 : xfs_is_write_fault(vmf));
1576 : }
1577 :
1578 : static vm_fault_t
1579 16431 : xfs_filemap_huge_fault(
1580 : struct vm_fault *vmf,
1581 : enum page_entry_size pe_size)
1582 : {
1583 16431 : if (!IS_DAX(file_inode(vmf->vma->vm_file)))
1584 : return VM_FAULT_FALLBACK;
1585 :
1586 : /* DAX can shortcut the normal fault path on write faults! */
1587 0 : return __xfs_filemap_fault(vmf, pe_size,
1588 : xfs_is_write_fault(vmf));
1589 : }
1590 :
1591 : static vm_fault_t
1592 93010711 : xfs_filemap_page_mkwrite(
1593 : struct vm_fault *vmf)
1594 : {
1595 93010711 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1596 : }
1597 :
1598 : /*
1599 : * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1600 : * on write faults. In reality, it needs to serialise against truncate and
1601 : * prepare memory for writing so handle is as standard write fault.
1602 : */
1603 : static vm_fault_t
1604 0 : xfs_filemap_pfn_mkwrite(
1605 : struct vm_fault *vmf)
1606 : {
1607 :
1608 0 : return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1609 : }
1610 :
1611 : static const struct vm_operations_struct xfs_file_vm_ops = {
1612 : .fault = xfs_filemap_fault,
1613 : .huge_fault = xfs_filemap_huge_fault,
1614 : .map_pages = filemap_map_pages,
1615 : .page_mkwrite = xfs_filemap_page_mkwrite,
1616 : .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1617 : };
1618 :
1619 : STATIC int
1620 15647236 : xfs_file_mmap(
1621 : struct file *file,
1622 : struct vm_area_struct *vma)
1623 : {
1624 15647236 : struct inode *inode = file_inode(file);
1625 15647236 : struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1626 :
1627 : /*
1628 : * We don't support synchronous mappings for non-DAX files and
1629 : * for DAX files if underneath dax_device is not synchronous.
1630 : */
1631 15647236 : if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1632 : return -EOPNOTSUPP;
1633 :
1634 15646988 : file_accessed(file);
1635 15647187 : vma->vm_ops = &xfs_file_vm_ops;
1636 15647187 : if (IS_DAX(inode))
1637 0 : vm_flags_set(vma, VM_HUGEPAGE);
1638 : return 0;
1639 : }
1640 :
1641 : const struct file_operations xfs_file_operations = {
1642 : .llseek = xfs_file_llseek,
1643 : .read_iter = xfs_file_read_iter,
1644 : .write_iter = xfs_file_write_iter,
1645 : .splice_read = xfs_file_splice_read,
1646 : .splice_write = iter_file_splice_write,
1647 : .iopoll = iocb_bio_iopoll,
1648 : .unlocked_ioctl = xfs_file_ioctl,
1649 : #ifdef CONFIG_COMPAT
1650 : .compat_ioctl = xfs_file_compat_ioctl,
1651 : #endif
1652 : .mmap = xfs_file_mmap,
1653 : .mmap_supported_flags = MAP_SYNC,
1654 : .open = xfs_file_open,
1655 : .release = xfs_file_release,
1656 : .fsync = xfs_file_fsync,
1657 : .get_unmapped_area = thp_get_unmapped_area,
1658 : .fallocate = xfs_file_fallocate,
1659 : .fadvise = xfs_file_fadvise,
1660 : .remap_file_range = xfs_file_remap_range,
1661 : };
1662 :
1663 : const struct file_operations xfs_dir_file_operations = {
1664 : .open = xfs_dir_open,
1665 : .read = generic_read_dir,
1666 : .iterate_shared = xfs_file_readdir,
1667 : .llseek = generic_file_llseek,
1668 : .unlocked_ioctl = xfs_file_ioctl,
1669 : #ifdef CONFIG_COMPAT
1670 : .compat_ioctl = xfs_file_compat_ioctl,
1671 : #endif
1672 : .fsync = xfs_dir_fsync,
1673 : };
|