Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2020-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : *
6 : * The xfs_swap_extent_* functions are:
7 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 : * Copyright (c) 2012 Red Hat, Inc.
9 : * All Rights Reserved.
10 : */
11 : #include "xfs.h"
12 : #include "xfs_shared.h"
13 : #include "xfs_format.h"
14 : #include "xfs_log_format.h"
15 : #include "xfs_trans_resv.h"
16 : #include "xfs_mount.h"
17 : #include "xfs_defer.h"
18 : #include "xfs_inode.h"
19 : #include "xfs_trans.h"
20 : #include "xfs_quota.h"
21 : #include "xfs_bmap_util.h"
22 : #include "xfs_bmap_btree.h"
23 : #include "xfs_reflink.h"
24 : #include "xfs_trace.h"
25 : #include "xfs_swapext.h"
26 : #include "xfs_xchgrange.h"
27 : #include "xfs_sb.h"
28 : #include "xfs_icache.h"
29 : #include "xfs_log.h"
30 : #include "xfs_rtalloc.h"
31 : #include <linux/fsnotify.h>
32 :
33 : /*
34 : * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
35 : * This part does not deal with XFS-specific data structures, and may some day
36 : * be ported to the VFS.
37 : *
38 : * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
39 : * file1 with the same number of bytes starting at fxr.file2_offset in file2.
40 : * Implementations must call xfs_exch_range_prep to prepare the two files
41 : * prior to taking locks; they must call xfs_exch_range_check_fresh once
42 : * the inode is locked to abort the call if file2 has changed; and they must
43 : * update the inode change and mod times of both files as part of the metadata
44 : * update. The timestamp updates must be done atomically as part of the data
45 : * exchange operation to ensure correctness of the freshness check.
46 : */
47 :
48 : /*
49 : * Check that both files' metadata agree with the snapshot that we took for
50 : * the range exchange request.
51 : *
52 : * This should be called after the filesystem has locked /all/ inode metadata
53 : * against modification.
54 : */
55 : STATIC int
56 353666 : xfs_exch_range_check_fresh(
57 : struct inode *inode2,
58 : const struct xfs_exch_range *fxr)
59 : {
60 : /* Check that file2 hasn't otherwise been modified. */
61 353666 : if ((fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) &&
62 13096 : (fxr->file2_ino != inode2->i_ino ||
63 13096 : fxr->file2_ctime != inode2->i_ctime.tv_sec ||
64 13096 : fxr->file2_ctime_nsec != inode2->i_ctime.tv_nsec ||
65 13090 : fxr->file2_mtime != inode2->i_mtime.tv_sec ||
66 13090 : fxr->file2_mtime_nsec != inode2->i_mtime.tv_nsec))
67 6 : return -EBUSY;
68 :
69 : return 0;
70 : }
71 :
72 : /* Performs necessary checks before doing a range exchange. */
73 : STATIC int
74 176871 : xfs_exch_range_checks(
75 : struct file *file1,
76 : struct file *file2,
77 : struct xfs_exch_range *fxr,
78 : unsigned int blocksize)
79 : {
80 176871 : struct inode *inode1 = file1->f_mapping->host;
81 176871 : struct inode *inode2 = file2->f_mapping->host;
82 176871 : uint64_t blkmask = blocksize - 1;
83 176871 : int64_t test_len;
84 176871 : uint64_t blen;
85 176871 : loff_t size1, size2;
86 176871 : int error;
87 :
88 : /* Don't touch certain kinds of inodes */
89 176871 : if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
90 : return -EPERM;
91 176869 : if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
92 : return -ETXTBSY;
93 :
94 176867 : size1 = i_size_read(inode1);
95 176867 : size2 = i_size_read(inode2);
96 :
97 : /* Ranges cannot start after EOF. */
98 176867 : if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
99 : return -EINVAL;
100 :
101 : /*
102 : * If the caller asked for full files, check that the offset/length
103 : * values cover all of both files.
104 : */
105 176859 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
106 6459 : (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
107 6459 : fxr->length != size1 || fxr->length != size2))
108 : return -EDOM;
109 :
110 : /*
111 : * If the caller said to exchange to EOF, we set the length of the
112 : * request large enough to cover everything to the end of both files.
113 : */
114 176855 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
115 74 : fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
116 : size2 - fxr->file2_offset);
117 :
118 : /* The start of both ranges must be aligned to an fs block. */
119 176855 : if (!IS_ALIGNED(fxr->file1_offset, blocksize) ||
120 176853 : !IS_ALIGNED(fxr->file2_offset, blocksize))
121 : return -EINVAL;
122 :
123 : /* Ensure offsets don't wrap. */
124 176853 : if (fxr->file1_offset + fxr->length < fxr->file1_offset ||
125 176853 : fxr->file2_offset + fxr->length < fxr->file2_offset)
126 : return -EINVAL;
127 :
128 : /*
129 : * We require both ranges to be within EOF, unless we're exchanging
130 : * to EOF. xfs_xchg_range_prep already checked that both
131 : * fxr->file1_offset and fxr->file2_offset are within EOF.
132 : */
133 176853 : if (!(fxr->flags & XFS_EXCH_RANGE_TO_EOF) &&
134 176779 : (fxr->file1_offset + fxr->length > size1 ||
135 176775 : fxr->file2_offset + fxr->length > size2))
136 : return -EINVAL;
137 :
138 : /*
139 : * Make sure we don't hit any file size limits. If we hit any size
140 : * limits such that test_length was adjusted, we abort the whole
141 : * operation.
142 : */
143 176847 : test_len = fxr->length;
144 176847 : error = generic_write_check_limits(file2, fxr->file2_offset, &test_len);
145 176847 : if (error)
146 : return error;
147 176847 : error = generic_write_check_limits(file1, fxr->file1_offset, &test_len);
148 176847 : if (error)
149 : return error;
150 176847 : if (test_len != fxr->length)
151 : return -EINVAL;
152 :
153 : /*
154 : * If the user wanted us to exchange up to the infile's EOF, round up
155 : * to the next block boundary for this check. Do the same for the
156 : * outfile.
157 : *
158 : * Otherwise, reject the range length if it's not block aligned. We
159 : * already confirmed the starting offsets' block alignment.
160 : */
161 176845 : if (fxr->file1_offset + fxr->length == size1)
162 6631 : blen = ALIGN(size1, blocksize) - fxr->file1_offset;
163 170214 : else if (fxr->file2_offset + fxr->length == size2)
164 7585 : blen = ALIGN(size2, blocksize) - fxr->file2_offset;
165 162629 : else if (!IS_ALIGNED(fxr->length, blocksize))
166 : return -EINVAL;
167 : else
168 : blen = fxr->length;
169 :
170 : /* Don't allow overlapped exchanges within the same file. */
171 176845 : if (inode1 == inode2 &&
172 170282 : fxr->file2_offset + blen > fxr->file1_offset &&
173 95617 : fxr->file1_offset + blen > fxr->file2_offset)
174 : return -EINVAL;
175 :
176 : /* If we already failed the freshness check, we're done. */
177 176839 : error = xfs_exch_range_check_fresh(inode2, fxr);
178 176839 : if (error)
179 : return error;
180 :
181 : /*
182 : * Ensure that we don't exchange a partial EOF block into the middle of
183 : * another file.
184 : */
185 176833 : if ((fxr->length & blkmask) == 0)
186 : return 0;
187 :
188 1605 : blen = fxr->length;
189 1605 : if (fxr->file2_offset + blen < size2)
190 2 : blen &= ~blkmask;
191 :
192 1605 : if (fxr->file1_offset + blen < size1)
193 4 : blen &= ~blkmask;
194 :
195 1605 : return blen == fxr->length ? 0 : -EINVAL;
196 : }
197 :
198 : /*
199 : * Check that the two inodes are eligible for range exchanges, the ranges make
200 : * sense, and then flush all dirty data. Caller must ensure that the inodes
201 : * have been locked against any other modifications.
202 : */
203 : int
204 176871 : xfs_exch_range_prep(
205 : struct file *file1,
206 : struct file *file2,
207 : struct xfs_exch_range *fxr,
208 : unsigned int blocksize)
209 : {
210 176871 : struct inode *inode1 = file_inode(file1);
211 176871 : struct inode *inode2 = file_inode(file2);
212 176871 : bool same_inode = (inode1 == inode2);
213 176871 : int error;
214 :
215 : /* Check that we don't violate system file offset limits. */
216 176871 : error = xfs_exch_range_checks(file1, file2, fxr, blocksize);
217 176871 : if (error || fxr->length == 0)
218 : return error;
219 :
220 : /* Wait for the completion of any pending IOs on both files */
221 176713 : inode_dio_wait(inode1);
222 176713 : if (!same_inode)
223 6553 : inode_dio_wait(inode2);
224 :
225 176713 : error = filemap_write_and_wait_range(inode1->i_mapping,
226 : fxr->file1_offset,
227 176713 : fxr->file1_offset + fxr->length - 1);
228 176713 : if (error)
229 : return error;
230 :
231 176713 : error = filemap_write_and_wait_range(inode2->i_mapping,
232 : fxr->file2_offset,
233 176713 : fxr->file2_offset + fxr->length - 1);
234 176713 : if (error)
235 : return error;
236 :
237 : /*
238 : * If the files or inodes involved require synchronous writes, amend
239 : * the request to force the filesystem to flush all data and metadata
240 : * to disk after the operation completes.
241 : */
242 176713 : if (((file1->f_flags | file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
243 170819 : IS_SYNC(inode1) || IS_SYNC(inode2))
244 5894 : fxr->flags |= XFS_EXCH_RANGE_FSYNC;
245 :
246 : return 0;
247 : }
248 :
249 : /*
250 : * Finish a range exchange operation, if it was successful. Caller must ensure
251 : * that the inodes are still locked against any other modifications.
252 : */
253 : int
254 176809 : xfs_exch_range_finish(
255 : struct file *file1,
256 : struct file *file2)
257 : {
258 176809 : int error;
259 :
260 176809 : error = file_remove_privs(file1);
261 176809 : if (error)
262 : return error;
263 176809 : if (file_inode(file1) == file_inode(file2))
264 : return 0;
265 :
266 6537 : return file_remove_privs(file2);
267 : }
268 :
269 : /* Decide if it's ok to remap the selected range of a given file. */
270 : STATIC int
271 353742 : xfs_exch_range_verify_area(
272 : struct file *file,
273 : loff_t pos,
274 : struct xfs_exch_range *fxr)
275 : {
276 353742 : int64_t len = fxr->length;
277 :
278 353742 : if (pos < 0)
279 : return -EINVAL;
280 :
281 353742 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
282 148 : len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos);
283 353742 : return remap_verify_area(file, pos, len, true);
284 : }
285 :
286 : /* Prepare for and exchange parts of two files. */
287 : static inline int
288 1703894 : __xfs_exch_range(
289 : struct file *file1,
290 : struct file *file2,
291 : struct xfs_exch_range *fxr)
292 : {
293 1703894 : struct inode *inode1 = file_inode(file1);
294 1703894 : struct inode *inode2 = file_inode(file2);
295 1703894 : int ret;
296 :
297 3407788 : if ((fxr->flags & ~XFS_EXCH_RANGE_ALL_FLAGS) ||
298 1703894 : memchr_inv(&fxr->pad, 0, sizeof(fxr->pad)))
299 0 : return -EINVAL;
300 :
301 1703894 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
302 : (fxr->flags & XFS_EXCH_RANGE_TO_EOF))
303 : return -EINVAL;
304 :
305 : /*
306 : * The ioctl enforces that src and dest files are on the same mount.
307 : * However, they only need to be on the same file system.
308 : */
309 1703894 : if (inode1->i_sb != inode2->i_sb)
310 : return -EXDEV;
311 :
312 : /* This only works for regular files. */
313 1703894 : if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
314 : return -EISDIR;
315 1703892 : if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
316 : return -EINVAL;
317 :
318 1703892 : ret = generic_file_rw_checks(file1, file2);
319 1703893 : if (ret < 0)
320 : return ret;
321 :
322 1703891 : ret = generic_file_rw_checks(file2, file1);
323 1703890 : if (ret < 0)
324 : return ret;
325 :
326 176871 : ret = xfs_exch_range_verify_area(file1, fxr->file1_offset, fxr);
327 176871 : if (ret)
328 : return ret;
329 :
330 176871 : ret = xfs_exch_range_verify_area(file2, fxr->file2_offset, fxr);
331 176871 : if (ret)
332 : return ret;
333 :
334 176871 : ret = xfs_file_xchg_range(file1, file2, fxr);
335 176871 : if (ret)
336 : return ret;
337 :
338 176809 : fsnotify_modify(file1);
339 176809 : if (file2 != file1)
340 6541 : fsnotify_modify(file2);
341 : return 0;
342 : }
343 :
344 : /* Exchange parts of two files. */
345 : int
346 1703896 : xfs_exch_range(
347 : struct file *file1,
348 : struct file *file2,
349 : struct xfs_exch_range *fxr)
350 : {
351 1703896 : int error;
352 :
353 1703896 : file_start_write(file2);
354 1703894 : error = __xfs_exch_range(file1, file2, fxr);
355 1703887 : file_end_write(file2);
356 1703892 : return error;
357 : }
358 :
359 : /* XFS-specific parts of XFS_IOC_EXCHANGE_RANGE */
360 :
361 : /*
362 : * Exchanging ranges as a file operation. This is the binding between the
363 : * VFS-level concepts and the XFS-specific implementation.
364 : */
365 : int
366 176871 : xfs_file_xchg_range(
367 : struct file *file1,
368 : struct file *file2,
369 : struct xfs_exch_range *fxr)
370 : {
371 176871 : struct inode *inode1 = file_inode(file1);
372 176871 : struct inode *inode2 = file_inode(file2);
373 176871 : struct xfs_inode *ip1 = XFS_I(inode1);
374 176871 : struct xfs_inode *ip2 = XFS_I(inode2);
375 176871 : struct xfs_mount *mp = ip1->i_mount;
376 176871 : unsigned int priv_flags = 0;
377 176871 : bool use_logging = false;
378 176871 : int error;
379 :
380 353742 : if (xfs_is_shutdown(mp))
381 : return -EIO;
382 :
383 : /* Update cmtime if the fd/inode don't forbid it. */
384 176871 : if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
385 176871 : priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
386 176871 : if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
387 170892 : priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
388 :
389 : /* Lock both files against IO */
390 176871 : error = xfs_ilock2_io_mmap(ip1, ip2);
391 176871 : if (error)
392 0 : goto out_err;
393 :
394 : /* Prepare and then exchange file contents. */
395 176871 : error = xfs_xchg_range_prep(file1, file2, fxr, priv_flags);
396 176871 : if (error)
397 42 : goto out_unlock;
398 :
399 : /* Get permission to use log-assisted file content swaps. */
400 176829 : error = xfs_xchg_range_grab_log_assist(mp,
401 176829 : !(fxr->flags & XFS_EXCH_RANGE_NONATOMIC),
402 : &use_logging);
403 176829 : if (error)
404 2 : goto out_unlock;
405 176827 : if (use_logging)
406 170364 : priv_flags |= XFS_XCHG_RANGE_LOGGED;
407 :
408 176827 : error = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
409 176827 : if (error)
410 18 : goto out_drop_feat;
411 :
412 : /*
413 : * Finish the exchange by removing special file privileges like any
414 : * other file write would do. This may involve turning on support for
415 : * logged xattrs if either file has security capabilities, which means
416 : * xfs_xchg_range_grab_log_assist before xfs_attr_grab_log_assist.
417 : */
418 176809 : error = xfs_exch_range_finish(file1, file2);
419 176809 : if (error)
420 0 : goto out_drop_feat;
421 :
422 176809 : out_drop_feat:
423 176827 : if (use_logging)
424 170364 : xfs_xchg_range_rele_log_assist(mp);
425 6463 : out_unlock:
426 176871 : xfs_iunlock2_io_mmap(ip1, ip2);
427 176871 : out_err:
428 176871 : if (error)
429 62 : trace_xfs_file_xchg_range_error(ip2, error, _RET_IP_);
430 : return error;
431 : }
432 :
433 : /* Lock (and optionally join) two inodes for a file range exchange. */
434 : void
435 610633 : xfs_xchg_range_ilock(
436 : struct xfs_trans *tp,
437 : struct xfs_inode *ip1,
438 : struct xfs_inode *ip2)
439 : {
440 610633 : if (ip1 != ip2)
441 270085 : xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
442 : ip2, XFS_ILOCK_EXCL);
443 : else
444 340548 : xfs_ilock(ip1, XFS_ILOCK_EXCL);
445 610646 : if (tp) {
446 390875 : xfs_trans_ijoin(tp, ip1, 0);
447 390862 : if (ip2 != ip1)
448 220588 : xfs_trans_ijoin(tp, ip2, 0);
449 : }
450 :
451 610638 : }
452 :
453 : /* Unlock two inodes after a file range exchange operation. */
454 : void
455 396604 : xfs_xchg_range_iunlock(
456 : struct xfs_inode *ip1,
457 : struct xfs_inode *ip2)
458 : {
459 396604 : if (ip2 != ip1)
460 56056 : xfs_iunlock(ip2, XFS_ILOCK_EXCL);
461 396604 : xfs_iunlock(ip1, XFS_ILOCK_EXCL);
462 396604 : }
463 :
464 : /*
465 : * Estimate the resource requirements to exchange file contents between the two
466 : * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
467 : * have flushed both inodes' pagecache and active direct-ios.
468 : */
469 : int
470 219765 : xfs_xchg_range_estimate(
471 : struct xfs_swapext_req *req)
472 : {
473 219765 : int error;
474 :
475 219765 : xfs_xchg_range_ilock(NULL, req->ip1, req->ip2);
476 219765 : error = xfs_swapext_estimate(req);
477 219765 : xfs_xchg_range_iunlock(req->ip1, req->ip2);
478 219765 : return error;
479 : }
480 :
481 : /*
482 : * We need to check that the format of the data fork in the temporary inode is
483 : * valid for the target inode before doing the swap. This is not a problem with
484 : * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
485 : * data fork depending on the space the attribute fork is taking so we can get
486 : * invalid formats on the target inode.
487 : *
488 : * E.g. target has space for 7 extents in extent format, temp inode only has
489 : * space for 6. If we defragment down to 7 extents, then the tmp format is a
490 : * btree, but when swapped it needs to be in extent format. Hence we can't just
491 : * blindly swap data forks on attr2 filesystems.
492 : *
493 : * Note that we check the swap in both directions so that we don't end up with
494 : * a corrupt temporary inode, either.
495 : *
496 : * Note that fixing the way xfs_fsr sets up the attribute fork in the source
497 : * inode will prevent this situation from occurring, so all we do here is
498 : * reject and log the attempt. basically we are putting the responsibility on
499 : * userspace to get this right.
500 : */
501 : STATIC int
502 0 : xfs_swap_extents_check_format(
503 : struct xfs_inode *ip, /* target inode */
504 : struct xfs_inode *tip) /* tmp inode */
505 : {
506 0 : struct xfs_ifork *ifp = &ip->i_df;
507 0 : struct xfs_ifork *tifp = &tip->i_df;
508 :
509 : /* User/group/project quota ids must match if quotas are enforced. */
510 0 : if (XFS_IS_QUOTA_ON(ip->i_mount) &&
511 0 : (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
512 0 : !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
513 0 : ip->i_projid != tip->i_projid))
514 : return -EINVAL;
515 :
516 : /* Should never get a local format */
517 0 : if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
518 0 : tifp->if_format == XFS_DINODE_FMT_LOCAL)
519 : return -EINVAL;
520 :
521 : /*
522 : * if the target inode has less extents that then temporary inode then
523 : * why did userspace call us?
524 : */
525 0 : if (ifp->if_nextents < tifp->if_nextents)
526 : return -EINVAL;
527 :
528 : /*
529 : * If we have to use the (expensive) rmap swap method, we can
530 : * handle any number of extents and any format.
531 : */
532 0 : if (xfs_has_rmapbt(ip->i_mount))
533 : return 0;
534 :
535 : /*
536 : * if the target inode is in extent form and the temp inode is in btree
537 : * form then we will end up with the target inode in the wrong format
538 : * as we already know there are less extents in the temp inode.
539 : */
540 0 : if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
541 : tifp->if_format == XFS_DINODE_FMT_BTREE)
542 : return -EINVAL;
543 :
544 : /* Check temp in extent form to max in target */
545 0 : if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
546 0 : tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
547 : return -EINVAL;
548 :
549 : /* Check target in extent form to max in temp */
550 0 : if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
551 0 : ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
552 : return -EINVAL;
553 :
554 : /*
555 : * If we are in a btree format, check that the temp root block will fit
556 : * in the target and that it has enough extents to be in btree format
557 : * in the target.
558 : *
559 : * Note that we have to be careful to allow btree->extent conversions
560 : * (a common defrag case) which will occur when the temp inode is in
561 : * extent format...
562 : */
563 0 : if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
564 0 : if (xfs_inode_has_attr_fork(ip) &&
565 0 : XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
566 : return -EINVAL;
567 0 : if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
568 : return -EINVAL;
569 : }
570 :
571 : /* Reciprocal target->temp btree format checks */
572 0 : if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
573 0 : if (xfs_inode_has_attr_fork(tip) &&
574 0 : XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
575 : return -EINVAL;
576 0 : if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
577 0 : return -EINVAL;
578 : }
579 :
580 : return 0;
581 : }
582 :
583 : /*
584 : * Fix up the owners of the bmbt blocks to refer to the current inode. The
585 : * change owner scan attempts to order all modified buffers in the current
586 : * transaction. In the event of ordered buffer failure, the offending buffer is
587 : * physically logged as a fallback and the scan returns -EAGAIN. We must roll
588 : * the transaction in this case to replenish the fallback log reservation and
589 : * restart the scan. This process repeats until the scan completes.
590 : */
591 : static int
592 0 : xfs_swap_change_owner(
593 : struct xfs_trans **tpp,
594 : struct xfs_inode *ip,
595 : struct xfs_inode *tmpip)
596 : {
597 0 : int error;
598 0 : struct xfs_trans *tp = *tpp;
599 :
600 0 : do {
601 0 : error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
602 : NULL);
603 : /* success or fatal error */
604 0 : if (error != -EAGAIN)
605 : break;
606 :
607 0 : error = xfs_trans_roll(tpp);
608 0 : if (error)
609 : break;
610 0 : tp = *tpp;
611 :
612 : /*
613 : * Redirty both inodes so they can relog and keep the log tail
614 : * moving forward.
615 : */
616 0 : xfs_trans_ijoin(tp, ip, 0);
617 0 : xfs_trans_ijoin(tp, tmpip, 0);
618 0 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
619 0 : xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
620 : } while (true);
621 :
622 0 : return error;
623 : }
624 :
625 : /* Swap the extents of two files by swapping data forks. */
626 : STATIC int
627 0 : xfs_swap_extent_forks(
628 : struct xfs_trans **tpp,
629 : struct xfs_swapext_req *req)
630 : {
631 0 : struct xfs_inode *ip = req->ip2;
632 0 : struct xfs_inode *tip = req->ip1;
633 0 : xfs_filblks_t aforkblks = 0;
634 0 : xfs_filblks_t taforkblks = 0;
635 0 : xfs_extnum_t junk;
636 0 : uint64_t tmp;
637 0 : int src_log_flags = XFS_ILOG_CORE;
638 0 : int target_log_flags = XFS_ILOG_CORE;
639 0 : int error;
640 :
641 : /*
642 : * Count the number of extended attribute blocks
643 : */
644 0 : if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
645 0 : ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
646 0 : error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
647 : &aforkblks);
648 0 : if (error)
649 : return error;
650 : }
651 0 : if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
652 0 : tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
653 0 : error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
654 : &taforkblks);
655 0 : if (error)
656 : return error;
657 : }
658 :
659 : /*
660 : * Btree format (v3) inodes have the inode number stamped in the bmbt
661 : * block headers. We can't start changing the bmbt blocks until the
662 : * inode owner change is logged so recovery does the right thing in the
663 : * event of a crash. Set the owner change log flags now and leave the
664 : * bmbt scan as the last step.
665 : */
666 0 : if (xfs_has_v3inodes(ip->i_mount)) {
667 0 : if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
668 0 : target_log_flags |= XFS_ILOG_DOWNER;
669 0 : if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
670 0 : src_log_flags |= XFS_ILOG_DOWNER;
671 : }
672 :
673 : /*
674 : * Swap the data forks of the inodes
675 : */
676 0 : swap(ip->i_df, tip->i_df);
677 :
678 : /*
679 : * Fix the on-disk inode values
680 : */
681 0 : tmp = (uint64_t)ip->i_nblocks;
682 0 : ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
683 0 : tip->i_nblocks = tmp + taforkblks - aforkblks;
684 :
685 : /*
686 : * The extents in the source inode could still contain speculative
687 : * preallocation beyond EOF (e.g. the file is open but not modified
688 : * while defrag is in progress). In that case, we need to copy over the
689 : * number of delalloc blocks the data fork in the source inode is
690 : * tracking beyond EOF so that when the fork is truncated away when the
691 : * temporary inode is unlinked we don't underrun the i_delayed_blks
692 : * counter on that inode.
693 : */
694 0 : ASSERT(tip->i_delayed_blks == 0);
695 0 : tip->i_delayed_blks = ip->i_delayed_blks;
696 0 : ip->i_delayed_blks = 0;
697 :
698 0 : switch (ip->i_df.if_format) {
699 0 : case XFS_DINODE_FMT_EXTENTS:
700 0 : src_log_flags |= XFS_ILOG_DEXT;
701 0 : break;
702 0 : case XFS_DINODE_FMT_BTREE:
703 0 : ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
704 : (src_log_flags & XFS_ILOG_DOWNER));
705 0 : src_log_flags |= XFS_ILOG_DBROOT;
706 0 : break;
707 : }
708 :
709 0 : switch (tip->i_df.if_format) {
710 0 : case XFS_DINODE_FMT_EXTENTS:
711 0 : target_log_flags |= XFS_ILOG_DEXT;
712 0 : break;
713 0 : case XFS_DINODE_FMT_BTREE:
714 0 : target_log_flags |= XFS_ILOG_DBROOT;
715 0 : ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
716 : (target_log_flags & XFS_ILOG_DOWNER));
717 : break;
718 : }
719 :
720 : /* Do we have to swap reflink flags? */
721 0 : if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
722 0 : (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
723 0 : uint64_t f;
724 :
725 0 : f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
726 0 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
727 0 : ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
728 0 : tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
729 0 : tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
730 : }
731 :
732 : /* Swap the cow forks. */
733 0 : if (xfs_has_reflink(ip->i_mount)) {
734 0 : ASSERT(!ip->i_cowfp ||
735 : ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
736 0 : ASSERT(!tip->i_cowfp ||
737 : tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
738 :
739 0 : swap(ip->i_cowfp, tip->i_cowfp);
740 :
741 0 : if (ip->i_cowfp && ip->i_cowfp->if_bytes)
742 0 : xfs_inode_set_cowblocks_tag(ip);
743 : else
744 0 : xfs_inode_clear_cowblocks_tag(ip);
745 0 : if (tip->i_cowfp && tip->i_cowfp->if_bytes)
746 0 : xfs_inode_set_cowblocks_tag(tip);
747 : else
748 0 : xfs_inode_clear_cowblocks_tag(tip);
749 : }
750 :
751 0 : xfs_trans_log_inode(*tpp, ip, src_log_flags);
752 0 : xfs_trans_log_inode(*tpp, tip, target_log_flags);
753 :
754 : /*
755 : * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
756 : * have inode number owner values in the bmbt blocks that still refer to
757 : * the old inode. Scan each bmbt to fix up the owner values with the
758 : * inode number of the current inode.
759 : */
760 0 : if (src_log_flags & XFS_ILOG_DOWNER) {
761 0 : error = xfs_swap_change_owner(tpp, ip, tip);
762 0 : if (error)
763 : return error;
764 : }
765 0 : if (target_log_flags & XFS_ILOG_DOWNER) {
766 0 : error = xfs_swap_change_owner(tpp, tip, ip);
767 0 : if (error)
768 0 : return error;
769 : }
770 :
771 : return 0;
772 : }
773 :
774 : /*
775 : * There may be partially written rt extents lurking in the ranges to be
776 : * swapped. According to the rules for realtime files with big rt extents, we
777 : * must guarantee that an outside observer (an IO thread, realistically) never
778 : * can see multiple physical rt extents mapped to the same logical file rt
779 : * extent. The deferred bmap log intent items that we use under the hood
780 : * operate on single block mappings and not rt extents, which means we must
781 : * have a strategy to ensure that log recovery after a failure won't stop in
782 : * the middle of an rt extent.
783 : *
784 : * The preferred strategy is to use deferred extent swap log intent items to
785 : * track the status of the overall swap operation so that we can complete the
786 : * work during crash recovery. If that isn't possible, we fall back to
787 : * requiring the selected mappings in both forks to be aligned to rt extent
788 : * boundaries. As an aside, the old fork swap routine didn't have this
789 : * requirement, but at an extreme cost in flexibilty (full files only, and no
790 : * support if rmapbt is enabled).
791 : */
792 : static bool
793 176713 : xfs_xchg_range_need_rt_conversion(
794 : struct xfs_inode *ip,
795 : unsigned int xchg_flags)
796 : {
797 176713 : struct xfs_mount *mp = ip->i_mount;
798 :
799 : /*
800 : * Caller got permission to use logged swapext, so log recovery will
801 : * finish the swap and not leave us with partially swapped rt extents
802 : * exposed to userspace.
803 : */
804 176713 : if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
805 : return false;
806 :
807 : /*
808 : * If we can't use log intent items at all, the only supported
809 : * operation is full fork swaps, so no conversions are needed.
810 : * The range requirements are enforced by the swapext code itself.
811 : */
812 176713 : if (!xfs_swapext_supported(mp))
813 : return false;
814 :
815 : /* Conversion is only needed for realtime files with big rt extents */
816 176713 : return xfs_inode_has_bigrtextents(ip);
817 : }
818 :
819 : /*
820 : * Check the alignment of an exchange request when the allocation unit size
821 : * isn't a power of two. The VFS helpers use (fast) bitmask-based alignment
822 : * checks, but here we have to use slow long division.
823 : */
824 : static int
825 0 : xfs_xchg_range_check_rtalign(
826 : struct xfs_inode *ip1,
827 : struct xfs_inode *ip2,
828 : const struct xfs_exch_range *fxr)
829 : {
830 0 : struct xfs_mount *mp = ip1->i_mount;
831 0 : uint32_t rextbytes;
832 0 : uint64_t length = fxr->length;
833 0 : uint64_t blen;
834 0 : loff_t size1, size2;
835 :
836 0 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
837 0 : size1 = i_size_read(VFS_I(ip1));
838 0 : size2 = i_size_read(VFS_I(ip2));
839 :
840 : /* The start of both ranges must be aligned to a rt extent. */
841 0 : if (!isaligned_64(fxr->file1_offset, rextbytes) ||
842 0 : !isaligned_64(fxr->file2_offset, rextbytes))
843 : return -EINVAL;
844 :
845 : /*
846 : * If the caller asked for full files, check that the offset/length
847 : * values cover all of both files.
848 : */
849 0 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
850 0 : (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
851 0 : fxr->length != size1 || fxr->length != size2))
852 : return -EDOM;
853 :
854 0 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
855 0 : length = max_t(int64_t, size1 - fxr->file1_offset,
856 : size2 - fxr->file2_offset);
857 :
858 : /*
859 : * If the user wanted us to exchange up to the infile's EOF, round up
860 : * to the next rt extent boundary for this check. Do the same for the
861 : * outfile.
862 : *
863 : * Otherwise, reject the range length if it's not rt extent aligned.
864 : * We already confirmed the starting offsets' rt extent block
865 : * alignment.
866 : */
867 0 : if (fxr->file1_offset + length == size1)
868 0 : blen = roundup_64(size1, rextbytes) - fxr->file1_offset;
869 0 : else if (fxr->file2_offset + length == size2)
870 0 : blen = roundup_64(size2, rextbytes) - fxr->file2_offset;
871 0 : else if (!isaligned_64(length, rextbytes))
872 : return -EINVAL;
873 : else
874 : blen = length;
875 :
876 : /* Don't allow overlapped exchanges within the same file. */
877 0 : if (ip1 == ip2 &&
878 0 : fxr->file2_offset + blen > fxr->file1_offset &&
879 0 : fxr->file1_offset + blen > fxr->file2_offset)
880 : return -EINVAL;
881 :
882 : /*
883 : * Ensure that we don't exchange a partial EOF rt extent into the
884 : * middle of another file.
885 : */
886 0 : if (isaligned_64(length, rextbytes))
887 : return 0;
888 :
889 0 : blen = length;
890 0 : if (fxr->file2_offset + length < size2)
891 0 : blen = rounddown_64(blen, rextbytes);
892 :
893 0 : if (fxr->file1_offset + blen < size1)
894 0 : blen = rounddown_64(blen, rextbytes);
895 :
896 0 : return blen == length ? 0 : -EINVAL;
897 : }
898 :
899 : /* Prepare two files to have their data exchanged. */
900 : int
901 176871 : xfs_xchg_range_prep(
902 : struct file *file1,
903 : struct file *file2,
904 : struct xfs_exch_range *fxr,
905 : unsigned int xchg_flags)
906 : {
907 176871 : struct xfs_inode *ip1 = XFS_I(file_inode(file1));
908 176871 : struct xfs_inode *ip2 = XFS_I(file_inode(file2));
909 176871 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
910 176871 : int error;
911 :
912 176871 : trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0);
913 :
914 : /* Verify both files are either real-time or non-realtime */
915 530605 : if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
916 : return -EINVAL;
917 :
918 : /* Check non-power of two alignment issues, if necessary. */
919 176875 : if (XFS_IS_REALTIME_INODE(ip2) && !is_power_of_2(alloc_unit)) {
920 0 : error = xfs_xchg_range_check_rtalign(ip1, ip2, fxr);
921 0 : if (error)
922 : return error;
923 :
924 : /* Do the VFS checks with the regular block alignment. */
925 0 : alloc_unit = ip1->i_mount->m_sb.sb_blocksize;
926 : }
927 :
928 176871 : error = xfs_exch_range_prep(file1, file2, fxr, alloc_unit);
929 176871 : if (error || fxr->length == 0)
930 : return error;
931 :
932 : /* Attach dquots to both inodes before changing block maps. */
933 176713 : error = xfs_qm_dqattach(ip2);
934 176713 : if (error)
935 : return error;
936 176713 : error = xfs_qm_dqattach(ip1);
937 176713 : if (error)
938 : return error;
939 :
940 176713 : trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0);
941 :
942 : /* Flush the relevant ranges of both files. */
943 176713 : error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
944 176713 : if (error)
945 : return error;
946 176713 : error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
947 176713 : if (error)
948 : return error;
949 :
950 : /*
951 : * Cancel CoW fork preallocations for the ranges of both files. The
952 : * prep function should have flushed all the dirty data, so the only
953 : * extents remaining should be speculative.
954 : */
955 353426 : if (xfs_inode_has_cow_data(ip1)) {
956 27232 : error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
957 27232 : fxr->length, true);
958 27232 : if (error)
959 : return error;
960 : }
961 :
962 353426 : if (xfs_inode_has_cow_data(ip2)) {
963 25926 : error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
964 25926 : fxr->length, true);
965 25926 : if (error)
966 : return error;
967 : }
968 :
969 : /* Convert unwritten sub-extent mappings if required. */
970 176713 : if (xfs_xchg_range_need_rt_conversion(ip2, xchg_flags)) {
971 0 : error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
972 : fxr->length);
973 0 : if (error)
974 : return error;
975 :
976 0 : error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
977 : fxr->length);
978 0 : if (error)
979 0 : return error;
980 : }
981 :
982 : return 0;
983 : }
984 :
985 : #define QRETRY_IP1 (0x1)
986 : #define QRETRY_IP2 (0x2)
987 :
988 : /*
989 : * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
990 : * this if quota enforcement is disabled or if both inodes' dquots are the
991 : * same. The qretry structure must be initialized to zeroes before the first
992 : * call to this function.
993 : */
994 : STATIC int
995 176827 : xfs_xchg_range_reserve_quota(
996 : struct xfs_trans *tp,
997 : const struct xfs_swapext_req *req,
998 : unsigned int *qretry)
999 : {
1000 176827 : int64_t ddelta, rdelta;
1001 176827 : int ip1_error = 0;
1002 176827 : int error;
1003 :
1004 : /*
1005 : * Don't bother with a quota reservation if we're not enforcing them
1006 : * or the two inodes have the same dquots.
1007 : */
1008 176827 : if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
1009 6549 : (req->ip1->i_udquot == req->ip2->i_udquot &&
1010 6533 : req->ip1->i_gdquot == req->ip2->i_gdquot &&
1011 6527 : req->ip1->i_pdquot == req->ip2->i_pdquot))
1012 : return 0;
1013 :
1014 22 : *qretry = 0;
1015 :
1016 : /*
1017 : * For each file, compute the net gain in the number of regular blocks
1018 : * that will be mapped into that file and reserve that much quota. The
1019 : * quota counts must be able to absorb at least that much space.
1020 : */
1021 22 : ddelta = req->ip2_bcount - req->ip1_bcount;
1022 22 : rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
1023 22 : if (ddelta > 0 || rdelta > 0) {
1024 8 : error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
1025 : ddelta > 0 ? ddelta : 0,
1026 : rdelta > 0 ? rdelta : 0,
1027 : false);
1028 8 : if (error == -EDQUOT || error == -ENOSPC) {
1029 : /*
1030 : * Save this error and see what happens if we try to
1031 : * reserve quota for ip2. Then report both.
1032 : */
1033 4 : *qretry |= QRETRY_IP1;
1034 4 : ip1_error = error;
1035 4 : error = 0;
1036 : }
1037 8 : if (error)
1038 : return error;
1039 : }
1040 22 : if (ddelta < 0 || rdelta < 0) {
1041 2 : error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
1042 : ddelta < 0 ? -ddelta : 0,
1043 : rdelta < 0 ? -rdelta : 0,
1044 : false);
1045 2 : if (error == -EDQUOT || error == -ENOSPC)
1046 0 : *qretry |= QRETRY_IP2;
1047 2 : if (error)
1048 : return error;
1049 : }
1050 22 : if (ip1_error)
1051 : return ip1_error;
1052 :
1053 : /*
1054 : * For each file, forcibly reserve the gross gain in mapped blocks so
1055 : * that we don't trip over any quota block reservation assertions.
1056 : * We must reserve the gross gain because the quota code subtracts from
1057 : * bcount the number of blocks that we unmap; it does not add that
1058 : * quantity back to the quota block reservation.
1059 : */
1060 18 : error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
1061 18 : req->ip1_rtbcount, true);
1062 18 : if (error)
1063 : return error;
1064 :
1065 18 : return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
1066 18 : req->ip2_rtbcount, true);
1067 : }
1068 :
1069 : /*
1070 : * Get permission to use log-assisted atomic exchange of file extents.
1071 : *
1072 : * Callers must hold the IOLOCK and MMAPLOCK of both files. They must not be
1073 : * running any transactions or hold any ILOCKS. If @use_logging is set after a
1074 : * successful return, callers must call xfs_xchg_range_rele_log_assist after
1075 : * the exchange is completed.
1076 : */
1077 : int
1078 391812 : xfs_xchg_range_grab_log_assist(
1079 : struct xfs_mount *mp,
1080 : bool force,
1081 : bool *use_logging)
1082 : {
1083 391812 : int error = 0;
1084 :
1085 : /*
1086 : * Protect ourselves from an idle log clearing the atomic swapext
1087 : * log incompat feature bit.
1088 : */
1089 391812 : xlog_use_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1090 391822 : *use_logging = true;
1091 :
1092 : /*
1093 : * If log-assisted swapping is already enabled, the caller can use the
1094 : * log assisted swap functions with the log-incompat reference we got.
1095 : */
1096 783644 : if (xfs_sb_version_haslogswapext(&mp->m_sb))
1097 : return 0;
1098 :
1099 : /*
1100 : * If the caller doesn't /require/ log-assisted swapping, drop the
1101 : * log-incompat feature protection and exit. The caller cannot use
1102 : * log assisted swapping.
1103 : */
1104 14300 : if (!force)
1105 6463 : goto drop_incompat;
1106 :
1107 : /*
1108 : * Caller requires log-assisted swapping but the fs feature set isn't
1109 : * rich enough to support it. Bail out.
1110 : */
1111 8788 : if (!xfs_swapext_supported(mp)) {
1112 951 : error = -EOPNOTSUPP;
1113 951 : goto drop_incompat;
1114 : }
1115 :
1116 6886 : error = xfs_add_incompat_log_feature(mp,
1117 : XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT);
1118 6886 : if (error)
1119 0 : goto drop_incompat;
1120 :
1121 6886 : xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SWAPEXT,
1122 : "EXPERIMENTAL atomic file range swap feature in use. Use at your own risk!");
1123 :
1124 : return 0;
1125 7414 : drop_incompat:
1126 7414 : xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1127 7414 : *use_logging = false;
1128 7414 : return error;
1129 : }
1130 :
1131 : /* Release permission to use log-assisted extent swapping. */
1132 : void
1133 214045 : xfs_xchg_range_rele_log_assist(
1134 : struct xfs_mount *mp)
1135 : {
1136 384409 : xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1137 170364 : }
1138 :
1139 : /* Decide if we can use the old data fork exchange code. */
1140 : static inline bool
1141 2 : xfs_xchg_use_forkswap(
1142 : const struct xfs_exch_range *fxr,
1143 : struct xfs_inode *ip1,
1144 : struct xfs_inode *ip2)
1145 : {
1146 2 : if (!(fxr->flags & XFS_EXCH_RANGE_NONATOMIC))
1147 : return false;
1148 2 : if (!(fxr->flags & XFS_EXCH_RANGE_FULL_FILES))
1149 : return false;
1150 0 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
1151 : return false;
1152 0 : if (fxr->file1_offset != 0 || fxr->file2_offset != 0)
1153 : return false;
1154 0 : if (fxr->length != ip1->i_disk_size)
1155 : return false;
1156 0 : if (fxr->length != ip2->i_disk_size)
1157 0 : return false;
1158 : return true;
1159 : }
1160 :
1161 : enum xchg_strategy {
1162 : SWAPEXT = 1, /* xfs_swapext() */
1163 : FORKSWAP = 2, /* exchange forks */
1164 : };
1165 :
1166 : /* Exchange the contents of two files. */
1167 : int
1168 176827 : xfs_xchg_range(
1169 : struct xfs_inode *ip1,
1170 : struct xfs_inode *ip2,
1171 : const struct xfs_exch_range *fxr,
1172 : unsigned int xchg_flags)
1173 : {
1174 176827 : struct xfs_mount *mp = ip1->i_mount;
1175 176827 : struct xfs_swapext_req req = {
1176 : .ip1 = ip1,
1177 : .ip2 = ip2,
1178 : .whichfork = XFS_DATA_FORK,
1179 176827 : .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
1180 176827 : .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
1181 176827 : .blockcount = XFS_B_TO_FSB(mp, fxr->length),
1182 : };
1183 176827 : struct xfs_trans *tp;
1184 176827 : unsigned int qretry;
1185 176827 : unsigned int flags = 0;
1186 176827 : bool retried = false;
1187 176827 : enum xchg_strategy strategy;
1188 176827 : int error;
1189 :
1190 176827 : trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags);
1191 :
1192 176827 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
1193 68 : req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
1194 176827 : if (fxr->flags & XFS_EXCH_RANGE_FILE1_WRITTEN)
1195 12 : req.req_flags |= XFS_SWAP_REQ_INO1_WRITTEN;
1196 176827 : if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
1197 170364 : req.req_flags |= XFS_SWAP_REQ_LOGGED;
1198 :
1199 : /*
1200 : * Round the request length up to the nearest fundamental unit of
1201 : * allocation. The prep function already checked that the request
1202 : * offsets and length in @fxr are safe to round up.
1203 : */
1204 176827 : if (XFS_IS_REALTIME_INODE(ip2))
1205 2 : req.blockcount = roundup_64(req.blockcount,
1206 : mp->m_sb.sb_rextsize);
1207 :
1208 176827 : error = xfs_xchg_range_estimate(&req);
1209 176827 : if (error)
1210 : return error;
1211 :
1212 : /*
1213 : * We haven't decided which exchange strategy we want to use yet, but
1214 : * here we must choose if we want freed blocks during the swap to be
1215 : * added to the transaction block reservation (RES_FDBLKS) or freed
1216 : * into the global fdblocks. The legacy fork swap mechanism doesn't
1217 : * free any blocks, so it doesn't require it. It is also the only
1218 : * option that works for older filesystems.
1219 : *
1220 : * The bmap log intent items that were added with rmap and reflink can
1221 : * change the bmbt shape, so the intent-based swap strategies require
1222 : * us to set RES_FDBLKS.
1223 : */
1224 176825 : if (xfs_has_lazysbcount(mp))
1225 176825 : flags |= XFS_TRANS_RES_FDBLKS;
1226 :
1227 176825 : retry:
1228 : /* Allocate the transaction, lock the inodes, and join them. */
1229 176827 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
1230 : flags, &tp);
1231 176827 : if (error)
1232 0 : return error;
1233 :
1234 176827 : xfs_xchg_range_ilock(tp, ip1, ip2);
1235 :
1236 176827 : trace_xfs_swap_extent_before(ip2, 0);
1237 176827 : trace_xfs_swap_extent_before(ip1, 1);
1238 :
1239 176827 : if (fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH)
1240 6543 : trace_xfs_xchg_range_freshness(ip2, fxr);
1241 :
1242 : /*
1243 : * Now that we've excluded all other inode metadata changes by taking
1244 : * the ILOCK, repeat the freshness check.
1245 : */
1246 176827 : error = xfs_exch_range_check_fresh(VFS_I(ip2), fxr);
1247 176827 : if (error)
1248 0 : goto out_trans_cancel;
1249 :
1250 176827 : error = xfs_swapext_check_extents(mp, &req);
1251 176827 : if (error)
1252 0 : goto out_trans_cancel;
1253 :
1254 : /*
1255 : * Reserve ourselves some quota if any of them are in enforcing mode.
1256 : * In theory we only need enough to satisfy the change in the number
1257 : * of blocks between the two ranges being remapped.
1258 : */
1259 176827 : error = xfs_xchg_range_reserve_quota(tp, &req, &qretry);
1260 176827 : if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
1261 2 : xfs_trans_cancel(tp);
1262 2 : xfs_xchg_range_iunlock(ip1, ip2);
1263 2 : if (qretry & QRETRY_IP1)
1264 2 : xfs_blockgc_free_quota(ip1, 0);
1265 2 : if (qretry & QRETRY_IP2)
1266 0 : xfs_blockgc_free_quota(ip2, 0);
1267 2 : retried = true;
1268 2 : goto retry;
1269 : }
1270 176825 : if (error)
1271 2 : goto out_trans_cancel;
1272 :
1273 183282 : if ((xchg_flags & XFS_XCHG_RANGE_LOGGED) || xfs_swapext_supported(mp)) {
1274 : /*
1275 : * xfs_swapext() uses deferred bmap log intent items to swap
1276 : * extents between file forks. If the atomic log swap feature
1277 : * is enabled, it will also use swapext log intent items to
1278 : * restart the operation in case of failure.
1279 : *
1280 : * This means that we can use it if we previously obtained
1281 : * permission from the log to use log-assisted atomic extent
1282 : * swapping; or if the fs supports rmap or reflink and the
1283 : * user said NONATOMIC.
1284 : */
1285 : strategy = SWAPEXT;
1286 2 : } else if (xfs_xchg_use_forkswap(fxr, ip1, ip2)) {
1287 : /*
1288 : * Exchange the file contents by using the old bmap fork
1289 : * exchange code, if we're a defrag tool doing a full file
1290 : * swap.
1291 : */
1292 0 : strategy = FORKSWAP;
1293 :
1294 0 : error = xfs_swap_extents_check_format(ip2, ip1);
1295 0 : if (error) {
1296 0 : xfs_notice(mp,
1297 : "%s: inode 0x%llx format is incompatible for exchanging.",
1298 : __func__, ip2->i_ino);
1299 0 : goto out_trans_cancel;
1300 : }
1301 : } else {
1302 : /* We cannot exchange the file contents. */
1303 2 : error = -EOPNOTSUPP;
1304 2 : goto out_trans_cancel;
1305 : }
1306 :
1307 : /* If we got this far on a dry run, all parameters are ok. */
1308 176821 : if (fxr->flags & XFS_EXCH_RANGE_DRY_RUN)
1309 114 : goto out_trans_cancel;
1310 :
1311 : /* Update the mtime and ctime of both files. */
1312 176707 : if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
1313 176707 : xfs_trans_ichgtime(tp, ip1,
1314 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1315 176707 : if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
1316 170728 : xfs_trans_ichgtime(tp, ip2,
1317 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1318 :
1319 176707 : if (strategy == SWAPEXT) {
1320 176707 : xfs_swapext(tp, &req);
1321 : } else {
1322 0 : error = xfs_swap_extent_forks(&tp, &req);
1323 0 : if (error)
1324 0 : goto out_trans_cancel;
1325 : }
1326 :
1327 : /*
1328 : * Force the log to persist metadata updates if the caller or the
1329 : * administrator requires this. The VFS prep function already flushed
1330 : * the relevant parts of the page cache.
1331 : */
1332 176707 : if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCH_RANGE_FSYNC))
1333 5966 : xfs_trans_set_sync(tp);
1334 :
1335 176707 : error = xfs_trans_commit(tp);
1336 :
1337 176707 : trace_xfs_swap_extent_after(ip2, 0);
1338 176707 : trace_xfs_swap_extent_after(ip1, 1);
1339 :
1340 176707 : if (error)
1341 12 : goto out_unlock;
1342 :
1343 : /*
1344 : * If the caller wanted us to exchange the contents of two complete
1345 : * files of unequal length, exchange the incore sizes now. This should
1346 : * be safe because we flushed both files' page caches, moved all the
1347 : * extents, and updated the ondisk sizes.
1348 : */
1349 176695 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) {
1350 66 : loff_t temp;
1351 :
1352 66 : temp = i_size_read(VFS_I(ip2));
1353 66 : i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
1354 66 : i_size_write(VFS_I(ip1), temp);
1355 : }
1356 :
1357 176629 : out_unlock:
1358 176825 : xfs_xchg_range_iunlock(ip1, ip2);
1359 176825 : return error;
1360 :
1361 118 : out_trans_cancel:
1362 118 : xfs_trans_cancel(tp);
1363 118 : goto out_unlock;
1364 : }
|