Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2020-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : *
6 : * The xfs_swap_extent_* functions are:
7 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 : * Copyright (c) 2012 Red Hat, Inc.
9 : * All Rights Reserved.
10 : */
11 : #include "xfs.h"
12 : #include "xfs_shared.h"
13 : #include "xfs_format.h"
14 : #include "xfs_log_format.h"
15 : #include "xfs_trans_resv.h"
16 : #include "xfs_mount.h"
17 : #include "xfs_defer.h"
18 : #include "xfs_inode.h"
19 : #include "xfs_trans.h"
20 : #include "xfs_quota.h"
21 : #include "xfs_bmap_util.h"
22 : #include "xfs_bmap_btree.h"
23 : #include "xfs_reflink.h"
24 : #include "xfs_trace.h"
25 : #include "xfs_swapext.h"
26 : #include "xfs_xchgrange.h"
27 : #include "xfs_sb.h"
28 : #include "xfs_icache.h"
29 : #include "xfs_log.h"
30 : #include "xfs_rtalloc.h"
31 : #include <linux/fsnotify.h>
32 :
33 : /*
34 : * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
35 : * This part does not deal with XFS-specific data structures, and may some day
36 : * be ported to the VFS.
37 : *
38 : * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
39 : * file1 with the same number of bytes starting at fxr.file2_offset in file2.
40 : * Implementations must call xfs_exch_range_prep to prepare the two files
41 : * prior to taking locks; they must call xfs_exch_range_check_fresh once
42 : * the inode is locked to abort the call if file2 has changed; and they must
43 : * update the inode change and mod times of both files as part of the metadata
44 : * update. The timestamp updates must be done atomically as part of the data
45 : * exchange operation to ensure correctness of the freshness check.
46 : */
47 :
48 : /*
49 : * Check that both files' metadata agree with the snapshot that we took for
50 : * the range exchange request.
51 : *
52 : * This should be called after the filesystem has locked /all/ inode metadata
53 : * against modification.
54 : */
55 : STATIC int
56 2485193 : xfs_exch_range_check_fresh(
57 : struct inode *inode2,
58 : const struct xfs_exch_range *fxr)
59 : {
60 : /* Check that file2 hasn't otherwise been modified. */
61 2485193 : if ((fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) &&
62 63416 : (fxr->file2_ino != inode2->i_ino ||
63 63416 : fxr->file2_ctime != inode2->i_ctime.tv_sec ||
64 63415 : fxr->file2_ctime_nsec != inode2->i_ctime.tv_nsec ||
65 63398 : fxr->file2_mtime != inode2->i_mtime.tv_sec ||
66 63398 : fxr->file2_mtime_nsec != inode2->i_mtime.tv_nsec))
67 18 : return -EBUSY;
68 :
69 : return 0;
70 : }
71 :
72 : /* Performs necessary checks before doing a range exchange. */
73 : STATIC int
74 1242836 : xfs_exch_range_checks(
75 : struct file *file1,
76 : struct file *file2,
77 : struct xfs_exch_range *fxr,
78 : unsigned int blocksize)
79 : {
80 1242836 : struct inode *inode1 = file1->f_mapping->host;
81 1242836 : struct inode *inode2 = file2->f_mapping->host;
82 1242836 : uint64_t blkmask = blocksize - 1;
83 1242836 : int64_t test_len;
84 1242836 : uint64_t blen;
85 1242836 : loff_t size1, size2;
86 1242836 : int error;
87 :
88 : /* Don't touch certain kinds of inodes */
89 1242836 : if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
90 : return -EPERM;
91 1242830 : if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
92 : return -ETXTBSY;
93 :
94 1242819 : size1 = i_size_read(inode1);
95 1242819 : size2 = i_size_read(inode2);
96 :
97 : /* Ranges cannot start after EOF. */
98 1242819 : if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
99 : return -EINVAL;
100 :
101 : /*
102 : * If the caller asked for full files, check that the offset/length
103 : * values cover all of both files.
104 : */
105 1242795 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
106 31488 : (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
107 31488 : fxr->length != size1 || fxr->length != size2))
108 : return -EDOM;
109 :
110 : /*
111 : * If the caller said to exchange to EOF, we set the length of the
112 : * request large enough to cover everything to the end of both files.
113 : */
114 1242780 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
115 225 : fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
116 : size2 - fxr->file2_offset);
117 :
118 : /* The start of both ranges must be aligned to an fs block. */
119 1242780 : if (!IS_ALIGNED(fxr->file1_offset, blocksize) ||
120 1242774 : !IS_ALIGNED(fxr->file2_offset, blocksize))
121 : return -EINVAL;
122 :
123 : /* Ensure offsets don't wrap. */
124 1242774 : if (fxr->file1_offset + fxr->length < fxr->file1_offset ||
125 1242774 : fxr->file2_offset + fxr->length < fxr->file2_offset)
126 : return -EINVAL;
127 :
128 : /*
129 : * We require both ranges to be within EOF, unless we're exchanging
130 : * to EOF. xfs_xchg_range_prep already checked that both
131 : * fxr->file1_offset and fxr->file2_offset are within EOF.
132 : */
133 1242774 : if (!(fxr->flags & XFS_EXCH_RANGE_TO_EOF) &&
134 1242549 : (fxr->file1_offset + fxr->length > size1 ||
135 1242537 : fxr->file2_offset + fxr->length > size2))
136 : return -EINVAL;
137 :
138 : /*
139 : * Make sure we don't hit any file size limits. If we hit any size
140 : * limits such that test_length was adjusted, we abort the whole
141 : * operation.
142 : */
143 1242756 : test_len = fxr->length;
144 1242756 : error = generic_write_check_limits(file2, fxr->file2_offset, &test_len);
145 1242761 : if (error)
146 : return error;
147 1242760 : error = generic_write_check_limits(file1, fxr->file1_offset, &test_len);
148 1242759 : if (error)
149 : return error;
150 1242759 : if (test_len != fxr->length)
151 : return -EINVAL;
152 :
153 : /*
154 : * If the user wanted us to exchange up to the infile's EOF, round up
155 : * to the next block boundary for this check. Do the same for the
156 : * outfile.
157 : *
158 : * Otherwise, reject the range length if it's not block aligned. We
159 : * already confirmed the starting offsets' block alignment.
160 : */
161 1242748 : if (fxr->file1_offset + fxr->length == size1)
162 32346 : blen = ALIGN(size1, blocksize) - fxr->file1_offset;
163 1210402 : else if (fxr->file2_offset + fxr->length == size2)
164 6556 : blen = ALIGN(size2, blocksize) - fxr->file2_offset;
165 1203846 : else if (!IS_ALIGNED(fxr->length, blocksize))
166 : return -EINVAL;
167 : else
168 : blen = fxr->length;
169 :
170 : /* Don't allow overlapped exchanges within the same file. */
171 1242748 : if (inode1 == inode2 &&
172 1210957 : fxr->file2_offset + blen > fxr->file1_offset &&
173 527257 : fxr->file1_offset + blen > fxr->file2_offset)
174 : return -EINVAL;
175 :
176 : /* If we already failed the freshness check, we're done. */
177 1242732 : error = xfs_exch_range_check_fresh(inode2, fxr);
178 1242732 : if (error)
179 : return error;
180 :
181 : /*
182 : * Ensure that we don't exchange a partial EOF block into the middle of
183 : * another file.
184 : */
185 1242712 : if ((fxr->length & blkmask) == 0)
186 : return 0;
187 :
188 4550 : blen = fxr->length;
189 4550 : if (fxr->file2_offset + blen < size2)
190 6 : blen &= ~blkmask;
191 :
192 4550 : if (fxr->file1_offset + blen < size1)
193 12 : blen &= ~blkmask;
194 :
195 4550 : return blen == fxr->length ? 0 : -EINVAL;
196 : }
197 :
198 : /*
199 : * Check that the two inodes are eligible for range exchanges, the ranges make
200 : * sense, and then flush all dirty data. Caller must ensure that the inodes
201 : * have been locked against any other modifications.
202 : */
203 : int
204 1242843 : xfs_exch_range_prep(
205 : struct file *file1,
206 : struct file *file2,
207 : struct xfs_exch_range *fxr,
208 : unsigned int blocksize)
209 : {
210 1242843 : struct inode *inode1 = file_inode(file1);
211 1242843 : struct inode *inode2 = file_inode(file2);
212 1242843 : bool same_inode = (inode1 == inode2);
213 1242843 : int error;
214 :
215 : /* Check that we don't violate system file offset limits. */
216 1242843 : error = xfs_exch_range_checks(file1, file2, fxr, blocksize);
217 1242837 : if (error || fxr->length == 0)
218 : return error;
219 :
220 : /* Wait for the completion of any pending IOs on both files */
221 1242035 : inode_dio_wait(inode1);
222 1242032 : if (!same_inode)
223 31761 : inode_dio_wait(inode2);
224 :
225 1242032 : error = filemap_write_and_wait_range(inode1->i_mapping,
226 : fxr->file1_offset,
227 1242032 : fxr->file1_offset + fxr->length - 1);
228 1242033 : if (error)
229 : return error;
230 :
231 1242033 : error = filemap_write_and_wait_range(inode2->i_mapping,
232 : fxr->file2_offset,
233 1242033 : fxr->file2_offset + fxr->length - 1);
234 1242043 : if (error)
235 : return error;
236 :
237 : /*
238 : * If the files or inodes involved require synchronous writes, amend
239 : * the request to force the filesystem to flush all data and metadata
240 : * to disk after the operation completes.
241 : */
242 1242043 : if (((file1->f_flags | file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
243 1218028 : IS_SYNC(inode1) || IS_SYNC(inode2))
244 24017 : fxr->flags |= XFS_EXCH_RANGE_FSYNC;
245 :
246 : return 0;
247 : }
248 :
249 : /*
250 : * Finish a range exchange operation, if it was successful. Caller must ensure
251 : * that the inodes are still locked against any other modifications.
252 : */
253 : int
254 1242164 : xfs_exch_range_finish(
255 : struct file *file1,
256 : struct file *file2)
257 : {
258 1242164 : int error;
259 :
260 1242164 : error = file_remove_privs(file1);
261 1242159 : if (error)
262 : return error;
263 1242159 : if (file_inode(file1) == file_inode(file2))
264 : return 0;
265 :
266 31596 : return file_remove_privs(file2);
267 : }
268 :
269 : /* Decide if it's ok to remap the selected range of a given file. */
270 : STATIC int
271 2485682 : xfs_exch_range_verify_area(
272 : struct file *file,
273 : loff_t pos,
274 : struct xfs_exch_range *fxr)
275 : {
276 2485682 : int64_t len = fxr->length;
277 :
278 2485682 : if (pos < 0)
279 : return -EINVAL;
280 :
281 2485682 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
282 450 : len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos);
283 2485682 : return remap_verify_area(file, pos, len, true);
284 : }
285 :
286 : /* Prepare for and exchange parts of two files. */
287 : static inline int
288 3531266 : __xfs_exch_range(
289 : struct file *file1,
290 : struct file *file2,
291 : struct xfs_exch_range *fxr)
292 : {
293 3531266 : struct inode *inode1 = file_inode(file1);
294 3531266 : struct inode *inode2 = file_inode(file2);
295 3531266 : int ret;
296 :
297 7062578 : if ((fxr->flags & ~XFS_EXCH_RANGE_ALL_FLAGS) ||
298 3531266 : memchr_inv(&fxr->pad, 0, sizeof(fxr->pad)))
299 0 : return -EINVAL;
300 :
301 3531312 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
302 : (fxr->flags & XFS_EXCH_RANGE_TO_EOF))
303 : return -EINVAL;
304 :
305 : /*
306 : * The ioctl enforces that src and dest files are on the same mount.
307 : * However, they only need to be on the same file system.
308 : */
309 3531312 : if (inode1->i_sb != inode2->i_sb)
310 : return -EXDEV;
311 :
312 : /* This only works for regular files. */
313 3531312 : if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
314 : return -EISDIR;
315 3531306 : if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
316 : return -EINVAL;
317 :
318 3531306 : ret = generic_file_rw_checks(file1, file2);
319 3531226 : if (ret < 0)
320 : return ret;
321 :
322 3531220 : ret = generic_file_rw_checks(file2, file1);
323 3531263 : if (ret < 0)
324 : return ret;
325 :
326 1242844 : ret = xfs_exch_range_verify_area(file1, fxr->file1_offset, fxr);
327 1242838 : if (ret)
328 : return ret;
329 :
330 1242838 : ret = xfs_exch_range_verify_area(file2, fxr->file2_offset, fxr);
331 1242841 : if (ret)
332 : return ret;
333 :
334 1242841 : ret = xfs_file_xchg_range(file1, file2, fxr);
335 1242843 : if (ret)
336 : return ret;
337 :
338 1242164 : fsnotify_modify(file1);
339 1242160 : if (file2 != file1)
340 31606 : fsnotify_modify(file2);
341 : return 0;
342 : }
343 :
344 : /* Exchange parts of two files. */
345 : int
346 3531295 : xfs_exch_range(
347 : struct file *file1,
348 : struct file *file2,
349 : struct xfs_exch_range *fxr)
350 : {
351 3531295 : int error;
352 :
353 3531295 : file_start_write(file2);
354 3531262 : error = __xfs_exch_range(file1, file2, fxr);
355 3531236 : file_end_write(file2);
356 3531267 : return error;
357 : }
358 :
359 : /* XFS-specific parts of XFS_IOC_EXCHANGE_RANGE */
360 :
361 : /*
362 : * Exchanging ranges as a file operation. This is the binding between the
363 : * VFS-level concepts and the XFS-specific implementation.
364 : */
365 : int
366 1242842 : xfs_file_xchg_range(
367 : struct file *file1,
368 : struct file *file2,
369 : struct xfs_exch_range *fxr)
370 : {
371 1242842 : struct inode *inode1 = file_inode(file1);
372 1242842 : struct inode *inode2 = file_inode(file2);
373 1242842 : struct xfs_inode *ip1 = XFS_I(inode1);
374 1242842 : struct xfs_inode *ip2 = XFS_I(inode2);
375 1242842 : struct xfs_mount *mp = ip1->i_mount;
376 1242842 : unsigned int priv_flags = 0;
377 1242842 : bool use_logging = false;
378 1242842 : int error;
379 :
380 2485684 : if (xfs_is_shutdown(mp))
381 : return -EIO;
382 :
383 : /* Update cmtime if the fd/inode don't forbid it. */
384 1242842 : if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
385 1242842 : priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
386 1242842 : if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
387 1213736 : priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
388 :
389 : /* Lock both files against IO */
390 1242842 : error = xfs_ilock2_io_mmap(ip1, ip2);
391 1242843 : if (error)
392 0 : goto out_err;
393 :
394 : /* Prepare and then exchange file contents. */
395 1242843 : error = xfs_xchg_range_prep(file1, file2, fxr, priv_flags);
396 1242841 : if (error)
397 139 : goto out_unlock;
398 :
399 : /* Get permission to use log-assisted file content swaps. */
400 1242702 : error = xfs_xchg_range_grab_log_assist(mp,
401 1242702 : !(fxr->flags & XFS_EXCH_RANGE_NONATOMIC),
402 : &use_logging);
403 1242706 : if (error)
404 242 : goto out_unlock;
405 1242464 : if (use_logging)
406 1210845 : priv_flags |= XFS_XCHG_RANGE_LOGGED;
407 :
408 1242464 : error = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
409 1242464 : if (error)
410 297 : goto out_drop_feat;
411 :
412 : /*
413 : * Finish the exchange by removing special file privileges like any
414 : * other file write would do. This may involve turning on support for
415 : * logged xattrs if either file has security capabilities, which means
416 : * xfs_xchg_range_grab_log_assist before xfs_attr_grab_log_assist.
417 : */
418 1242167 : error = xfs_exch_range_finish(file1, file2);
419 1242158 : if (error)
420 0 : goto out_drop_feat;
421 :
422 1242158 : out_drop_feat:
423 1242455 : if (use_logging)
424 1210836 : xfs_xchg_range_rele_log_assist(mp);
425 31619 : out_unlock:
426 1242842 : xfs_iunlock2_io_mmap(ip1, ip2);
427 1242843 : out_err:
428 1242843 : if (error)
429 678 : trace_xfs_file_xchg_range_error(ip2, error, _RET_IP_);
430 : return error;
431 : }
432 :
433 : /* Lock (and optionally join) two inodes for a file range exchange. */
434 : void
435 7119074 : xfs_xchg_range_ilock(
436 : struct xfs_trans *tp,
437 : struct xfs_inode *ip1,
438 : struct xfs_inode *ip2)
439 : {
440 7119074 : if (ip1 != ip2)
441 4697566 : xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
442 : ip2, XFS_ILOCK_EXCL);
443 : else
444 2421508 : xfs_ilock(ip1, XFS_ILOCK_EXCL);
445 7117684 : if (tp) {
446 5815540 : xfs_trans_ijoin(tp, ip1, 0);
447 5817149 : if (ip2 != ip1)
448 4606391 : xfs_trans_ijoin(tp, ip2, 0);
449 : }
450 :
451 7119663 : }
452 :
453 : /* Unlock two inodes after a file range exchange operation. */
454 : void
455 2544625 : xfs_xchg_range_iunlock(
456 : struct xfs_inode *ip1,
457 : struct xfs_inode *ip2)
458 : {
459 2544625 : if (ip2 != ip1)
460 123110 : xfs_iunlock(ip2, XFS_ILOCK_EXCL);
461 2544626 : xfs_iunlock(ip1, XFS_ILOCK_EXCL);
462 2544620 : }
463 :
464 : /*
465 : * Estimate the resource requirements to exchange file contents between the two
466 : * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
467 : * have flushed both inodes' pagecache and active direct-ios.
468 : */
469 : int
470 1302126 : xfs_xchg_range_estimate(
471 : struct xfs_swapext_req *req)
472 : {
473 1302126 : int error;
474 :
475 1302126 : xfs_xchg_range_ilock(NULL, req->ip1, req->ip2);
476 1302128 : error = xfs_swapext_estimate(req);
477 1302127 : xfs_xchg_range_iunlock(req->ip1, req->ip2);
478 1302122 : return error;
479 : }
480 :
481 : /*
482 : * We need to check that the format of the data fork in the temporary inode is
483 : * valid for the target inode before doing the swap. This is not a problem with
484 : * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
485 : * data fork depending on the space the attribute fork is taking so we can get
486 : * invalid formats on the target inode.
487 : *
488 : * E.g. target has space for 7 extents in extent format, temp inode only has
489 : * space for 6. If we defragment down to 7 extents, then the tmp format is a
490 : * btree, but when swapped it needs to be in extent format. Hence we can't just
491 : * blindly swap data forks on attr2 filesystems.
492 : *
493 : * Note that we check the swap in both directions so that we don't end up with
494 : * a corrupt temporary inode, either.
495 : *
496 : * Note that fixing the way xfs_fsr sets up the attribute fork in the source
497 : * inode will prevent this situation from occurring, so all we do here is
498 : * reject and log the attempt. basically we are putting the responsibility on
499 : * userspace to get this right.
500 : */
501 : STATIC int
502 15580 : xfs_swap_extents_check_format(
503 : struct xfs_inode *ip, /* target inode */
504 : struct xfs_inode *tip) /* tmp inode */
505 : {
506 15580 : struct xfs_ifork *ifp = &ip->i_df;
507 15580 : struct xfs_ifork *tifp = &tip->i_df;
508 :
509 : /* User/group/project quota ids must match if quotas are enforced. */
510 15580 : if (XFS_IS_QUOTA_ON(ip->i_mount) &&
511 0 : (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
512 0 : !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
513 0 : ip->i_projid != tip->i_projid))
514 : return -EINVAL;
515 :
516 : /* Should never get a local format */
517 15580 : if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
518 15580 : tifp->if_format == XFS_DINODE_FMT_LOCAL)
519 : return -EINVAL;
520 :
521 : /*
522 : * if the target inode has less extents that then temporary inode then
523 : * why did userspace call us?
524 : */
525 15580 : if (ifp->if_nextents < tifp->if_nextents)
526 : return -EINVAL;
527 :
528 : /*
529 : * If we have to use the (expensive) rmap swap method, we can
530 : * handle any number of extents and any format.
531 : */
532 15560 : if (xfs_has_rmapbt(ip->i_mount))
533 : return 0;
534 :
535 : /*
536 : * if the target inode is in extent form and the temp inode is in btree
537 : * form then we will end up with the target inode in the wrong format
538 : * as we already know there are less extents in the temp inode.
539 : */
540 15560 : if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
541 : tifp->if_format == XFS_DINODE_FMT_BTREE)
542 : return -EINVAL;
543 :
544 : /* Check temp in extent form to max in target */
545 15560 : if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
546 14609 : tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
547 : return -EINVAL;
548 :
549 : /* Check target in extent form to max in temp */
550 15560 : if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
551 10211 : ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
552 : return -EINVAL;
553 :
554 : /*
555 : * If we are in a btree format, check that the temp root block will fit
556 : * in the target and that it has enough extents to be in btree format
557 : * in the target.
558 : *
559 : * Note that we have to be careful to allow btree->extent conversions
560 : * (a common defrag case) which will occur when the temp inode is in
561 : * extent format...
562 : */
563 15560 : if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
564 951 : if (xfs_inode_has_attr_fork(ip) &&
565 767 : XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
566 : return -EINVAL;
567 951 : if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
568 : return -EINVAL;
569 : }
570 :
571 : /* Reciprocal target->temp btree format checks */
572 15560 : if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
573 5349 : if (xfs_inode_has_attr_fork(tip) &&
574 5162 : XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
575 : return -EINVAL;
576 5349 : if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
577 36 : return -EINVAL;
578 : }
579 :
580 : return 0;
581 : }
582 :
583 : /*
584 : * Fix up the owners of the bmbt blocks to refer to the current inode. The
585 : * change owner scan attempts to order all modified buffers in the current
586 : * transaction. In the event of ordered buffer failure, the offending buffer is
587 : * physically logged as a fallback and the scan returns -EAGAIN. We must roll
588 : * the transaction in this case to replenish the fallback log reservation and
589 : * restart the scan. This process repeats until the scan completes.
590 : */
591 : static int
592 6255 : xfs_swap_change_owner(
593 : struct xfs_trans **tpp,
594 : struct xfs_inode *ip,
595 : struct xfs_inode *tmpip)
596 : {
597 6255 : int error;
598 6255 : struct xfs_trans *tp = *tpp;
599 :
600 21053 : do {
601 13654 : error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
602 : NULL);
603 : /* success or fatal error */
604 13654 : if (error != -EAGAIN)
605 : break;
606 :
607 7399 : error = xfs_trans_roll(tpp);
608 7399 : if (error)
609 : break;
610 7399 : tp = *tpp;
611 :
612 : /*
613 : * Redirty both inodes so they can relog and keep the log tail
614 : * moving forward.
615 : */
616 7399 : xfs_trans_ijoin(tp, ip, 0);
617 7399 : xfs_trans_ijoin(tp, tmpip, 0);
618 7399 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
619 7399 : xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
620 : } while (true);
621 :
622 6255 : return error;
623 : }
624 :
625 : /* Swap the extents of two files by swapping data forks. */
626 : STATIC int
627 15519 : xfs_swap_extent_forks(
628 : struct xfs_trans **tpp,
629 : struct xfs_swapext_req *req)
630 : {
631 15519 : struct xfs_inode *ip = req->ip2;
632 15519 : struct xfs_inode *tip = req->ip1;
633 15519 : xfs_filblks_t aforkblks = 0;
634 15519 : xfs_filblks_t taforkblks = 0;
635 15519 : xfs_extnum_t junk;
636 15519 : uint64_t tmp;
637 15519 : int src_log_flags = XFS_ILOG_CORE;
638 15519 : int target_log_flags = XFS_ILOG_CORE;
639 15519 : int error;
640 :
641 : /*
642 : * Count the number of extended attribute blocks
643 : */
644 15519 : if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
645 819 : ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
646 819 : error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
647 : &aforkblks);
648 819 : if (error)
649 : return error;
650 : }
651 15519 : if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
652 0 : tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
653 0 : error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
654 : &taforkblks);
655 0 : if (error)
656 : return error;
657 : }
658 :
659 : /*
660 : * Btree format (v3) inodes have the inode number stamped in the bmbt
661 : * block headers. We can't start changing the bmbt blocks until the
662 : * inode owner change is logged so recovery does the right thing in the
663 : * event of a crash. Set the owner change log flags now and leave the
664 : * bmbt scan as the last step.
665 : */
666 15519 : if (xfs_has_v3inodes(ip->i_mount)) {
667 15519 : if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
668 5308 : target_log_flags |= XFS_ILOG_DOWNER;
669 15519 : if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
670 947 : src_log_flags |= XFS_ILOG_DOWNER;
671 : }
672 :
673 : /*
674 : * Swap the data forks of the inodes
675 : */
676 15519 : swap(ip->i_df, tip->i_df);
677 :
678 : /*
679 : * Fix the on-disk inode values
680 : */
681 15519 : tmp = (uint64_t)ip->i_nblocks;
682 15519 : ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
683 15519 : tip->i_nblocks = tmp + taforkblks - aforkblks;
684 :
685 : /*
686 : * The extents in the source inode could still contain speculative
687 : * preallocation beyond EOF (e.g. the file is open but not modified
688 : * while defrag is in progress). In that case, we need to copy over the
689 : * number of delalloc blocks the data fork in the source inode is
690 : * tracking beyond EOF so that when the fork is truncated away when the
691 : * temporary inode is unlinked we don't underrun the i_delayed_blks
692 : * counter on that inode.
693 : */
694 15519 : ASSERT(tip->i_delayed_blks == 0);
695 15519 : tip->i_delayed_blks = ip->i_delayed_blks;
696 15519 : ip->i_delayed_blks = 0;
697 :
698 15519 : switch (ip->i_df.if_format) {
699 14572 : case XFS_DINODE_FMT_EXTENTS:
700 14572 : src_log_flags |= XFS_ILOG_DEXT;
701 14572 : break;
702 947 : case XFS_DINODE_FMT_BTREE:
703 947 : ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
704 : (src_log_flags & XFS_ILOG_DOWNER));
705 947 : src_log_flags |= XFS_ILOG_DBROOT;
706 947 : break;
707 : }
708 :
709 15519 : switch (tip->i_df.if_format) {
710 10211 : case XFS_DINODE_FMT_EXTENTS:
711 10211 : target_log_flags |= XFS_ILOG_DEXT;
712 10211 : break;
713 5308 : case XFS_DINODE_FMT_BTREE:
714 5308 : target_log_flags |= XFS_ILOG_DBROOT;
715 5308 : ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
716 : (target_log_flags & XFS_ILOG_DOWNER));
717 : break;
718 : }
719 :
720 : /* Do we have to swap reflink flags? */
721 15519 : if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
722 15519 : (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
723 0 : uint64_t f;
724 :
725 0 : f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
726 0 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
727 0 : ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
728 0 : tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
729 0 : tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
730 : }
731 :
732 : /* Swap the cow forks. */
733 15519 : if (xfs_has_reflink(ip->i_mount)) {
734 0 : ASSERT(!ip->i_cowfp ||
735 : ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
736 0 : ASSERT(!tip->i_cowfp ||
737 : tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
738 :
739 0 : swap(ip->i_cowfp, tip->i_cowfp);
740 :
741 0 : if (ip->i_cowfp && ip->i_cowfp->if_bytes)
742 0 : xfs_inode_set_cowblocks_tag(ip);
743 : else
744 0 : xfs_inode_clear_cowblocks_tag(ip);
745 0 : if (tip->i_cowfp && tip->i_cowfp->if_bytes)
746 0 : xfs_inode_set_cowblocks_tag(tip);
747 : else
748 0 : xfs_inode_clear_cowblocks_tag(tip);
749 : }
750 :
751 15519 : xfs_trans_log_inode(*tpp, ip, src_log_flags);
752 15519 : xfs_trans_log_inode(*tpp, tip, target_log_flags);
753 :
754 : /*
755 : * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
756 : * have inode number owner values in the bmbt blocks that still refer to
757 : * the old inode. Scan each bmbt to fix up the owner values with the
758 : * inode number of the current inode.
759 : */
760 15519 : if (src_log_flags & XFS_ILOG_DOWNER) {
761 947 : error = xfs_swap_change_owner(tpp, ip, tip);
762 947 : if (error)
763 : return error;
764 : }
765 15519 : if (target_log_flags & XFS_ILOG_DOWNER) {
766 5308 : error = xfs_swap_change_owner(tpp, tip, ip);
767 5308 : if (error)
768 0 : return error;
769 : }
770 :
771 : return 0;
772 : }
773 :
774 : /*
775 : * There may be partially written rt extents lurking in the ranges to be
776 : * swapped. According to the rules for realtime files with big rt extents, we
777 : * must guarantee that an outside observer (an IO thread, realistically) never
778 : * can see multiple physical rt extents mapped to the same logical file rt
779 : * extent. The deferred bmap log intent items that we use under the hood
780 : * operate on single block mappings and not rt extents, which means we must
781 : * have a strategy to ensure that log recovery after a failure won't stop in
782 : * the middle of an rt extent.
783 : *
784 : * The preferred strategy is to use deferred extent swap log intent items to
785 : * track the status of the overall swap operation so that we can complete the
786 : * work during crash recovery. If that isn't possible, we fall back to
787 : * requiring the selected mappings in both forks to be aligned to rt extent
788 : * boundaries. As an aside, the old fork swap routine didn't have this
789 : * requirement, but at an extreme cost in flexibilty (full files only, and no
790 : * support if rmapbt is enabled).
791 : */
792 : static bool
793 1242041 : xfs_xchg_range_need_rt_conversion(
794 : struct xfs_inode *ip,
795 : unsigned int xchg_flags)
796 : {
797 1242041 : struct xfs_mount *mp = ip->i_mount;
798 :
799 : /*
800 : * Caller got permission to use logged swapext, so log recovery will
801 : * finish the swap and not leave us with partially swapped rt extents
802 : * exposed to userspace.
803 : */
804 1242041 : if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
805 : return false;
806 :
807 : /*
808 : * If we can't use log intent items at all, the only supported
809 : * operation is full fork swaps, so no conversions are needed.
810 : * The range requirements are enforced by the swapext code itself.
811 : */
812 1257684 : if (!xfs_swapext_supported(mp))
813 : return false;
814 :
815 : /* Conversion is only needed for realtime files with big rt extents */
816 1226398 : return xfs_inode_has_bigrtextents(ip);
817 : }
818 :
819 : /*
820 : * Check the alignment of an exchange request when the allocation unit size
821 : * isn't a power of two. The VFS helpers use (fast) bitmask-based alignment
822 : * checks, but here we have to use slow long division.
823 : */
824 : static int
825 5010 : xfs_xchg_range_check_rtalign(
826 : struct xfs_inode *ip1,
827 : struct xfs_inode *ip2,
828 : const struct xfs_exch_range *fxr)
829 : {
830 5010 : struct xfs_mount *mp = ip1->i_mount;
831 5010 : uint32_t rextbytes;
832 5010 : uint64_t length = fxr->length;
833 5010 : uint64_t blen;
834 5010 : loff_t size1, size2;
835 :
836 5010 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
837 5010 : size1 = i_size_read(VFS_I(ip1));
838 5010 : size2 = i_size_read(VFS_I(ip2));
839 :
840 : /* The start of both ranges must be aligned to a rt extent. */
841 10018 : if (!isaligned_64(fxr->file1_offset, rextbytes) ||
842 5008 : !isaligned_64(fxr->file2_offset, rextbytes))
843 2 : return -EINVAL;
844 :
845 : /*
846 : * If the caller asked for full files, check that the offset/length
847 : * values cover all of both files.
848 : */
849 5008 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
850 4933 : (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
851 4933 : fxr->length != size1 || fxr->length != size2))
852 : return -EDOM;
853 :
854 5008 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
855 1 : length = max_t(int64_t, size1 - fxr->file1_offset,
856 : size2 - fxr->file2_offset);
857 :
858 : /*
859 : * If the user wanted us to exchange up to the infile's EOF, round up
860 : * to the next rt extent boundary for this check. Do the same for the
861 : * outfile.
862 : *
863 : * Otherwise, reject the range length if it's not rt extent aligned.
864 : * We already confirmed the starting offsets' rt extent block
865 : * alignment.
866 : */
867 5008 : if (fxr->file1_offset + length == size1)
868 5004 : blen = roundup_64(size1, rextbytes) - fxr->file1_offset;
869 4 : else if (fxr->file2_offset + length == size2)
870 0 : blen = roundup_64(size2, rextbytes) - fxr->file2_offset;
871 4 : else if (!isaligned_64(length, rextbytes))
872 : return -EINVAL;
873 : else
874 : blen = length;
875 :
876 : /* Don't allow overlapped exchanges within the same file. */
877 5008 : if (ip1 == ip2 &&
878 74 : fxr->file2_offset + blen > fxr->file1_offset &&
879 0 : fxr->file1_offset + blen > fxr->file2_offset)
880 : return -EINVAL;
881 :
882 : /*
883 : * Ensure that we don't exchange a partial EOF rt extent into the
884 : * middle of another file.
885 : */
886 5008 : if (isaligned_64(length, rextbytes))
887 : return 0;
888 :
889 4708 : blen = length;
890 4708 : if (fxr->file2_offset + length < size2)
891 0 : blen = rounddown_64(blen, rextbytes);
892 :
893 4708 : if (fxr->file1_offset + blen < size1)
894 0 : blen = rounddown_64(blen, rextbytes);
895 :
896 4708 : return blen == length ? 0 : -EINVAL;
897 : }
898 :
899 : /* Prepare two files to have their data exchanged. */
900 : int
901 1242845 : xfs_xchg_range_prep(
902 : struct file *file1,
903 : struct file *file2,
904 : struct xfs_exch_range *fxr,
905 : unsigned int xchg_flags)
906 : {
907 1242845 : struct xfs_inode *ip1 = XFS_I(file_inode(file1));
908 1242845 : struct xfs_inode *ip2 = XFS_I(file_inode(file2));
909 1242845 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
910 1242835 : int error;
911 :
912 1242835 : trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0);
913 :
914 : /* Verify both files are either real-time or non-realtime */
915 3696447 : if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
916 : return -EINVAL;
917 :
918 : /* Check non-power of two alignment issues, if necessary. */
919 1258869 : if (XFS_IS_REALTIME_INODE(ip2) && !is_power_of_2(alloc_unit)) {
920 5010 : error = xfs_xchg_range_check_rtalign(ip1, ip2, fxr);
921 5010 : if (error)
922 : return error;
923 :
924 : /* Do the VFS checks with the regular block alignment. */
925 5008 : alloc_unit = ip1->i_mount->m_sb.sb_blocksize;
926 : }
927 :
928 1242835 : error = xfs_exch_range_prep(file1, file2, fxr, alloc_unit);
929 1242842 : if (error || fxr->length == 0)
930 : return error;
931 :
932 : /* Attach dquots to both inodes before changing block maps. */
933 1242040 : error = xfs_qm_dqattach(ip2);
934 1242041 : if (error)
935 : return error;
936 1242041 : error = xfs_qm_dqattach(ip1);
937 1242041 : if (error)
938 : return error;
939 :
940 1242041 : trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0);
941 :
942 : /* Flush the relevant ranges of both files. */
943 1242036 : error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
944 1242036 : if (error)
945 : return error;
946 1242040 : error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
947 1242040 : if (error)
948 : return error;
949 :
950 : /*
951 : * Cancel CoW fork preallocations for the ranges of both files. The
952 : * prep function should have flushed all the dirty data, so the only
953 : * extents remaining should be speculative.
954 : */
955 2484080 : if (xfs_inode_has_cow_data(ip1)) {
956 717315 : error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
957 717315 : fxr->length, true);
958 717315 : if (error)
959 : return error;
960 : }
961 :
962 2484080 : if (xfs_inode_has_cow_data(ip2)) {
963 712271 : error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
964 712271 : fxr->length, true);
965 712271 : if (error)
966 : return error;
967 : }
968 :
969 : /* Convert unwritten sub-extent mappings if required. */
970 1242040 : if (xfs_xchg_range_need_rt_conversion(ip2, xchg_flags)) {
971 0 : error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
972 : fxr->length);
973 0 : if (error)
974 : return error;
975 :
976 0 : error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
977 : fxr->length);
978 0 : if (error)
979 0 : return error;
980 : }
981 :
982 : return 0;
983 : }
984 :
985 : #define QRETRY_IP1 (0x1)
986 : #define QRETRY_IP2 (0x2)
987 :
988 : /*
989 : * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
990 : * this if quota enforcement is disabled or if both inodes' dquots are the
991 : * same. The qretry structure must be initialized to zeroes before the first
992 : * call to this function.
993 : */
994 : STATIC int
995 1242457 : xfs_xchg_range_reserve_quota(
996 : struct xfs_trans *tp,
997 : const struct xfs_swapext_req *req,
998 : unsigned int *qretry)
999 : {
1000 1242457 : int64_t ddelta, rdelta;
1001 1242457 : int ip1_error = 0;
1002 1242457 : int error;
1003 :
1004 : /*
1005 : * Don't bother with a quota reservation if we're not enforcing them
1006 : * or the two inodes have the same dquots.
1007 : */
1008 1242457 : if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
1009 13084 : (req->ip1->i_udquot == req->ip2->i_udquot &&
1010 13041 : req->ip1->i_gdquot == req->ip2->i_gdquot &&
1011 13026 : req->ip1->i_pdquot == req->ip2->i_pdquot))
1012 : return 0;
1013 :
1014 58 : *qretry = 0;
1015 :
1016 : /*
1017 : * For each file, compute the net gain in the number of regular blocks
1018 : * that will be mapped into that file and reserve that much quota. The
1019 : * quota counts must be able to absorb at least that much space.
1020 : */
1021 58 : ddelta = req->ip2_bcount - req->ip1_bcount;
1022 58 : rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
1023 58 : if (ddelta > 0 || rdelta > 0) {
1024 23 : error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
1025 : ddelta > 0 ? ddelta : 0,
1026 : rdelta > 0 ? rdelta : 0,
1027 : false);
1028 23 : if (error == -EDQUOT || error == -ENOSPC) {
1029 : /*
1030 : * Save this error and see what happens if we try to
1031 : * reserve quota for ip2. Then report both.
1032 : */
1033 12 : *qretry |= QRETRY_IP1;
1034 12 : ip1_error = error;
1035 12 : error = 0;
1036 : }
1037 23 : if (error)
1038 : return error;
1039 : }
1040 58 : if (ddelta < 0 || rdelta < 0) {
1041 5 : error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
1042 : ddelta < 0 ? -ddelta : 0,
1043 : rdelta < 0 ? -rdelta : 0,
1044 : false);
1045 5 : if (error == -EDQUOT || error == -ENOSPC)
1046 0 : *qretry |= QRETRY_IP2;
1047 5 : if (error)
1048 : return error;
1049 : }
1050 58 : if (ip1_error)
1051 : return ip1_error;
1052 :
1053 : /*
1054 : * For each file, forcibly reserve the gross gain in mapped blocks so
1055 : * that we don't trip over any quota block reservation assertions.
1056 : * We must reserve the gross gain because the quota code subtracts from
1057 : * bcount the number of blocks that we unmap; it does not add that
1058 : * quantity back to the quota block reservation.
1059 : */
1060 46 : error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
1061 46 : req->ip1_rtbcount, true);
1062 46 : if (error)
1063 : return error;
1064 :
1065 46 : return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
1066 46 : req->ip2_rtbcount, true);
1067 : }
1068 :
1069 : /*
1070 : * Get permission to use log-assisted atomic exchange of file extents.
1071 : *
1072 : * Callers must hold the IOLOCK and MMAPLOCK of both files. They must not be
1073 : * running any transactions or hold any ILOCKS. If @use_logging is set after a
1074 : * successful return, callers must call xfs_xchg_range_rele_log_assist after
1075 : * the exchange is completed.
1076 : */
1077 : int
1078 5818408 : xfs_xchg_range_grab_log_assist(
1079 : struct xfs_mount *mp,
1080 : bool force,
1081 : bool *use_logging)
1082 : {
1083 5818408 : int error = 0;
1084 :
1085 : /*
1086 : * Protect ourselves from an idle log clearing the atomic swapext
1087 : * log incompat feature bit.
1088 : */
1089 5818408 : xlog_use_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1090 5818442 : *use_logging = true;
1091 :
1092 : /*
1093 : * If log-assisted swapping is already enabled, the caller can use the
1094 : * log assisted swap functions with the log-incompat reference we got.
1095 : */
1096 11636884 : if (xfs_sb_version_haslogswapext(&mp->m_sb))
1097 : return 0;
1098 :
1099 : /*
1100 : * If the caller doesn't /require/ log-assisted swapping, drop the
1101 : * log-incompat feature protection and exit. The caller cannot use
1102 : * log assisted swapping.
1103 : */
1104 48562 : if (!force)
1105 31619 : goto drop_incompat;
1106 :
1107 : /*
1108 : * Caller requires log-assisted swapping but the fs feature set isn't
1109 : * rich enough to support it. Bail out.
1110 : */
1111 18943 : if (!xfs_swapext_supported(mp)) {
1112 1997 : error = -EOPNOTSUPP;
1113 1997 : goto drop_incompat;
1114 : }
1115 :
1116 14946 : error = xfs_add_incompat_log_feature(mp,
1117 : XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT);
1118 14949 : if (error)
1119 0 : goto drop_incompat;
1120 :
1121 14949 : xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SWAPEXT,
1122 : "EXPERIMENTAL atomic file range swap feature in use. Use at your own risk!");
1123 :
1124 : return 0;
1125 33616 : drop_incompat:
1126 33616 : xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1127 33616 : *use_logging = false;
1128 33616 : return error;
1129 : }
1130 :
1131 : /* Release permission to use log-assisted extent swapping. */
1132 : void
1133 4573290 : xfs_xchg_range_rele_log_assist(
1134 : struct xfs_mount *mp)
1135 : {
1136 5784126 : xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1137 1210842 : }
1138 :
1139 : /* Decide if we can use the old data fork exchange code. */
1140 : static inline bool
1141 15775 : xfs_xchg_use_forkswap(
1142 : const struct xfs_exch_range *fxr,
1143 : struct xfs_inode *ip1,
1144 : struct xfs_inode *ip2)
1145 : {
1146 15775 : if (!(fxr->flags & XFS_EXCH_RANGE_NONATOMIC))
1147 : return false;
1148 15775 : if (!(fxr->flags & XFS_EXCH_RANGE_FULL_FILES))
1149 : return false;
1150 15580 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
1151 : return false;
1152 15580 : if (fxr->file1_offset != 0 || fxr->file2_offset != 0)
1153 : return false;
1154 15580 : if (fxr->length != ip1->i_disk_size)
1155 : return false;
1156 15580 : if (fxr->length != ip2->i_disk_size)
1157 0 : return false;
1158 : return true;
1159 : }
1160 :
1161 : enum xchg_strategy {
1162 : SWAPEXT = 1, /* xfs_swapext() */
1163 : FORKSWAP = 2, /* exchange forks */
1164 : };
1165 :
1166 : /* Exchange the contents of two files. */
1167 : int
1168 1242466 : xfs_xchg_range(
1169 : struct xfs_inode *ip1,
1170 : struct xfs_inode *ip2,
1171 : const struct xfs_exch_range *fxr,
1172 : unsigned int xchg_flags)
1173 : {
1174 1242466 : struct xfs_mount *mp = ip1->i_mount;
1175 4969864 : struct xfs_swapext_req req = {
1176 : .ip1 = ip1,
1177 : .ip2 = ip2,
1178 : .whichfork = XFS_DATA_FORK,
1179 1242466 : .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
1180 1242466 : .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
1181 1242466 : .blockcount = XFS_B_TO_FSB(mp, fxr->length),
1182 : };
1183 1242466 : struct xfs_trans *tp;
1184 1242466 : unsigned int qretry;
1185 1242466 : unsigned int flags = 0;
1186 1242466 : bool retried = false;
1187 1242466 : enum xchg_strategy strategy;
1188 1242466 : int error;
1189 :
1190 1242466 : trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags);
1191 :
1192 1242459 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
1193 202 : req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
1194 1242459 : if (fxr->flags & XFS_EXCH_RANGE_FILE1_WRITTEN)
1195 34 : req.req_flags |= XFS_SWAP_REQ_INO1_WRITTEN;
1196 1242459 : if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
1197 1210840 : req.req_flags |= XFS_SWAP_REQ_LOGGED;
1198 :
1199 : /*
1200 : * Round the request length up to the nearest fundamental unit of
1201 : * allocation. The prep function already checked that the request
1202 : * offsets and length in @fxr are safe to round up.
1203 : */
1204 1242459 : if (XFS_IS_REALTIME_INODE(ip2))
1205 15775 : req.blockcount = roundup_64(req.blockcount,
1206 : mp->m_sb.sb_rextsize);
1207 :
1208 1242459 : error = xfs_xchg_range_estimate(&req);
1209 1242458 : if (error)
1210 : return error;
1211 :
1212 : /*
1213 : * We haven't decided which exchange strategy we want to use yet, but
1214 : * here we must choose if we want freed blocks during the swap to be
1215 : * added to the transaction block reservation (RES_FDBLKS) or freed
1216 : * into the global fdblocks. The legacy fork swap mechanism doesn't
1217 : * free any blocks, so it doesn't require it. It is also the only
1218 : * option that works for older filesystems.
1219 : *
1220 : * The bmap log intent items that were added with rmap and reflink can
1221 : * change the bmbt shape, so the intent-based swap strategies require
1222 : * us to set RES_FDBLKS.
1223 : */
1224 1242452 : if (xfs_has_lazysbcount(mp))
1225 1242452 : flags |= XFS_TRANS_RES_FDBLKS;
1226 :
1227 1242452 : retry:
1228 : /* Allocate the transaction, lock the inodes, and join them. */
1229 1242458 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
1230 : flags, &tp);
1231 1242465 : if (error)
1232 1 : return error;
1233 :
1234 1242464 : xfs_xchg_range_ilock(tp, ip1, ip2);
1235 :
1236 1242462 : trace_xfs_swap_extent_before(ip2, 0);
1237 1242461 : trace_xfs_swap_extent_before(ip1, 1);
1238 :
1239 1242460 : if (fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH)
1240 31665 : trace_xfs_xchg_range_freshness(ip2, fxr);
1241 :
1242 : /*
1243 : * Now that we've excluded all other inode metadata changes by taking
1244 : * the ILOCK, repeat the freshness check.
1245 : */
1246 1242460 : error = xfs_exch_range_check_fresh(VFS_I(ip2), fxr);
1247 1242460 : if (error)
1248 0 : goto out_trans_cancel;
1249 :
1250 1242460 : error = xfs_swapext_check_extents(mp, &req);
1251 1242457 : if (error)
1252 0 : goto out_trans_cancel;
1253 :
1254 : /*
1255 : * Reserve ourselves some quota if any of them are in enforcing mode.
1256 : * In theory we only need enough to satisfy the change in the number
1257 : * of blocks between the two ranges being remapped.
1258 : */
1259 1242457 : error = xfs_xchg_range_reserve_quota(tp, &req, &qretry);
1260 1242451 : if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
1261 6 : xfs_trans_cancel(tp);
1262 6 : xfs_xchg_range_iunlock(ip1, ip2);
1263 6 : if (qretry & QRETRY_IP1)
1264 6 : xfs_blockgc_free_quota(ip1, 0);
1265 6 : if (qretry & QRETRY_IP2)
1266 0 : xfs_blockgc_free_quota(ip2, 0);
1267 6 : retried = true;
1268 6 : goto retry;
1269 : }
1270 1242445 : if (error)
1271 6 : goto out_trans_cancel;
1272 :
1273 1274045 : if ((xchg_flags & XFS_XCHG_RANGE_LOGGED) || xfs_swapext_supported(mp)) {
1274 : /*
1275 : * xfs_swapext() uses deferred bmap log intent items to swap
1276 : * extents between file forks. If the atomic log swap feature
1277 : * is enabled, it will also use swapext log intent items to
1278 : * restart the operation in case of failure.
1279 : *
1280 : * This means that we can use it if we previously obtained
1281 : * permission from the log to use log-assisted atomic extent
1282 : * swapping; or if the fs supports rmap or reflink and the
1283 : * user said NONATOMIC.
1284 : */
1285 : strategy = SWAPEXT;
1286 15775 : } else if (xfs_xchg_use_forkswap(fxr, ip1, ip2)) {
1287 : /*
1288 : * Exchange the file contents by using the old bmap fork
1289 : * exchange code, if we're a defrag tool doing a full file
1290 : * swap.
1291 : */
1292 15580 : strategy = FORKSWAP;
1293 :
1294 15580 : error = xfs_swap_extents_check_format(ip2, ip1);
1295 15580 : if (error) {
1296 56 : xfs_notice(mp,
1297 : "%s: inode 0x%llx format is incompatible for exchanging.",
1298 : __func__, ip2->i_ino);
1299 56 : goto out_trans_cancel;
1300 : }
1301 : } else {
1302 : /* We cannot exchange the file contents. */
1303 195 : error = -EOPNOTSUPP;
1304 195 : goto out_trans_cancel;
1305 : }
1306 :
1307 : /* If we got this far on a dry run, all parameters are ok. */
1308 1242188 : if (fxr->flags & XFS_EXCH_RANGE_DRY_RUN)
1309 302 : goto out_trans_cancel;
1310 :
1311 : /* Update the mtime and ctime of both files. */
1312 1241886 : if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
1313 1241889 : xfs_trans_ichgtime(tp, ip1,
1314 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1315 1241883 : if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
1316 1212837 : xfs_trans_ichgtime(tp, ip2,
1317 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1318 :
1319 1241891 : if (strategy == SWAPEXT) {
1320 1226372 : xfs_swapext(tp, &req);
1321 : } else {
1322 15519 : error = xfs_swap_extent_forks(&tp, &req);
1323 15519 : if (error)
1324 0 : goto out_trans_cancel;
1325 : }
1326 :
1327 : /*
1328 : * Force the log to persist metadata updates if the caller or the
1329 : * administrator requires this. The VFS prep function already flushed
1330 : * the relevant parts of the page cache.
1331 : */
1332 1241894 : if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCH_RANGE_FSYNC))
1333 24180 : xfs_trans_set_sync(tp);
1334 :
1335 1241894 : error = xfs_trans_commit(tp);
1336 :
1337 1241900 : trace_xfs_swap_extent_after(ip2, 0);
1338 1241898 : trace_xfs_swap_extent_after(ip1, 1);
1339 :
1340 1241900 : if (error)
1341 33 : goto out_unlock;
1342 :
1343 : /*
1344 : * If the caller wanted us to exchange the contents of two complete
1345 : * files of unequal length, exchange the incore sizes now. This should
1346 : * be safe because we flushed both files' page caches, moved all the
1347 : * extents, and updated the ondisk sizes.
1348 : */
1349 1241867 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) {
1350 196 : loff_t temp;
1351 :
1352 196 : temp = i_size_read(VFS_I(ip2));
1353 196 : i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
1354 196 : i_size_write(VFS_I(ip1), temp);
1355 : }
1356 :
1357 1241671 : out_unlock:
1358 1242459 : xfs_xchg_range_iunlock(ip1, ip2);
1359 1242459 : return error;
1360 :
1361 559 : out_trans_cancel:
1362 559 : xfs_trans_cancel(tp);
1363 559 : goto out_unlock;
1364 : }
|