Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2020-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : *
6 : * The xfs_swap_extent_* functions are:
7 : * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 : * Copyright (c) 2012 Red Hat, Inc.
9 : * All Rights Reserved.
10 : */
11 : #include "xfs.h"
12 : #include "xfs_shared.h"
13 : #include "xfs_format.h"
14 : #include "xfs_log_format.h"
15 : #include "xfs_trans_resv.h"
16 : #include "xfs_mount.h"
17 : #include "xfs_defer.h"
18 : #include "xfs_inode.h"
19 : #include "xfs_trans.h"
20 : #include "xfs_quota.h"
21 : #include "xfs_bmap_util.h"
22 : #include "xfs_bmap_btree.h"
23 : #include "xfs_reflink.h"
24 : #include "xfs_trace.h"
25 : #include "xfs_swapext.h"
26 : #include "xfs_xchgrange.h"
27 : #include "xfs_sb.h"
28 : #include "xfs_icache.h"
29 : #include "xfs_log.h"
30 : #include "xfs_rtalloc.h"
31 : #include "xfs_rtbitmap.h"
32 : #include <linux/fsnotify.h>
33 :
34 : /*
35 : * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
36 : * This part does not deal with XFS-specific data structures, and may some day
37 : * be ported to the VFS.
38 : *
39 : * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
40 : * file1 with the same number of bytes starting at fxr.file2_offset in file2.
41 : * Implementations must call xfs_exch_range_prep to prepare the two files
42 : * prior to taking locks; they must call xfs_exch_range_check_fresh once
43 : * the inode is locked to abort the call if file2 has changed; and they must
44 : * update the inode change and mod times of both files as part of the metadata
45 : * update. The timestamp updates must be done atomically as part of the data
46 : * exchange operation to ensure correctness of the freshness check.
47 : */
48 :
49 : /*
50 : * Check that both files' metadata agree with the snapshot that we took for
51 : * the range exchange request.
52 : *
53 : * This should be called after the filesystem has locked /all/ inode metadata
54 : * against modification.
55 : */
56 : STATIC int
57 3723718 : xfs_exch_range_check_fresh(
58 : struct inode *inode2,
59 : const struct xfs_exch_range *fxr)
60 : {
61 : /* Check that file2 hasn't otherwise been modified. */
62 3723718 : if ((fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH) &&
63 64241 : (fxr->file2_ino != inode2->i_ino ||
64 64241 : fxr->file2_ctime != inode2->i_ctime.tv_sec ||
65 64240 : fxr->file2_ctime_nsec != inode2->i_ctime.tv_nsec ||
66 64211 : fxr->file2_mtime != inode2->i_mtime.tv_sec ||
67 64211 : fxr->file2_mtime_nsec != inode2->i_mtime.tv_nsec))
68 30 : return -EBUSY;
69 :
70 : return 0;
71 : }
72 :
73 : /* Performs necessary checks before doing a range exchange. */
74 : STATIC int
75 1862059 : xfs_exch_range_checks(
76 : struct file *file1,
77 : struct file *file2,
78 : struct xfs_exch_range *fxr,
79 : unsigned int blocksize)
80 : {
81 1862059 : struct inode *inode1 = file1->f_mapping->host;
82 1862059 : struct inode *inode2 = file2->f_mapping->host;
83 1862059 : uint64_t blkmask = blocksize - 1;
84 1862059 : int64_t test_len;
85 1862059 : uint64_t blen;
86 1862059 : loff_t size1, size2;
87 1862059 : int error;
88 :
89 : /* Don't touch certain kinds of inodes */
90 1862059 : if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
91 : return -EPERM;
92 1862049 : if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
93 : return -ETXTBSY;
94 :
95 1862038 : size1 = i_size_read(inode1);
96 1862038 : size2 = i_size_read(inode2);
97 :
98 : /* Ranges cannot start after EOF. */
99 1862038 : if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
100 : return -EINVAL;
101 :
102 : /*
103 : * If the caller asked for full files, check that the offset/length
104 : * values cover all of both files.
105 : */
106 1862002 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
107 31724 : (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
108 31724 : fxr->length != size1 || fxr->length != size2))
109 : return -EDOM;
110 :
111 : /*
112 : * If the caller said to exchange to EOF, we set the length of the
113 : * request large enough to cover everything to the end of both files.
114 : */
115 1861983 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
116 366 : fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
117 : size2 - fxr->file2_offset);
118 :
119 : /* The start of both ranges must be aligned to an fs block. */
120 1861983 : if (!IS_ALIGNED(fxr->file1_offset, blocksize) ||
121 1861974 : !IS_ALIGNED(fxr->file2_offset, blocksize))
122 : return -EINVAL;
123 :
124 : /* Ensure offsets don't wrap. */
125 1861974 : if (fxr->file1_offset + fxr->length < fxr->file1_offset ||
126 1861974 : fxr->file2_offset + fxr->length < fxr->file2_offset)
127 : return -EINVAL;
128 :
129 : /*
130 : * We require both ranges to be within EOF, unless we're exchanging
131 : * to EOF. xfs_xchg_range_prep already checked that both
132 : * fxr->file1_offset and fxr->file2_offset are within EOF.
133 : */
134 1861974 : if (!(fxr->flags & XFS_EXCH_RANGE_TO_EOF) &&
135 1861608 : (fxr->file1_offset + fxr->length > size1 ||
136 1861590 : fxr->file2_offset + fxr->length > size2))
137 : return -EINVAL;
138 :
139 : /*
140 : * Make sure we don't hit any file size limits. If we hit any size
141 : * limits such that test_length was adjusted, we abort the whole
142 : * operation.
143 : */
144 1861947 : test_len = fxr->length;
145 1861947 : error = generic_write_check_limits(file2, fxr->file2_offset, &test_len);
146 1861946 : if (error)
147 : return error;
148 1861946 : error = generic_write_check_limits(file1, fxr->file1_offset, &test_len);
149 1861946 : if (error)
150 : return error;
151 1861946 : if (test_len != fxr->length)
152 : return -EINVAL;
153 :
154 : /*
155 : * If the user wanted us to exchange up to the infile's EOF, round up
156 : * to the next block boundary for this check. Do the same for the
157 : * outfile.
158 : *
159 : * Otherwise, reject the range length if it's not block aligned. We
160 : * already confirmed the starting offsets' block alignment.
161 : */
162 1861935 : if (fxr->file1_offset + fxr->length == size1)
163 32617 : blen = ALIGN(size1, blocksize) - fxr->file1_offset;
164 1829318 : else if (fxr->file2_offset + fxr->length == size2)
165 10572 : blen = ALIGN(size2, blocksize) - fxr->file2_offset;
166 1818746 : else if (!IS_ALIGNED(fxr->length, blocksize))
167 : return -EINVAL;
168 : else
169 : blen = fxr->length;
170 :
171 : /* Don't allow overlapped exchanges within the same file. */
172 1861935 : if (inode1 == inode2 &&
173 1829722 : fxr->file2_offset + blen > fxr->file1_offset &&
174 809786 : fxr->file1_offset + blen > fxr->file2_offset)
175 : return -EINVAL;
176 :
177 : /* If we already failed the freshness check, we're done. */
178 1861910 : error = xfs_exch_range_check_fresh(inode2, fxr);
179 1861910 : if (error)
180 : return error;
181 :
182 : /*
183 : * Ensure that we don't exchange a partial EOF block into the middle of
184 : * another file.
185 : */
186 1861872 : if ((fxr->length & blkmask) == 0)
187 : return 0;
188 :
189 4622 : blen = fxr->length;
190 4622 : if (fxr->file2_offset + blen < size2)
191 9 : blen &= ~blkmask;
192 :
193 4622 : if (fxr->file1_offset + blen < size1)
194 18 : blen &= ~blkmask;
195 :
196 4622 : return blen == fxr->length ? 0 : -EINVAL;
197 : }
198 :
199 : /*
200 : * Check that the two inodes are eligible for range exchanges, the ranges make
201 : * sense, and then flush all dirty data. Caller must ensure that the inodes
202 : * have been locked against any other modifications.
203 : */
204 : int
205 1862061 : xfs_exch_range_prep(
206 : struct file *file1,
207 : struct file *file2,
208 : struct xfs_exch_range *fxr,
209 : unsigned int blocksize)
210 : {
211 1862061 : struct inode *inode1 = file_inode(file1);
212 1862061 : struct inode *inode2 = file_inode(file2);
213 1862061 : bool same_inode = (inode1 == inode2);
214 1862061 : int error;
215 :
216 : /* Check that we don't violate system file offset limits. */
217 1862061 : error = xfs_exch_range_checks(file1, file2, fxr, blocksize);
218 1862047 : if (error || fxr->length == 0)
219 : return error;
220 :
221 : /* Wait for the completion of any pending IOs on both files */
222 1861298 : inode_dio_wait(inode1);
223 1861298 : if (!same_inode)
224 32164 : inode_dio_wait(inode2);
225 :
226 1861298 : error = filemap_write_and_wait_range(inode1->i_mapping,
227 : fxr->file1_offset,
228 1861298 : fxr->file1_offset + fxr->length - 1);
229 1861303 : if (error)
230 : return error;
231 :
232 1861302 : error = filemap_write_and_wait_range(inode2->i_mapping,
233 : fxr->file2_offset,
234 1861302 : fxr->file2_offset + fxr->length - 1);
235 1861315 : if (error)
236 : return error;
237 :
238 : /*
239 : * If the files or inodes involved require synchronous writes, amend
240 : * the request to force the filesystem to flush all data and metadata
241 : * to disk after the operation completes.
242 : */
243 1861315 : if (((file1->f_flags | file2->f_flags) & (__O_SYNC | O_DSYNC)) ||
244 1837203 : IS_SYNC(inode1) || IS_SYNC(inode2))
245 24113 : fxr->flags |= XFS_EXCH_RANGE_FSYNC;
246 :
247 : return 0;
248 : }
249 :
250 : /*
251 : * Finish a range exchange operation, if it was successful. Caller must ensure
252 : * that the inodes are still locked against any other modifications.
253 : */
254 : int
255 1861683 : xfs_exch_range_finish(
256 : struct file *file1,
257 : struct file *file2)
258 : {
259 1861683 : int error;
260 :
261 1861683 : error = file_remove_privs(file1);
262 1861671 : if (error)
263 : return error;
264 1861671 : if (file_inode(file1) == file_inode(file2))
265 : return 0;
266 :
267 32076 : return file_remove_privs(file2);
268 : }
269 :
270 : /* Decide if it's ok to remap the selected range of a given file. */
271 : STATIC int
272 3724135 : xfs_exch_range_verify_area(
273 : struct file *file,
274 : loff_t pos,
275 : struct xfs_exch_range *fxr)
276 : {
277 3724135 : int64_t len = fxr->length;
278 :
279 3724135 : if (pos < 0)
280 : return -EINVAL;
281 :
282 3724135 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
283 732 : len = min_t(int64_t, len, i_size_read(file_inode(file)) - pos);
284 3724135 : return remap_verify_area(file, pos, len, true);
285 : }
286 :
287 : /* Prepare for and exchange parts of two files. */
288 : static inline int
289 4967102 : __xfs_exch_range(
290 : struct file *file1,
291 : struct file *file2,
292 : struct xfs_exch_range *fxr)
293 : {
294 4967102 : struct inode *inode1 = file_inode(file1);
295 4967102 : struct inode *inode2 = file_inode(file2);
296 4967102 : int ret;
297 :
298 9934289 : if ((fxr->flags & ~XFS_EXCH_RANGE_ALL_FLAGS) ||
299 4967102 : memchr_inv(&fxr->pad, 0, sizeof(fxr->pad)))
300 0 : return -EINVAL;
301 :
302 4967187 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
303 : (fxr->flags & XFS_EXCH_RANGE_TO_EOF))
304 : return -EINVAL;
305 :
306 : /*
307 : * The ioctl enforces that src and dest files are on the same mount.
308 : * However, they only need to be on the same file system.
309 : */
310 4967187 : if (inode1->i_sb != inode2->i_sb)
311 : return -EXDEV;
312 :
313 : /* This only works for regular files. */
314 4967187 : if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
315 : return -EISDIR;
316 4967177 : if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
317 : return -EINVAL;
318 :
319 4967177 : ret = generic_file_rw_checks(file1, file2);
320 4967062 : if (ret < 0)
321 : return ret;
322 :
323 4967052 : ret = generic_file_rw_checks(file2, file1);
324 4967056 : if (ret < 0)
325 : return ret;
326 :
327 1862071 : ret = xfs_exch_range_verify_area(file1, fxr->file1_offset, fxr);
328 1862067 : if (ret)
329 : return ret;
330 :
331 1862069 : ret = xfs_exch_range_verify_area(file2, fxr->file2_offset, fxr);
332 1862072 : if (ret)
333 : return ret;
334 :
335 1862072 : ret = xfs_file_xchg_range(file1, file2, fxr);
336 1862069 : if (ret)
337 : return ret;
338 :
339 1861683 : fsnotify_modify(file1);
340 1861672 : if (file2 != file1)
341 32092 : fsnotify_modify(file2);
342 : return 0;
343 : }
344 :
345 : /* Exchange parts of two files. */
346 : int
347 4967150 : xfs_exch_range(
348 : struct file *file1,
349 : struct file *file2,
350 : struct xfs_exch_range *fxr)
351 : {
352 4967150 : int error;
353 :
354 4967150 : file_start_write(file2);
355 4967101 : error = __xfs_exch_range(file1, file2, fxr);
356 4967057 : file_end_write(file2);
357 4967095 : return error;
358 : }
359 :
360 : /* XFS-specific parts of XFS_IOC_EXCHANGE_RANGE */
361 :
362 : /*
363 : * Exchanging ranges as a file operation. This is the binding between the
364 : * VFS-level concepts and the XFS-specific implementation.
365 : */
366 : int
367 1862069 : xfs_file_xchg_range(
368 : struct file *file1,
369 : struct file *file2,
370 : struct xfs_exch_range *fxr)
371 : {
372 1862069 : struct inode *inode1 = file_inode(file1);
373 1862069 : struct inode *inode2 = file_inode(file2);
374 1862069 : struct xfs_inode *ip1 = XFS_I(inode1);
375 1862069 : struct xfs_inode *ip2 = XFS_I(inode2);
376 1862069 : struct xfs_mount *mp = ip1->i_mount;
377 1862069 : unsigned int priv_flags = 0;
378 1862069 : bool use_logging = false;
379 1862069 : int error;
380 :
381 3724138 : if (xfs_is_shutdown(mp))
382 : return -EIO;
383 :
384 : /* Update cmtime if the fd/inode don't forbid it. */
385 1862069 : if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
386 1862069 : priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
387 1862069 : if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
388 1832769 : priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
389 :
390 : /* Lock both files against IO */
391 1862069 : error = xfs_ilock2_io_mmap(ip1, ip2);
392 1862077 : if (error)
393 0 : goto out_err;
394 :
395 : /* Prepare and then exchange file contents. */
396 1862077 : error = xfs_xchg_range_prep(file1, file2, fxr, priv_flags);
397 1862070 : if (error)
398 210 : goto out_unlock;
399 :
400 : /* Get permission to use log-assisted file content swaps. */
401 1861860 : error = xfs_xchg_range_grab_log_assist(mp,
402 1861860 : !(fxr->flags & XFS_EXCH_RANGE_NONATOMIC),
403 : &use_logging);
404 1861862 : if (error)
405 54 : goto out_unlock;
406 1861808 : if (use_logging)
407 1830069 : priv_flags |= XFS_XCHG_RANGE_LOGGED;
408 :
409 1861808 : error = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
410 1861807 : if (error)
411 120 : goto out_drop_feat;
412 :
413 : /*
414 : * Finish the exchange by removing special file privileges like any
415 : * other file write would do. This may involve turning on support for
416 : * logged xattrs if either file has security capabilities, which means
417 : * xfs_xchg_range_grab_log_assist before xfs_attr_grab_log_assist.
418 : */
419 1861687 : error = xfs_exch_range_finish(file1, file2);
420 1861669 : if (error)
421 0 : goto out_drop_feat;
422 :
423 1861669 : out_drop_feat:
424 1861789 : if (use_logging)
425 1830050 : xfs_xchg_range_rele_log_assist(mp);
426 31739 : out_unlock:
427 1862070 : xfs_iunlock2_io_mmap(ip1, ip2);
428 1862072 : out_err:
429 1862072 : if (error)
430 384 : trace_xfs_file_xchg_range_error(ip2, error, _RET_IP_);
431 : return error;
432 : }
433 :
434 : /* Lock (and optionally join) two inodes for a file range exchange. */
435 : void
436 8686402 : xfs_xchg_range_ilock(
437 : struct xfs_trans *tp,
438 : struct xfs_inode *ip1,
439 : struct xfs_inode *ip2)
440 : {
441 8686402 : if (ip1 != ip2)
442 5027087 : xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
443 : ip2, XFS_ILOCK_EXCL);
444 : else
445 3659315 : xfs_ilock(ip1, XFS_ILOCK_EXCL);
446 8682289 : if (tp) {
447 6726874 : xfs_trans_ijoin(tp, ip1, 0);
448 6733109 : if (ip2 != ip1)
449 4903443 : xfs_trans_ijoin(tp, ip2, 0);
450 : }
451 :
452 8689013 : }
453 :
454 : /* Unlock two inodes after a file range exchange operation. */
455 : void
456 3817240 : xfs_xchg_range_iunlock(
457 : struct xfs_inode *ip1,
458 : struct xfs_inode *ip2)
459 : {
460 3817240 : if (ip2 != ip1)
461 157926 : xfs_iunlock(ip2, XFS_ILOCK_EXCL);
462 3817241 : xfs_iunlock(ip1, XFS_ILOCK_EXCL);
463 3817241 : }
464 :
465 : /*
466 : * Estimate the resource requirements to exchange file contents between the two
467 : * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
468 : * have flushed both inodes' pagecache and active direct-ios.
469 : */
470 : int
471 1955380 : xfs_xchg_range_estimate(
472 : struct xfs_swapext_req *req)
473 : {
474 1955380 : int error;
475 :
476 1955380 : xfs_xchg_range_ilock(NULL, req->ip1, req->ip2);
477 1955383 : error = xfs_swapext_estimate(req);
478 1955381 : xfs_xchg_range_iunlock(req->ip1, req->ip2);
479 1955377 : return error;
480 : }
481 :
482 : /*
483 : * We need to check that the format of the data fork in the temporary inode is
484 : * valid for the target inode before doing the swap. This is not a problem with
485 : * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
486 : * data fork depending on the space the attribute fork is taking so we can get
487 : * invalid formats on the target inode.
488 : *
489 : * E.g. target has space for 7 extents in extent format, temp inode only has
490 : * space for 6. If we defragment down to 7 extents, then the tmp format is a
491 : * btree, but when swapped it needs to be in extent format. Hence we can't just
492 : * blindly swap data forks on attr2 filesystems.
493 : *
494 : * Note that we check the swap in both directions so that we don't end up with
495 : * a corrupt temporary inode, either.
496 : *
497 : * Note that fixing the way xfs_fsr sets up the attribute fork in the source
498 : * inode will prevent this situation from occurring, so all we do here is
499 : * reject and log the attempt. basically we are putting the responsibility on
500 : * userspace to get this right.
501 : */
502 : STATIC int
503 3465 : xfs_swap_extents_check_format(
504 : struct xfs_inode *ip, /* target inode */
505 : struct xfs_inode *tip) /* tmp inode */
506 : {
507 3465 : struct xfs_ifork *ifp = &ip->i_df;
508 3465 : struct xfs_ifork *tifp = &tip->i_df;
509 :
510 : /* User/group/project quota ids must match if quotas are enforced. */
511 3465 : if (XFS_IS_QUOTA_ON(ip->i_mount) &&
512 3464 : (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
513 3464 : !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
514 3464 : ip->i_projid != tip->i_projid))
515 : return -EINVAL;
516 :
517 : /* Should never get a local format */
518 3464 : if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
519 3464 : tifp->if_format == XFS_DINODE_FMT_LOCAL)
520 : return -EINVAL;
521 :
522 : /*
523 : * if the target inode has less extents that then temporary inode then
524 : * why did userspace call us?
525 : */
526 3464 : if (ifp->if_nextents < tifp->if_nextents)
527 : return -EINVAL;
528 :
529 : /*
530 : * If we have to use the (expensive) rmap swap method, we can
531 : * handle any number of extents and any format.
532 : */
533 3464 : if (xfs_has_rmapbt(ip->i_mount))
534 : return 0;
535 :
536 : /*
537 : * if the target inode is in extent form and the temp inode is in btree
538 : * form then we will end up with the target inode in the wrong format
539 : * as we already know there are less extents in the temp inode.
540 : */
541 3464 : if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
542 : tifp->if_format == XFS_DINODE_FMT_BTREE)
543 : return -EINVAL;
544 :
545 : /* Check temp in extent form to max in target */
546 3464 : if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
547 3277 : tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
548 : return -EINVAL;
549 :
550 : /* Check target in extent form to max in temp */
551 3464 : if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
552 2490 : ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
553 : return -EINVAL;
554 :
555 : /*
556 : * If we are in a btree format, check that the temp root block will fit
557 : * in the target and that it has enough extents to be in btree format
558 : * in the target.
559 : *
560 : * Note that we have to be careful to allow btree->extent conversions
561 : * (a common defrag case) which will occur when the temp inode is in
562 : * extent format...
563 : */
564 3464 : if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
565 187 : if (xfs_inode_has_attr_fork(ip) &&
566 0 : xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip))
567 : return -EINVAL;
568 187 : if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
569 : return -EINVAL;
570 : }
571 :
572 : /* Reciprocal target->temp btree format checks */
573 3464 : if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
574 974 : if (xfs_inode_has_attr_fork(tip) &&
575 783 : xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
576 : return -EINVAL;
577 974 : if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
578 0 : return -EINVAL;
579 : }
580 :
581 : return 0;
582 : }
583 :
584 : /*
585 : * Fix up the owners of the bmbt blocks to refer to the current inode. The
586 : * change owner scan attempts to order all modified buffers in the current
587 : * transaction. In the event of ordered buffer failure, the offending buffer is
588 : * physically logged as a fallback and the scan returns -EAGAIN. We must roll
589 : * the transaction in this case to replenish the fallback log reservation and
590 : * restart the scan. This process repeats until the scan completes.
591 : */
592 : static int
593 1159 : xfs_swap_change_owner(
594 : struct xfs_trans **tpp,
595 : struct xfs_inode *ip,
596 : struct xfs_inode *tmpip)
597 : {
598 1159 : int error;
599 1159 : struct xfs_trans *tp = *tpp;
600 :
601 4255 : do {
602 2707 : error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
603 : NULL);
604 : /* success or fatal error */
605 2707 : if (error != -EAGAIN)
606 : break;
607 :
608 1548 : error = xfs_trans_roll(tpp);
609 1548 : if (error)
610 : break;
611 1548 : tp = *tpp;
612 :
613 : /*
614 : * Redirty both inodes so they can relog and keep the log tail
615 : * moving forward.
616 : */
617 1548 : xfs_trans_ijoin(tp, ip, 0);
618 1548 : xfs_trans_ijoin(tp, tmpip, 0);
619 1548 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
620 1548 : xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
621 : } while (true);
622 :
623 1159 : return error;
624 : }
625 :
626 : /* Swap the extents of two files by swapping data forks. */
627 : STATIC int
628 3463 : xfs_swap_extent_forks(
629 : struct xfs_trans **tpp,
630 : struct xfs_swapext_req *req)
631 : {
632 3463 : struct xfs_inode *ip = req->ip2;
633 3463 : struct xfs_inode *tip = req->ip1;
634 3463 : xfs_filblks_t aforkblks = 0;
635 3463 : xfs_filblks_t taforkblks = 0;
636 3463 : xfs_extnum_t junk;
637 3463 : uint64_t tmp;
638 3463 : int src_log_flags = XFS_ILOG_CORE;
639 3463 : int target_log_flags = XFS_ILOG_CORE;
640 3463 : int error;
641 :
642 : /*
643 : * Count the number of extended attribute blocks
644 : */
645 3463 : if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
646 459 : ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
647 459 : error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
648 : &aforkblks);
649 459 : if (error)
650 : return error;
651 : }
652 3463 : if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
653 0 : tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
654 0 : error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
655 : &taforkblks);
656 0 : if (error)
657 : return error;
658 : }
659 :
660 : /*
661 : * Btree format (v3) inodes have the inode number stamped in the bmbt
662 : * block headers. We can't start changing the bmbt blocks until the
663 : * inode owner change is logged so recovery does the right thing in the
664 : * event of a crash. Set the owner change log flags now and leave the
665 : * bmbt scan as the last step.
666 : */
667 3463 : if (xfs_has_v3inodes(ip->i_mount)) {
668 3463 : if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
669 973 : target_log_flags |= XFS_ILOG_DOWNER;
670 3463 : if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
671 186 : src_log_flags |= XFS_ILOG_DOWNER;
672 : }
673 :
674 : /*
675 : * Swap the data forks of the inodes
676 : */
677 3463 : swap(ip->i_df, tip->i_df);
678 :
679 : /*
680 : * Fix the on-disk inode values
681 : */
682 3463 : tmp = (uint64_t)ip->i_nblocks;
683 3463 : ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
684 3463 : tip->i_nblocks = tmp + taforkblks - aforkblks;
685 :
686 : /*
687 : * The extents in the source inode could still contain speculative
688 : * preallocation beyond EOF (e.g. the file is open but not modified
689 : * while defrag is in progress). In that case, we need to copy over the
690 : * number of delalloc blocks the data fork in the source inode is
691 : * tracking beyond EOF so that when the fork is truncated away when the
692 : * temporary inode is unlinked we don't underrun the i_delayed_blks
693 : * counter on that inode.
694 : */
695 3463 : ASSERT(tip->i_delayed_blks == 0);
696 3463 : tip->i_delayed_blks = ip->i_delayed_blks;
697 3463 : ip->i_delayed_blks = 0;
698 :
699 3463 : switch (ip->i_df.if_format) {
700 3277 : case XFS_DINODE_FMT_EXTENTS:
701 3277 : src_log_flags |= XFS_ILOG_DEXT;
702 3277 : break;
703 186 : case XFS_DINODE_FMT_BTREE:
704 186 : ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
705 : (src_log_flags & XFS_ILOG_DOWNER));
706 186 : src_log_flags |= XFS_ILOG_DBROOT;
707 186 : break;
708 : }
709 :
710 3463 : switch (tip->i_df.if_format) {
711 2490 : case XFS_DINODE_FMT_EXTENTS:
712 2490 : target_log_flags |= XFS_ILOG_DEXT;
713 2490 : break;
714 973 : case XFS_DINODE_FMT_BTREE:
715 973 : target_log_flags |= XFS_ILOG_DBROOT;
716 973 : ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
717 : (target_log_flags & XFS_ILOG_DOWNER));
718 : break;
719 : }
720 :
721 : /* Do we have to swap reflink flags? */
722 3463 : if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
723 3463 : (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
724 0 : uint64_t f;
725 :
726 0 : f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
727 0 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
728 0 : ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
729 0 : tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
730 0 : tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
731 : }
732 :
733 : /* Swap the cow forks. */
734 3463 : if (xfs_has_reflink(ip->i_mount)) {
735 0 : ASSERT(!ip->i_cowfp ||
736 : ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
737 0 : ASSERT(!tip->i_cowfp ||
738 : tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
739 :
740 0 : swap(ip->i_cowfp, tip->i_cowfp);
741 :
742 0 : if (ip->i_cowfp && ip->i_cowfp->if_bytes)
743 0 : xfs_inode_set_cowblocks_tag(ip);
744 : else
745 0 : xfs_inode_clear_cowblocks_tag(ip);
746 0 : if (tip->i_cowfp && tip->i_cowfp->if_bytes)
747 0 : xfs_inode_set_cowblocks_tag(tip);
748 : else
749 0 : xfs_inode_clear_cowblocks_tag(tip);
750 : }
751 :
752 3463 : xfs_trans_log_inode(*tpp, ip, src_log_flags);
753 3463 : xfs_trans_log_inode(*tpp, tip, target_log_flags);
754 :
755 : /*
756 : * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
757 : * have inode number owner values in the bmbt blocks that still refer to
758 : * the old inode. Scan each bmbt to fix up the owner values with the
759 : * inode number of the current inode.
760 : */
761 3463 : if (src_log_flags & XFS_ILOG_DOWNER) {
762 186 : error = xfs_swap_change_owner(tpp, ip, tip);
763 186 : if (error)
764 : return error;
765 : }
766 3463 : if (target_log_flags & XFS_ILOG_DOWNER) {
767 973 : error = xfs_swap_change_owner(tpp, tip, ip);
768 973 : if (error)
769 0 : return error;
770 : }
771 :
772 : return 0;
773 : }
774 :
775 : /*
776 : * There may be partially written rt extents lurking in the ranges to be
777 : * swapped. According to the rules for realtime files with big rt extents, we
778 : * must guarantee that an outside observer (an IO thread, realistically) never
779 : * can see multiple physical rt extents mapped to the same logical file rt
780 : * extent. The deferred bmap log intent items that we use under the hood
781 : * operate on single block mappings and not rt extents, which means we must
782 : * have a strategy to ensure that log recovery after a failure won't stop in
783 : * the middle of an rt extent.
784 : *
785 : * The preferred strategy is to use deferred extent swap log intent items to
786 : * track the status of the overall swap operation so that we can complete the
787 : * work during crash recovery. If that isn't possible, we fall back to
788 : * requiring the selected mappings in both forks to be aligned to rt extent
789 : * boundaries. As an aside, the old fork swap routine didn't have this
790 : * requirement, but at an extreme cost in flexibilty (full files only, and no
791 : * support if rmapbt is enabled).
792 : */
793 : static bool
794 1861311 : xfs_xchg_range_need_rt_conversion(
795 : struct xfs_inode *ip,
796 : unsigned int xchg_flags)
797 : {
798 1861311 : struct xfs_mount *mp = ip->i_mount;
799 :
800 : /*
801 : * Caller got permission to use logged swapext, so log recovery will
802 : * finish the swap and not leave us with partially swapped rt extents
803 : * exposed to userspace.
804 : */
805 1861311 : if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
806 : return false;
807 :
808 : /*
809 : * If we can't use log intent items at all, the only supported
810 : * operation is full fork swaps, so no conversions are needed.
811 : * The range requirements are enforced by the swapext code itself.
812 : */
813 1864790 : if (!xfs_swapext_supported(mp))
814 : return false;
815 :
816 : /* Conversion is only needed for realtime files with big rt extents */
817 1857832 : return xfs_inode_has_bigrtextents(ip);
818 : }
819 :
820 : /*
821 : * Check the alignment of an exchange request when the allocation unit size
822 : * isn't a power of two. The VFS helpers use (fast) bitmask-based alignment
823 : * checks, but here we have to use slow long division.
824 : */
825 : static int
826 127480 : xfs_xchg_range_check_rtalign(
827 : struct xfs_inode *ip1,
828 : struct xfs_inode *ip2,
829 : const struct xfs_exch_range *fxr)
830 : {
831 127480 : struct xfs_mount *mp = ip1->i_mount;
832 127480 : uint32_t rextbytes;
833 127480 : uint64_t length = fxr->length;
834 127480 : uint64_t blen;
835 127480 : loff_t size1, size2;
836 :
837 127480 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
838 127480 : size1 = i_size_read(VFS_I(ip1));
839 127480 : size2 = i_size_read(VFS_I(ip2));
840 :
841 : /* The start of both ranges must be aligned to a rt extent. */
842 254953 : if (!isaligned_64(fxr->file1_offset, rextbytes) ||
843 127474 : !isaligned_64(fxr->file2_offset, rextbytes))
844 8 : return -EINVAL;
845 :
846 : /*
847 : * If the caller asked for full files, check that the offset/length
848 : * values cover all of both files.
849 : */
850 127471 : if ((fxr->flags & XFS_EXCH_RANGE_FULL_FILES) &&
851 4935 : (fxr->file1_offset != 0 || fxr->file2_offset != 0 ||
852 4935 : fxr->length != size1 || fxr->length != size2))
853 : return -EDOM;
854 :
855 127469 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
856 29 : length = max_t(int64_t, size1 - fxr->file1_offset,
857 : size2 - fxr->file2_offset);
858 :
859 : /*
860 : * If the user wanted us to exchange up to the infile's EOF, round up
861 : * to the next rt extent boundary for this check. Do the same for the
862 : * outfile.
863 : *
864 : * Otherwise, reject the range length if it's not rt extent aligned.
865 : * We already confirmed the starting offsets' rt extent block
866 : * alignment.
867 : */
868 127469 : if (fxr->file1_offset + length == size1)
869 4999 : blen = roundup_64(size1, rextbytes) - fxr->file1_offset;
870 122470 : else if (fxr->file2_offset + length == size2)
871 1085 : blen = roundup_64(size2, rextbytes) - fxr->file2_offset;
872 121385 : else if (!isaligned_64(length, rextbytes))
873 : return -EINVAL;
874 : else
875 : blen = length;
876 :
877 : /* Don't allow overlapped exchanges within the same file. */
878 127468 : if (ip1 == ip2 &&
879 122506 : fxr->file2_offset + blen > fxr->file1_offset &&
880 59202 : fxr->file1_offset + blen > fxr->file2_offset)
881 : return -EINVAL;
882 :
883 : /*
884 : * Ensure that we don't exchange a partial EOF rt extent into the
885 : * middle of another file.
886 : */
887 127467 : if (isaligned_64(length, rextbytes))
888 : return 0;
889 :
890 4726 : blen = length;
891 4726 : if (fxr->file2_offset + length < size2)
892 0 : blen = rounddown_64(blen, rextbytes);
893 :
894 4726 : if (fxr->file1_offset + blen < size1)
895 0 : blen = rounddown_64(blen, rextbytes);
896 :
897 4726 : return blen == length ? 0 : -EINVAL;
898 : }
899 :
900 : /* Prepare two files to have their data exchanged. */
901 : int
902 1862074 : xfs_xchg_range_prep(
903 : struct file *file1,
904 : struct file *file2,
905 : struct xfs_exch_range *fxr,
906 : unsigned int xchg_flags)
907 : {
908 1862074 : struct xfs_inode *ip1 = XFS_I(file_inode(file1));
909 1862074 : struct xfs_inode *ip2 = XFS_I(file_inode(file2));
910 1862074 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
911 1862065 : int error;
912 :
913 1862065 : trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0);
914 :
915 : /* Verify both files are either real-time or non-realtime */
916 4288982 : if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
917 : return -EINVAL;
918 :
919 : /* Check non-power of two alignment issues, if necessary. */
920 2510680 : if (XFS_IS_REALTIME_INODE(ip2) && !is_power_of_2(alloc_unit)) {
921 127480 : error = xfs_xchg_range_check_rtalign(ip1, ip2, fxr);
922 127481 : if (error)
923 : return error;
924 :
925 : /* Do the VFS checks with the regular block alignment. */
926 127467 : alloc_unit = ip1->i_mount->m_sb.sb_blocksize;
927 : }
928 :
929 1862055 : error = xfs_exch_range_prep(file1, file2, fxr, alloc_unit);
930 1862065 : if (error || fxr->length == 0)
931 : return error;
932 :
933 : /* Attach dquots to both inodes before changing block maps. */
934 1861314 : error = xfs_qm_dqattach(ip2);
935 1861309 : if (error)
936 : return error;
937 1861309 : error = xfs_qm_dqattach(ip1);
938 1861304 : if (error)
939 : return error;
940 :
941 1861303 : trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0);
942 :
943 : /* Flush the relevant ranges of both files. */
944 1861305 : error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
945 1861304 : if (error)
946 : return error;
947 1861303 : error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
948 1861309 : if (error)
949 : return error;
950 :
951 : /*
952 : * Cancel CoW fork preallocations for the ranges of both files. The
953 : * prep function should have flushed all the dirty data, so the only
954 : * extents remaining should be speculative.
955 : */
956 3722618 : if (xfs_inode_has_cow_data(ip1)) {
957 960691 : error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
958 960691 : fxr->length, true);
959 960691 : if (error)
960 : return error;
961 : }
962 :
963 3722618 : if (xfs_inode_has_cow_data(ip2)) {
964 953686 : error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
965 953686 : fxr->length, true);
966 953686 : if (error)
967 : return error;
968 : }
969 :
970 : /* Convert unwritten sub-extent mappings if required. */
971 1861309 : if (xfs_xchg_range_need_rt_conversion(ip2, xchg_flags)) {
972 186609 : error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
973 : fxr->length);
974 186610 : if (error)
975 : return error;
976 :
977 186610 : error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
978 : fxr->length);
979 186611 : if (error)
980 0 : return error;
981 : }
982 :
983 : return 0;
984 : }
985 :
986 : #define QRETRY_IP1 (0x1)
987 : #define QRETRY_IP2 (0x2)
988 :
989 : /*
990 : * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
991 : * this if quota enforcement is disabled or if both inodes' dquots are the
992 : * same. The qretry structure must be initialized to zeroes before the first
993 : * call to this function.
994 : */
995 : STATIC int
996 1861801 : xfs_xchg_range_reserve_quota(
997 : struct xfs_trans *tp,
998 : const struct xfs_swapext_req *req,
999 : unsigned int *qretry)
1000 : {
1001 1861801 : int64_t ddelta, rdelta;
1002 1861801 : int ip1_error = 0;
1003 1861801 : int error;
1004 :
1005 : /*
1006 : * Don't bother with a quota reservation if we're not enforcing them
1007 : * or the two inodes have the same dquots.
1008 : */
1009 1861801 : if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
1010 29077 : (req->ip1->i_udquot == req->ip2->i_udquot &&
1011 29004 : req->ip1->i_gdquot == req->ip2->i_gdquot &&
1012 28977 : req->ip1->i_pdquot == req->ip2->i_pdquot))
1013 : return 0;
1014 :
1015 100 : *qretry = 0;
1016 :
1017 : /*
1018 : * For each file, compute the net gain in the number of regular blocks
1019 : * that will be mapped into that file and reserve that much quota. The
1020 : * quota counts must be able to absorb at least that much space.
1021 : */
1022 100 : ddelta = req->ip2_bcount - req->ip1_bcount;
1023 100 : rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
1024 100 : if (ddelta > 0 || rdelta > 0) {
1025 42 : error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
1026 : ddelta > 0 ? ddelta : 0,
1027 : rdelta > 0 ? rdelta : 0,
1028 : false);
1029 42 : if (error == -EDQUOT || error == -ENOSPC) {
1030 : /*
1031 : * Save this error and see what happens if we try to
1032 : * reserve quota for ip2. Then report both.
1033 : */
1034 22 : *qretry |= QRETRY_IP1;
1035 22 : ip1_error = error;
1036 22 : error = 0;
1037 : }
1038 42 : if (error)
1039 : return error;
1040 : }
1041 100 : if (ddelta < 0 || rdelta < 0) {
1042 9 : error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
1043 : ddelta < 0 ? -ddelta : 0,
1044 : rdelta < 0 ? -rdelta : 0,
1045 : false);
1046 9 : if (error == -EDQUOT || error == -ENOSPC)
1047 0 : *qretry |= QRETRY_IP2;
1048 9 : if (error)
1049 : return error;
1050 : }
1051 100 : if (ip1_error)
1052 : return ip1_error;
1053 :
1054 : /*
1055 : * For each file, forcibly reserve the gross gain in mapped blocks so
1056 : * that we don't trip over any quota block reservation assertions.
1057 : * We must reserve the gross gain because the quota code subtracts from
1058 : * bcount the number of blocks that we unmap; it does not add that
1059 : * quantity back to the quota block reservation.
1060 : */
1061 78 : error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
1062 78 : req->ip1_rtbcount, true);
1063 78 : if (error)
1064 : return error;
1065 :
1066 78 : return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
1067 78 : req->ip2_rtbcount, true);
1068 : }
1069 :
1070 : /*
1071 : * Get permission to use log-assisted atomic exchange of file extents.
1072 : *
1073 : * Callers must hold the IOLOCK and MMAPLOCK of both files. They must not be
1074 : * running any transactions or hold any ILOCKS. If @use_logging is set after a
1075 : * successful return, callers must call xfs_xchg_range_rele_log_assist after
1076 : * the exchange is completed.
1077 : */
1078 : int
1079 6818952 : xfs_xchg_range_grab_log_assist(
1080 : struct xfs_mount *mp,
1081 : bool force,
1082 : bool *use_logging)
1083 : {
1084 6818952 : int error = 0;
1085 :
1086 : /*
1087 : * Protect ourselves from an idle log clearing the atomic swapext
1088 : * log incompat feature bit.
1089 : */
1090 6818952 : xlog_use_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1091 6814820 : *use_logging = true;
1092 :
1093 : /*
1094 : * If log-assisted swapping is already enabled, the caller can use the
1095 : * log assisted swap functions with the log-incompat reference we got.
1096 : */
1097 13629640 : if (xfs_sb_version_haslogswapext(&mp->m_sb))
1098 : return 0;
1099 :
1100 : /*
1101 : * If the caller doesn't /require/ log-assisted swapping, drop the
1102 : * log-incompat feature protection and exit. The caller cannot use
1103 : * log assisted swapping.
1104 : */
1105 57660 : if (!force)
1106 31739 : goto drop_incompat;
1107 :
1108 : /*
1109 : * Caller requires log-assisted swapping but the fs feature set isn't
1110 : * rich enough to support it. Bail out.
1111 : */
1112 26066 : if (!xfs_swapext_supported(mp)) {
1113 139 : error = -EOPNOTSUPP;
1114 139 : goto drop_incompat;
1115 : }
1116 :
1117 25782 : error = xfs_add_incompat_log_feature(mp,
1118 : XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT);
1119 25793 : if (error)
1120 0 : goto drop_incompat;
1121 :
1122 25793 : xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SWAPEXT,
1123 : "EXPERIMENTAL atomic file range swap feature in use. Use at your own risk!");
1124 :
1125 : return 0;
1126 31878 : drop_incompat:
1127 31878 : xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1128 31878 : *use_logging = false;
1129 31878 : return error;
1130 : }
1131 :
1132 : /* Release permission to use log-assisted extent swapping. */
1133 : void
1134 4953777 : xfs_xchg_range_rele_log_assist(
1135 : struct xfs_mount *mp)
1136 : {
1137 6783827 : xlog_drop_incompat_feat(mp->m_log, XLOG_INCOMPAT_FEAT_SWAPEXT);
1138 1830067 : }
1139 :
1140 : /* Decide if we can use the old data fork exchange code. */
1141 : static inline bool
1142 3510 : xfs_xchg_use_forkswap(
1143 : const struct xfs_exch_range *fxr,
1144 : struct xfs_inode *ip1,
1145 : struct xfs_inode *ip2)
1146 : {
1147 3510 : if (!(fxr->flags & XFS_EXCH_RANGE_NONATOMIC))
1148 : return false;
1149 3510 : if (!(fxr->flags & XFS_EXCH_RANGE_FULL_FILES))
1150 : return false;
1151 3465 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
1152 : return false;
1153 3465 : if (fxr->file1_offset != 0 || fxr->file2_offset != 0)
1154 : return false;
1155 3465 : if (fxr->length != ip1->i_disk_size)
1156 : return false;
1157 3465 : if (fxr->length != ip2->i_disk_size)
1158 0 : return false;
1159 : return true;
1160 : }
1161 :
1162 : enum xchg_strategy {
1163 : SWAPEXT = 1, /* xfs_swapext() */
1164 : FORKSWAP = 2, /* exchange forks */
1165 : };
1166 :
1167 : /* Exchange the contents of two files. */
1168 : int
1169 1861811 : xfs_xchg_range(
1170 : struct xfs_inode *ip1,
1171 : struct xfs_inode *ip2,
1172 : const struct xfs_exch_range *fxr,
1173 : unsigned int xchg_flags)
1174 : {
1175 1861811 : struct xfs_mount *mp = ip1->i_mount;
1176 7447244 : struct xfs_swapext_req req = {
1177 : .ip1 = ip1,
1178 : .ip2 = ip2,
1179 : .whichfork = XFS_DATA_FORK,
1180 1861811 : .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
1181 1861811 : .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
1182 1861811 : .blockcount = XFS_B_TO_FSB(mp, fxr->length),
1183 : };
1184 1861811 : struct xfs_trans *tp;
1185 1861811 : unsigned int qretry;
1186 1861811 : unsigned int flags = 0;
1187 1861811 : bool retried = false;
1188 1861811 : enum xchg_strategy strategy;
1189 1861811 : int error;
1190 :
1191 1861811 : trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags);
1192 :
1193 1861803 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF)
1194 335 : req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
1195 1861803 : if (fxr->flags & XFS_EXCH_RANGE_FILE1_WRITTEN)
1196 64 : req.req_flags |= XFS_SWAP_REQ_INO1_WRITTEN;
1197 1861803 : if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
1198 1830063 : req.req_flags |= XFS_SWAP_REQ_LOGGED;
1199 :
1200 : /*
1201 : * Round the request length up to the nearest fundamental unit of
1202 : * allocation. The prep function already checked that the request
1203 : * offsets and length in @fxr are safe to round up.
1204 : */
1205 1861803 : if (XFS_IS_REALTIME_INODE(ip2))
1206 648475 : req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
1207 :
1208 1861801 : error = xfs_xchg_range_estimate(&req);
1209 1861809 : if (error)
1210 : return error;
1211 :
1212 : /*
1213 : * We haven't decided which exchange strategy we want to use yet, but
1214 : * here we must choose if we want freed blocks during the swap to be
1215 : * added to the transaction block reservation (RES_FDBLKS) or freed
1216 : * into the global fdblocks. The legacy fork swap mechanism doesn't
1217 : * free any blocks, so it doesn't require it. It is also the only
1218 : * option that works for older filesystems.
1219 : *
1220 : * The bmap log intent items that were added with rmap and reflink can
1221 : * change the bmbt shape, so the intent-based swap strategies require
1222 : * us to set RES_FDBLKS.
1223 : */
1224 1861799 : if (xfs_has_lazysbcount(mp))
1225 1861799 : flags |= XFS_TRANS_RES_FDBLKS;
1226 :
1227 1861799 : retry:
1228 : /* Allocate the transaction, lock the inodes, and join them. */
1229 1861810 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
1230 : flags, &tp);
1231 1861807 : if (error)
1232 1 : return error;
1233 :
1234 1861806 : xfs_xchg_range_ilock(tp, ip1, ip2);
1235 :
1236 1861813 : trace_xfs_swap_extent_before(ip2, 0);
1237 1861804 : trace_xfs_swap_extent_before(ip1, 1);
1238 :
1239 1861805 : if (fxr->flags & XFS_EXCH_RANGE_FILE2_FRESH)
1240 32091 : trace_xfs_xchg_range_freshness(ip2, fxr);
1241 :
1242 : /*
1243 : * Now that we've excluded all other inode metadata changes by taking
1244 : * the ILOCK, repeat the freshness check.
1245 : */
1246 1861805 : error = xfs_exch_range_check_fresh(VFS_I(ip2), fxr);
1247 1861805 : if (error)
1248 0 : goto out_trans_cancel;
1249 :
1250 1861805 : error = xfs_swapext_check_extents(mp, &req);
1251 1861799 : if (error)
1252 0 : goto out_trans_cancel;
1253 :
1254 : /*
1255 : * Reserve ourselves some quota if any of them are in enforcing mode.
1256 : * In theory we only need enough to satisfy the change in the number
1257 : * of blocks between the two ranges being remapped.
1258 : */
1259 1861799 : error = xfs_xchg_range_reserve_quota(tp, &req, &qretry);
1260 1861785 : if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
1261 11 : xfs_trans_cancel(tp);
1262 11 : xfs_xchg_range_iunlock(ip1, ip2);
1263 11 : if (qretry & QRETRY_IP1)
1264 11 : xfs_blockgc_free_quota(ip1, 0);
1265 11 : if (qretry & QRETRY_IP2)
1266 0 : xfs_blockgc_free_quota(ip2, 0);
1267 11 : retried = true;
1268 11 : goto retry;
1269 : }
1270 1861774 : if (error)
1271 11 : goto out_trans_cancel;
1272 :
1273 1893480 : if ((xchg_flags & XFS_XCHG_RANGE_LOGGED) || xfs_swapext_supported(mp)) {
1274 : /*
1275 : * xfs_swapext() uses deferred bmap log intent items to swap
1276 : * extents between file forks. If the atomic log swap feature
1277 : * is enabled, it will also use swapext log intent items to
1278 : * restart the operation in case of failure.
1279 : *
1280 : * This means that we can use it if we previously obtained
1281 : * permission from the log to use log-assisted atomic extent
1282 : * swapping; or if the fs supports rmap or reflink and the
1283 : * user said NONATOMIC.
1284 : */
1285 : strategy = SWAPEXT;
1286 3510 : } else if (xfs_xchg_use_forkswap(fxr, ip1, ip2)) {
1287 : /*
1288 : * Exchange the file contents by using the old bmap fork
1289 : * exchange code, if we're a defrag tool doing a full file
1290 : * swap.
1291 : */
1292 3465 : strategy = FORKSWAP;
1293 :
1294 3465 : error = xfs_swap_extents_check_format(ip2, ip1);
1295 3465 : if (error) {
1296 1 : xfs_notice(mp,
1297 : "%s: inode 0x%llx format is incompatible for exchanging.",
1298 : __func__, ip2->i_ino);
1299 1 : goto out_trans_cancel;
1300 : }
1301 : } else {
1302 : /* We cannot exchange the file contents. */
1303 45 : error = -EOPNOTSUPP;
1304 45 : goto out_trans_cancel;
1305 : }
1306 :
1307 : /* If we got this far on a dry run, all parameters are ok. */
1308 1861717 : if (fxr->flags & XFS_EXCH_RANGE_DRY_RUN)
1309 480 : goto out_trans_cancel;
1310 :
1311 : /* Update the mtime and ctime of both files. */
1312 1861237 : if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
1313 1861247 : xfs_trans_ichgtime(tp, ip1,
1314 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1315 1861237 : if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
1316 1831943 : xfs_trans_ichgtime(tp, ip2,
1317 : XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1318 :
1319 1861242 : if (strategy == SWAPEXT) {
1320 1857779 : xfs_swapext(tp, &req);
1321 : } else {
1322 3463 : error = xfs_swap_extent_forks(&tp, &req);
1323 3463 : if (error)
1324 0 : goto out_trans_cancel;
1325 : }
1326 :
1327 : /*
1328 : * Force the log to persist metadata updates if the caller or the
1329 : * administrator requires this. The VFS prep function already flushed
1330 : * the relevant parts of the page cache.
1331 : */
1332 1861256 : if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCH_RANGE_FSYNC))
1333 24468 : xfs_trans_set_sync(tp);
1334 :
1335 1861256 : error = xfs_trans_commit(tp);
1336 :
1337 1861266 : trace_xfs_swap_extent_after(ip2, 0);
1338 1861263 : trace_xfs_swap_extent_after(ip1, 1);
1339 :
1340 1861260 : if (error)
1341 52 : goto out_unlock;
1342 :
1343 : /*
1344 : * If the caller wanted us to exchange the contents of two complete
1345 : * files of unequal length, exchange the incore sizes now. This should
1346 : * be safe because we flushed both files' page caches, moved all the
1347 : * extents, and updated the ondisk sizes.
1348 : */
1349 1861208 : if (fxr->flags & XFS_EXCH_RANGE_TO_EOF) {
1350 326 : loff_t temp;
1351 :
1352 326 : temp = i_size_read(VFS_I(ip2));
1353 326 : i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
1354 326 : i_size_write(VFS_I(ip1), temp);
1355 : }
1356 :
1357 1860882 : out_unlock:
1358 1861797 : xfs_xchg_range_iunlock(ip1, ip2);
1359 1861797 : return error;
1360 :
1361 537 : out_trans_cancel:
1362 537 : xfs_trans_cancel(tp);
1363 537 : goto out_unlock;
1364 : }
|