Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0+
2 : /*
3 : * Copyright (C) 2016 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_defer.h"
14 : #include "xfs_inode.h"
15 : #include "xfs_trans.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_trace.h"
19 : #include "xfs_icache.h"
20 : #include "xfs_btree.h"
21 : #include "xfs_refcount_btree.h"
22 : #include "xfs_refcount.h"
23 : #include "xfs_bmap_btree.h"
24 : #include "xfs_trans_space.h"
25 : #include "xfs_bit.h"
26 : #include "xfs_alloc.h"
27 : #include "xfs_quota.h"
28 : #include "xfs_reflink.h"
29 : #include "xfs_iomap.h"
30 : #include "xfs_ag.h"
31 : #include "xfs_ag_resv.h"
32 : #include "xfs_health.h"
33 : #include "xfs_rtrefcount_btree.h"
34 : #include "xfs_rtalloc.h"
35 : #include "xfs_rtgroup.h"
36 : #include "xfs_imeta.h"
37 : #include "xfs_rtbitmap.h"
38 :
39 : /*
40 : * Copy on Write of Shared Blocks
41 : *
42 : * XFS must preserve "the usual" file semantics even when two files share
43 : * the same physical blocks. This means that a write to one file must not
44 : * alter the blocks in a different file; the way that we'll do that is
45 : * through the use of a copy-on-write mechanism. At a high level, that
46 : * means that when we want to write to a shared block, we allocate a new
47 : * block, write the data to the new block, and if that succeeds we map the
48 : * new block into the file.
49 : *
50 : * XFS provides a "delayed allocation" mechanism that defers the allocation
51 : * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
52 : * possible. This reduces fragmentation by enabling the filesystem to ask
53 : * for bigger chunks less often, which is exactly what we want for CoW.
54 : *
55 : * The delalloc mechanism begins when the kernel wants to make a block
56 : * writable (write_begin or page_mkwrite). If the offset is not mapped, we
57 : * create a delalloc mapping, which is a regular in-core extent, but without
58 : * a real startblock. (For delalloc mappings, the startblock encodes both
59 : * a flag that this is a delalloc mapping, and a worst-case estimate of how
60 : * many blocks might be required to put the mapping into the BMBT.) delalloc
61 : * mappings are a reservation against the free space in the filesystem;
62 : * adjacent mappings can also be combined into fewer larger mappings.
63 : *
64 : * As an optimization, the CoW extent size hint (cowextsz) creates
65 : * outsized aligned delalloc reservations in the hope of landing out of
66 : * order nearby CoW writes in a single extent on disk, thereby reducing
67 : * fragmentation and improving future performance.
68 : *
69 : * D: --RRRRRRSSSRRRRRRRR--- (data fork)
70 : * C: ------DDDDDDD--------- (CoW fork)
71 : *
72 : * When dirty pages are being written out (typically in writepage), the
73 : * delalloc reservations are converted into unwritten mappings by
74 : * allocating blocks and replacing the delalloc mapping with real ones.
75 : * A delalloc mapping can be replaced by several unwritten ones if the
76 : * free space is fragmented.
77 : *
78 : * D: --RRRRRRSSSRRRRRRRR---
79 : * C: ------UUUUUUU---------
80 : *
81 : * We want to adapt the delalloc mechanism for copy-on-write, since the
82 : * write paths are similar. The first two steps (creating the reservation
83 : * and allocating the blocks) are exactly the same as delalloc except that
84 : * the mappings must be stored in a separate CoW fork because we do not want
85 : * to disturb the mapping in the data fork until we're sure that the write
86 : * succeeded. IO completion in this case is the process of removing the old
87 : * mapping from the data fork and moving the new mapping from the CoW fork to
88 : * the data fork. This will be discussed shortly.
89 : *
90 : * For now, unaligned directio writes will be bounced back to the page cache.
91 : * Block-aligned directio writes will use the same mechanism as buffered
92 : * writes.
93 : *
94 : * Just prior to submitting the actual disk write requests, we convert
95 : * the extents representing the range of the file actually being written
96 : * (as opposed to extra pieces created for the cowextsize hint) to real
97 : * extents. This will become important in the next step:
98 : *
99 : * D: --RRRRRRSSSRRRRRRRR---
100 : * C: ------UUrrUUU---------
101 : *
102 : * CoW remapping must be done after the data block write completes,
103 : * because we don't want to destroy the old data fork map until we're sure
104 : * the new block has been written. Since the new mappings are kept in a
105 : * separate fork, we can simply iterate these mappings to find the ones
106 : * that cover the file blocks that we just CoW'd. For each extent, simply
107 : * unmap the corresponding range in the data fork, map the new range into
108 : * the data fork, and remove the extent from the CoW fork. Because of
109 : * the presence of the cowextsize hint, however, we must be careful
110 : * only to remap the blocks that we've actually written out -- we must
111 : * never remap delalloc reservations nor CoW staging blocks that have
112 : * yet to be written. This corresponds exactly to the real extents in
113 : * the CoW fork:
114 : *
115 : * D: --RRRRRRrrSRRRRRRRR---
116 : * C: ------UU--UUU---------
117 : *
118 : * Since the remapping operation can be applied to an arbitrary file
119 : * range, we record the need for the remap step as a flag in the ioend
120 : * instead of declaring a new IO type. This is required for direct io
121 : * because we only have ioend for the whole dio, and we have to be able to
122 : * remember the presence of unwritten blocks and CoW blocks with a single
123 : * ioend structure. Better yet, the more ground we can cover with one
124 : * ioend, the better.
125 : */
126 :
127 : /*
128 : * Given an AG extent, find the lowest-numbered run of shared blocks
129 : * within that range and return the range in fbno/flen. If
130 : * find_end_of_shared is true, return the longest contiguous extent of
131 : * shared blocks. If there are no shared extents, fbno and flen will
132 : * be set to NULLAGBLOCK and 0, respectively.
133 : */
134 : static int
135 794556659 : xfs_reflink_find_shared(
136 : struct xfs_perag *pag,
137 : struct xfs_trans *tp,
138 : xfs_agblock_t agbno,
139 : xfs_extlen_t aglen,
140 : xfs_agblock_t *fbno,
141 : xfs_extlen_t *flen,
142 : bool find_end_of_shared)
143 : {
144 794556659 : struct xfs_buf *agbp;
145 794556659 : struct xfs_btree_cur *cur;
146 794556659 : int error;
147 :
148 794556659 : error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
149 794559022 : if (error)
150 : return error;
151 :
152 794559046 : cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
153 :
154 794556187 : error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
155 : find_end_of_shared);
156 :
157 794554503 : xfs_btree_del_cursor(cur, error);
158 :
159 794560635 : xfs_trans_brelse(tp, agbp);
160 794560635 : return error;
161 : }
162 :
163 : /*
164 : * Given an RT extent, find the lowest-numbered run of shared blocks
165 : * within that range and return the range in fbno/flen. If
166 : * find_end_of_shared is true, return the longest contiguous extent of
167 : * shared blocks. If there are no shared extents, fbno and flen will
168 : * be set to NULLRGBLOCK and 0, respectively.
169 : */
170 : static int
171 13464042 : xfs_reflink_find_rtshared(
172 : struct xfs_rtgroup *rtg,
173 : struct xfs_trans *tp,
174 : xfs_agblock_t rtbno,
175 : xfs_extlen_t rtlen,
176 : xfs_agblock_t *fbno,
177 : xfs_extlen_t *flen,
178 : bool find_end_of_shared)
179 : {
180 13464042 : struct xfs_mount *mp = rtg->rtg_mount;
181 13464042 : struct xfs_btree_cur *cur;
182 13464042 : int error;
183 :
184 13464042 : BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
185 :
186 13464042 : xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_REFCOUNT);
187 13464564 : cur = xfs_rtrefcountbt_init_cursor(mp, tp, rtg, rtg->rtg_refcountip);
188 13464118 : error = xfs_refcount_find_shared(cur, rtbno, rtlen, fbno, flen,
189 : find_end_of_shared);
190 13464493 : xfs_btree_del_cursor(cur, error);
191 13464869 : xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
192 13464739 : return error;
193 : }
194 :
195 : /*
196 : * Trim the mapping to the next block where there's a change in the
197 : * shared/unshared status. More specifically, this means that we
198 : * find the lowest-numbered extent of shared blocks that coincides with
199 : * the given block mapping. If the shared extent overlaps the start of
200 : * the mapping, trim the mapping to the end of the shared extent. If
201 : * the shared region intersects the mapping, trim the mapping to the
202 : * start of the shared extent. If there are no shared regions that
203 : * overlap, just return the original extent.
204 : */
205 : int
206 36507041 : xfs_reflink_trim_around_shared(
207 : struct xfs_inode *ip,
208 : struct xfs_bmbt_irec *irec,
209 : bool *shared)
210 : {
211 36507041 : struct xfs_mount *mp = ip->i_mount;
212 36507041 : xfs_agblock_t orig_bno;
213 36507041 : xfs_agblock_t fbno;
214 36507041 : xfs_extlen_t flen;
215 36507041 : int error = 0;
216 :
217 : /* Holes, unwritten, and delalloc extents cannot be shared */
218 36507041 : if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
219 19933716 : *shared = false;
220 19933716 : return 0;
221 : }
222 :
223 16573529 : trace_xfs_reflink_trim_around_shared(ip, irec);
224 :
225 22596663 : if (XFS_IS_REALTIME_INODE(ip)) {
226 6023139 : struct xfs_rtgroup *rtg;
227 6023139 : xfs_rgnumber_t rgno;
228 :
229 6023139 : orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock, &rgno);
230 6023136 : rtg = xfs_rtgroup_get(mp, rgno);
231 6023126 : error = xfs_reflink_find_rtshared(rtg, NULL, orig_bno,
232 6023126 : irec->br_blockcount, &fbno, &flen, true);
233 6023161 : xfs_rtgroup_put(rtg);
234 : } else {
235 10550359 : struct xfs_perag *pag;
236 :
237 10550359 : pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
238 : irec->br_startblock));
239 10550343 : orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
240 10550343 : error = xfs_reflink_find_shared(pag, NULL, orig_bno,
241 10550343 : irec->br_blockcount, &fbno, &flen, true);
242 10550397 : xfs_perag_put(pag);
243 : }
244 16573565 : if (error)
245 : return error;
246 :
247 16573470 : *shared = false;
248 16573470 : if (fbno == NULLAGBLOCK) {
249 : /* No shared blocks at all. */
250 : return 0;
251 : }
252 :
253 1934574 : if (fbno == orig_bno) {
254 : /*
255 : * The start of this extent is shared. Truncate the
256 : * mapping at the end of the shared region so that a
257 : * subsequent iteration starts at the start of the
258 : * unshared region.
259 : */
260 1898568 : irec->br_blockcount = flen;
261 1898568 : *shared = true;
262 1898568 : return 0;
263 : }
264 :
265 : /*
266 : * There's a shared extent midway through this extent.
267 : * Truncate the mapping at the start of the shared
268 : * extent so that a subsequent iteration starts at the
269 : * start of the shared region.
270 : */
271 36006 : irec->br_blockcount = fbno - orig_bno;
272 36006 : return 0;
273 : }
274 :
275 : int
276 27711787 : xfs_bmap_trim_cow(
277 : struct xfs_inode *ip,
278 : struct xfs_bmbt_irec *imap,
279 : bool *shared)
280 : {
281 : /* We can't update any real extents in always COW mode. */
282 27711787 : if (xfs_is_always_cow_inode(ip) &&
283 0 : !isnullstartblock(imap->br_startblock)) {
284 0 : *shared = true;
285 0 : return 0;
286 : }
287 :
288 : /* Trim the mapping to the nearest shared extent boundary. */
289 27711835 : return xfs_reflink_trim_around_shared(ip, imap, shared);
290 : }
291 :
292 : static int
293 2002971 : xfs_reflink_convert_cow_locked(
294 : struct xfs_inode *ip,
295 : xfs_fileoff_t offset_fsb,
296 : xfs_filblks_t count_fsb)
297 : {
298 2002971 : struct xfs_iext_cursor icur;
299 2002971 : struct xfs_bmbt_irec got;
300 2002971 : struct xfs_btree_cur *dummy_cur = NULL;
301 2002971 : struct xfs_mount *mp = ip->i_mount;
302 2002971 : int dummy_logflags;
303 2002971 : int error = 0;
304 :
305 : /*
306 : * We can only remap full rt extents, so make sure that we convert the
307 : * entire extent. The caller must ensure that this is either a direct
308 : * write that's aligned to the rt extent size, or a buffered write for
309 : * which we've dirtied extra pages to make this work properly.
310 : */
311 2002971 : if (xfs_inode_needs_cow_around(ip)) {
312 0 : xfs_fileoff_t new_off;
313 :
314 0 : new_off = xfs_rtb_rounddown_rtx(mp, offset_fsb);
315 0 : count_fsb += offset_fsb - new_off;
316 0 : offset_fsb = new_off;
317 :
318 0 : count_fsb = xfs_rtb_roundup_rtx(mp, count_fsb);
319 : }
320 :
321 2002944 : if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
322 : return 0;
323 :
324 3764054 : do {
325 3764054 : if (got.br_startoff >= offset_fsb + count_fsb)
326 : break;
327 2004060 : if (got.br_state == XFS_EXT_NORM)
328 6 : continue;
329 2004054 : if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
330 : return -EIO;
331 :
332 2004054 : xfs_trim_extent(&got, offset_fsb, count_fsb);
333 2004046 : if (!got.br_blockcount)
334 0 : continue;
335 :
336 2004046 : got.br_state = XFS_EXT_NORM;
337 2004046 : error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
338 : XFS_COW_FORK, &icur, &dummy_cur, &got,
339 : &dummy_logflags);
340 2003869 : if (error)
341 0 : return error;
342 2003875 : } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
343 :
344 : return error;
345 : }
346 :
347 : /* Convert all of the unwritten CoW extents in a file's range to real ones. */
348 : int
349 611513 : xfs_reflink_convert_cow(
350 : struct xfs_inode *ip,
351 : xfs_off_t offset,
352 : xfs_off_t count)
353 : {
354 611513 : struct xfs_mount *mp = ip->i_mount;
355 611513 : xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
356 611513 : xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
357 611513 : xfs_filblks_t count_fsb = end_fsb - offset_fsb;
358 611513 : int error;
359 :
360 611513 : ASSERT(count != 0);
361 :
362 611513 : xfs_ilock(ip, XFS_ILOCK_EXCL);
363 611476 : error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
364 611336 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
365 611244 : return error;
366 : }
367 :
368 : /*
369 : * Find the extent that maps the given range in the COW fork. Even if the extent
370 : * is not shared we might have a preallocation for it in the COW fork. If so we
371 : * use it that rather than trigger a new allocation.
372 : */
373 : static int
374 22159553 : xfs_find_trim_cow_extent(
375 : struct xfs_inode *ip,
376 : struct xfs_bmbt_irec *imap,
377 : struct xfs_bmbt_irec *cmap,
378 : bool *shared,
379 : bool *found)
380 : {
381 22159553 : xfs_fileoff_t offset_fsb = imap->br_startoff;
382 22159553 : xfs_filblks_t count_fsb = imap->br_blockcount;
383 22159553 : struct xfs_iext_cursor icur;
384 :
385 22159553 : *found = false;
386 :
387 : /*
388 : * If we don't find an overlapping extent, trim the range we need to
389 : * allocate to fit the hole we found.
390 : */
391 22159553 : if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
392 17968993 : cmap->br_startoff = offset_fsb + count_fsb;
393 22159479 : if (cmap->br_startoff > offset_fsb) {
394 20541857 : xfs_trim_extent(imap, imap->br_startoff,
395 : cmap->br_startoff - imap->br_startoff);
396 20541857 : return xfs_bmap_trim_cow(ip, imap, shared);
397 : }
398 :
399 1617622 : *shared = true;
400 1617622 : if (isnullstartblock(cmap->br_startblock)) {
401 913 : xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
402 913 : return 0;
403 : }
404 :
405 : /* real extent found - no need to allocate */
406 1616709 : xfs_trim_extent(cmap, offset_fsb, count_fsb);
407 1616709 : *found = true;
408 1616709 : return 0;
409 : }
410 :
411 : static int
412 2156384 : xfs_reflink_convert_unwritten(
413 : struct xfs_inode *ip,
414 : struct xfs_bmbt_irec *imap,
415 : struct xfs_bmbt_irec *cmap,
416 : bool convert_now)
417 : {
418 2156384 : xfs_fileoff_t offset_fsb = imap->br_startoff;
419 2156384 : xfs_filblks_t count_fsb = imap->br_blockcount;
420 2156384 : int error;
421 :
422 : /*
423 : * cmap might larger than imap due to cowextsize hint.
424 : */
425 2156384 : xfs_trim_extent(cmap, offset_fsb, count_fsb);
426 :
427 : /*
428 : * COW fork extents are supposed to remain unwritten until we're ready
429 : * to initiate a disk write. For direct I/O we are going to write the
430 : * data and need the conversion, but for buffered writes we're done.
431 : */
432 2156384 : if (!convert_now || cmap->br_state == XFS_EXT_NORM)
433 : return 0;
434 :
435 1391473 : trace_xfs_reflink_convert_cow(ip, cmap);
436 :
437 1391473 : error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
438 1391473 : if (!error)
439 1391473 : cmap->br_state = XFS_EXT_NORM;
440 :
441 : return error;
442 : }
443 :
444 : static int
445 539271 : xfs_reflink_fill_cow_hole(
446 : struct xfs_inode *ip,
447 : struct xfs_bmbt_irec *imap,
448 : struct xfs_bmbt_irec *cmap,
449 : bool *shared,
450 : uint *lockmode,
451 : bool convert_now)
452 : {
453 539271 : struct xfs_mount *mp = ip->i_mount;
454 539271 : struct xfs_trans *tp;
455 539271 : xfs_filblks_t resaligned;
456 539271 : unsigned int dblocks = 0, rblocks = 0;
457 539271 : int nimaps;
458 539271 : int error;
459 539271 : bool found;
460 :
461 539271 : resaligned = xfs_aligned_fsb_count(imap->br_startoff,
462 : imap->br_blockcount, xfs_get_cowextsz_hint(ip));
463 539271 : if (XFS_IS_REALTIME_INODE(ip)) {
464 313361 : dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
465 313361 : rblocks = resaligned;
466 : } else {
467 225910 : dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
468 225910 : rblocks = 0;
469 : }
470 :
471 539271 : xfs_iunlock(ip, *lockmode);
472 539271 : *lockmode = 0;
473 :
474 539271 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
475 : rblocks, false, &tp);
476 539271 : if (error)
477 : return error;
478 :
479 539264 : *lockmode = XFS_ILOCK_EXCL;
480 :
481 539264 : error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
482 539264 : if (error || !*shared)
483 0 : goto out_trans_cancel;
484 :
485 539264 : if (found) {
486 0 : xfs_trans_cancel(tp);
487 0 : goto convert;
488 : }
489 :
490 : /* Allocate the entire reservation as unwritten blocks. */
491 539264 : nimaps = 1;
492 539264 : error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
493 : XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
494 : &nimaps);
495 539264 : if (error)
496 19 : goto out_trans_cancel;
497 :
498 539245 : xfs_inode_set_cowblocks_tag(ip);
499 539245 : error = xfs_trans_commit(tp);
500 539245 : if (error)
501 : return error;
502 :
503 : /*
504 : * Allocation succeeded but the requested range was not even partially
505 : * satisfied? Bail out!
506 : */
507 539245 : if (nimaps == 0)
508 : return -ENOSPC;
509 :
510 539245 : convert:
511 539245 : return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
512 :
513 19 : out_trans_cancel:
514 19 : xfs_trans_cancel(tp);
515 19 : return error;
516 : }
517 :
518 : static int
519 431 : xfs_reflink_fill_delalloc(
520 : struct xfs_inode *ip,
521 : struct xfs_bmbt_irec *imap,
522 : struct xfs_bmbt_irec *cmap,
523 : bool *shared,
524 : uint *lockmode,
525 : bool convert_now)
526 : {
527 431 : struct xfs_mount *mp = ip->i_mount;
528 483 : struct xfs_trans *tp;
529 483 : int nimaps;
530 483 : int error;
531 483 : bool found;
532 :
533 483 : do {
534 483 : xfs_iunlock(ip, *lockmode);
535 483 : *lockmode = 0;
536 :
537 483 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
538 : false, &tp);
539 483 : if (error)
540 0 : return error;
541 :
542 483 : *lockmode = XFS_ILOCK_EXCL;
543 :
544 483 : error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
545 : &found);
546 483 : if (error || !*shared)
547 0 : goto out_trans_cancel;
548 :
549 483 : if (found) {
550 1 : xfs_trans_cancel(tp);
551 1 : break;
552 : }
553 :
554 482 : ASSERT(isnullstartblock(cmap->br_startblock) ||
555 : cmap->br_startblock == DELAYSTARTBLOCK);
556 :
557 : /*
558 : * Replace delalloc reservation with an unwritten extent.
559 : */
560 482 : nimaps = 1;
561 482 : error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
562 : cmap->br_blockcount,
563 : XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
564 : cmap, &nimaps);
565 482 : if (error)
566 0 : goto out_trans_cancel;
567 :
568 482 : xfs_inode_set_cowblocks_tag(ip);
569 482 : error = xfs_trans_commit(tp);
570 482 : if (error)
571 0 : return error;
572 :
573 : /*
574 : * Allocation succeeded but the requested range was not even
575 : * partially satisfied? Bail out!
576 : */
577 482 : if (nimaps == 0)
578 : return -ENOSPC;
579 482 : } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
580 :
581 431 : return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
582 :
583 0 : out_trans_cancel:
584 0 : xfs_trans_cancel(tp);
585 0 : return error;
586 : }
587 :
588 : /* Allocate all CoW reservations covering a range of blocks in a file. */
589 : int
590 21619744 : xfs_reflink_allocate_cow(
591 : struct xfs_inode *ip,
592 : struct xfs_bmbt_irec *imap,
593 : struct xfs_bmbt_irec *cmap,
594 : bool *shared,
595 : uint *lockmode,
596 : bool convert_now)
597 : {
598 21619744 : int error;
599 21619744 : bool found;
600 :
601 21619744 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
602 21619813 : if (!ip->i_cowfp) {
603 0 : ASSERT(!xfs_is_reflink_inode(ip));
604 0 : xfs_ifork_init_cow(ip);
605 : }
606 :
607 21619813 : error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
608 21619757 : if (error || !*shared)
609 : return error;
610 :
611 : /* CoW fork has a real extent */
612 2156410 : if (found)
613 1616708 : return xfs_reflink_convert_unwritten(ip, imap, cmap,
614 : convert_now);
615 :
616 : /*
617 : * CoW fork does not have an extent and data extent is shared.
618 : * Allocate a real extent in the CoW fork.
619 : */
620 539702 : if (cmap->br_startoff > imap->br_startoff)
621 539271 : return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
622 : lockmode, convert_now);
623 :
624 : /*
625 : * CoW fork has a delalloc reservation. Replace it with a real extent.
626 : * There may or may not be a data fork mapping.
627 : */
628 431 : if (isnullstartblock(cmap->br_startblock) ||
629 : cmap->br_startblock == DELAYSTARTBLOCK)
630 431 : return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
631 : lockmode, convert_now);
632 :
633 : /* Shouldn't get here. */
634 0 : ASSERT(0);
635 0 : return -EFSCORRUPTED;
636 : }
637 :
638 : /*
639 : * Cancel CoW reservations for some block range of an inode.
640 : *
641 : * If cancel_real is true this function cancels all COW fork extents for the
642 : * inode; if cancel_real is false, real extents are not cleared.
643 : *
644 : * Caller must have already joined the inode to the current transaction. The
645 : * inode will be joined to the transaction returned to the caller.
646 : */
647 : int
648 12296133 : xfs_reflink_cancel_cow_blocks(
649 : struct xfs_inode *ip,
650 : struct xfs_trans **tpp,
651 : xfs_fileoff_t offset_fsb,
652 : xfs_fileoff_t end_fsb,
653 : bool cancel_real)
654 : {
655 12296133 : struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
656 12296133 : struct xfs_mount *mp = ip->i_mount;
657 12296133 : struct xfs_bmbt_irec got, del;
658 12296133 : struct xfs_iext_cursor icur;
659 12296133 : bool isrt = XFS_IS_REALTIME_INODE(ip);
660 12296133 : int error = 0;
661 :
662 : /*
663 : * Shrink the range that we're cancelling if they don't align to the
664 : * realtime extent size, since we can only free full extents.
665 : */
666 12296133 : if (xfs_inode_needs_cow_around(ip)) {
667 0 : offset_fsb = xfs_rtb_roundup_rtx(mp, offset_fsb);
668 0 : end_fsb = xfs_rtb_rounddown_rtx(mp, end_fsb);
669 : }
670 :
671 24593968 : if (!xfs_inode_has_cow_data(ip))
672 : return 0;
673 607405 : if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
674 : return 0;
675 :
676 : /* Walk backwards until we're out of the I/O range... */
677 854558 : while (got.br_startoff + got.br_blockcount > offset_fsb) {
678 522854 : del = got;
679 522854 : xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
680 :
681 : /* Extent delete may have bumped ext forward */
682 522849 : if (!del.br_blockcount) {
683 5888 : xfs_iext_prev(ifp, &icur);
684 5888 : goto next_extent;
685 : }
686 :
687 516961 : trace_xfs_reflink_cancel_cow(ip, &del);
688 :
689 516962 : if (isnullstartblock(del.br_startblock)) {
690 20830 : error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
691 : &icur, &got, &del);
692 20830 : if (error)
693 : break;
694 496132 : } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
695 496132 : ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
696 :
697 : /* Free the CoW orphan record. */
698 496132 : xfs_refcount_free_cow_extent(*tpp, isrt,
699 496132 : del.br_startblock, del.br_blockcount);
700 :
701 645311 : error = xfs_free_extent_later(*tpp, del.br_startblock,
702 : del.br_blockcount, NULL,
703 : XFS_AG_RESV_NONE,
704 : isrt ? XFS_FREE_EXTENT_REALTIME : 0);
705 496127 : if (error)
706 : break;
707 :
708 : /* Roll the transaction */
709 496127 : error = xfs_defer_finish(tpp);
710 496135 : if (error)
711 : break;
712 :
713 : /* Remove the mapping from the CoW fork. */
714 496119 : xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
715 :
716 : /* Remove the quota reservation */
717 496121 : error = xfs_quota_unreserve_blkres(ip,
718 496121 : del.br_blockcount);
719 496120 : if (error)
720 : break;
721 : } else {
722 : /* Didn't do anything, push cursor back. */
723 0 : xfs_iext_prev(ifp, &icur);
724 : }
725 522838 : next_extent:
726 522838 : if (!xfs_iext_get_extent(ifp, &icur, &got))
727 : break;
728 : }
729 :
730 : /* clear tag if cow fork is emptied */
731 583273 : if (!ifp->if_bytes)
732 247282 : xfs_inode_clear_cowblocks_tag(ip);
733 : return error;
734 : }
735 :
736 : /*
737 : * Cancel CoW reservations for some byte range of an inode.
738 : *
739 : * If cancel_real is true this function cancels all COW fork extents for the
740 : * inode; if cancel_real is false, real extents are not cleared.
741 : */
742 : int
743 348564 : xfs_reflink_cancel_cow_range(
744 : struct xfs_inode *ip,
745 : xfs_off_t offset,
746 : xfs_off_t count,
747 : bool cancel_real)
748 : {
749 348564 : struct xfs_trans *tp;
750 348564 : xfs_fileoff_t offset_fsb;
751 348564 : xfs_fileoff_t end_fsb;
752 348564 : int error;
753 :
754 348564 : trace_xfs_reflink_cancel_cow_range(ip, offset, count);
755 348637 : ASSERT(ip->i_cowfp);
756 :
757 348637 : offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
758 348637 : if (count == NULLFILEOFF)
759 : end_fsb = NULLFILEOFF;
760 : else
761 87009 : end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
762 :
763 : /* Start a rolling transaction to remove the mappings */
764 348637 : error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
765 : 0, 0, 0, &tp);
766 348640 : if (error)
767 0 : goto out;
768 :
769 348640 : xfs_ilock(ip, XFS_ILOCK_EXCL);
770 348639 : xfs_trans_ijoin(tp, ip, 0);
771 :
772 : /* Scrape out the old CoW reservations */
773 348639 : error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
774 : cancel_real);
775 348634 : if (error)
776 16 : goto out_cancel;
777 :
778 348618 : error = xfs_trans_commit(tp);
779 :
780 348624 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
781 348624 : return error;
782 :
783 : out_cancel:
784 16 : xfs_trans_cancel(tp);
785 16 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
786 16 : out:
787 16 : trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
788 16 : return error;
789 : }
790 :
791 : #ifdef CONFIG_XFS_QUOTA
792 : /*
793 : * Update quota accounting for a remapping operation. When we're remapping
794 : * something from the CoW fork to the data fork, we must update the quota
795 : * accounting for delayed allocations. For remapping from the data fork to the
796 : * data fork, use regular block accounting.
797 : */
798 : static inline void
799 89195184 : xfs_reflink_update_quota(
800 : struct xfs_trans *tp,
801 : struct xfs_inode *ip,
802 : bool is_cow,
803 : int64_t blocks)
804 : {
805 89195184 : unsigned int qflag;
806 :
807 89195184 : if (XFS_IS_REALTIME_INODE(ip)) {
808 25557497 : qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
809 : XFS_TRANS_DQ_RTBCOUNT;
810 : } else {
811 63637687 : qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
812 : XFS_TRANS_DQ_BCOUNT;
813 : }
814 89195184 : xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
815 89194857 : }
816 : #else
817 : # define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0)
818 : #endif
819 :
820 : /*
821 : * Remap part of the CoW fork into the data fork.
822 : *
823 : * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
824 : * into the data fork; this function will remap what it can (at the end of the
825 : * range) and update @end_fsb appropriately. Each remap gets its own
826 : * transaction because we can end up merging and splitting bmbt blocks for
827 : * every remap operation and we'd like to keep the block reservation
828 : * requirements as low as possible.
829 : */
830 : STATIC int
831 2984788 : xfs_reflink_end_cow_extent(
832 : struct xfs_inode *ip,
833 : xfs_fileoff_t *offset_fsb,
834 : xfs_fileoff_t end_fsb)
835 : {
836 2984788 : struct xfs_iext_cursor icur;
837 2984788 : struct xfs_bmbt_irec got, del, data;
838 2984788 : struct xfs_mount *mp = ip->i_mount;
839 2984788 : struct xfs_trans *tp;
840 2984788 : struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
841 2984788 : unsigned int resblks;
842 2984788 : int nmaps;
843 2984788 : bool isrt = XFS_IS_REALTIME_INODE(ip);
844 2984788 : int error;
845 :
846 : /* No COW extents? That's easy! */
847 2984788 : if (ifp->if_bytes == 0) {
848 167 : *offset_fsb = end_fsb;
849 167 : return 0;
850 : }
851 :
852 2984621 : resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
853 2984621 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
854 : XFS_TRANS_RESERVE, &tp);
855 2984621 : if (error)
856 : return error;
857 :
858 : /*
859 : * Lock the inode. We have to ijoin without automatic unlock because
860 : * the lead transaction is the refcountbt record deletion; the data
861 : * fork update follows as a deferred log item.
862 : */
863 2984621 : xfs_ilock(ip, XFS_ILOCK_EXCL);
864 2984621 : xfs_trans_ijoin(tp, ip, 0);
865 :
866 2984621 : error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
867 : XFS_IEXT_REFLINK_END_COW_CNT);
868 2984621 : if (error == -EFBIG)
869 4 : error = xfs_iext_count_upgrade(tp, ip,
870 : XFS_IEXT_REFLINK_END_COW_CNT);
871 2984621 : if (error)
872 4 : goto out_cancel;
873 :
874 : /*
875 : * In case of racing, overlapping AIO writes no COW extents might be
876 : * left by the time I/O completes for the loser of the race. In that
877 : * case we are done.
878 : */
879 2984617 : if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
880 2984192 : got.br_startoff >= end_fsb) {
881 19025 : *offset_fsb = end_fsb;
882 19025 : goto out_cancel;
883 : }
884 :
885 : /*
886 : * Only remap real extents that contain data. With AIO, speculative
887 : * preallocations can leak into the range we are called upon, and we
888 : * need to skip them. Preserve @got for the eventual CoW fork
889 : * deletion; from now on @del represents the mapping that we're
890 : * actually remapping.
891 : */
892 2968324 : while (!xfs_bmap_is_written_extent(&got)) {
893 2966 : if (!xfs_iext_next_extent(ifp, &icur, &got) ||
894 2965 : got.br_startoff >= end_fsb) {
895 234 : *offset_fsb = end_fsb;
896 234 : goto out_cancel;
897 : }
898 : }
899 2965358 : del = got;
900 :
901 : /* Grab the corresponding mapping in the data fork. */
902 2965358 : nmaps = 1;
903 2965358 : error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
904 : &nmaps, 0);
905 2965358 : if (error)
906 16 : goto out_cancel;
907 :
908 : /* We can only remap the smaller of the two extent sizes. */
909 2965342 : data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
910 2965342 : del.br_blockcount = data.br_blockcount;
911 :
912 2965342 : trace_xfs_reflink_cow_remap_from(ip, &del);
913 2965342 : trace_xfs_reflink_cow_remap_to(ip, &data);
914 :
915 5558990 : if (xfs_bmap_is_real_extent(&data)) {
916 : /*
917 : * If the extent we're remapping is backed by storage (written
918 : * or not), unmap the extent and drop its refcount.
919 : */
920 2593648 : xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
921 2593648 : xfs_refcount_decrease_extent(tp, isrt, &data);
922 2593648 : xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
923 371694 : } else if (data.br_startblock == DELAYSTARTBLOCK) {
924 19879 : int done;
925 :
926 : /*
927 : * If the extent we're remapping is a delalloc reservation,
928 : * we can use the regular bunmapi function to release the
929 : * incore state. Dropping the delalloc reservation takes care
930 : * of the quota reservation for us.
931 : */
932 19879 : error = xfs_bunmapi(NULL, ip, data.br_startoff,
933 : data.br_blockcount, 0, 1, &done);
934 19879 : if (error)
935 0 : goto out_cancel;
936 19879 : ASSERT(done);
937 : }
938 :
939 : /* Free the CoW orphan record. */
940 2965342 : xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
941 2965342 : del.br_blockcount);
942 :
943 : /* Map the new blocks into the data fork. */
944 2965342 : xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
945 :
946 : /* Charge this new data fork mapping to the on-disk quota. */
947 2965342 : xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
948 :
949 : /* Remove the mapping from the CoW fork. */
950 2965342 : xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
951 :
952 2965342 : error = xfs_trans_commit(tp);
953 2965342 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
954 2965342 : if (error)
955 : return error;
956 :
957 : /* Update the caller about how much progress we made. */
958 2965338 : *offset_fsb = del.br_startoff + del.br_blockcount;
959 2965338 : return 0;
960 :
961 19279 : out_cancel:
962 19279 : xfs_trans_cancel(tp);
963 19279 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
964 19279 : return error;
965 : }
966 :
967 : /*
968 : * Remap parts of a file's data fork after a successful CoW.
969 : */
970 : int
971 1743149 : xfs_reflink_end_cow(
972 : struct xfs_inode *ip,
973 : xfs_off_t offset,
974 : xfs_off_t count)
975 : {
976 1743149 : struct xfs_mount *mp = ip->i_mount;
977 1743149 : xfs_fileoff_t offset_fsb;
978 1743149 : xfs_fileoff_t end_fsb;
979 1743149 : int error = 0;
980 :
981 1743149 : trace_xfs_reflink_end_cow(ip, offset, count);
982 :
983 1743149 : offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
984 1743149 : end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
985 :
986 : /*
987 : * Make sure the end is aligned with a rt extent (if desired), since
988 : * the end of the range could be EOF. The _convert_cow function should
989 : * have set us up to swap only full rt extents.
990 : */
991 1743149 : if (xfs_inode_needs_cow_around(ip)) {
992 0 : offset_fsb = xfs_rtb_rounddown_rtx(mp, offset_fsb);
993 0 : end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
994 : }
995 :
996 : /*
997 : * Walk forwards until we've remapped the I/O range. The loop function
998 : * repeatedly cycles the ILOCK to allocate one transaction per remapped
999 : * extent.
1000 : *
1001 : * If we're being called by writeback then the pages will still
1002 : * have PageWriteback set, which prevents races with reflink remapping
1003 : * and truncate. Reflink remapping prevents races with writeback by
1004 : * taking the iolock and mmaplock before flushing the pages and
1005 : * remapping, which means there won't be any further writeback or page
1006 : * cache dirtying until the reflink completes.
1007 : *
1008 : * We should never have two threads issuing writeback for the same file
1009 : * region. There are also have post-eof checks in the writeback
1010 : * preparation code so that we don't bother writing out pages that are
1011 : * about to be truncated.
1012 : *
1013 : * If we're being called as part of directio write completion, the dio
1014 : * count is still elevated, which reflink and truncate will wait for.
1015 : * Reflink remapping takes the iolock and mmaplock and waits for
1016 : * pending dio to finish, which should prevent any directio until the
1017 : * remap completes. Multiple concurrent directio writes to the same
1018 : * region are handled by end_cow processing only occurring for the
1019 : * threads which succeed; the outcome of multiple overlapping direct
1020 : * writes is not well defined anyway.
1021 : *
1022 : * It's possible that a buffered write and a direct write could collide
1023 : * here (the buffered write stumbles in after the dio flushes and
1024 : * invalidates the page cache and immediately queues writeback), but we
1025 : * have never supported this 100%. If either disk write succeeds the
1026 : * blocks will be remapped.
1027 : */
1028 4727937 : while (end_fsb > offset_fsb && !error)
1029 2984787 : error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
1030 :
1031 1743150 : if (error)
1032 24 : trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
1033 1743150 : return error;
1034 : }
1035 :
1036 : /*
1037 : * Free all CoW staging blocks that are still referenced by the ondisk refcount
1038 : * metadata. The ondisk metadata does not track which inode created the
1039 : * staging extent, so callers must ensure that there are no cached inodes with
1040 : * live CoW staging extents.
1041 : */
1042 : int
1043 11367 : xfs_reflink_recover_cow(
1044 : struct xfs_mount *mp)
1045 : {
1046 11367 : struct xfs_perag *pag;
1047 11367 : struct xfs_rtgroup *rtg;
1048 11367 : xfs_agnumber_t agno;
1049 11367 : xfs_rgnumber_t rgno;
1050 11367 : int error = 0;
1051 :
1052 11367 : if (!xfs_has_reflink(mp))
1053 : return 0;
1054 :
1055 56785 : for_each_perag(mp, agno, pag) {
1056 45426 : error = xfs_refcount_recover_cow_leftovers(mp, pag);
1057 45426 : if (error) {
1058 8 : xfs_perag_rele(pag);
1059 8 : return error;
1060 : }
1061 : }
1062 :
1063 11359 : for_each_rtgroup(mp, rgno, rtg) {
1064 0 : error = xfs_refcount_recover_rtcow_leftovers(mp, rtg);
1065 0 : if (error) {
1066 0 : xfs_rtgroup_rele(rtg);
1067 0 : return error;
1068 : }
1069 : }
1070 :
1071 : return 0;
1072 : }
1073 :
1074 : /*
1075 : * Reflinking (Block) Ranges of Two Files Together
1076 : *
1077 : * First, ensure that the reflink flag is set on both inodes. The flag is an
1078 : * optimization to avoid unnecessary refcount btree lookups in the write path.
1079 : *
1080 : * Now we can iteratively remap the range of extents (and holes) in src to the
1081 : * corresponding ranges in dest. Let drange and srange denote the ranges of
1082 : * logical blocks in dest and src touched by the reflink operation.
1083 : *
1084 : * While the length of drange is greater than zero,
1085 : * - Read src's bmbt at the start of srange ("imap")
1086 : * - If imap doesn't exist, make imap appear to start at the end of srange
1087 : * with zero length.
1088 : * - If imap starts before srange, advance imap to start at srange.
1089 : * - If imap goes beyond srange, truncate imap to end at the end of srange.
1090 : * - Punch (imap start - srange start + imap len) blocks from dest at
1091 : * offset (drange start).
1092 : * - If imap points to a real range of pblks,
1093 : * > Increase the refcount of the imap's pblks
1094 : * > Map imap's pblks into dest at the offset
1095 : * (drange start + imap start - srange start)
1096 : * - Advance drange and srange by (imap start - srange start + imap len)
1097 : *
1098 : * Finally, if the reflink made dest longer, update both the in-core and
1099 : * on-disk file sizes.
1100 : *
1101 : * ASCII Art Demonstration:
1102 : *
1103 : * Let's say we want to reflink this source file:
1104 : *
1105 : * ----SSSSSSS-SSSSS----SSSSSS (src file)
1106 : * <-------------------->
1107 : *
1108 : * into this destination file:
1109 : *
1110 : * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
1111 : * <-------------------->
1112 : * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
1113 : * Observe that the range has different logical offsets in either file.
1114 : *
1115 : * Consider that the first extent in the source file doesn't line up with our
1116 : * reflink range. Unmapping and remapping are separate operations, so we can
1117 : * unmap more blocks from the destination file than we remap.
1118 : *
1119 : * ----SSSSSSS-SSSSS----SSSSSS
1120 : * <------->
1121 : * --DDDDD---------DDDDD--DDD
1122 : * <------->
1123 : *
1124 : * Now remap the source extent into the destination file:
1125 : *
1126 : * ----SSSSSSS-SSSSS----SSSSSS
1127 : * <------->
1128 : * --DDDDD--SSSSSSSDDDDD--DDD
1129 : * <------->
1130 : *
1131 : * Do likewise with the second hole and extent in our range. Holes in the
1132 : * unmap range don't affect our operation.
1133 : *
1134 : * ----SSSSSSS-SSSSS----SSSSSS
1135 : * <---->
1136 : * --DDDDD--SSSSSSS-SSSSS-DDD
1137 : * <---->
1138 : *
1139 : * Finally, unmap and remap part of the third extent. This will increase the
1140 : * size of the destination file.
1141 : *
1142 : * ----SSSSSSS-SSSSS----SSSSSS
1143 : * <----->
1144 : * --DDDDD--SSSSSSS-SSSSS----SSS
1145 : * <----->
1146 : *
1147 : * Once we update the destination file's i_size, we're done.
1148 : */
1149 :
1150 : /*
1151 : * Ensure the reflink bit is set in both inodes.
1152 : */
1153 : STATIC int
1154 197421863 : xfs_reflink_set_inode_flag(
1155 : struct xfs_inode *src,
1156 : struct xfs_inode *dest)
1157 : {
1158 197421863 : struct xfs_mount *mp = src->i_mount;
1159 197421863 : int error;
1160 197421863 : struct xfs_trans *tp;
1161 :
1162 197421863 : if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
1163 : return 0;
1164 :
1165 3760210 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1166 3761792 : if (error)
1167 2 : goto out_error;
1168 :
1169 : /* Lock both files against IO */
1170 3761790 : if (src->i_ino == dest->i_ino)
1171 76209 : xfs_ilock(src, XFS_ILOCK_EXCL);
1172 : else
1173 3685581 : xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
1174 :
1175 3761785 : if (!xfs_is_reflink_inode(src)) {
1176 327289 : trace_xfs_reflink_set_inode_flag(src);
1177 327289 : xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
1178 327289 : src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1179 327289 : xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
1180 327289 : xfs_ifork_init_cow(src);
1181 : } else
1182 3434496 : xfs_iunlock(src, XFS_ILOCK_EXCL);
1183 :
1184 3761791 : if (src->i_ino == dest->i_ino)
1185 76209 : goto commit_flags;
1186 :
1187 3685582 : if (!xfs_is_reflink_inode(dest)) {
1188 3484781 : trace_xfs_reflink_set_inode_flag(dest);
1189 3484781 : xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1190 3484777 : dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1191 3484777 : xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1192 3484781 : xfs_ifork_init_cow(dest);
1193 : } else
1194 200801 : xfs_iunlock(dest, XFS_ILOCK_EXCL);
1195 :
1196 3761791 : commit_flags:
1197 3761791 : error = xfs_trans_commit(tp);
1198 3761789 : if (error)
1199 3 : goto out_error;
1200 : return error;
1201 :
1202 5 : out_error:
1203 5 : trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
1204 5 : return error;
1205 : }
1206 :
1207 : /*
1208 : * Update destination inode size & cowextsize hint, if necessary.
1209 : */
1210 : int
1211 196825597 : xfs_reflink_update_dest(
1212 : struct xfs_inode *dest,
1213 : xfs_off_t newlen,
1214 : xfs_extlen_t cowextsize,
1215 : unsigned int remap_flags)
1216 : {
1217 196825597 : struct xfs_mount *mp = dest->i_mount;
1218 196825597 : struct xfs_trans *tp;
1219 196825597 : int error;
1220 :
1221 196825597 : if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
1222 : return 0;
1223 :
1224 2794278 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1225 2791886 : if (error)
1226 0 : goto out_error;
1227 :
1228 2791886 : xfs_ilock(dest, XFS_ILOCK_EXCL);
1229 2791887 : xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1230 :
1231 2791882 : if (newlen > i_size_read(VFS_I(dest))) {
1232 2791876 : trace_xfs_reflink_update_inode_size(dest, newlen);
1233 2791880 : i_size_write(VFS_I(dest), newlen);
1234 2791880 : dest->i_disk_size = newlen;
1235 : }
1236 :
1237 2791886 : if (cowextsize) {
1238 6 : dest->i_cowextsize = cowextsize;
1239 6 : dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
1240 : }
1241 :
1242 2791886 : xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1243 :
1244 2791887 : error = xfs_trans_commit(tp);
1245 2791887 : if (error)
1246 0 : goto out_error;
1247 : return error;
1248 :
1249 0 : out_error:
1250 0 : trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
1251 0 : return error;
1252 : }
1253 :
1254 : /*
1255 : * Do we have enough reserve in this AG to handle a reflink? The refcount
1256 : * btree already reserved all the space it needs, but the rmap btree can grow
1257 : * infinitely, so we won't allow more reflinks when the AG is down to the
1258 : * btree reserves.
1259 : */
1260 : static int
1261 61781221 : xfs_reflink_ag_has_free_space(
1262 : struct xfs_mount *mp,
1263 : struct xfs_inode *ip,
1264 : xfs_fsblock_t fsb)
1265 : {
1266 61781221 : struct xfs_perag *pag;
1267 61781221 : xfs_agnumber_t agno;
1268 61781221 : int error = 0;
1269 :
1270 61781221 : if (!xfs_has_rmapbt(mp))
1271 : return 0;
1272 61781221 : if (XFS_IS_REALTIME_INODE(ip)) {
1273 7502544 : struct xfs_rtgroup *rtg;
1274 7502544 : xfs_rgnumber_t rgno;
1275 :
1276 7502544 : rgno = xfs_rtb_to_rgno(mp, fsb);
1277 7502544 : rtg = xfs_rtgroup_get(mp, rgno);
1278 15005012 : if (xfs_imeta_resv_critical(rtg->rtg_rmapip) ||
1279 7502499 : xfs_imeta_resv_critical(rtg->rtg_refcountip))
1280 : error = -ENOSPC;
1281 7502504 : xfs_rtgroup_put(rtg);
1282 7502504 : return error;
1283 : }
1284 :
1285 54278682 : agno = XFS_FSB_TO_AGNO(mp, fsb);
1286 54278682 : pag = xfs_perag_get(mp, agno);
1287 108557338 : if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
1288 54278660 : xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
1289 : error = -ENOSPC;
1290 54278674 : xfs_perag_put(pag);
1291 54278674 : return error;
1292 : }
1293 :
1294 : /*
1295 : * Remap the given extent into the file. The dmap blockcount will be set to
1296 : * the number of blocks that were actually remapped.
1297 : */
1298 : STATIC int
1299 287373995 : xfs_reflink_remap_extent(
1300 : struct xfs_inode *ip,
1301 : struct xfs_bmbt_irec *dmap,
1302 : xfs_off_t new_isize)
1303 : {
1304 287373995 : struct xfs_bmbt_irec smap;
1305 287373995 : struct xfs_mount *mp = ip->i_mount;
1306 287373995 : struct xfs_trans *tp;
1307 287373995 : xfs_off_t newlen;
1308 287373995 : int64_t qdelta = 0;
1309 287373995 : unsigned int dblocks, rblocks, resblks;
1310 287373995 : bool quota_reserved = true;
1311 287373995 : bool smap_real;
1312 287373995 : bool dmap_written = xfs_bmap_is_written_extent(dmap);
1313 287373995 : bool isrt = XFS_IS_REALTIME_INODE(ip);
1314 287373995 : int iext_delta = 0;
1315 287373995 : int nimaps;
1316 287373995 : int error;
1317 :
1318 : /*
1319 : * Start a rolling transaction to switch the mappings.
1320 : *
1321 : * Adding a written extent to the extent map can cause a bmbt split,
1322 : * and removing a mapped extent from the extent can cause a bmbt split.
1323 : * The two operations cannot both cause a split since they operate on
1324 : * the same index in the bmap btree, so we only need a reservation for
1325 : * one bmbt split if either thing is happening. However, we haven't
1326 : * locked the inode yet, so we reserve assuming this is the case.
1327 : *
1328 : * The first allocation call tries to reserve enough space to handle
1329 : * mapping dmap into a sparse part of the file plus the bmbt split. We
1330 : * haven't locked the inode or read the existing mapping yet, so we do
1331 : * not know for sure that we need the space. This should succeed most
1332 : * of the time.
1333 : *
1334 : * If the first attempt fails, try again but reserving only enough
1335 : * space to handle a bmbt split. This is the hard minimum requirement,
1336 : * and we revisit quota reservations later when we know more about what
1337 : * we're remapping.
1338 : */
1339 287373995 : resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1340 287373995 : if (XFS_IS_REALTIME_INODE(ip)) {
1341 100457850 : dblocks = resblks;
1342 100457850 : rblocks = dmap->br_blockcount;
1343 : } else {
1344 186916300 : dblocks = resblks + dmap->br_blockcount;
1345 186916300 : rblocks = 0;
1346 : }
1347 287373995 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
1348 : dblocks, rblocks, false, &tp);
1349 287379650 : if (error == -EDQUOT || error == -ENOSPC) {
1350 2374348 : quota_reserved = false;
1351 2374348 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
1352 : resblks, 0, false, &tp);
1353 : }
1354 287379320 : if (error)
1355 596854 : goto out;
1356 :
1357 : /*
1358 : * Read what's currently mapped in the destination file into smap.
1359 : * If smap isn't a hole, we will have to remove it before we can add
1360 : * dmap to the destination file.
1361 : */
1362 286782466 : nimaps = 1;
1363 286782466 : error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
1364 : &smap, &nimaps, 0);
1365 286780343 : if (error)
1366 9 : goto out_cancel;
1367 286780334 : ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
1368 286780334 : smap_real = xfs_bmap_is_real_extent(&smap);
1369 :
1370 : /*
1371 : * We can only remap as many blocks as the smaller of the two extent
1372 : * maps, because we can only remap one extent at a time.
1373 : */
1374 286780334 : dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
1375 286780334 : ASSERT(dmap->br_blockcount == smap.br_blockcount);
1376 :
1377 286780334 : trace_xfs_reflink_remap_extent_dest(ip, &smap);
1378 :
1379 : /*
1380 : * Two extents mapped to the same physical block must not have
1381 : * different states; that's filesystem corruption. Move on to the next
1382 : * extent if they're both holes or both the same physical extent.
1383 : */
1384 286778070 : if (dmap->br_startblock == smap.br_startblock) {
1385 202840393 : if (dmap->br_state != smap.br_state) {
1386 0 : xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
1387 0 : error = -EFSCORRUPTED;
1388 : }
1389 202840393 : goto out_cancel;
1390 : }
1391 :
1392 : /* If both extents are unwritten, leave them alone. */
1393 83937677 : if (dmap->br_state == XFS_EXT_UNWRITTEN &&
1394 7858778 : smap.br_state == XFS_EXT_UNWRITTEN)
1395 301248 : goto out_cancel;
1396 :
1397 : /* No reflinking if the AG of the dest mapping is low on space. */
1398 83636429 : if (dmap_written) {
1399 61781223 : error = xfs_reflink_ag_has_free_space(mp, ip,
1400 : dmap->br_startblock);
1401 61781196 : if (error)
1402 6 : goto out_cancel;
1403 : }
1404 :
1405 : /*
1406 : * Increase quota reservation if we think the quota block counter for
1407 : * this file could increase.
1408 : *
1409 : * If we are mapping a written extent into the file, we need to have
1410 : * enough quota block count reservation to handle the blocks in that
1411 : * extent. We log only the delta to the quota block counts, so if the
1412 : * extent we're unmapping also has blocks allocated to it, we don't
1413 : * need a quota reservation for the extent itself.
1414 : *
1415 : * Note that if we're replacing a delalloc reservation with a written
1416 : * extent, we have to take the full quota reservation because removing
1417 : * the delalloc reservation gives the block count back to the quota
1418 : * count. This is suboptimal, but the VFS flushed the dest range
1419 : * before we started. That should have removed all the delalloc
1420 : * reservations, but we code defensively.
1421 : *
1422 : * xfs_trans_alloc_inode above already tried to grab an even larger
1423 : * quota reservation, and kicked off a blockgc scan if it couldn't.
1424 : * If we can't get a potentially smaller quota reservation now, we're
1425 : * done.
1426 : */
1427 83636396 : if (!quota_reserved && !smap_real && dmap_written) {
1428 17560 : if (XFS_IS_REALTIME_INODE(ip)) {
1429 0 : dblocks = 0;
1430 0 : rblocks = dmap->br_blockcount;
1431 : } else {
1432 17560 : dblocks = dmap->br_blockcount;
1433 17560 : rblocks = 0;
1434 : }
1435 17560 : error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
1436 : false);
1437 17560 : if (error)
1438 0 : goto out_cancel;
1439 : }
1440 :
1441 83636396 : if (smap_real)
1442 15551352 : ++iext_delta;
1443 :
1444 83636396 : if (dmap_written)
1445 61781193 : ++iext_delta;
1446 :
1447 83636396 : error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
1448 83636241 : if (error == -EFBIG)
1449 6 : error = xfs_iext_count_upgrade(tp, ip, iext_delta);
1450 83636241 : if (error)
1451 6 : goto out_cancel;
1452 :
1453 83636235 : if (smap_real) {
1454 : /*
1455 : * If the extent we're unmapping is backed by storage (written
1456 : * or not), unmap the extent and drop its refcount.
1457 : */
1458 15551212 : xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
1459 15550755 : xfs_refcount_decrease_extent(tp, isrt, &smap);
1460 15550889 : qdelta -= smap.br_blockcount;
1461 68085023 : } else if (smap.br_startblock == DELAYSTARTBLOCK) {
1462 79 : int done;
1463 :
1464 : /*
1465 : * If the extent we're unmapping is a delalloc reservation,
1466 : * we can use the regular bunmapi function to release the
1467 : * incore state. Dropping the delalloc reservation takes care
1468 : * of the quota reservation for us.
1469 : */
1470 79 : error = xfs_bunmapi(NULL, ip, smap.br_startoff,
1471 : smap.br_blockcount, 0, 1, &done);
1472 79 : if (error)
1473 0 : goto out_cancel;
1474 79 : ASSERT(done);
1475 : }
1476 :
1477 : /*
1478 : * If the extent we're sharing is backed by written storage, increase
1479 : * its refcount and map it into the file.
1480 : */
1481 83635912 : if (dmap_written) {
1482 61781139 : xfs_refcount_increase_extent(tp, isrt, dmap);
1483 61781180 : xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
1484 61781152 : qdelta += dmap->br_blockcount;
1485 : }
1486 :
1487 83635925 : xfs_reflink_update_quota(tp, ip, false, qdelta);
1488 :
1489 : /* Update dest isize if needed. */
1490 83635694 : newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
1491 83635694 : newlen = min_t(xfs_off_t, newlen, new_isize);
1492 83635694 : if (newlen > i_size_read(VFS_I(ip))) {
1493 53800692 : trace_xfs_reflink_update_inode_size(ip, newlen);
1494 53800692 : i_size_write(VFS_I(ip), newlen);
1495 53800692 : ip->i_disk_size = newlen;
1496 53800692 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1497 : }
1498 :
1499 : /* Commit everything and unlock. */
1500 83635694 : error = xfs_trans_commit(tp);
1501 83636702 : goto out_unlock;
1502 :
1503 203141662 : out_cancel:
1504 203141662 : xfs_trans_cancel(tp);
1505 286778787 : out_unlock:
1506 286778787 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1507 287368429 : out:
1508 287368429 : if (error)
1509 597670 : trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1510 287368438 : return error;
1511 : }
1512 :
1513 : /* Remap a range of one file to the other. */
1514 : int
1515 197409030 : xfs_reflink_remap_blocks(
1516 : struct xfs_inode *src,
1517 : loff_t pos_in,
1518 : struct xfs_inode *dest,
1519 : loff_t pos_out,
1520 : loff_t remap_len,
1521 : loff_t *remapped)
1522 : {
1523 197409030 : struct xfs_bmbt_irec imap;
1524 197409030 : struct xfs_mount *mp = src->i_mount;
1525 197409030 : xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
1526 197409030 : xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
1527 197409030 : xfs_filblks_t len;
1528 197409030 : xfs_filblks_t remapped_len = 0;
1529 197409030 : xfs_off_t new_isize = pos_out + remap_len;
1530 197409030 : int nimaps;
1531 197409030 : int error = 0;
1532 :
1533 197409030 : len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
1534 : XFS_MAX_FILEOFF);
1535 :
1536 : /*
1537 : * Make sure the end is aligned with a rt extent (if desired), since
1538 : * the end of the range could be EOF.
1539 : */
1540 197409030 : if (xfs_inode_has_bigrtextents(dest))
1541 0 : len = xfs_rtb_roundup_rtx(mp, len);
1542 :
1543 197409030 : trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
1544 :
1545 484180453 : while (len > 0) {
1546 287358801 : unsigned int lock_mode;
1547 :
1548 : /* Read extent from the source file */
1549 287358801 : nimaps = 1;
1550 287358801 : lock_mode = xfs_ilock_data_map_shared(src);
1551 287366027 : error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
1552 287371356 : xfs_iunlock(src, lock_mode);
1553 287375642 : if (error)
1554 : break;
1555 : /*
1556 : * The caller supposedly flushed all dirty pages in the source
1557 : * file range, which means that writeback should have allocated
1558 : * or deleted all delalloc reservations in that range. If we
1559 : * find one, that's a good sign that something is seriously
1560 : * wrong here.
1561 : */
1562 287375581 : ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
1563 287375581 : if (imap.br_startblock == DELAYSTARTBLOCK) {
1564 0 : ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1565 0 : xfs_bmap_mark_sick(src, XFS_DATA_FORK);
1566 0 : error = -EFSCORRUPTED;
1567 0 : break;
1568 : }
1569 :
1570 287375581 : trace_xfs_reflink_remap_extent_src(src, &imap);
1571 :
1572 : /* Remap into the destination file at the given offset. */
1573 287376186 : imap.br_startoff = destoff;
1574 287376186 : error = xfs_reflink_remap_extent(dest, &imap, new_isize);
1575 287365658 : if (error)
1576 : break;
1577 :
1578 286767980 : if (fatal_signal_pending(current)) {
1579 : error = -EINTR;
1580 : break;
1581 : }
1582 :
1583 : /* Advance drange/srange */
1584 286771423 : srcoff += imap.br_blockcount;
1585 286771423 : destoff += imap.br_blockcount;
1586 286771423 : len -= imap.br_blockcount;
1587 286771423 : remapped_len += imap.br_blockcount;
1588 : }
1589 :
1590 197424485 : if (error)
1591 597904 : trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
1592 197424485 : *remapped = min_t(loff_t, remap_len,
1593 : XFS_FSB_TO_B(src->i_mount, remapped_len));
1594 197424485 : return error;
1595 : }
1596 :
1597 : /*
1598 : * If we're reflinking to a point past the destination file's EOF, we must
1599 : * zero any speculative post-EOF preallocations that sit between the old EOF
1600 : * and the destination file offset.
1601 : */
1602 : static int
1603 197424030 : xfs_reflink_zero_posteof(
1604 : struct xfs_inode *ip,
1605 : loff_t pos)
1606 : {
1607 197424030 : loff_t isize = i_size_read(VFS_I(ip));
1608 :
1609 197424030 : if (pos <= isize)
1610 : return 0;
1611 :
1612 3107520 : trace_xfs_zero_eof(ip, isize, pos - isize);
1613 3107518 : return xfs_zero_range(ip, isize, pos - isize, NULL);
1614 : }
1615 :
1616 : #ifdef CONFIG_XFS_RT
1617 : /* Adjust the length of the remap operation to end on a rt extent boundary. */
1618 : STATIC int
1619 0 : xfs_reflink_remap_adjust_rtlen(
1620 : struct xfs_inode *src,
1621 : loff_t pos_in,
1622 : struct xfs_inode *dest,
1623 : loff_t pos_out,
1624 : loff_t *len,
1625 : unsigned int remap_flags)
1626 : {
1627 0 : struct xfs_mount *mp = src->i_mount;
1628 0 : uint32_t mod;
1629 :
1630 0 : div_u64_rem(*len, XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize), &mod);
1631 :
1632 : /*
1633 : * We previously checked the rtextent alignment of both offsets, so we
1634 : * now have to check the alignment of the length. The VFS remap prep
1635 : * function can change the length on us, so we can only make length
1636 : * adjustments after that. If the length is aligned to an rtextent,
1637 : * we're trivially good to go.
1638 : *
1639 : * Otherwise, the length is not aligned to an rt extent. If the source
1640 : * file's range ends at EOF, the VFS ensured that the dest file's range
1641 : * also ends at EOF. The actual remap function will round the (byte)
1642 : * length up to the nearest rtextent unit, so we're ok here too.
1643 : */
1644 0 : if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
1645 : return 0;
1646 :
1647 : /*
1648 : * Otherwise, the only thing we can do is round the request length down
1649 : * to an rt extent boundary. If the caller doesn't allow that, we are
1650 : * finished.
1651 : */
1652 0 : if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
1653 : return -EINVAL;
1654 :
1655 : /* Back off by a single extent. */
1656 0 : (*len) -= mod;
1657 0 : trace_xfs_reflink_remap_adjust_rtlen(src, pos_in, *len, dest, pos_out);
1658 0 : return 0;
1659 : }
1660 : #else
1661 : # define xfs_reflink_remap_adjust_rtlen(...) (0)
1662 : #endif /* CONFIG_XFS_RT */
1663 :
1664 : /*
1665 : * Check the alignment of a remap request when the allocation unit size isn't a
1666 : * power of two. The VFS helpers use (fast) bitmask-based alignment checks,
1667 : * but here we have to use slow long division.
1668 : */
1669 : static int
1670 0 : xfs_reflink_remap_check_rtalign(
1671 : struct xfs_inode *ip_in,
1672 : loff_t pos_in,
1673 : struct xfs_inode *ip_out,
1674 : loff_t pos_out,
1675 : loff_t *req_len,
1676 : unsigned int remap_flags)
1677 : {
1678 0 : struct xfs_mount *mp = ip_in->i_mount;
1679 0 : uint32_t rextbytes;
1680 0 : loff_t in_size, out_size;
1681 0 : loff_t new_length, length = *req_len;
1682 0 : loff_t blen;
1683 :
1684 0 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
1685 0 : in_size = i_size_read(VFS_I(ip_in));
1686 0 : out_size = i_size_read(VFS_I(ip_out));
1687 :
1688 : /* The start of both ranges must be aligned to a rt extent. */
1689 0 : if (!isaligned_64(pos_in, rextbytes) ||
1690 0 : !isaligned_64(pos_out, rextbytes))
1691 : return -EINVAL;
1692 :
1693 0 : if (length == 0)
1694 0 : length = in_size - pos_in;
1695 :
1696 : /*
1697 : * If the user wanted us to exchange up to the infile's EOF, round up
1698 : * to the next block boundary for this check.
1699 : *
1700 : * Otherwise, reject the range length if it's not extent aligned. We
1701 : * already confirmed the starting offsets' extent alignment.
1702 : */
1703 0 : if (pos_in + length == in_size)
1704 0 : blen = roundup_64(in_size, rextbytes) - pos_in;
1705 : else
1706 0 : blen = rounddown_64(length, rextbytes);
1707 :
1708 : /* Don't allow overlapped remappings within the same file. */
1709 0 : if (ip_in == ip_out &&
1710 0 : pos_out + blen > pos_in &&
1711 0 : pos_in + blen > pos_out)
1712 : return -EINVAL;
1713 :
1714 : /*
1715 : * Ensure that we don't exchange a partial EOF extent into the middle
1716 : * of another file.
1717 : */
1718 0 : if (isaligned_64(length, rextbytes))
1719 : return 0;
1720 :
1721 0 : new_length = length;
1722 0 : if (pos_out + length < out_size)
1723 0 : new_length = rounddown_64(new_length, rextbytes);
1724 :
1725 0 : if (new_length == length)
1726 : return 0;
1727 :
1728 : /*
1729 : * Return the shortened request if the caller permits it. If the
1730 : * request was shortened to zero rt extents, we know that the original
1731 : * arguments weren't valid in the first place.
1732 : */
1733 0 : if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) {
1734 0 : *req_len = new_length;
1735 0 : return 0;
1736 : }
1737 :
1738 0 : return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1739 : }
1740 :
1741 : /*
1742 : * Prepare two files for range cloning. Upon a successful return both inodes
1743 : * will have the iolock and mmaplock held, the page cache of the out file will
1744 : * be truncated, and any leases on the out file will have been broken. This
1745 : * function borrows heavily from xfs_file_aio_write_checks.
1746 : *
1747 : * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
1748 : * checked that the bytes beyond EOF physically match. Hence we cannot use the
1749 : * EOF block in the source dedupe range because it's not a complete block match,
1750 : * hence can introduce a corruption into the file that has it's block replaced.
1751 : *
1752 : * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
1753 : * "block aligned" for the purposes of cloning entire files. However, if the
1754 : * source file range includes the EOF block and it lands within the existing EOF
1755 : * of the destination file, then we can expose stale data from beyond the source
1756 : * file EOF in the destination file.
1757 : *
1758 : * XFS doesn't support partial block sharing, so in both cases we have check
1759 : * these cases ourselves. For dedupe, we can simply round the length to dedupe
1760 : * down to the previous whole block and ignore the partial EOF block. While this
1761 : * means we can't dedupe the last block of a file, this is an acceptible
1762 : * tradeoff for simplicity on implementation.
1763 : *
1764 : * For cloning, we want to share the partial EOF block if it is also the new EOF
1765 : * block of the destination file. If the partial EOF block lies inside the
1766 : * existing destination EOF, then we have to abort the clone to avoid exposing
1767 : * stale data in the destination file. Hence we reject these clone attempts with
1768 : * -EINVAL in this case.
1769 : */
1770 : int
1771 272702656 : xfs_reflink_remap_prep(
1772 : struct file *file_in,
1773 : loff_t pos_in,
1774 : struct file *file_out,
1775 : loff_t pos_out,
1776 : loff_t *len,
1777 : unsigned int remap_flags)
1778 : {
1779 272702656 : struct inode *inode_in = file_inode(file_in);
1780 272702656 : struct xfs_inode *src = XFS_I(inode_in);
1781 272702656 : struct inode *inode_out = file_inode(file_out);
1782 272702656 : struct xfs_inode *dest = XFS_I(inode_out);
1783 272702656 : const struct iomap_ops *dax_read_ops = NULL;
1784 272702656 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(dest);
1785 272701536 : int ret;
1786 :
1787 : /* Lock both files against IO */
1788 272701536 : ret = xfs_ilock2_io_mmap(src, dest);
1789 272722455 : if (ret)
1790 : return ret;
1791 :
1792 : /* Check file eligibility and prepare for block sharing. */
1793 272722455 : ret = -EINVAL;
1794 : /* Can't reflink between data and rt volumes */
1795 588288367 : if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
1796 0 : goto out_unlock;
1797 :
1798 : /* Don't share DAX file data with non-DAX file. */
1799 272722455 : if (IS_DAX(inode_in) != IS_DAX(inode_out))
1800 : goto out_unlock;
1801 :
1802 : /* Check non-power of two alignment issues, if necessary. */
1803 387660867 : if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
1804 0 : ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,
1805 : pos_out, len, remap_flags);
1806 0 : if (ret)
1807 0 : goto out_unlock;
1808 :
1809 : /* Do the VFS checks with the regular block alignment. */
1810 0 : alloc_unit = src->i_mount->m_sb.sb_blocksize;
1811 : }
1812 :
1813 272722455 : if (IS_DAX(inode_in))
1814 : dax_read_ops = &xfs_read_iomap_ops;
1815 :
1816 272722455 : ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
1817 : pos_out, len, remap_flags, dax_read_ops, alloc_unit);
1818 272720854 : if (ret || *len == 0)
1819 75293322 : goto out_unlock;
1820 :
1821 : /* Make sure the end is aligned with a rt extent. */
1822 197427532 : if (xfs_inode_has_bigrtextents(src)) {
1823 0 : ret = xfs_reflink_remap_adjust_rtlen(src, pos_in, dest,
1824 : pos_out, len, remap_flags);
1825 0 : if (ret || *len == 0)
1826 0 : goto out_unlock;
1827 : }
1828 :
1829 : /* Attach dquots to dest inode before changing block map */
1830 197427532 : ret = xfs_qm_dqattach(dest);
1831 197425805 : if (ret)
1832 0 : goto out_unlock;
1833 :
1834 : /*
1835 : * Zero existing post-eof speculative preallocations in the destination
1836 : * file.
1837 : */
1838 197425805 : ret = xfs_reflink_zero_posteof(dest, pos_out);
1839 197425198 : if (ret)
1840 225 : goto out_unlock;
1841 :
1842 : /* Set flags and remap blocks. */
1843 197424973 : ret = xfs_reflink_set_inode_flag(src, dest);
1844 197424085 : if (ret)
1845 5 : goto out_unlock;
1846 :
1847 : /*
1848 : * Now that we've marked both inodes for reflink, make sure that all
1849 : * possible rt extents in both files' ranges are either wholly written,
1850 : * wholly unwritten, or holes. The bmap code requires that we align
1851 : * all unmap and remap requests to a rt extent boundary. We've already
1852 : * flushed the page cache and finished directio for the range that's
1853 : * being remapped, so we can convert the extents directly.
1854 : */
1855 197424080 : if (xfs_inode_has_bigrtextents(src)) {
1856 0 : ret = xfs_rtfile_convert_unwritten(src, pos_in, *len);
1857 0 : if (ret)
1858 0 : goto out_unlock;
1859 : }
1860 197424080 : if (xfs_inode_has_bigrtextents(dest)) {
1861 0 : ret = xfs_rtfile_convert_unwritten(dest, pos_out, *len);
1862 0 : if (ret)
1863 0 : goto out_unlock;
1864 : }
1865 :
1866 : /*
1867 : * If pos_out > EOF, we may have dirtied blocks between EOF and
1868 : * pos_out. In that case, we need to extend the flush and unmap to cover
1869 : * from EOF to the end of the copy length.
1870 : */
1871 394848160 : if (pos_out > XFS_ISIZE(dest)) {
1872 3105240 : loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
1873 3105240 : ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
1874 : } else {
1875 194318840 : ret = xfs_flush_unmap_range(dest, pos_out, *len);
1876 : }
1877 197418404 : if (ret)
1878 916 : goto out_unlock;
1879 :
1880 : return 0;
1881 75294468 : out_unlock:
1882 75294468 : xfs_iunlock2_io_mmap(src, dest);
1883 75294468 : return ret;
1884 : }
1885 :
1886 : /* Does this inode need the reflink flag? */
1887 : int
1888 23128660 : xfs_reflink_inode_has_shared_extents(
1889 : struct xfs_trans *tp,
1890 : struct xfs_inode *ip,
1891 : bool *has_shared)
1892 : {
1893 23128660 : struct xfs_bmbt_irec got;
1894 23128660 : struct xfs_mount *mp = ip->i_mount;
1895 23128660 : struct xfs_ifork *ifp;
1896 23128660 : struct xfs_iext_cursor icur;
1897 23128660 : bool found;
1898 23128660 : int error;
1899 :
1900 23128660 : ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1901 23128660 : error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
1902 23128967 : if (error)
1903 : return error;
1904 :
1905 23128974 : *has_shared = false;
1906 23128974 : found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
1907 822691872 : while (found) {
1908 801131148 : xfs_agblock_t rbno;
1909 801131148 : xfs_extlen_t rlen;
1910 :
1911 801131148 : if (isnullstartblock(got.br_startblock) ||
1912 801128710 : got.br_state != XFS_EXT_NORM)
1913 9685066 : goto next;
1914 :
1915 798887781 : if (XFS_IS_REALTIME_INODE(ip)) {
1916 7440697 : struct xfs_rtgroup *rtg;
1917 7440697 : xfs_rgnumber_t rgno;
1918 7440697 : xfs_rgblock_t rgbno;
1919 :
1920 7440697 : rgbno = xfs_rtb_to_rgbno(mp, got.br_startblock, &rgno);
1921 7440741 : rtg = xfs_rtgroup_get(mp, rgno);
1922 7441045 : error = xfs_reflink_find_rtshared(rtg, tp, rgbno,
1923 7441045 : got.br_blockcount, &rbno, &rlen,
1924 : false);
1925 7441501 : xfs_rtgroup_put(rtg);
1926 : } else {
1927 784005385 : struct xfs_perag *pag;
1928 784005385 : xfs_agblock_t agbno;
1929 :
1930 784005385 : pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
1931 : got.br_startblock));
1932 784006100 : agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
1933 784006100 : error = xfs_reflink_find_shared(pag, tp, agbno,
1934 784006100 : got.br_blockcount, &rbno, &rlen,
1935 : false);
1936 784009943 : xfs_perag_put(pag);
1937 : }
1938 791450100 : if (error)
1939 1568235 : return error;
1940 :
1941 : /* Is there still a shared block here? */
1942 791450100 : if (rbno != NULLAGBLOCK) {
1943 1568235 : *has_shared = true;
1944 1568235 : return 0;
1945 : }
1946 789881865 : next:
1947 799566931 : found = xfs_iext_next_extent(ifp, &icur, &got);
1948 : }
1949 :
1950 : return 0;
1951 : }
1952 :
1953 : /*
1954 : * Clear the inode reflink flag if there are no shared extents.
1955 : *
1956 : * The caller is responsible for joining the inode to the transaction passed in.
1957 : * The inode will be joined to the transaction that is returned to the caller.
1958 : */
1959 : int
1960 14390 : xfs_reflink_clear_inode_flag(
1961 : struct xfs_inode *ip,
1962 : struct xfs_trans **tpp)
1963 : {
1964 14390 : bool needs_flag;
1965 14390 : int error = 0;
1966 :
1967 14390 : ASSERT(xfs_is_reflink_inode(ip));
1968 :
1969 14390 : error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
1970 14390 : if (error || needs_flag)
1971 : return error;
1972 :
1973 : /*
1974 : * We didn't find any shared blocks so turn off the reflink flag.
1975 : * First, get rid of any leftover CoW mappings.
1976 : */
1977 7473 : error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
1978 : true);
1979 7473 : if (error)
1980 : return error;
1981 :
1982 : /* Clear the inode flag. */
1983 7473 : trace_xfs_reflink_unset_inode_flag(ip);
1984 7473 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1985 7473 : xfs_inode_clear_cowblocks_tag(ip);
1986 7473 : xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1987 :
1988 7473 : return error;
1989 : }
1990 :
1991 : /*
1992 : * Clear the inode reflink flag if there are no shared extents and the size
1993 : * hasn't changed.
1994 : */
1995 : STATIC int
1996 38 : xfs_reflink_try_clear_inode_flag(
1997 : struct xfs_inode *ip)
1998 : {
1999 38 : struct xfs_mount *mp = ip->i_mount;
2000 38 : struct xfs_trans *tp;
2001 38 : int error = 0;
2002 :
2003 : /* Start a rolling transaction to remove the mappings */
2004 38 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
2005 38 : if (error)
2006 : return error;
2007 :
2008 38 : xfs_ilock(ip, XFS_ILOCK_EXCL);
2009 38 : xfs_trans_ijoin(tp, ip, 0);
2010 :
2011 38 : error = xfs_reflink_clear_inode_flag(ip, &tp);
2012 38 : if (error)
2013 0 : goto cancel;
2014 :
2015 38 : error = xfs_trans_commit(tp);
2016 38 : if (error)
2017 0 : goto out;
2018 :
2019 38 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2020 38 : return 0;
2021 : cancel:
2022 0 : xfs_trans_cancel(tp);
2023 0 : out:
2024 0 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2025 0 : return error;
2026 : }
2027 :
2028 : /*
2029 : * Pre-COW all shared blocks within a given byte range of a file and turn off
2030 : * the reflink flag if we unshare all of the file's blocks.
2031 : */
2032 : int
2033 66 : xfs_reflink_unshare(
2034 : struct xfs_inode *ip,
2035 : xfs_off_t offset,
2036 : xfs_off_t len)
2037 : {
2038 66 : struct inode *inode = VFS_I(ip);
2039 66 : int error;
2040 :
2041 66 : if (!xfs_is_reflink_inode(ip))
2042 : return 0;
2043 :
2044 40 : trace_xfs_reflink_unshare(ip, offset, len);
2045 :
2046 40 : inode_dio_wait(inode);
2047 :
2048 40 : if (IS_DAX(inode))
2049 : error = dax_file_unshare(inode, offset, len,
2050 : &xfs_dax_write_iomap_ops);
2051 : else
2052 40 : error = iomap_file_unshare(inode, offset, len,
2053 : &xfs_buffered_write_iomap_ops);
2054 40 : if (error)
2055 0 : goto out;
2056 :
2057 40 : error = filemap_write_and_wait_range(inode->i_mapping, offset,
2058 40 : offset + len - 1);
2059 40 : if (error)
2060 2 : goto out;
2061 :
2062 : /* Turn off the reflink flag if possible. */
2063 38 : error = xfs_reflink_try_clear_inode_flag(ip);
2064 38 : if (error)
2065 0 : goto out;
2066 : return 0;
2067 :
2068 2 : out:
2069 2 : trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
2070 2 : return error;
2071 : }
|