Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0+
2 : /*
3 : * Copyright (C) 2016 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_log_format.h"
11 : #include "xfs_trans_resv.h"
12 : #include "xfs_mount.h"
13 : #include "xfs_defer.h"
14 : #include "xfs_inode.h"
15 : #include "xfs_trans.h"
16 : #include "xfs_bmap.h"
17 : #include "xfs_bmap_util.h"
18 : #include "xfs_trace.h"
19 : #include "xfs_icache.h"
20 : #include "xfs_btree.h"
21 : #include "xfs_refcount_btree.h"
22 : #include "xfs_refcount.h"
23 : #include "xfs_bmap_btree.h"
24 : #include "xfs_trans_space.h"
25 : #include "xfs_bit.h"
26 : #include "xfs_alloc.h"
27 : #include "xfs_quota.h"
28 : #include "xfs_reflink.h"
29 : #include "xfs_iomap.h"
30 : #include "xfs_ag.h"
31 : #include "xfs_ag_resv.h"
32 : #include "xfs_health.h"
33 : #include "xfs_rtrefcount_btree.h"
34 : #include "xfs_rtalloc.h"
35 : #include "xfs_rtgroup.h"
36 : #include "xfs_imeta.h"
37 : #include "xfs_rtbitmap.h"
38 :
39 : /*
40 : * Copy on Write of Shared Blocks
41 : *
42 : * XFS must preserve "the usual" file semantics even when two files share
43 : * the same physical blocks. This means that a write to one file must not
44 : * alter the blocks in a different file; the way that we'll do that is
45 : * through the use of a copy-on-write mechanism. At a high level, that
46 : * means that when we want to write to a shared block, we allocate a new
47 : * block, write the data to the new block, and if that succeeds we map the
48 : * new block into the file.
49 : *
50 : * XFS provides a "delayed allocation" mechanism that defers the allocation
51 : * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
52 : * possible. This reduces fragmentation by enabling the filesystem to ask
53 : * for bigger chunks less often, which is exactly what we want for CoW.
54 : *
55 : * The delalloc mechanism begins when the kernel wants to make a block
56 : * writable (write_begin or page_mkwrite). If the offset is not mapped, we
57 : * create a delalloc mapping, which is a regular in-core extent, but without
58 : * a real startblock. (For delalloc mappings, the startblock encodes both
59 : * a flag that this is a delalloc mapping, and a worst-case estimate of how
60 : * many blocks might be required to put the mapping into the BMBT.) delalloc
61 : * mappings are a reservation against the free space in the filesystem;
62 : * adjacent mappings can also be combined into fewer larger mappings.
63 : *
64 : * As an optimization, the CoW extent size hint (cowextsz) creates
65 : * outsized aligned delalloc reservations in the hope of landing out of
66 : * order nearby CoW writes in a single extent on disk, thereby reducing
67 : * fragmentation and improving future performance.
68 : *
69 : * D: --RRRRRRSSSRRRRRRRR--- (data fork)
70 : * C: ------DDDDDDD--------- (CoW fork)
71 : *
72 : * When dirty pages are being written out (typically in writepage), the
73 : * delalloc reservations are converted into unwritten mappings by
74 : * allocating blocks and replacing the delalloc mapping with real ones.
75 : * A delalloc mapping can be replaced by several unwritten ones if the
76 : * free space is fragmented.
77 : *
78 : * D: --RRRRRRSSSRRRRRRRR---
79 : * C: ------UUUUUUU---------
80 : *
81 : * We want to adapt the delalloc mechanism for copy-on-write, since the
82 : * write paths are similar. The first two steps (creating the reservation
83 : * and allocating the blocks) are exactly the same as delalloc except that
84 : * the mappings must be stored in a separate CoW fork because we do not want
85 : * to disturb the mapping in the data fork until we're sure that the write
86 : * succeeded. IO completion in this case is the process of removing the old
87 : * mapping from the data fork and moving the new mapping from the CoW fork to
88 : * the data fork. This will be discussed shortly.
89 : *
90 : * For now, unaligned directio writes will be bounced back to the page cache.
91 : * Block-aligned directio writes will use the same mechanism as buffered
92 : * writes.
93 : *
94 : * Just prior to submitting the actual disk write requests, we convert
95 : * the extents representing the range of the file actually being written
96 : * (as opposed to extra pieces created for the cowextsize hint) to real
97 : * extents. This will become important in the next step:
98 : *
99 : * D: --RRRRRRSSSRRRRRRRR---
100 : * C: ------UUrrUUU---------
101 : *
102 : * CoW remapping must be done after the data block write completes,
103 : * because we don't want to destroy the old data fork map until we're sure
104 : * the new block has been written. Since the new mappings are kept in a
105 : * separate fork, we can simply iterate these mappings to find the ones
106 : * that cover the file blocks that we just CoW'd. For each extent, simply
107 : * unmap the corresponding range in the data fork, map the new range into
108 : * the data fork, and remove the extent from the CoW fork. Because of
109 : * the presence of the cowextsize hint, however, we must be careful
110 : * only to remap the blocks that we've actually written out -- we must
111 : * never remap delalloc reservations nor CoW staging blocks that have
112 : * yet to be written. This corresponds exactly to the real extents in
113 : * the CoW fork:
114 : *
115 : * D: --RRRRRRrrSRRRRRRRR---
116 : * C: ------UU--UUU---------
117 : *
118 : * Since the remapping operation can be applied to an arbitrary file
119 : * range, we record the need for the remap step as a flag in the ioend
120 : * instead of declaring a new IO type. This is required for direct io
121 : * because we only have ioend for the whole dio, and we have to be able to
122 : * remember the presence of unwritten blocks and CoW blocks with a single
123 : * ioend structure. Better yet, the more ground we can cover with one
124 : * ioend, the better.
125 : */
126 :
127 : /*
128 : * Given an AG extent, find the lowest-numbered run of shared blocks
129 : * within that range and return the range in fbno/flen. If
130 : * find_end_of_shared is true, return the longest contiguous extent of
131 : * shared blocks. If there are no shared extents, fbno and flen will
132 : * be set to NULLAGBLOCK and 0, respectively.
133 : */
134 : static int
135 788379267 : xfs_reflink_find_shared(
136 : struct xfs_perag *pag,
137 : struct xfs_trans *tp,
138 : xfs_agblock_t agbno,
139 : xfs_extlen_t aglen,
140 : xfs_agblock_t *fbno,
141 : xfs_extlen_t *flen,
142 : bool find_end_of_shared)
143 : {
144 788379267 : struct xfs_buf *agbp;
145 788379267 : struct xfs_btree_cur *cur;
146 788379267 : int error;
147 :
148 788379267 : error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
149 788386771 : if (error)
150 : return error;
151 :
152 788387620 : cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
153 :
154 788431977 : error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
155 : find_end_of_shared);
156 :
157 788353270 : xfs_btree_del_cursor(cur, error);
158 :
159 788420172 : xfs_trans_brelse(tp, agbp);
160 788420172 : return error;
161 : }
162 :
163 : /*
164 : * Given an RT extent, find the lowest-numbered run of shared blocks
165 : * within that range and return the range in fbno/flen. If
166 : * find_end_of_shared is true, return the longest contiguous extent of
167 : * shared blocks. If there are no shared extents, fbno and flen will
168 : * be set to NULLRGBLOCK and 0, respectively.
169 : */
170 : static int
171 62251591 : xfs_reflink_find_rtshared(
172 : struct xfs_rtgroup *rtg,
173 : struct xfs_trans *tp,
174 : xfs_agblock_t rtbno,
175 : xfs_extlen_t rtlen,
176 : xfs_agblock_t *fbno,
177 : xfs_extlen_t *flen,
178 : bool find_end_of_shared)
179 : {
180 62251591 : struct xfs_mount *mp = rtg->rtg_mount;
181 62251591 : struct xfs_btree_cur *cur;
182 62251591 : int error;
183 :
184 62251591 : BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK);
185 :
186 62251591 : xfs_rtgroup_lock(NULL, rtg, XFS_RTGLOCK_REFCOUNT);
187 62271708 : cur = xfs_rtrefcountbt_init_cursor(mp, tp, rtg, rtg->rtg_refcountip);
188 62287624 : error = xfs_refcount_find_shared(cur, rtbno, rtlen, fbno, flen,
189 : find_end_of_shared);
190 62265923 : xfs_btree_del_cursor(cur, error);
191 62284786 : xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT);
192 62275993 : return error;
193 : }
194 :
195 : /*
196 : * Trim the mapping to the next block where there's a change in the
197 : * shared/unshared status. More specifically, this means that we
198 : * find the lowest-numbered extent of shared blocks that coincides with
199 : * the given block mapping. If the shared extent overlaps the start of
200 : * the mapping, trim the mapping to the end of the shared extent. If
201 : * the shared region intersects the mapping, trim the mapping to the
202 : * start of the shared extent. If there are no shared regions that
203 : * overlap, just return the original extent.
204 : */
205 : int
206 127515071 : xfs_reflink_trim_around_shared(
207 : struct xfs_inode *ip,
208 : struct xfs_bmbt_irec *irec,
209 : bool *shared)
210 : {
211 127515071 : struct xfs_mount *mp = ip->i_mount;
212 127515071 : xfs_agblock_t orig_bno;
213 127515071 : xfs_agblock_t fbno;
214 127515071 : xfs_extlen_t flen;
215 127515071 : int error = 0;
216 :
217 : /* Holes, unwritten, and delalloc extents cannot be shared */
218 127515071 : if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
219 77414720 : *shared = false;
220 77414720 : return 0;
221 : }
222 :
223 50100714 : trace_xfs_reflink_trim_around_shared(ip, irec);
224 :
225 77627213 : if (XFS_IS_REALTIME_INODE(ip)) {
226 27525937 : struct xfs_rtgroup *rtg;
227 27525937 : xfs_rgnumber_t rgno;
228 :
229 27525937 : orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock, &rgno);
230 27525960 : rtg = xfs_rtgroup_get(mp, rgno);
231 27526474 : error = xfs_reflink_find_rtshared(rtg, NULL, orig_bno,
232 27526474 : irec->br_blockcount, &fbno, &flen, true);
233 27526585 : xfs_rtgroup_put(rtg);
234 : } else {
235 22574559 : struct xfs_perag *pag;
236 :
237 22574559 : pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
238 : irec->br_startblock));
239 22574957 : orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
240 22574879 : error = xfs_reflink_find_shared(pag, NULL, orig_bno,
241 22574879 : irec->br_blockcount, &fbno, &flen, true);
242 22574969 : xfs_perag_put(pag);
243 : }
244 50101750 : if (error)
245 : return error;
246 :
247 50101591 : *shared = false;
248 50101591 : if (fbno == NULLAGBLOCK) {
249 : /* No shared blocks at all. */
250 : return 0;
251 : }
252 :
253 4786842 : if (fbno == orig_bno) {
254 : /*
255 : * The start of this extent is shared. Truncate the
256 : * mapping at the end of the shared region so that a
257 : * subsequent iteration starts at the start of the
258 : * unshared region.
259 : */
260 4688946 : irec->br_blockcount = flen;
261 4688946 : *shared = true;
262 4688946 : return 0;
263 : }
264 :
265 : /*
266 : * There's a shared extent midway through this extent.
267 : * Truncate the mapping at the start of the shared
268 : * extent so that a subsequent iteration starts at the
269 : * start of the shared region.
270 : */
271 97896 : irec->br_blockcount = fbno - orig_bno;
272 97896 : return 0;
273 : }
274 :
275 : int
276 109713012 : xfs_bmap_trim_cow(
277 : struct xfs_inode *ip,
278 : struct xfs_bmbt_irec *imap,
279 : bool *shared)
280 : {
281 : /* We can't update any real extents in always COW mode. */
282 109713012 : if (xfs_is_always_cow_inode(ip) &&
283 1247093 : !isnullstartblock(imap->br_startblock)) {
284 1031706 : *shared = true;
285 1031706 : return 0;
286 : }
287 :
288 : /* Trim the mapping to the nearest shared extent boundary. */
289 108679666 : return xfs_reflink_trim_around_shared(ip, imap, shared);
290 : }
291 :
292 : static int
293 12337869 : xfs_reflink_convert_cow_locked(
294 : struct xfs_inode *ip,
295 : xfs_fileoff_t offset_fsb,
296 : xfs_filblks_t count_fsb)
297 : {
298 12337869 : struct xfs_iext_cursor icur;
299 12337869 : struct xfs_bmbt_irec got;
300 12337869 : struct xfs_btree_cur *dummy_cur = NULL;
301 12337869 : struct xfs_mount *mp = ip->i_mount;
302 12337869 : int dummy_logflags;
303 12337869 : int error = 0;
304 :
305 : /*
306 : * We can only remap full rt extents, so make sure that we convert the
307 : * entire extent. The caller must ensure that this is either a direct
308 : * write that's aligned to the rt extent size, or a buffered write for
309 : * which we've dirtied extra pages to make this work properly.
310 : */
311 12337869 : if (xfs_inode_needs_cow_around(ip)) {
312 240697 : xfs_fileoff_t new_off;
313 :
314 240697 : new_off = xfs_rtb_rounddown_rtx(mp, offset_fsb);
315 240697 : count_fsb += offset_fsb - new_off;
316 240697 : offset_fsb = new_off;
317 :
318 240697 : count_fsb = xfs_rtb_roundup_rtx(mp, count_fsb);
319 : }
320 :
321 12337874 : if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
322 : return 0;
323 :
324 23119119 : do {
325 23119119 : if (got.br_startoff >= offset_fsb + count_fsb)
326 : break;
327 12348118 : if (got.br_state == XFS_EXT_NORM)
328 737 : continue;
329 12347381 : if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
330 : return -EIO;
331 :
332 12347381 : xfs_trim_extent(&got, offset_fsb, count_fsb);
333 12347308 : if (!got.br_blockcount)
334 0 : continue;
335 :
336 12347308 : got.br_state = XFS_EXT_NORM;
337 12347308 : error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
338 : XFS_COW_FORK, &icur, &dummy_cur, &got,
339 : &dummy_logflags);
340 12347260 : if (error)
341 0 : return error;
342 12347997 : } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
343 :
344 : return error;
345 : }
346 :
347 : /* Convert all of the unwritten CoW extents in a file's range to real ones. */
348 : int
349 6271223 : xfs_reflink_convert_cow(
350 : struct xfs_inode *ip,
351 : xfs_off_t offset,
352 : xfs_off_t count)
353 : {
354 6271223 : struct xfs_mount *mp = ip->i_mount;
355 6271223 : xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
356 6271223 : xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
357 6271223 : xfs_filblks_t count_fsb = end_fsb - offset_fsb;
358 6271223 : int error;
359 :
360 6271223 : ASSERT(count != 0);
361 :
362 6271223 : xfs_ilock(ip, XFS_ILOCK_EXCL);
363 6271226 : error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
364 6271169 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
365 6271211 : return error;
366 : }
367 :
368 : /*
369 : * Find the extent that maps the given range in the COW fork. Even if the extent
370 : * is not shared we might have a preallocation for it in the COW fork. If so we
371 : * use it that rather than trigger a new allocation.
372 : */
373 : static int
374 93897894 : xfs_find_trim_cow_extent(
375 : struct xfs_inode *ip,
376 : struct xfs_bmbt_irec *imap,
377 : struct xfs_bmbt_irec *cmap,
378 : bool *shared,
379 : bool *found)
380 : {
381 93897894 : xfs_fileoff_t offset_fsb = imap->br_startoff;
382 93897894 : xfs_filblks_t count_fsb = imap->br_blockcount;
383 93897894 : struct xfs_iext_cursor icur;
384 :
385 93897894 : *found = false;
386 :
387 : /*
388 : * If we don't find an overlapping extent, trim the range we need to
389 : * allocate to fit the hole we found.
390 : */
391 93897894 : if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
392 75132363 : cmap->br_startoff = offset_fsb + count_fsb;
393 93898790 : if (cmap->br_startoff > offset_fsb) {
394 82428382 : xfs_trim_extent(imap, imap->br_startoff,
395 82428382 : cmap->br_startoff - imap->br_startoff);
396 82427219 : return xfs_bmap_trim_cow(ip, imap, shared);
397 : }
398 :
399 11470408 : *shared = true;
400 11470408 : if (isnullstartblock(cmap->br_startblock)) {
401 11323 : xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
402 11323 : return 0;
403 : }
404 :
405 : /* real extent found - no need to allocate */
406 11459085 : xfs_trim_extent(cmap, offset_fsb, count_fsb);
407 11459001 : *found = true;
408 11459001 : return 0;
409 : }
410 :
411 : static int
412 12688953 : xfs_reflink_convert_unwritten(
413 : struct xfs_inode *ip,
414 : struct xfs_bmbt_irec *imap,
415 : struct xfs_bmbt_irec *cmap,
416 : bool convert_now)
417 : {
418 12688953 : xfs_fileoff_t offset_fsb = imap->br_startoff;
419 12688953 : xfs_filblks_t count_fsb = imap->br_blockcount;
420 12688953 : int error;
421 :
422 : /*
423 : * cmap might larger than imap due to cowextsize hint.
424 : */
425 12688953 : xfs_trim_extent(cmap, offset_fsb, count_fsb);
426 :
427 : /*
428 : * COW fork extents are supposed to remain unwritten until we're ready
429 : * to initiate a disk write. For direct I/O we are going to write the
430 : * data and need the conversion, but for buffered writes we're done.
431 : */
432 12688806 : if (!convert_now || cmap->br_state == XFS_EXT_NORM)
433 : return 0;
434 :
435 6066658 : trace_xfs_reflink_convert_cow(ip, cmap);
436 :
437 6066657 : error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
438 6066659 : if (!error)
439 6066659 : cmap->br_state = XFS_EXT_NORM;
440 :
441 : return error;
442 : }
443 :
444 : static int
445 1226137 : xfs_reflink_fill_cow_hole(
446 : struct xfs_inode *ip,
447 : struct xfs_bmbt_irec *imap,
448 : struct xfs_bmbt_irec *cmap,
449 : bool *shared,
450 : uint *lockmode,
451 : bool convert_now)
452 : {
453 1226137 : struct xfs_mount *mp = ip->i_mount;
454 1226137 : struct xfs_trans *tp;
455 1226137 : xfs_filblks_t resaligned;
456 1226137 : unsigned int dblocks = 0, rblocks = 0;
457 1226137 : int nimaps;
458 1226137 : int error;
459 1226137 : bool found;
460 :
461 1226137 : resaligned = xfs_aligned_fsb_count(imap->br_startoff,
462 : imap->br_blockcount, xfs_get_cowextsz_hint(ip));
463 1226134 : if (XFS_IS_REALTIME_INODE(ip)) {
464 872791 : dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
465 872791 : rblocks = resaligned;
466 : } else {
467 353343 : dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
468 353343 : rblocks = 0;
469 : }
470 :
471 1226134 : xfs_iunlock(ip, *lockmode);
472 1226139 : *lockmode = 0;
473 :
474 1226139 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
475 : rblocks, false, &tp);
476 1226139 : if (error)
477 : return error;
478 :
479 1224363 : *lockmode = XFS_ILOCK_EXCL;
480 :
481 1224363 : error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
482 1224359 : if (error || !*shared)
483 2 : goto out_trans_cancel;
484 :
485 1224357 : if (found) {
486 53 : xfs_trans_cancel(tp);
487 53 : goto convert;
488 : }
489 :
490 : /* Allocate the entire reservation as unwritten blocks. */
491 1224304 : nimaps = 1;
492 1224304 : error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
493 : XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
494 : &nimaps);
495 1224297 : if (error)
496 11 : goto out_trans_cancel;
497 :
498 1224286 : xfs_inode_set_cowblocks_tag(ip);
499 1224278 : error = xfs_trans_commit(tp);
500 1224301 : if (error)
501 : return error;
502 :
503 : /*
504 : * Allocation succeeded but the requested range was not even partially
505 : * satisfied? Bail out!
506 : */
507 1224293 : if (nimaps == 0)
508 : return -ENOSPC;
509 :
510 1224293 : convert:
511 1224346 : return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
512 :
513 13 : out_trans_cancel:
514 13 : xfs_trans_cancel(tp);
515 13 : return error;
516 : }
517 :
518 : static int
519 5643 : xfs_reflink_fill_delalloc(
520 : struct xfs_inode *ip,
521 : struct xfs_bmbt_irec *imap,
522 : struct xfs_bmbt_irec *cmap,
523 : bool *shared,
524 : uint *lockmode,
525 : bool convert_now)
526 : {
527 5643 : struct xfs_mount *mp = ip->i_mount;
528 5683 : struct xfs_trans *tp;
529 5683 : int nimaps;
530 5683 : int error;
531 5683 : bool found;
532 :
533 5683 : do {
534 5683 : xfs_iunlock(ip, *lockmode);
535 5683 : *lockmode = 0;
536 :
537 5683 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
538 : false, &tp);
539 5683 : if (error)
540 0 : return error;
541 :
542 5683 : *lockmode = XFS_ILOCK_EXCL;
543 :
544 5683 : error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
545 : &found);
546 5683 : if (error || !*shared)
547 0 : goto out_trans_cancel;
548 :
549 5683 : if (found) {
550 3 : xfs_trans_cancel(tp);
551 3 : break;
552 : }
553 :
554 5680 : ASSERT(isnullstartblock(cmap->br_startblock) ||
555 : cmap->br_startblock == DELAYSTARTBLOCK);
556 :
557 : /*
558 : * Replace delalloc reservation with an unwritten extent.
559 : */
560 5680 : nimaps = 1;
561 5680 : error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
562 : cmap->br_blockcount,
563 : XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
564 : cmap, &nimaps);
565 5680 : if (error)
566 1 : goto out_trans_cancel;
567 :
568 5679 : xfs_inode_set_cowblocks_tag(ip);
569 5679 : error = xfs_trans_commit(tp);
570 5679 : if (error)
571 0 : return error;
572 :
573 : /*
574 : * Allocation succeeded but the requested range was not even
575 : * partially satisfied? Bail out!
576 : */
577 5679 : if (nimaps == 0)
578 : return -ENOSPC;
579 5679 : } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
580 :
581 5642 : return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
582 :
583 1 : out_trans_cancel:
584 1 : xfs_trans_cancel(tp);
585 1 : return error;
586 : }
587 :
588 : /* Allocate all CoW reservations covering a range of blocks in a file. */
589 : int
590 92668396 : xfs_reflink_allocate_cow(
591 : struct xfs_inode *ip,
592 : struct xfs_bmbt_irec *imap,
593 : struct xfs_bmbt_irec *cmap,
594 : bool *shared,
595 : uint *lockmode,
596 : bool convert_now)
597 : {
598 92668396 : int error;
599 92668396 : bool found;
600 :
601 92668396 : ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
602 92666945 : if (!ip->i_cowfp) {
603 5244 : ASSERT(!xfs_is_reflink_inode(ip));
604 5244 : xfs_ifork_init_cow(ip);
605 : }
606 :
607 92666945 : error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
608 92667981 : if (error || !*shared)
609 79977248 : return error;
610 :
611 : /* CoW fork has a real extent */
612 12690733 : if (found)
613 11458951 : return xfs_reflink_convert_unwritten(ip, imap, cmap,
614 : convert_now);
615 :
616 : /*
617 : * CoW fork does not have an extent and data extent is shared.
618 : * Allocate a real extent in the CoW fork.
619 : */
620 1231782 : if (cmap->br_startoff > imap->br_startoff)
621 1226139 : return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
622 : lockmode, convert_now);
623 :
624 : /*
625 : * CoW fork has a delalloc reservation. Replace it with a real extent.
626 : * There may or may not be a data fork mapping.
627 : */
628 5643 : if (isnullstartblock(cmap->br_startblock) ||
629 : cmap->br_startblock == DELAYSTARTBLOCK)
630 5643 : return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
631 : lockmode, convert_now);
632 :
633 : /* Shouldn't get here. */
634 0 : ASSERT(0);
635 0 : return -EFSCORRUPTED;
636 : }
637 :
638 : /*
639 : * Cancel CoW reservations for some block range of an inode.
640 : *
641 : * If cancel_real is true this function cancels all COW fork extents for the
642 : * inode; if cancel_real is false, real extents are not cleared.
643 : *
644 : * Caller must have already joined the inode to the current transaction. The
645 : * inode will be joined to the transaction returned to the caller.
646 : */
647 : int
648 39327908 : xfs_reflink_cancel_cow_blocks(
649 : struct xfs_inode *ip,
650 : struct xfs_trans **tpp,
651 : xfs_fileoff_t offset_fsb,
652 : xfs_fileoff_t end_fsb,
653 : bool cancel_real)
654 : {
655 39327908 : struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
656 39327908 : struct xfs_mount *mp = ip->i_mount;
657 39327908 : struct xfs_bmbt_irec got, del;
658 39327908 : struct xfs_iext_cursor icur;
659 39327908 : bool isrt = XFS_IS_REALTIME_INODE(ip);
660 39327908 : int error = 0;
661 :
662 : /*
663 : * Shrink the range that we're cancelling if they don't align to the
664 : * realtime extent size, since we can only free full extents.
665 : */
666 39327908 : if (xfs_inode_needs_cow_around(ip)) {
667 670622 : offset_fsb = xfs_rtb_roundup_rtx(mp, offset_fsb);
668 670622 : end_fsb = xfs_rtb_rounddown_rtx(mp, end_fsb);
669 : }
670 :
671 78541228 : if (!xfs_inode_has_cow_data(ip))
672 : return 0;
673 11165002 : if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
674 : return 0;
675 :
676 : /* Walk backwards until we're out of the I/O range... */
677 20064887 : while (got.br_startoff + got.br_blockcount > offset_fsb) {
678 11419903 : del = got;
679 11419903 : xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
680 :
681 : /* Extent delete may have bumped ext forward */
682 11408824 : if (!del.br_blockcount) {
683 212842 : xfs_iext_prev(ifp, &icur);
684 212842 : goto next_extent;
685 : }
686 :
687 11195982 : trace_xfs_reflink_cancel_cow(ip, &del);
688 :
689 11197070 : if (isnullstartblock(del.br_startblock)) {
690 2780077 : error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
691 : &icur, &got, &del);
692 2798834 : if (error)
693 : break;
694 8416993 : } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
695 6074458 : ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
696 :
697 : /* Free the CoW orphan record. */
698 6074458 : xfs_refcount_free_cow_extent(*tpp, isrt,
699 6074458 : del.br_startblock, del.br_blockcount);
700 :
701 11074682 : error = xfs_free_extent_later(*tpp, del.br_startblock,
702 : del.br_blockcount, NULL,
703 : XFS_AG_RESV_NONE,
704 : isrt ? XFS_FREE_EXTENT_REALTIME : 0);
705 6079084 : if (error)
706 : break;
707 :
708 : /* Roll the transaction */
709 6079084 : error = xfs_defer_finish(tpp);
710 6078842 : if (error)
711 : break;
712 :
713 : /* Remove the mapping from the CoW fork. */
714 6078826 : xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
715 :
716 : /* Remove the quota reservation */
717 6078351 : error = xfs_quota_unreserve_blkres(ip,
718 6078351 : del.br_blockcount);
719 6079444 : if (error)
720 : break;
721 : } else {
722 : /* Didn't do anything, push cursor back. */
723 2342535 : xfs_iext_prev(ifp, &icur);
724 : }
725 11433773 : next_extent:
726 11433773 : if (!xfs_iext_get_extent(ifp, &icur, &got))
727 : break;
728 : }
729 :
730 : /* clear tag if cow fork is emptied */
731 11064832 : if (!ifp->if_bytes)
732 1842227 : xfs_inode_clear_cowblocks_tag(ip);
733 : return error;
734 : }
735 :
736 : /*
737 : * Cancel CoW reservations for some byte range of an inode.
738 : *
739 : * If cancel_real is true this function cancels all COW fork extents for the
740 : * inode; if cancel_real is false, real extents are not cleared.
741 : */
742 : int
743 5747508 : xfs_reflink_cancel_cow_range(
744 : struct xfs_inode *ip,
745 : xfs_off_t offset,
746 : xfs_off_t count,
747 : bool cancel_real)
748 : {
749 5747508 : struct xfs_trans *tp;
750 5747508 : xfs_fileoff_t offset_fsb;
751 5747508 : xfs_fileoff_t end_fsb;
752 5747508 : int error;
753 :
754 5747508 : trace_xfs_reflink_cancel_cow_range(ip, offset, count);
755 5747309 : ASSERT(ip->i_cowfp);
756 :
757 5747309 : offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
758 5747309 : if (count == NULLFILEOFF)
759 : end_fsb = NULLFILEOFF;
760 : else
761 1914915 : end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
762 :
763 : /* Start a rolling transaction to remove the mappings */
764 5747309 : error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
765 : 0, 0, 0, &tp);
766 5746063 : if (error)
767 10 : goto out;
768 :
769 5746053 : xfs_ilock(ip, XFS_ILOCK_EXCL);
770 5746024 : xfs_trans_ijoin(tp, ip, 0);
771 :
772 : /* Scrape out the old CoW reservations */
773 5746804 : error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
774 : cancel_real);
775 5747803 : if (error)
776 16 : goto out_cancel;
777 :
778 5747787 : error = xfs_trans_commit(tp);
779 :
780 5747674 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
781 5747674 : return error;
782 :
783 : out_cancel:
784 16 : xfs_trans_cancel(tp);
785 16 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
786 26 : out:
787 26 : trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
788 26 : return error;
789 : }
790 :
791 : #ifdef CONFIG_XFS_QUOTA
792 : /*
793 : * Update quota accounting for a remapping operation. When we're remapping
794 : * something from the CoW fork to the data fork, we must update the quota
795 : * accounting for delayed allocations. For remapping from the data fork to the
796 : * data fork, use regular block accounting.
797 : */
798 : static inline void
799 209967986 : xfs_reflink_update_quota(
800 : struct xfs_trans *tp,
801 : struct xfs_inode *ip,
802 : bool is_cow,
803 : int64_t blocks)
804 : {
805 209967986 : unsigned int qflag;
806 :
807 209967986 : if (XFS_IS_REALTIME_INODE(ip)) {
808 89888088 : qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT :
809 : XFS_TRANS_DQ_RTBCOUNT;
810 : } else {
811 120079898 : qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT :
812 : XFS_TRANS_DQ_BCOUNT;
813 : }
814 209967986 : xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks);
815 209966657 : }
816 : #else
817 : # define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0)
818 : #endif
819 :
820 : /*
821 : * Remap part of the CoW fork into the data fork.
822 : *
823 : * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
824 : * into the data fork; this function will remap what it can (at the end of the
825 : * range) and update @end_fsb appropriately. Each remap gets its own
826 : * transaction because we can end up merging and splitting bmbt blocks for
827 : * every remap operation and we'd like to keep the block reservation
828 : * requirements as low as possible.
829 : */
830 : STATIC int
831 15000996 : xfs_reflink_end_cow_extent(
832 : struct xfs_inode *ip,
833 : xfs_fileoff_t *offset_fsb,
834 : xfs_fileoff_t end_fsb)
835 : {
836 15000996 : struct xfs_iext_cursor icur;
837 15000996 : struct xfs_bmbt_irec got, del, data;
838 15000996 : struct xfs_mount *mp = ip->i_mount;
839 15000996 : struct xfs_trans *tp;
840 15000996 : struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
841 15000996 : unsigned int resblks;
842 15000996 : int nmaps;
843 15000996 : bool isrt = XFS_IS_REALTIME_INODE(ip);
844 15000996 : int error;
845 :
846 : /* No COW extents? That's easy! */
847 15000996 : if (ifp->if_bytes == 0) {
848 6291 : *offset_fsb = end_fsb;
849 6291 : return 0;
850 : }
851 :
852 14994705 : resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
853 14994705 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
854 : XFS_TRANS_RESERVE, &tp);
855 14994696 : if (error)
856 : return error;
857 :
858 : /*
859 : * Lock the inode. We have to ijoin without automatic unlock because
860 : * the lead transaction is the refcountbt record deletion; the data
861 : * fork update follows as a deferred log item.
862 : */
863 14994696 : xfs_ilock(ip, XFS_ILOCK_EXCL);
864 14994705 : xfs_trans_ijoin(tp, ip, 0);
865 :
866 14994707 : error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
867 : XFS_IEXT_REFLINK_END_COW_CNT);
868 14994706 : if (error == -EFBIG)
869 16 : error = xfs_iext_count_upgrade(tp, ip,
870 : XFS_IEXT_REFLINK_END_COW_CNT);
871 14994706 : if (error)
872 16 : goto out_cancel;
873 :
874 : /*
875 : * In case of racing, overlapping AIO writes no COW extents might be
876 : * left by the time I/O completes for the loser of the race. In that
877 : * case we are done.
878 : */
879 14994690 : if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
880 14968613 : got.br_startoff >= end_fsb) {
881 62445 : *offset_fsb = end_fsb;
882 62445 : goto out_cancel;
883 : }
884 :
885 : /*
886 : * Only remap real extents that contain data. With AIO, speculative
887 : * preallocations can leak into the range we are called upon, and we
888 : * need to skip them. Preserve @got for the eventual CoW fork
889 : * deletion; from now on @del represents the mapping that we're
890 : * actually remapping.
891 : */
892 14989655 : while (!xfs_bmap_is_written_extent(&got)) {
893 57856 : if (!xfs_iext_next_extent(ifp, &icur, &got) ||
894 57780 : got.br_startoff >= end_fsb) {
895 447 : *offset_fsb = end_fsb;
896 447 : goto out_cancel;
897 : }
898 : }
899 14931799 : del = got;
900 :
901 : /* Grab the corresponding mapping in the data fork. */
902 14931799 : nmaps = 1;
903 14931799 : error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
904 : &nmaps, 0);
905 14931799 : if (error)
906 5 : goto out_cancel;
907 :
908 : /* We can only remap the smaller of the two extent sizes. */
909 14931794 : data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
910 14931794 : del.br_blockcount = data.br_blockcount;
911 :
912 14931794 : trace_xfs_reflink_cow_remap_from(ip, &del);
913 14931794 : trace_xfs_reflink_cow_remap_to(ip, &data);
914 :
915 25832007 : if (xfs_bmap_is_real_extent(&data)) {
916 : /*
917 : * If the extent we're remapping is backed by storage (written
918 : * or not), unmap the extent and drop its refcount.
919 : */
920 10900213 : xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
921 10900213 : xfs_refcount_decrease_extent(tp, isrt, &data);
922 10900213 : xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount);
923 4031581 : } else if (data.br_startblock == DELAYSTARTBLOCK) {
924 233908 : int done;
925 :
926 : /*
927 : * If the extent we're remapping is a delalloc reservation,
928 : * we can use the regular bunmapi function to release the
929 : * incore state. Dropping the delalloc reservation takes care
930 : * of the quota reservation for us.
931 : */
932 233908 : error = xfs_bunmapi(NULL, ip, data.br_startoff,
933 : data.br_blockcount, 0, 1, &done);
934 233908 : if (error)
935 0 : goto out_cancel;
936 233908 : ASSERT(done);
937 : }
938 :
939 : /* Free the CoW orphan record. */
940 14931793 : xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock,
941 14931793 : del.br_blockcount);
942 :
943 : /* Map the new blocks into the data fork. */
944 14931794 : xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
945 :
946 : /* Charge this new data fork mapping to the on-disk quota. */
947 14931794 : xfs_reflink_update_quota(tp, ip, true, del.br_blockcount);
948 :
949 : /* Remove the mapping from the CoW fork. */
950 14931794 : xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
951 :
952 14931794 : error = xfs_trans_commit(tp);
953 14931794 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
954 14931794 : if (error)
955 : return error;
956 :
957 : /* Update the caller about how much progress we made. */
958 14931769 : *offset_fsb = del.br_startoff + del.br_blockcount;
959 14931769 : return 0;
960 :
961 62913 : out_cancel:
962 62913 : xfs_trans_cancel(tp);
963 62913 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
964 62913 : return error;
965 : }
966 :
967 : /*
968 : * Remap parts of a file's data fork after a successful CoW.
969 : */
970 : int
971 11213372 : xfs_reflink_end_cow(
972 : struct xfs_inode *ip,
973 : xfs_off_t offset,
974 : xfs_off_t count)
975 : {
976 11213372 : struct xfs_mount *mp = ip->i_mount;
977 11213372 : xfs_fileoff_t offset_fsb;
978 11213372 : xfs_fileoff_t end_fsb;
979 11213372 : int error = 0;
980 :
981 11213372 : trace_xfs_reflink_end_cow(ip, offset, count);
982 :
983 11213373 : offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
984 11213373 : end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
985 :
986 : /*
987 : * Make sure the end is aligned with a rt extent (if desired), since
988 : * the end of the range could be EOF. The _convert_cow function should
989 : * have set us up to swap only full rt extents.
990 : */
991 11213373 : if (xfs_inode_needs_cow_around(ip)) {
992 239445 : offset_fsb = xfs_rtb_rounddown_rtx(mp, offset_fsb);
993 239445 : end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
994 : }
995 :
996 : /*
997 : * Walk forwards until we've remapped the I/O range. The loop function
998 : * repeatedly cycles the ILOCK to allocate one transaction per remapped
999 : * extent.
1000 : *
1001 : * If we're being called by writeback then the pages will still
1002 : * have PageWriteback set, which prevents races with reflink remapping
1003 : * and truncate. Reflink remapping prevents races with writeback by
1004 : * taking the iolock and mmaplock before flushing the pages and
1005 : * remapping, which means there won't be any further writeback or page
1006 : * cache dirtying until the reflink completes.
1007 : *
1008 : * We should never have two threads issuing writeback for the same file
1009 : * region. There are also have post-eof checks in the writeback
1010 : * preparation code so that we don't bother writing out pages that are
1011 : * about to be truncated.
1012 : *
1013 : * If we're being called as part of directio write completion, the dio
1014 : * count is still elevated, which reflink and truncate will wait for.
1015 : * Reflink remapping takes the iolock and mmaplock and waits for
1016 : * pending dio to finish, which should prevent any directio until the
1017 : * remap completes. Multiple concurrent directio writes to the same
1018 : * region are handled by end_cow processing only occurring for the
1019 : * threads which succeed; the outcome of multiple overlapping direct
1020 : * writes is not well defined anyway.
1021 : *
1022 : * It's possible that a buffered write and a direct write could collide
1023 : * here (the buffered write stumbles in after the dio flushes and
1024 : * invalidates the page cache and immediately queues writeback), but we
1025 : * have never supported this 100%. If either disk write succeeds the
1026 : * blocks will be remapped.
1027 : */
1028 26214369 : while (end_fsb > offset_fsb && !error)
1029 15000997 : error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
1030 :
1031 11213372 : if (error)
1032 46 : trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
1033 11213372 : return error;
1034 : }
1035 :
1036 : /*
1037 : * Free all CoW staging blocks that are still referenced by the ondisk refcount
1038 : * metadata. The ondisk metadata does not track which inode created the
1039 : * staging extent, so callers must ensure that there are no cached inodes with
1040 : * live CoW staging extents.
1041 : */
1042 : int
1043 13570 : xfs_reflink_recover_cow(
1044 : struct xfs_mount *mp)
1045 : {
1046 13570 : struct xfs_perag *pag;
1047 13570 : struct xfs_rtgroup *rtg;
1048 13570 : xfs_agnumber_t agno;
1049 13570 : xfs_rgnumber_t rgno;
1050 13570 : int error = 0;
1051 :
1052 13570 : if (!xfs_has_reflink(mp))
1053 : return 0;
1054 :
1055 69079 : for_each_perag(mp, agno, pag) {
1056 56002 : error = xfs_refcount_recover_cow_leftovers(mp, pag);
1057 56002 : if (error) {
1058 30 : xfs_perag_rele(pag);
1059 30 : return error;
1060 : }
1061 : }
1062 :
1063 19402 : for_each_rtgroup(mp, rgno, rtg) {
1064 6325 : error = xfs_refcount_recover_rtcow_leftovers(mp, rtg);
1065 6325 : if (error) {
1066 0 : xfs_rtgroup_rele(rtg);
1067 0 : return error;
1068 : }
1069 : }
1070 :
1071 : return 0;
1072 : }
1073 :
1074 : /*
1075 : * Reflinking (Block) Ranges of Two Files Together
1076 : *
1077 : * First, ensure that the reflink flag is set on both inodes. The flag is an
1078 : * optimization to avoid unnecessary refcount btree lookups in the write path.
1079 : *
1080 : * Now we can iteratively remap the range of extents (and holes) in src to the
1081 : * corresponding ranges in dest. Let drange and srange denote the ranges of
1082 : * logical blocks in dest and src touched by the reflink operation.
1083 : *
1084 : * While the length of drange is greater than zero,
1085 : * - Read src's bmbt at the start of srange ("imap")
1086 : * - If imap doesn't exist, make imap appear to start at the end of srange
1087 : * with zero length.
1088 : * - If imap starts before srange, advance imap to start at srange.
1089 : * - If imap goes beyond srange, truncate imap to end at the end of srange.
1090 : * - Punch (imap start - srange start + imap len) blocks from dest at
1091 : * offset (drange start).
1092 : * - If imap points to a real range of pblks,
1093 : * > Increase the refcount of the imap's pblks
1094 : * > Map imap's pblks into dest at the offset
1095 : * (drange start + imap start - srange start)
1096 : * - Advance drange and srange by (imap start - srange start + imap len)
1097 : *
1098 : * Finally, if the reflink made dest longer, update both the in-core and
1099 : * on-disk file sizes.
1100 : *
1101 : * ASCII Art Demonstration:
1102 : *
1103 : * Let's say we want to reflink this source file:
1104 : *
1105 : * ----SSSSSSS-SSSSS----SSSSSS (src file)
1106 : * <-------------------->
1107 : *
1108 : * into this destination file:
1109 : *
1110 : * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
1111 : * <-------------------->
1112 : * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
1113 : * Observe that the range has different logical offsets in either file.
1114 : *
1115 : * Consider that the first extent in the source file doesn't line up with our
1116 : * reflink range. Unmapping and remapping are separate operations, so we can
1117 : * unmap more blocks from the destination file than we remap.
1118 : *
1119 : * ----SSSSSSS-SSSSS----SSSSSS
1120 : * <------->
1121 : * --DDDDD---------DDDDD--DDD
1122 : * <------->
1123 : *
1124 : * Now remap the source extent into the destination file:
1125 : *
1126 : * ----SSSSSSS-SSSSS----SSSSSS
1127 : * <------->
1128 : * --DDDDD--SSSSSSSDDDDD--DDD
1129 : * <------->
1130 : *
1131 : * Do likewise with the second hole and extent in our range. Holes in the
1132 : * unmap range don't affect our operation.
1133 : *
1134 : * ----SSSSSSS-SSSSS----SSSSSS
1135 : * <---->
1136 : * --DDDDD--SSSSSSS-SSSSS-DDD
1137 : * <---->
1138 : *
1139 : * Finally, unmap and remap part of the third extent. This will increase the
1140 : * size of the destination file.
1141 : *
1142 : * ----SSSSSSS-SSSSS----SSSSSS
1143 : * <----->
1144 : * --DDDDD--SSSSSSS-SSSSS----SSS
1145 : * <----->
1146 : *
1147 : * Once we update the destination file's i_size, we're done.
1148 : */
1149 :
1150 : /*
1151 : * Ensure the reflink bit is set in both inodes.
1152 : */
1153 : STATIC int
1154 243381673 : xfs_reflink_set_inode_flag(
1155 : struct xfs_inode *src,
1156 : struct xfs_inode *dest)
1157 : {
1158 243381673 : struct xfs_mount *mp = src->i_mount;
1159 243381673 : int error;
1160 243381673 : struct xfs_trans *tp;
1161 :
1162 243381673 : if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
1163 : return 0;
1164 :
1165 5186569 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1166 5187647 : if (error)
1167 0 : goto out_error;
1168 :
1169 : /* Lock both files against IO */
1170 5187647 : if (src->i_ino == dest->i_ino)
1171 111000 : xfs_ilock(src, XFS_ILOCK_EXCL);
1172 : else
1173 5076647 : xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
1174 :
1175 5187638 : if (!xfs_is_reflink_inode(src)) {
1176 492869 : trace_xfs_reflink_set_inode_flag(src);
1177 492868 : xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
1178 492868 : src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1179 492868 : xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
1180 492869 : xfs_ifork_init_cow(src);
1181 : } else
1182 4694769 : xfs_iunlock(src, XFS_ILOCK_EXCL);
1183 :
1184 5187647 : if (src->i_ino == dest->i_ino)
1185 110999 : goto commit_flags;
1186 :
1187 5076648 : if (!xfs_is_reflink_inode(dest)) {
1188 4774819 : trace_xfs_reflink_set_inode_flag(dest);
1189 4774787 : xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1190 4774821 : dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1191 4774821 : xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1192 4774837 : xfs_ifork_init_cow(dest);
1193 : } else
1194 301829 : xfs_iunlock(dest, XFS_ILOCK_EXCL);
1195 :
1196 5187627 : commit_flags:
1197 5187627 : error = xfs_trans_commit(tp);
1198 5187649 : if (error)
1199 2 : goto out_error;
1200 : return error;
1201 :
1202 2 : out_error:
1203 2 : trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
1204 2 : return error;
1205 : }
1206 :
1207 : /*
1208 : * Update destination inode size & cowextsize hint, if necessary.
1209 : */
1210 : int
1211 241977421 : xfs_reflink_update_dest(
1212 : struct xfs_inode *dest,
1213 : xfs_off_t newlen,
1214 : xfs_extlen_t cowextsize,
1215 : unsigned int remap_flags)
1216 : {
1217 241977421 : struct xfs_mount *mp = dest->i_mount;
1218 241977421 : struct xfs_trans *tp;
1219 241977421 : int error;
1220 :
1221 241977421 : if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
1222 : return 0;
1223 :
1224 3793747 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1225 3785342 : if (error)
1226 0 : goto out_error;
1227 :
1228 3785342 : xfs_ilock(dest, XFS_ILOCK_EXCL);
1229 3785284 : xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1230 :
1231 3785257 : if (newlen > i_size_read(VFS_I(dest))) {
1232 3785266 : trace_xfs_reflink_update_inode_size(dest, newlen);
1233 3785200 : i_size_write(VFS_I(dest), newlen);
1234 3785200 : dest->i_disk_size = newlen;
1235 : }
1236 :
1237 3785191 : if (cowextsize) {
1238 27 : dest->i_cowextsize = cowextsize;
1239 27 : dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
1240 : }
1241 :
1242 3785191 : xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1243 :
1244 3785312 : error = xfs_trans_commit(tp);
1245 3785347 : if (error)
1246 0 : goto out_error;
1247 : return error;
1248 :
1249 0 : out_error:
1250 0 : trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
1251 0 : return error;
1252 : }
1253 :
1254 : /*
1255 : * Do we have enough reserve in this AG to handle a reflink? The refcount
1256 : * btree already reserved all the space it needs, but the rmap btree can grow
1257 : * infinitely, so we won't allow more reflinks when the AG is down to the
1258 : * btree reserves.
1259 : */
1260 : static int
1261 136258997 : xfs_reflink_ag_has_free_space(
1262 : struct xfs_mount *mp,
1263 : struct xfs_inode *ip,
1264 : xfs_fsblock_t fsb)
1265 : {
1266 136258997 : struct xfs_perag *pag;
1267 136258997 : xfs_agnumber_t agno;
1268 136258997 : int error = 0;
1269 :
1270 136258997 : if (!xfs_has_rmapbt(mp))
1271 : return 0;
1272 136258997 : if (XFS_IS_REALTIME_INODE(ip)) {
1273 60236071 : struct xfs_rtgroup *rtg;
1274 60236071 : xfs_rgnumber_t rgno;
1275 :
1276 60236071 : rgno = xfs_rtb_to_rgno(mp, fsb);
1277 60236038 : rtg = xfs_rtgroup_get(mp, rgno);
1278 120472135 : if (xfs_imeta_resv_critical(rtg->rtg_rmapip) ||
1279 60236012 : xfs_imeta_resv_critical(rtg->rtg_refcountip))
1280 : error = -ENOSPC;
1281 60236037 : xfs_rtgroup_put(rtg);
1282 60236037 : return error;
1283 : }
1284 :
1285 76022926 : agno = XFS_FSB_TO_AGNO(mp, fsb);
1286 76022926 : pag = xfs_perag_get(mp, agno);
1287 152044581 : if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
1288 76021660 : xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
1289 : error = -ENOSPC;
1290 76022853 : xfs_perag_put(pag);
1291 76022853 : return error;
1292 : }
1293 :
1294 : /*
1295 : * Remap the given extent into the file. The dmap blockcount will be set to
1296 : * the number of blocks that were actually remapped.
1297 : */
1298 : STATIC int
1299 438684007 : xfs_reflink_remap_extent(
1300 : struct xfs_inode *ip,
1301 : struct xfs_bmbt_irec *dmap,
1302 : xfs_off_t new_isize)
1303 : {
1304 438684007 : struct xfs_bmbt_irec smap;
1305 438684007 : struct xfs_mount *mp = ip->i_mount;
1306 438684007 : struct xfs_trans *tp;
1307 438684007 : xfs_off_t newlen;
1308 438684007 : int64_t qdelta = 0;
1309 438684007 : unsigned int dblocks, rblocks, resblks;
1310 438684007 : bool quota_reserved = true;
1311 438684007 : bool smap_real;
1312 438684007 : bool dmap_written = xfs_bmap_is_written_extent(dmap);
1313 438684007 : bool isrt = XFS_IS_REALTIME_INODE(ip);
1314 438684007 : int iext_delta = 0;
1315 438684007 : int nimaps;
1316 438684007 : int error;
1317 :
1318 : /*
1319 : * Start a rolling transaction to switch the mappings.
1320 : *
1321 : * Adding a written extent to the extent map can cause a bmbt split,
1322 : * and removing a mapped extent from the extent can cause a bmbt split.
1323 : * The two operations cannot both cause a split since they operate on
1324 : * the same index in the bmap btree, so we only need a reservation for
1325 : * one bmbt split if either thing is happening. However, we haven't
1326 : * locked the inode yet, so we reserve assuming this is the case.
1327 : *
1328 : * The first allocation call tries to reserve enough space to handle
1329 : * mapping dmap into a sparse part of the file plus the bmbt split. We
1330 : * haven't locked the inode or read the existing mapping yet, so we do
1331 : * not know for sure that we need the space. This should succeed most
1332 : * of the time.
1333 : *
1334 : * If the first attempt fails, try again but reserving only enough
1335 : * space to handle a bmbt split. This is the hard minimum requirement,
1336 : * and we revisit quota reservations later when we know more about what
1337 : * we're remapping.
1338 : */
1339 438684007 : resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
1340 438684007 : if (XFS_IS_REALTIME_INODE(ip)) {
1341 202536351 : dblocks = resblks;
1342 202536351 : rblocks = dmap->br_blockcount;
1343 : } else {
1344 236147656 : dblocks = resblks + dmap->br_blockcount;
1345 236147656 : rblocks = 0;
1346 : }
1347 438684007 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
1348 : dblocks, rblocks, false, &tp);
1349 438701127 : if (error == -EDQUOT || error == -ENOSPC) {
1350 5352138 : quota_reserved = false;
1351 5352138 : error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
1352 : resblks, 0, false, &tp);
1353 : }
1354 438701138 : if (error)
1355 1375336 : goto out;
1356 :
1357 : /*
1358 : * Read what's currently mapped in the destination file into smap.
1359 : * If smap isn't a hole, we will have to remove it before we can add
1360 : * dmap to the destination file.
1361 : */
1362 437325802 : nimaps = 1;
1363 437325802 : error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
1364 : &smap, &nimaps, 0);
1365 437315355 : if (error)
1366 15 : goto out_cancel;
1367 437315340 : ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
1368 437315340 : smap_real = xfs_bmap_is_real_extent(&smap);
1369 :
1370 : /*
1371 : * We can only remap as many blocks as the smaller of the two extent
1372 : * maps, because we can only remap one extent at a time.
1373 : */
1374 437315340 : dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
1375 437315340 : ASSERT(dmap->br_blockcount == smap.br_blockcount);
1376 :
1377 437315340 : trace_xfs_reflink_remap_extent_dest(ip, &smap);
1378 :
1379 : /*
1380 : * Two extents mapped to the same physical block must not have
1381 : * different states; that's filesystem corruption. Move on to the next
1382 : * extent if they're both holes or both the same physical extent.
1383 : */
1384 437314894 : if (dmap->br_startblock == smap.br_startblock) {
1385 251835456 : if (dmap->br_state != smap.br_state) {
1386 0 : xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
1387 0 : error = -EFSCORRUPTED;
1388 : }
1389 251835456 : goto out_cancel;
1390 : }
1391 :
1392 : /* If both extents are unwritten, leave them alone. */
1393 185479438 : if (dmap->br_state == XFS_EXT_UNWRITTEN &&
1394 18797163 : smap.br_state == XFS_EXT_UNWRITTEN)
1395 1342735 : goto out_cancel;
1396 :
1397 : /* No reflinking if the AG of the dest mapping is low on space. */
1398 184136703 : if (dmap_written) {
1399 136259011 : error = xfs_reflink_ag_has_free_space(mp, ip,
1400 : dmap->br_startblock);
1401 136259040 : if (error)
1402 1263 : goto out_cancel;
1403 : }
1404 :
1405 : /*
1406 : * Increase quota reservation if we think the quota block counter for
1407 : * this file could increase.
1408 : *
1409 : * If we are mapping a written extent into the file, we need to have
1410 : * enough quota block count reservation to handle the blocks in that
1411 : * extent. We log only the delta to the quota block counts, so if the
1412 : * extent we're unmapping also has blocks allocated to it, we don't
1413 : * need a quota reservation for the extent itself.
1414 : *
1415 : * Note that if we're replacing a delalloc reservation with a written
1416 : * extent, we have to take the full quota reservation because removing
1417 : * the delalloc reservation gives the block count back to the quota
1418 : * count. This is suboptimal, but the VFS flushed the dest range
1419 : * before we started. That should have removed all the delalloc
1420 : * reservations, but we code defensively.
1421 : *
1422 : * xfs_trans_alloc_inode above already tried to grab an even larger
1423 : * quota reservation, and kicked off a blockgc scan if it couldn't.
1424 : * If we can't get a potentially smaller quota reservation now, we're
1425 : * done.
1426 : */
1427 184135469 : if (!quota_reserved && !smap_real && dmap_written) {
1428 166509 : if (XFS_IS_REALTIME_INODE(ip)) {
1429 58605 : dblocks = 0;
1430 58605 : rblocks = dmap->br_blockcount;
1431 : } else {
1432 107904 : dblocks = dmap->br_blockcount;
1433 107904 : rblocks = 0;
1434 : }
1435 166509 : error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
1436 : false);
1437 166509 : if (error)
1438 0 : goto out_cancel;
1439 : }
1440 :
1441 184135469 : if (smap_real)
1442 34960150 : ++iext_delta;
1443 :
1444 184135469 : if (dmap_written)
1445 136257722 : ++iext_delta;
1446 :
1447 184135469 : error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
1448 184134752 : if (error == -EFBIG)
1449 24 : error = xfs_iext_count_upgrade(tp, ip, iext_delta);
1450 184134752 : if (error)
1451 24 : goto out_cancel;
1452 :
1453 184134728 : if (smap_real) {
1454 : /*
1455 : * If the extent we're unmapping is backed by storage (written
1456 : * or not), unmap the extent and drop its refcount.
1457 : */
1458 34959713 : xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
1459 34960814 : xfs_refcount_decrease_extent(tp, isrt, &smap);
1460 34961547 : qdelta -= smap.br_blockcount;
1461 149175015 : } else if (smap.br_startblock == DELAYSTARTBLOCK) {
1462 3199 : int done;
1463 :
1464 : /*
1465 : * If the extent we're unmapping is a delalloc reservation,
1466 : * we can use the regular bunmapi function to release the
1467 : * incore state. Dropping the delalloc reservation takes care
1468 : * of the quota reservation for us.
1469 : */
1470 3199 : error = xfs_bunmapi(NULL, ip, smap.br_startoff,
1471 : smap.br_blockcount, 0, 1, &done);
1472 3199 : if (error)
1473 0 : goto out_cancel;
1474 3199 : ASSERT(done);
1475 : }
1476 :
1477 : /*
1478 : * If the extent we're sharing is backed by written storage, increase
1479 : * its refcount and map it into the file.
1480 : */
1481 184136562 : if (dmap_written) {
1482 136257540 : xfs_refcount_increase_extent(tp, isrt, dmap);
1483 136257587 : xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
1484 136257799 : qdelta += dmap->br_blockcount;
1485 : }
1486 :
1487 184136821 : xfs_reflink_update_quota(tp, ip, false, qdelta);
1488 :
1489 : /* Update dest isize if needed. */
1490 184134392 : newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
1491 184134392 : newlen = min_t(xfs_off_t, newlen, new_isize);
1492 184134392 : if (newlen > i_size_read(VFS_I(ip))) {
1493 118765478 : trace_xfs_reflink_update_inode_size(ip, newlen);
1494 118765475 : i_size_write(VFS_I(ip), newlen);
1495 118765475 : ip->i_disk_size = newlen;
1496 118765475 : xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1497 : }
1498 :
1499 : /* Commit everything and unlock. */
1500 184134389 : error = xfs_trans_commit(tp);
1501 184137323 : goto out_unlock;
1502 :
1503 253179493 : out_cancel:
1504 253179493 : xfs_trans_cancel(tp);
1505 437263559 : out_unlock:
1506 437263559 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
1507 438627702 : out:
1508 438627702 : if (error)
1509 1377541 : trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1510 438627668 : return error;
1511 : }
1512 :
1513 : /* Remap a range of one file to the other. */
1514 : int
1515 243401518 : xfs_reflink_remap_blocks(
1516 : struct xfs_inode *src,
1517 : loff_t pos_in,
1518 : struct xfs_inode *dest,
1519 : loff_t pos_out,
1520 : loff_t remap_len,
1521 : loff_t *remapped)
1522 : {
1523 243401518 : struct xfs_bmbt_irec imap;
1524 243401518 : struct xfs_mount *mp = src->i_mount;
1525 243401518 : xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
1526 243401518 : xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
1527 243401518 : xfs_filblks_t len;
1528 243401518 : xfs_filblks_t remapped_len = 0;
1529 243401518 : xfs_off_t new_isize = pos_out + remap_len;
1530 243401518 : int nimaps;
1531 243401518 : int error = 0;
1532 :
1533 243401518 : len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
1534 : XFS_MAX_FILEOFF);
1535 :
1536 : /*
1537 : * Make sure the end is aligned with a rt extent (if desired), since
1538 : * the end of the range could be EOF.
1539 : */
1540 243401518 : if (xfs_inode_has_bigrtextents(dest))
1541 14152750 : len = xfs_rtb_roundup_rtx(mp, len);
1542 :
1543 243401622 : trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
1544 :
1545 680649986 : while (len > 0) {
1546 438672854 : unsigned int lock_mode;
1547 :
1548 : /* Read extent from the source file */
1549 438672854 : nimaps = 1;
1550 438672854 : lock_mode = xfs_ilock_data_map_shared(src);
1551 438689913 : error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
1552 438697678 : xfs_iunlock(src, lock_mode);
1553 438685147 : if (error)
1554 : break;
1555 : /*
1556 : * The caller supposedly flushed all dirty pages in the source
1557 : * file range, which means that writeback should have allocated
1558 : * or deleted all delalloc reservations in that range. If we
1559 : * find one, that's a good sign that something is seriously
1560 : * wrong here.
1561 : */
1562 438685097 : ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
1563 438685097 : if (imap.br_startblock == DELAYSTARTBLOCK) {
1564 0 : ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1565 0 : xfs_bmap_mark_sick(src, XFS_DATA_FORK);
1566 0 : error = -EFSCORRUPTED;
1567 0 : break;
1568 : }
1569 :
1570 438685097 : trace_xfs_reflink_remap_extent_src(src, &imap);
1571 :
1572 : /* Remap into the destination file at the given offset. */
1573 438685705 : imap.br_startoff = destoff;
1574 438685705 : error = xfs_reflink_remap_extent(dest, &imap, new_isize);
1575 438651400 : if (error)
1576 : break;
1577 :
1578 437273882 : if (fatal_signal_pending(current)) {
1579 : error = -EINTR;
1580 : break;
1581 : }
1582 :
1583 : /* Advance drange/srange */
1584 437248364 : srcoff += imap.br_blockcount;
1585 437248364 : destoff += imap.br_blockcount;
1586 437248364 : len -= imap.br_blockcount;
1587 437248364 : remapped_len += imap.br_blockcount;
1588 : }
1589 :
1590 243354267 : if (error)
1591 1377995 : trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
1592 243354270 : *remapped = min_t(loff_t, remap_len,
1593 : XFS_FSB_TO_B(src->i_mount, remapped_len));
1594 243354270 : return error;
1595 : }
1596 :
1597 : /*
1598 : * If we're reflinking to a point past the destination file's EOF, we must
1599 : * zero any speculative post-EOF preallocations that sit between the old EOF
1600 : * and the destination file offset.
1601 : */
1602 : static int
1603 243386967 : xfs_reflink_zero_posteof(
1604 : struct xfs_inode *ip,
1605 : loff_t pos)
1606 : {
1607 243386967 : loff_t isize = i_size_read(VFS_I(ip));
1608 :
1609 243386967 : if (pos <= isize)
1610 : return 0;
1611 :
1612 4689477 : trace_xfs_zero_eof(ip, isize, pos - isize);
1613 4689421 : return xfs_zero_range(ip, isize, pos - isize, NULL);
1614 : }
1615 :
1616 : #ifdef CONFIG_XFS_RT
1617 : /* Adjust the length of the remap operation to end on a rt extent boundary. */
1618 : STATIC int
1619 14155832 : xfs_reflink_remap_adjust_rtlen(
1620 : struct xfs_inode *src,
1621 : loff_t pos_in,
1622 : struct xfs_inode *dest,
1623 : loff_t pos_out,
1624 : loff_t *len,
1625 : unsigned int remap_flags)
1626 : {
1627 14155832 : struct xfs_mount *mp = src->i_mount;
1628 14155832 : uint32_t mod;
1629 :
1630 14155832 : div_u64_rem(*len, XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize), &mod);
1631 :
1632 : /*
1633 : * We previously checked the rtextent alignment of both offsets, so we
1634 : * now have to check the alignment of the length. The VFS remap prep
1635 : * function can change the length on us, so we can only make length
1636 : * adjustments after that. If the length is aligned to an rtextent,
1637 : * we're trivially good to go.
1638 : *
1639 : * Otherwise, the length is not aligned to an rt extent. If the source
1640 : * file's range ends at EOF, the VFS ensured that the dest file's range
1641 : * also ends at EOF. The actual remap function will round the (byte)
1642 : * length up to the nearest rtextent unit, so we're ok here too.
1643 : */
1644 14155737 : if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
1645 : return 0;
1646 :
1647 : /*
1648 : * Otherwise, the only thing we can do is round the request length down
1649 : * to an rt extent boundary. If the caller doesn't allow that, we are
1650 : * finished.
1651 : */
1652 2967 : if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
1653 : return -EINVAL;
1654 :
1655 : /* Back off by a single extent. */
1656 627 : (*len) -= mod;
1657 627 : trace_xfs_reflink_remap_adjust_rtlen(src, pos_in, *len, dest, pos_out);
1658 627 : return 0;
1659 : }
1660 : #else
1661 : # define xfs_reflink_remap_adjust_rtlen(...) (0)
1662 : #endif /* CONFIG_XFS_RT */
1663 :
1664 : /*
1665 : * Check the alignment of a remap request when the allocation unit size isn't a
1666 : * power of two. The VFS helpers use (fast) bitmask-based alignment checks,
1667 : * but here we have to use slow long division.
1668 : */
1669 : static int
1670 11900521 : xfs_reflink_remap_check_rtalign(
1671 : struct xfs_inode *ip_in,
1672 : loff_t pos_in,
1673 : struct xfs_inode *ip_out,
1674 : loff_t pos_out,
1675 : loff_t *req_len,
1676 : unsigned int remap_flags)
1677 : {
1678 11900521 : struct xfs_mount *mp = ip_in->i_mount;
1679 11900521 : uint32_t rextbytes;
1680 11900521 : loff_t in_size, out_size;
1681 11900521 : loff_t new_length, length = *req_len;
1682 11900521 : loff_t blen;
1683 :
1684 11900521 : rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
1685 11900521 : in_size = i_size_read(VFS_I(ip_in));
1686 11900521 : out_size = i_size_read(VFS_I(ip_out));
1687 :
1688 : /* The start of both ranges must be aligned to a rt extent. */
1689 23394989 : if (!isaligned_64(pos_in, rextbytes) ||
1690 11494605 : !isaligned_64(pos_out, rextbytes))
1691 429754 : return -EINVAL;
1692 :
1693 11470424 : if (length == 0)
1694 49153 : length = in_size - pos_in;
1695 :
1696 : /*
1697 : * If the user wanted us to exchange up to the infile's EOF, round up
1698 : * to the next block boundary for this check.
1699 : *
1700 : * Otherwise, reject the range length if it's not extent aligned. We
1701 : * already confirmed the starting offsets' extent alignment.
1702 : */
1703 11470424 : if (pos_in + length == in_size)
1704 78116 : blen = roundup_64(in_size, rextbytes) - pos_in;
1705 : else
1706 11392308 : blen = rounddown_64(length, rextbytes);
1707 :
1708 : /* Don't allow overlapped remappings within the same file. */
1709 11470265 : if (ip_in == ip_out &&
1710 358988 : pos_out + blen > pos_in &&
1711 195402 : pos_in + blen > pos_out)
1712 : return -EINVAL;
1713 :
1714 : /*
1715 : * Ensure that we don't exchange a partial EOF extent into the middle
1716 : * of another file.
1717 : */
1718 11469942 : if (isaligned_64(length, rextbytes))
1719 : return 0;
1720 :
1721 2429581 : new_length = length;
1722 2429581 : if (pos_out + length < out_size)
1723 2169394 : new_length = rounddown_64(new_length, rextbytes);
1724 :
1725 2429581 : if (new_length == length)
1726 : return 0;
1727 :
1728 : /*
1729 : * Return the shortened request if the caller permits it. If the
1730 : * request was shortened to zero rt extents, we know that the original
1731 : * arguments weren't valid in the first place.
1732 : */
1733 2169396 : if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) {
1734 28493 : *req_len = new_length;
1735 28493 : return 0;
1736 : }
1737 :
1738 2140903 : return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1739 : }
1740 :
1741 : /*
1742 : * Prepare two files for range cloning. Upon a successful return both inodes
1743 : * will have the iolock and mmaplock held, the page cache of the out file will
1744 : * be truncated, and any leases on the out file will have been broken. This
1745 : * function borrows heavily from xfs_file_aio_write_checks.
1746 : *
1747 : * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
1748 : * checked that the bytes beyond EOF physically match. Hence we cannot use the
1749 : * EOF block in the source dedupe range because it's not a complete block match,
1750 : * hence can introduce a corruption into the file that has it's block replaced.
1751 : *
1752 : * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
1753 : * "block aligned" for the purposes of cloning entire files. However, if the
1754 : * source file range includes the EOF block and it lands within the existing EOF
1755 : * of the destination file, then we can expose stale data from beyond the source
1756 : * file EOF in the destination file.
1757 : *
1758 : * XFS doesn't support partial block sharing, so in both cases we have check
1759 : * these cases ourselves. For dedupe, we can simply round the length to dedupe
1760 : * down to the previous whole block and ignore the partial EOF block. While this
1761 : * means we can't dedupe the last block of a file, this is an acceptible
1762 : * tradeoff for simplicity on implementation.
1763 : *
1764 : * For cloning, we want to share the partial EOF block if it is also the new EOF
1765 : * block of the destination file. If the partial EOF block lies inside the
1766 : * existing destination EOF, then we have to abort the clone to avoid exposing
1767 : * stale data in the destination file. Hence we reject these clone attempts with
1768 : * -EINVAL in this case.
1769 : */
1770 : int
1771 388531619 : xfs_reflink_remap_prep(
1772 : struct file *file_in,
1773 : loff_t pos_in,
1774 : struct file *file_out,
1775 : loff_t pos_out,
1776 : loff_t *len,
1777 : unsigned int remap_flags)
1778 : {
1779 388531619 : struct inode *inode_in = file_inode(file_in);
1780 388531619 : struct xfs_inode *src = XFS_I(inode_in);
1781 388531619 : struct inode *inode_out = file_inode(file_out);
1782 388531619 : struct xfs_inode *dest = XFS_I(inode_out);
1783 388531619 : const struct iomap_ops *dax_read_ops = NULL;
1784 388531619 : unsigned int alloc_unit = xfs_inode_alloc_unitsize(dest);
1785 388525362 : int ret;
1786 :
1787 : /* Lock both files against IO */
1788 388525362 : ret = xfs_ilock2_io_mmap(src, dest);
1789 388540105 : if (ret)
1790 : return ret;
1791 :
1792 : /* Check file eligibility and prepare for block sharing. */
1793 388540105 : ret = -EINVAL;
1794 : /* Can't reflink between data and rt volumes */
1795 795585186 : if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest))
1796 0 : goto out_unlock;
1797 :
1798 : /* Don't share DAX file data with non-DAX file. */
1799 388540105 : if (IS_DAX(inode_in) != IS_DAX(inode_out))
1800 0 : goto out_unlock;
1801 :
1802 : /* Check non-power of two alignment issues, if necessary. */
1803 573557271 : if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
1804 11900763 : ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,
1805 : pos_out, len, remap_flags);
1806 11899731 : if (ret)
1807 2570957 : goto out_unlock;
1808 :
1809 : /* Do the VFS checks with the regular block alignment. */
1810 9328774 : alloc_unit = src->i_mount->m_sb.sb_blocksize;
1811 : }
1812 :
1813 385968116 : if (IS_DAX(inode_in))
1814 0 : dax_read_ops = &xfs_read_iomap_ops;
1815 :
1816 385968116 : ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
1817 : pos_out, len, remap_flags, dax_read_ops, alloc_unit);
1818 385992868 : if (ret || *len == 0)
1819 142574975 : goto out_unlock;
1820 :
1821 : /* Make sure the end is aligned with a rt extent. */
1822 243417893 : if (xfs_inode_has_bigrtextents(src)) {
1823 14156001 : ret = xfs_reflink_remap_adjust_rtlen(src, pos_in, dest,
1824 : pos_out, len, remap_flags);
1825 14155563 : if (ret || *len == 0)
1826 2712 : goto out_unlock;
1827 : }
1828 :
1829 : /* Attach dquots to dest inode before changing block map */
1830 243414743 : ret = xfs_qm_dqattach(dest);
1831 243392047 : if (ret)
1832 0 : goto out_unlock;
1833 :
1834 : /*
1835 : * Zero existing post-eof speculative preallocations in the destination
1836 : * file.
1837 : */
1838 243392047 : ret = xfs_reflink_zero_posteof(dest, pos_out);
1839 243386032 : if (ret)
1840 4782 : goto out_unlock;
1841 :
1842 : /* Set flags and remap blocks. */
1843 243381250 : ret = xfs_reflink_set_inode_flag(src, dest);
1844 243371626 : if (ret)
1845 2 : goto out_unlock;
1846 :
1847 : /*
1848 : * Now that we've marked both inodes for reflink, make sure that all
1849 : * possible rt extents in both files' ranges are either wholly written,
1850 : * wholly unwritten, or holes. The bmap code requires that we align
1851 : * all unmap and remap requests to a rt extent boundary. We've already
1852 : * flushed the page cache and finished directio for the range that's
1853 : * being remapped, so we can convert the extents directly.
1854 : */
1855 243371624 : if (xfs_inode_has_bigrtextents(src)) {
1856 14151304 : ret = xfs_rtfile_convert_unwritten(src, pos_in, *len);
1857 14151085 : if (ret)
1858 53 : goto out_unlock;
1859 : }
1860 243371352 : if (xfs_inode_has_bigrtextents(dest)) {
1861 14151397 : ret = xfs_rtfile_convert_unwritten(dest, pos_out, *len);
1862 14153273 : if (ret)
1863 13 : goto out_unlock;
1864 : }
1865 :
1866 : /*
1867 : * If pos_out > EOF, we may have dirtied blocks between EOF and
1868 : * pos_out. In that case, we need to extend the flush and unmap to cover
1869 : * from EOF to the end of the copy length.
1870 : */
1871 486746430 : if (pos_out > XFS_ISIZE(dest)) {
1872 4676599 : loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
1873 4676599 : ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
1874 : } else {
1875 238696616 : ret = xfs_flush_unmap_range(dest, pos_out, *len);
1876 : }
1877 243409400 : if (ret)
1878 84 : goto out_unlock;
1879 :
1880 : return 0;
1881 145153578 : out_unlock:
1882 145153578 : xfs_iunlock2_io_mmap(src, dest);
1883 145153578 : return ret;
1884 : }
1885 :
1886 : /* Does this inode need the reflink flag? */
1887 : int
1888 63383970 : xfs_reflink_inode_has_shared_extents(
1889 : struct xfs_trans *tp,
1890 : struct xfs_inode *ip,
1891 : bool *has_shared)
1892 : {
1893 63383970 : struct xfs_bmbt_irec got;
1894 63383970 : struct xfs_mount *mp = ip->i_mount;
1895 63383970 : struct xfs_ifork *ifp;
1896 63383970 : struct xfs_iext_cursor icur;
1897 63383970 : bool found;
1898 63383970 : int error;
1899 :
1900 63383970 : ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1901 63383970 : error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
1902 63100674 : if (error)
1903 : return error;
1904 :
1905 63099627 : *has_shared = false;
1906 63099627 : found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
1907 880303248 : while (found) {
1908 820918579 : xfs_agblock_t rbno;
1909 820918579 : xfs_extlen_t rlen;
1910 :
1911 820918579 : if (isnullstartblock(got.br_startblock) ||
1912 820914940 : got.br_state != XFS_EXT_NORM)
1913 20460913 : goto next;
1914 :
1915 835233186 : if (XFS_IS_REALTIME_INODE(ip)) {
1916 34699515 : struct xfs_rtgroup *rtg;
1917 34699515 : xfs_rgnumber_t rgno;
1918 34699515 : xfs_rgblock_t rgbno;
1919 :
1920 34699515 : rgbno = xfs_rtb_to_rgbno(mp, got.br_startblock, &rgno);
1921 34668647 : rtg = xfs_rtgroup_get(mp, rgno);
1922 34757922 : error = xfs_reflink_find_rtshared(rtg, tp, rgbno,
1923 34757922 : got.br_blockcount, &rbno, &rlen,
1924 : false);
1925 34754046 : xfs_rtgroup_put(rtg);
1926 : } else {
1927 765758151 : struct xfs_perag *pag;
1928 765758151 : xfs_agblock_t agbno;
1929 :
1930 765758151 : pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp,
1931 : got.br_startblock));
1932 765843803 : agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
1933 765834989 : error = xfs_reflink_find_shared(pag, tp, agbno,
1934 765834989 : got.br_blockcount, &rbno, &rlen,
1935 : false);
1936 765858333 : xfs_perag_put(pag);
1937 : }
1938 800639877 : if (error)
1939 3779757 : return error;
1940 :
1941 : /* Is there still a shared block here? */
1942 800639877 : if (rbno != NULLAGBLOCK) {
1943 3779757 : *has_shared = true;
1944 3779757 : return 0;
1945 : }
1946 796860120 : next:
1947 817321033 : found = xfs_iext_next_extent(ifp, &icur, &got);
1948 : }
1949 :
1950 : return 0;
1951 : }
1952 :
1953 : /*
1954 : * Clear the inode reflink flag if there are no shared extents.
1955 : *
1956 : * The caller is responsible for joining the inode to the transaction passed in.
1957 : * The inode will be joined to the transaction that is returned to the caller.
1958 : */
1959 : int
1960 91278 : xfs_reflink_clear_inode_flag(
1961 : struct xfs_inode *ip,
1962 : struct xfs_trans **tpp)
1963 : {
1964 91278 : bool needs_flag;
1965 91278 : int error = 0;
1966 :
1967 91278 : ASSERT(xfs_is_reflink_inode(ip));
1968 :
1969 91278 : error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
1970 91278 : if (error || needs_flag)
1971 82401 : return error;
1972 :
1973 : /*
1974 : * We didn't find any shared blocks so turn off the reflink flag.
1975 : * First, get rid of any leftover CoW mappings.
1976 : */
1977 8877 : error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
1978 : true);
1979 8877 : if (error)
1980 : return error;
1981 :
1982 : /* Clear the inode flag. */
1983 8877 : trace_xfs_reflink_unset_inode_flag(ip);
1984 8877 : ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1985 8877 : xfs_inode_clear_cowblocks_tag(ip);
1986 8877 : xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1987 :
1988 8877 : return error;
1989 : }
1990 :
1991 : /*
1992 : * Clear the inode reflink flag if there are no shared extents and the size
1993 : * hasn't changed.
1994 : */
1995 : STATIC int
1996 180 : xfs_reflink_try_clear_inode_flag(
1997 : struct xfs_inode *ip)
1998 : {
1999 180 : struct xfs_mount *mp = ip->i_mount;
2000 180 : struct xfs_trans *tp;
2001 180 : int error = 0;
2002 :
2003 : /* Start a rolling transaction to remove the mappings */
2004 180 : error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
2005 180 : if (error)
2006 : return error;
2007 :
2008 180 : xfs_ilock(ip, XFS_ILOCK_EXCL);
2009 180 : xfs_trans_ijoin(tp, ip, 0);
2010 :
2011 180 : error = xfs_reflink_clear_inode_flag(ip, &tp);
2012 180 : if (error)
2013 0 : goto cancel;
2014 :
2015 180 : error = xfs_trans_commit(tp);
2016 180 : if (error)
2017 0 : goto out;
2018 :
2019 180 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2020 180 : return 0;
2021 : cancel:
2022 0 : xfs_trans_cancel(tp);
2023 0 : out:
2024 0 : xfs_iunlock(ip, XFS_ILOCK_EXCL);
2025 0 : return error;
2026 : }
2027 :
2028 : /*
2029 : * Pre-COW all shared blocks within a given byte range of a file and turn off
2030 : * the reflink flag if we unshare all of the file's blocks.
2031 : */
2032 : int
2033 315 : xfs_reflink_unshare(
2034 : struct xfs_inode *ip,
2035 : xfs_off_t offset,
2036 : xfs_off_t len)
2037 : {
2038 315 : struct inode *inode = VFS_I(ip);
2039 315 : int error;
2040 :
2041 315 : if (!xfs_is_reflink_inode(ip))
2042 : return 0;
2043 :
2044 188 : trace_xfs_reflink_unshare(ip, offset, len);
2045 :
2046 188 : inode_dio_wait(inode);
2047 :
2048 188 : if (IS_DAX(inode))
2049 0 : error = dax_file_unshare(inode, offset, len,
2050 : &xfs_dax_write_iomap_ops);
2051 : else
2052 188 : error = iomap_file_unshare(inode, offset, len,
2053 : &xfs_buffered_write_iomap_ops);
2054 188 : if (error)
2055 0 : goto out;
2056 :
2057 188 : error = filemap_write_and_wait_range(inode->i_mapping, offset,
2058 188 : offset + len - 1);
2059 188 : if (error)
2060 8 : goto out;
2061 :
2062 : /* Turn off the reflink flag if possible. */
2063 180 : error = xfs_reflink_try_clear_inode_flag(ip);
2064 180 : if (error)
2065 0 : goto out;
2066 : return 0;
2067 :
2068 8 : out:
2069 8 : trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
2070 8 : return error;
2071 : }
|