Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_btree.h"
13 : #include "xfs_log_format.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_sb.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_alloc.h"
18 : #include "xfs_alloc_btree.h"
19 : #include "xfs_ialloc.h"
20 : #include "xfs_ialloc_btree.h"
21 : #include "xfs_rmap.h"
22 : #include "xfs_rmap_btree.h"
23 : #include "xfs_refcount.h"
24 : #include "xfs_refcount_btree.h"
25 : #include "xfs_extent_busy.h"
26 : #include "xfs_ag.h"
27 : #include "xfs_ag_resv.h"
28 : #include "xfs_quota.h"
29 : #include "xfs_qm.h"
30 : #include "xfs_bmap.h"
31 : #include "xfs_da_format.h"
32 : #include "xfs_da_btree.h"
33 : #include "xfs_attr.h"
34 : #include "xfs_attr_remote.h"
35 : #include "xfs_defer.h"
36 : #include "scrub/scrub.h"
37 : #include "scrub/common.h"
38 : #include "scrub/trace.h"
39 : #include "scrub/repair.h"
40 : #include "scrub/bitmap.h"
41 : #include "scrub/reap.h"
42 :
43 : /*
44 : * Disposal of Blocks from Old Metadata
45 : *
46 : * Now that we've constructed a new btree to replace the damaged one, we want
47 : * to dispose of the blocks that (we think) the old btree was using.
48 : * Previously, we used the rmapbt to collect the extents (bitmap) with the
49 : * rmap owner corresponding to the tree we rebuilt, collected extents for any
50 : * blocks with the same rmap owner that are owned by another data structure
51 : * (sublist), and subtracted sublist from bitmap. In theory the extents
52 : * remaining in bitmap are the old btree's blocks.
53 : *
54 : * Unfortunately, it's possible that the btree was crosslinked with other
55 : * blocks on disk. The rmap data can tell us if there are multiple owners, so
56 : * if the rmapbt says there is an owner of this block other than @oinfo, then
57 : * the block is crosslinked. Remove the reverse mapping and continue.
58 : *
59 : * If there is one rmap record, we can free the block, which removes the
60 : * reverse mapping but doesn't add the block to the free space. Our repair
61 : * strategy is to hope the other metadata objects crosslinked on this block
62 : * will be rebuilt (atop different blocks), thereby removing all the cross
63 : * links.
64 : *
65 : * If there are no rmap records at all, we also free the block. If the btree
66 : * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
67 : * supposed to be a rmap record and everything is ok. For other btrees there
68 : * had to have been an rmap entry for the block to have ended up on @bitmap,
69 : * so if it's gone now there's something wrong and the fs will shut down.
70 : *
71 : * Note: If there are multiple rmap records with only the same rmap owner as
72 : * the btree we're trying to rebuild and the block is indeed owned by another
73 : * data structure with the same rmap owner, then the block will be in sublist
74 : * and therefore doesn't need disposal. If there are multiple rmap records
75 : * with only the same rmap owner but the block is not owned by something with
76 : * the same rmap owner, the block will be freed.
77 : *
78 : * The caller is responsible for locking the AG headers/inode for the entire
79 : * rebuild operation so that nothing else can sneak in and change the incore
80 : * state while we're not looking. We must also invalidate any buffers
81 : * associated with @bitmap.
82 : */
83 :
84 : /* Information about reaping extents after a repair. */
85 : struct xreap_state {
86 : struct xfs_scrub *sc;
87 :
88 : /* Reverse mapping owner and metadata reservation type. */
89 : const struct xfs_owner_info *oinfo;
90 : enum xfs_ag_resv_type resv;
91 :
92 : /* If true, roll the transaction before reaping the next extent. */
93 : bool force_roll;
94 :
95 : /* Number of deferred reaps attached to the current transaction. */
96 : unsigned int deferred;
97 :
98 : /* Number of invalidated buffers logged to the current transaction. */
99 : unsigned int invalidated;
100 :
101 : /* Number of deferred reaps queued during the whole reap sequence. */
102 : unsigned long long total_deferred;
103 : };
104 :
105 : /* Put a block back on the AGFL. */
106 : STATIC int
107 0 : xreap_put_freelist(
108 : struct xfs_scrub *sc,
109 : xfs_agblock_t agbno)
110 : {
111 0 : struct xfs_buf *agfl_bp;
112 0 : int error;
113 :
114 : /* Make sure there's space on the freelist. */
115 0 : error = xrep_fix_freelist(sc, 0);
116 0 : if (error)
117 : return error;
118 :
119 : /*
120 : * Since we're "freeing" a lost block onto the AGFL, we have to
121 : * create an rmap for the block prior to merging it or else other
122 : * parts will break.
123 : */
124 0 : error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
125 : &XFS_RMAP_OINFO_AG);
126 0 : if (error)
127 : return error;
128 :
129 : /* Put the block on the AGFL. */
130 0 : error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
131 0 : if (error)
132 : return error;
133 :
134 0 : error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
135 : agfl_bp, agbno, 0);
136 0 : if (error)
137 : return error;
138 0 : xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
139 : XFS_EXTENT_BUSY_SKIP_DISCARD);
140 :
141 0 : return 0;
142 : }
143 :
144 : /* Are there any uncommitted reap operations? */
145 217549 : static inline bool xreap_dirty(const struct xreap_state *rs)
146 : {
147 217549 : if (rs->force_roll)
148 : return true;
149 217549 : if (rs->deferred)
150 : return true;
151 140919 : if (rs->invalidated)
152 : return true;
153 140919 : if (rs->total_deferred)
154 0 : return true;
155 : return false;
156 : }
157 :
158 : #define XREAP_MAX_DEFERRED (128)
159 : #define XREAP_MAX_BINVAL (2048)
160 :
161 : /*
162 : * Decide if we want to roll the transaction after reaping an extent. We don't
163 : * want to overrun the transaction reservation, so we prohibit more than
164 : * 128 EFIs per transaction. For the same reason, we limit the number
165 : * of buffer invalidations to 2048.
166 : */
167 : static inline bool xreap_want_roll(const struct xreap_state *rs)
168 : {
169 96733 : if (rs->force_roll)
170 : return true;
171 0 : if (rs->deferred > XREAP_MAX_DEFERRED)
172 : return true;
173 96724 : if (rs->invalidated > XREAP_MAX_BINVAL)
174 0 : return true;
175 : return false;
176 : }
177 :
178 : static inline void xreap_reset(struct xreap_state *rs)
179 : {
180 9 : rs->total_deferred += rs->deferred;
181 9 : rs->deferred = 0;
182 9 : rs->invalidated = 0;
183 9 : rs->force_roll = false;
184 9 : }
185 :
186 : #define XREAP_MAX_DEFER_CHAIN (2048)
187 :
188 : /*
189 : * Decide if we want to finish the deferred ops that are attached to the scrub
190 : * transaction. We don't want to queue huge chains of deferred ops because
191 : * that can consume a lot of log space and kernel memory. Hence we trigger a
192 : * xfs_defer_finish if there are more than 2048 deferred reap operations or the
193 : * caller did some real work.
194 : */
195 : static inline bool
196 : xreap_want_defer_finish(const struct xreap_state *rs)
197 : {
198 97566 : if (rs->force_roll)
199 : return true;
200 96733 : if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
201 : return true;
202 : return false;
203 : }
204 :
205 : static inline void xreap_defer_finish_reset(struct xreap_state *rs)
206 : {
207 833 : rs->total_deferred = 0;
208 833 : rs->deferred = 0;
209 833 : rs->invalidated = 0;
210 833 : rs->force_roll = false;
211 833 : }
212 :
213 : /*
214 : * Compute the maximum length of a buffer cache scan (in units of sectors),
215 : * given a quantity of fs blocks.
216 : */
217 : xfs_daddr_t
218 43793 : xrep_bufscan_max_sectors(
219 : struct xfs_mount *mp,
220 : xfs_extlen_t fsblocks)
221 : {
222 534268 : int max_fsbs;
223 :
224 : /* Remote xattr values are the largest buffers that we support. */
225 43793 : max_fsbs = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
226 :
227 534260 : return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
228 : }
229 :
230 : /*
231 : * Return an incore buffer from a sector scan, or NULL if there are no buffers
232 : * left to return.
233 : */
234 : struct xfs_buf *
235 1015522 : xrep_bufscan_advance(
236 : struct xfs_mount *mp,
237 : struct xrep_bufscan *scan)
238 : {
239 1015522 : scan->__sector_count += scan->daddr_step;
240 4524731 : while (scan->__sector_count <= scan->max_sectors) {
241 4034257 : struct xfs_buf *bp = NULL;
242 4034257 : int error;
243 :
244 4034257 : error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
245 : scan->__sector_count, XBF_LIVESCAN, &bp);
246 4034271 : if (!error)
247 525062 : return bp;
248 :
249 3509209 : scan->__sector_count += scan->daddr_step;
250 : }
251 :
252 : return NULL;
253 : }
254 :
255 : /* Try to invalidate the incore buffers for an extent that we're freeing. */
256 : STATIC void
257 97577 : xreap_agextent_binval(
258 : struct xreap_state *rs,
259 : xfs_agblock_t agbno,
260 : xfs_extlen_t *aglenp)
261 : {
262 97577 : struct xfs_scrub *sc = rs->sc;
263 97577 : struct xfs_perag *pag = sc->sa.pag;
264 97577 : struct xfs_mount *mp = sc->mp;
265 97577 : xfs_agnumber_t agno = sc->sa.pag->pag_agno;
266 97577 : xfs_agblock_t agbno_next = agbno + *aglenp;
267 97577 : xfs_agblock_t bno = agbno;
268 :
269 : /*
270 : * Avoid invalidating AG headers and post-EOFS blocks because we never
271 : * own those.
272 : */
273 97577 : if (!xfs_verify_agbno(pag, agbno) ||
274 97577 : !xfs_verify_agbno(pag, agbno_next - 1))
275 : return;
276 :
277 : /*
278 : * If there are incore buffers for these blocks, invalidate them. We
279 : * assume that the lack of any other known owners means that the buffer
280 : * can be locked without risk of deadlocking. The buffer cache cannot
281 : * detect aliasing, so employ nested loops to scan for incore buffers
282 : * of any plausible size.
283 : */
284 537905 : while (bno < agbno_next) {
285 880648 : struct xrep_bufscan scan = {
286 440328 : .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
287 440328 : .max_sectors = xrep_bufscan_max_sectors(mp,
288 : agbno_next - bno),
289 440320 : .daddr_step = XFS_FSB_TO_BB(mp, 1),
290 : };
291 440320 : struct xfs_buf *bp;
292 :
293 872461 : while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
294 432138 : xfs_trans_bjoin(sc->tp, bp);
295 432132 : xfs_trans_binval(sc->tp, bp);
296 432141 : rs->invalidated++;
297 :
298 : /*
299 : * Stop invalidating if we've hit the limit; we should
300 : * still have enough reservation left to free however
301 : * far we've gotten.
302 : */
303 432141 : if (rs->invalidated > XREAP_MAX_BINVAL) {
304 0 : *aglenp -= agbno_next - bno;
305 0 : goto out;
306 : }
307 : }
308 :
309 440328 : bno++;
310 : }
311 :
312 97577 : out:
313 97577 : trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
314 : }
315 :
316 : /*
317 : * Figure out the longest run of blocks that we can dispose of with a single
318 : * call. Cross-linked blocks should have their reverse mappings removed, but
319 : * single-owner extents can be freed. AGFL blocks can only be put back one at
320 : * a time.
321 : */
322 : STATIC int
323 97560 : xreap_agextent_select(
324 : struct xreap_state *rs,
325 : xfs_agblock_t agbno,
326 : xfs_agblock_t agbno_next,
327 : bool *crosslinked,
328 : xfs_extlen_t *aglenp)
329 : {
330 97560 : struct xfs_scrub *sc = rs->sc;
331 97560 : struct xfs_btree_cur *cur;
332 97560 : xfs_agblock_t bno = agbno + 1;
333 97560 : xfs_extlen_t len = 1;
334 97560 : int error;
335 :
336 : /*
337 : * Determine if there are any other rmap records covering the first
338 : * block of this extent. If so, the block is crosslinked.
339 : */
340 97560 : cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
341 : sc->sa.pag);
342 97559 : error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
343 : crosslinked);
344 97578 : if (error)
345 0 : goto out_cur;
346 :
347 : /* AGFL blocks can only be deal with one at a time. */
348 97578 : if (rs->resv == XFS_AG_RESV_AGFL)
349 0 : goto out_found;
350 :
351 : /*
352 : * Figure out how many of the subsequent blocks have the same crosslink
353 : * status.
354 : */
355 440335 : while (bno < agbno_next) {
356 342758 : bool also_crosslinked;
357 :
358 342758 : error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
359 : &also_crosslinked);
360 342757 : if (error)
361 0 : goto out_cur;
362 :
363 342757 : if (*crosslinked != also_crosslinked)
364 : break;
365 :
366 342757 : len++;
367 342757 : bno++;
368 : }
369 :
370 97577 : out_found:
371 97577 : *aglenp = len;
372 97577 : trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
373 97576 : out_cur:
374 97576 : xfs_btree_del_cursor(cur, error);
375 97577 : return error;
376 : }
377 :
378 : /*
379 : * Dispose of as much of the beginning of this AG extent as possible. The
380 : * number of blocks disposed of will be returned in @aglenp.
381 : */
382 : STATIC int
383 97574 : xreap_agextent_iter(
384 : struct xreap_state *rs,
385 : xfs_agblock_t agbno,
386 : xfs_extlen_t *aglenp,
387 : bool crosslinked)
388 : {
389 97574 : struct xfs_scrub *sc = rs->sc;
390 97574 : xfs_fsblock_t fsbno;
391 97574 : int error = 0;
392 :
393 97574 : fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
394 :
395 : /*
396 : * If there are other rmappings, this block is cross linked and must
397 : * not be freed. Remove the reverse mapping and move on. Otherwise,
398 : * we were the only owner of the block, so free the extent, which will
399 : * also remove the rmap.
400 : *
401 : * XXX: XFS doesn't support detecting the case where a single block
402 : * metadata structure is crosslinked with a multi-block structure
403 : * because the buffer cache doesn't detect aliasing problems, so we
404 : * can't fix 100% of crosslinking problems (yet). The verifiers will
405 : * blow on writeout, the filesystem will shut down, and the admin gets
406 : * to run xfs_repair.
407 : */
408 97574 : if (crosslinked) {
409 0 : trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
410 :
411 0 : rs->force_roll = true;
412 :
413 0 : if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
414 : /*
415 : * If we're unmapping CoW staging extents, remove the
416 : * records from the refcountbt, which will remove the
417 : * rmap record as well.
418 : */
419 0 : xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
420 0 : return 0;
421 : }
422 :
423 0 : return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
424 : *aglenp, rs->oinfo);
425 : }
426 :
427 97574 : trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
428 :
429 : /*
430 : * Invalidate as many buffers as we can, starting at agbno. If this
431 : * function sets *aglenp to zero, the transaction is full of logged
432 : * buffer invalidations, so we need to return early so that we can
433 : * roll and retry.
434 : */
435 97571 : xreap_agextent_binval(rs, agbno, aglenp);
436 97565 : if (*aglenp == 0) {
437 0 : ASSERT(xreap_want_roll(rs));
438 0 : return 0;
439 : }
440 :
441 : /*
442 : * If we're getting rid of CoW staging extents, use deferred work items
443 : * to remove the refcountbt records (which removes the rmap records)
444 : * and free the extent. We're not worried about the system going down
445 : * here because log recovery walks the refcount btree to clean out the
446 : * CoW staging extents.
447 : */
448 97565 : if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
449 833 : ASSERT(rs->resv == XFS_AG_RESV_NONE);
450 :
451 833 : xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
452 833 : error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
453 : rs->resv, true);
454 833 : if (error)
455 : return error;
456 :
457 833 : rs->force_roll = true;
458 833 : return 0;
459 : }
460 :
461 : /* Put blocks back on the AGFL one at a time. */
462 96732 : if (rs->resv == XFS_AG_RESV_AGFL) {
463 0 : ASSERT(*aglenp == 1);
464 0 : error = xreap_put_freelist(sc, agbno);
465 0 : if (error)
466 : return error;
467 :
468 0 : rs->force_roll = true;
469 0 : return 0;
470 : }
471 :
472 : /*
473 : * Use deferred frees to get rid of the old btree blocks to try to
474 : * minimize the window in which we could crash and lose the old blocks.
475 : */
476 96732 : error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
477 : rs->resv, true);
478 96741 : if (error)
479 : return error;
480 :
481 96741 : rs->deferred++;
482 96741 : return 0;
483 : }
484 :
485 : /*
486 : * Break an AG metadata extent into sub-extents by fate (crosslinked, not
487 : * crosslinked), and dispose of each sub-extent separately.
488 : */
489 : STATIC int
490 91419 : xreap_agmeta_extent(
491 : uint64_t fsbno,
492 : uint64_t len,
493 : void *priv)
494 : {
495 91419 : struct xreap_state *rs = priv;
496 91419 : struct xfs_scrub *sc = rs->sc;
497 91419 : xfs_agblock_t agbno = fsbno;
498 91419 : xfs_agblock_t agbno_next = agbno + len;
499 91419 : int error = 0;
500 :
501 91419 : ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
502 91419 : ASSERT(sc->ip == NULL);
503 :
504 182847 : while (agbno < agbno_next) {
505 91427 : xfs_extlen_t aglen;
506 91427 : bool crosslinked;
507 :
508 91427 : error = xreap_agextent_select(rs, agbno, agbno_next,
509 : &crosslinked, &aglen);
510 91432 : if (error)
511 0 : return error;
512 :
513 91432 : error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
514 91428 : if (error)
515 0 : return error;
516 :
517 91428 : if (xreap_want_defer_finish(rs)) {
518 0 : error = xrep_defer_finish(sc);
519 0 : if (error)
520 0 : return error;
521 0 : xreap_defer_finish_reset(rs);
522 91428 : } else if (xreap_want_roll(rs)) {
523 9 : error = xrep_roll_ag_trans(sc);
524 9 : if (error)
525 0 : return error;
526 9 : xreap_reset(rs);
527 : }
528 :
529 91428 : agbno += aglen;
530 : }
531 :
532 : return 0;
533 : }
534 :
535 : /* Dispose of every block of every AG metadata extent in the bitmap. */
536 : int
537 125902 : xrep_reap_agblocks(
538 : struct xfs_scrub *sc,
539 : struct xagb_bitmap *bitmap,
540 : const struct xfs_owner_info *oinfo,
541 : enum xfs_ag_resv_type type)
542 : {
543 125902 : struct xreap_state rs = {
544 : .sc = sc,
545 : .oinfo = oinfo,
546 : .resv = type,
547 : };
548 125902 : int error;
549 :
550 125902 : ASSERT(xfs_has_rmapbt(sc->mp));
551 125902 : ASSERT(sc->ip == NULL);
552 :
553 125902 : error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
554 125914 : if (error)
555 : return error;
556 :
557 125914 : if (xreap_dirty(&rs))
558 71463 : return xrep_defer_finish(sc);
559 :
560 : return 0;
561 : }
562 :
563 : /*
564 : * Break a file metadata extent into sub-extents by fate (crosslinked, not
565 : * crosslinked), and dispose of each sub-extent separately. The extent must
566 : * not cross an AG boundary.
567 : */
568 : STATIC int
569 6138 : xreap_fsmeta_extent(
570 : uint64_t fsbno,
571 : uint64_t len,
572 : void *priv)
573 : {
574 6138 : struct xreap_state *rs = priv;
575 6138 : struct xfs_scrub *sc = rs->sc;
576 6138 : xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
577 6138 : xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
578 6138 : xfs_agblock_t agbno_next = agbno + len;
579 6138 : int error = 0;
580 :
581 6138 : ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
582 6138 : ASSERT(sc->ip != NULL);
583 6138 : ASSERT(!sc->sa.pag);
584 :
585 : /*
586 : * We're reaping blocks after repairing file metadata, which means that
587 : * we have to init the xchk_ag structure ourselves.
588 : */
589 6138 : sc->sa.pag = xfs_perag_get(sc->mp, agno);
590 6138 : if (!sc->sa.pag)
591 : return -EFSCORRUPTED;
592 :
593 6138 : error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
594 6138 : if (error)
595 0 : goto out_pag;
596 :
597 12276 : while (agbno < agbno_next) {
598 6138 : xfs_extlen_t aglen;
599 6138 : bool crosslinked;
600 :
601 6138 : error = xreap_agextent_select(rs, agbno, agbno_next,
602 : &crosslinked, &aglen);
603 6138 : if (error)
604 0 : goto out_agf;
605 :
606 6138 : error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
607 6138 : if (error)
608 0 : goto out_agf;
609 :
610 6138 : if (xreap_want_defer_finish(rs)) {
611 : /*
612 : * Holds the AGF buffer across the deferred chain
613 : * processing.
614 : */
615 833 : error = xrep_defer_finish(sc);
616 833 : if (error)
617 0 : goto out_agf;
618 833 : xreap_defer_finish_reset(rs);
619 5305 : } else if (xreap_want_roll(rs)) {
620 : /*
621 : * Hold the AGF buffer across the transaction roll so
622 : * that we don't have to reattach it to the scrub
623 : * context.
624 : */
625 0 : xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
626 0 : error = xfs_trans_roll_inode(&sc->tp, sc->ip);
627 0 : xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
628 0 : if (error)
629 0 : goto out_agf;
630 0 : xreap_reset(rs);
631 : }
632 :
633 6138 : agbno += aglen;
634 : }
635 :
636 6138 : out_agf:
637 6138 : xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
638 6138 : sc->sa.agf_bp = NULL;
639 6138 : out_pag:
640 6138 : xfs_perag_put(sc->sa.pag);
641 6138 : sc->sa.pag = NULL;
642 6138 : return error;
643 : }
644 :
645 : /*
646 : * Dispose of every block of every fs metadata extent in the bitmap.
647 : * Do not use this to dispose of the mappings in an ondisk inode fork.
648 : */
649 : int
650 91635 : xrep_reap_fsblocks(
651 : struct xfs_scrub *sc,
652 : struct xfsb_bitmap *bitmap,
653 : const struct xfs_owner_info *oinfo)
654 : {
655 91635 : struct xreap_state rs = {
656 : .sc = sc,
657 : .oinfo = oinfo,
658 : .resv = XFS_AG_RESV_NONE,
659 : };
660 91635 : int error;
661 :
662 91635 : ASSERT(xfs_has_rmapbt(sc->mp));
663 91635 : ASSERT(sc->ip != NULL);
664 :
665 91635 : error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
666 91635 : if (error)
667 : return error;
668 :
669 91635 : if (xreap_dirty(&rs))
670 5163 : return xrep_defer_finish(sc);
671 :
672 : return 0;
673 : }
674 :
675 : /*
676 : * Metadata files are not supposed to share blocks with anything else.
677 : * If blocks are shared, we remove the reverse mapping (thus reducing the
678 : * crosslink factor); if blocks are not shared, we also need to free them.
679 : *
680 : * This first step determines the longest subset of the passed-in imap
681 : * (starting at its beginning) that is either crosslinked or not crosslinked.
682 : * The blockcount will be adjust down as needed.
683 : */
684 : STATIC int
685 47937 : xreap_bmapi_select(
686 : struct xfs_scrub *sc,
687 : struct xfs_inode *ip,
688 : int whichfork,
689 : struct xfs_bmbt_irec *imap,
690 : bool *crosslinked)
691 : {
692 47937 : struct xfs_owner_info oinfo;
693 47937 : struct xfs_btree_cur *cur;
694 47937 : xfs_filblks_t len = 1;
695 47937 : xfs_agblock_t bno;
696 47937 : xfs_agblock_t agbno;
697 47937 : xfs_agblock_t agbno_next;
698 47937 : int error;
699 :
700 47937 : agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
701 47937 : agbno_next = agbno + imap->br_blockcount;
702 :
703 47937 : cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
704 : sc->sa.pag);
705 :
706 47937 : xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
707 47937 : error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
708 47937 : if (error)
709 0 : goto out_cur;
710 :
711 47937 : bno = agbno + 1;
712 50147 : while (bno < agbno_next) {
713 2210 : bool also_crosslinked;
714 :
715 2210 : oinfo.oi_offset++;
716 2210 : error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
717 : &also_crosslinked);
718 2210 : if (error)
719 0 : goto out_cur;
720 :
721 2210 : if (also_crosslinked != *crosslinked)
722 : break;
723 :
724 2210 : len++;
725 2210 : bno++;
726 : }
727 :
728 47937 : imap->br_blockcount = len;
729 47937 : trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
730 47937 : out_cur:
731 47937 : xfs_btree_del_cursor(cur, error);
732 47937 : return error;
733 : }
734 :
735 : /*
736 : * Decide if this buffer can be joined to a transaction. This is true for most
737 : * buffers, but there are two cases that we want to catch: large remote xattr
738 : * value buffers are not logged and can overflow the buffer log item dirty
739 : * bitmap size; and oversized cached buffers if things have really gone
740 : * haywire.
741 : */
742 : static inline bool
743 49130 : xreap_buf_loggable(
744 : const struct xfs_buf *bp)
745 : {
746 49130 : int i;
747 :
748 98237 : for (i = 0; i < bp->b_map_count; i++) {
749 49130 : int chunks;
750 49130 : int map_size;
751 :
752 49130 : chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
753 : XFS_BLF_CHUNK);
754 49130 : map_size = DIV_ROUND_UP(chunks, NBWORD);
755 49130 : if (map_size > XFS_BLF_DATAMAP_SIZE)
756 : return false;
757 : }
758 :
759 : return true;
760 : }
761 :
762 : /*
763 : * Invalidate any buffers for this file mapping. The @imap blockcount may be
764 : * adjusted downward if we need to roll the transaction.
765 : */
766 : STATIC int
767 47937 : xreap_bmapi_binval(
768 : struct xfs_scrub *sc,
769 : struct xfs_inode *ip,
770 : int whichfork,
771 : struct xfs_bmbt_irec *imap)
772 : {
773 47937 : struct xfs_mount *mp = sc->mp;
774 47937 : struct xfs_perag *pag = sc->sa.pag;
775 47937 : int bmap_flags = xfs_bmapi_aflag(whichfork);
776 47937 : xfs_fileoff_t off;
777 47937 : xfs_fileoff_t max_off;
778 47937 : xfs_extlen_t scan_blocks;
779 47937 : xfs_agnumber_t agno = sc->sa.pag->pag_agno;
780 47937 : xfs_agblock_t bno;
781 47937 : xfs_agblock_t agbno;
782 47937 : xfs_agblock_t agbno_next;
783 47937 : unsigned int invalidated = 0;
784 47937 : int error;
785 :
786 : /*
787 : * Avoid invalidating AG headers and post-EOFS blocks because we never
788 : * own those.
789 : */
790 47937 : agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
791 47937 : agbno_next = agbno + imap->br_blockcount;
792 47937 : if (!xfs_verify_agbno(pag, agbno) ||
793 47937 : !xfs_verify_agbno(pag, agbno_next - 1))
794 : return 0;
795 :
796 : /*
797 : * Buffers for file blocks can span multiple contiguous mappings. This
798 : * means that for each block in the mapping, there could exist an
799 : * xfs_buf indexed by that block with any length up to the maximum
800 : * buffer size (remote xattr values) or to the next hole in the fork.
801 : * To set up our binval scan, first we need to figure out the location
802 : * of the next hole.
803 : */
804 47937 : off = imap->br_startoff + imap->br_blockcount;
805 47937 : max_off = off + xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
806 66761 : while (off < max_off) {
807 65486 : struct xfs_bmbt_irec hmap;
808 65486 : int nhmaps = 1;
809 :
810 65486 : error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
811 : &nhmaps, bmap_flags);
812 65486 : if (error)
813 0 : return error;
814 65486 : if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
815 0 : ASSERT(0);
816 0 : return -EFSCORRUPTED;
817 : }
818 :
819 84310 : if (!xfs_bmap_is_real_extent(&hmap))
820 : break;
821 :
822 18824 : off = hmap.br_startoff + hmap.br_blockcount;
823 : }
824 47937 : scan_blocks = off - imap->br_startoff;
825 :
826 47937 : trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
827 :
828 : /*
829 : * If there are incore buffers for these blocks, invalidate them. If
830 : * we can't (try)lock the buffer we assume it's owned by someone else
831 : * and leave it alone. The buffer cache cannot detect aliasing, so
832 : * employ nested loops to detect incore buffers of any plausible size.
833 : */
834 98084 : while (bno < agbno_next) {
835 100294 : struct xrep_bufscan scan = {
836 50147 : .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
837 : .max_sectors = xrep_bufscan_max_sectors(mp,
838 : scan_blocks),
839 50147 : .daddr_step = XFS_FSB_TO_BB(mp, 1),
840 : };
841 50147 : struct xfs_buf *bp;
842 :
843 99277 : while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
844 49130 : if (xreap_buf_loggable(bp)) {
845 49107 : xfs_trans_bjoin(sc->tp, bp);
846 49107 : xfs_trans_binval(sc->tp, bp);
847 : } else {
848 23 : xfs_buf_stale(bp);
849 23 : xfs_buf_relse(bp);
850 : }
851 49130 : invalidated++;
852 :
853 : /*
854 : * Stop invalidating if we've hit the limit; we should
855 : * still have enough reservation left to free however
856 : * much of the mapping we've seen so far.
857 : */
858 49130 : if (invalidated > XREAP_MAX_BINVAL) {
859 0 : imap->br_blockcount = agbno_next - bno;
860 0 : goto out;
861 : }
862 : }
863 :
864 50147 : bno++;
865 50147 : scan_blocks--;
866 : }
867 :
868 47937 : out:
869 47937 : trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
870 47937 : return 0;
871 : }
872 :
873 : /*
874 : * Dispose of as much of the beginning of this file fork mapping as possible.
875 : * The number of blocks disposed of is returned in @imap->br_blockcount.
876 : */
877 : STATIC int
878 47937 : xrep_reap_bmapi_iter(
879 : struct xfs_scrub *sc,
880 : struct xfs_inode *ip,
881 : int whichfork,
882 : struct xfs_bmbt_irec *imap,
883 : bool crosslinked)
884 : {
885 47937 : int error;
886 :
887 47937 : if (crosslinked) {
888 : /*
889 : * If there are other rmappings, this block is cross linked and
890 : * must not be freed. Remove the reverse mapping, leave the
891 : * buffer cache in its possibly confused state, and move on.
892 : * We don't want to risk discarding valid data buffers from
893 : * anybody else who thinks they own the block, even though that
894 : * runs the risk of stale buffer warnings in the future.
895 : */
896 0 : trace_xreap_dispose_unmap_extent(sc->sa.pag,
897 0 : XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
898 0 : imap->br_blockcount);
899 :
900 : /*
901 : * Schedule removal of the mapping from the fork. We use
902 : * deferred log intents in this function to control the exact
903 : * sequence of metadata updates.
904 : */
905 0 : xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
906 0 : xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
907 0 : -(int64_t)imap->br_blockcount);
908 0 : xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
909 0 : return 0;
910 : }
911 :
912 : /*
913 : * If the block is not crosslinked, we can invalidate all the incore
914 : * buffers for the extent, and then free the extent. This is a bit of
915 : * a mess since we don't detect discontiguous buffers that are indexed
916 : * by a block starting before the first block of the extent but overlap
917 : * anyway.
918 : */
919 47937 : trace_xreap_dispose_free_extent(sc->sa.pag,
920 47937 : XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
921 47937 : imap->br_blockcount);
922 :
923 : /*
924 : * Invalidate as many buffers as we can, starting at the beginning of
925 : * this mapping. If this function sets blockcount to zero, the
926 : * transaction is full of logged buffer invalidations, so we need to
927 : * return early so that we can roll and retry.
928 : */
929 47937 : error = xreap_bmapi_binval(sc, ip, whichfork, imap);
930 47937 : if (error || imap->br_blockcount == 0)
931 : return error;
932 :
933 : /*
934 : * Schedule removal of the mapping from the fork. We use deferred log
935 : * intents in this function to control the exact sequence of metadata
936 : * updates.
937 : */
938 47937 : xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
939 47937 : xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
940 47937 : -(int64_t)imap->br_blockcount);
941 47937 : return __xfs_free_extent_later(sc->tp, imap->br_startblock,
942 : imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true);
943 : }
944 :
945 : /*
946 : * Dispose of as much of this file extent as we can. Upon successful return,
947 : * the imap will reflect the mapping that was removed from the fork.
948 : */
949 : STATIC int
950 47937 : xreap_ifork_extent(
951 : struct xfs_scrub *sc,
952 : struct xfs_inode *ip,
953 : int whichfork,
954 : struct xfs_bmbt_irec *imap)
955 : {
956 47937 : xfs_agnumber_t agno;
957 47937 : bool crosslinked;
958 47937 : int error;
959 :
960 47937 : ASSERT(sc->sa.pag == NULL);
961 :
962 47937 : trace_xreap_ifork_extent(sc, ip, whichfork, imap);
963 :
964 47937 : agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
965 47937 : sc->sa.pag = xfs_perag_get(sc->mp, agno);
966 47937 : if (!sc->sa.pag)
967 : return -EFSCORRUPTED;
968 :
969 47937 : error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
970 47937 : if (error)
971 0 : goto out_pag;
972 :
973 : /*
974 : * Decide the fate of the blocks at the beginning of the mapping, then
975 : * update the mapping to use it with the unmap calls.
976 : */
977 47937 : error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
978 47937 : if (error)
979 0 : goto out_agf;
980 :
981 47937 : error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
982 47937 : if (error)
983 0 : goto out_agf;
984 :
985 47937 : out_agf:
986 47937 : xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
987 47937 : sc->sa.agf_bp = NULL;
988 47937 : out_pag:
989 47937 : xfs_perag_put(sc->sa.pag);
990 47937 : sc->sa.pag = NULL;
991 47937 : return error;
992 : }
993 :
994 : /*
995 : * Dispose of each block mapped to the given fork of the given file. Callers
996 : * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork
997 : * must not have any delalloc reservations.
998 : */
999 : int
1000 45089 : xrep_reap_ifork(
1001 : struct xfs_scrub *sc,
1002 : struct xfs_inode *ip,
1003 : int whichfork)
1004 : {
1005 45089 : xfs_fileoff_t off = 0;
1006 45089 : int bmap_flags = xfs_bmapi_aflag(whichfork);
1007 45089 : int error;
1008 :
1009 45089 : ASSERT(xfs_has_rmapbt(sc->mp));
1010 45089 : ASSERT(ip == sc->ip || ip == sc->tempip);
1011 45089 : ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
1012 :
1013 138231 : while (off < XFS_MAX_FILEOFF) {
1014 93142 : struct xfs_bmbt_irec imap;
1015 93142 : int nimaps = 1;
1016 :
1017 : /* Read the next extent, skip past holes and delalloc. */
1018 93142 : error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
1019 : &nimaps, bmap_flags);
1020 93142 : if (error)
1021 0 : return error;
1022 93142 : if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
1023 0 : ASSERT(0);
1024 0 : return -EFSCORRUPTED;
1025 : }
1026 :
1027 : /*
1028 : * If this is a real space mapping, reap as much of it as we
1029 : * can in a single transaction.
1030 : */
1031 141079 : if (xfs_bmap_is_real_extent(&imap)) {
1032 47937 : error = xreap_ifork_extent(sc, ip, whichfork, &imap);
1033 47937 : if (error)
1034 0 : return error;
1035 :
1036 47937 : error = xfs_defer_finish(&sc->tp);
1037 47937 : if (error)
1038 0 : return error;
1039 : }
1040 :
1041 93142 : off = imap.br_startoff + imap.br_blockcount;
1042 : }
1043 :
1044 : return 0;
1045 : }
|