Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_btree.h"
13 : #include "xfs_log_format.h"
14 : #include "xfs_trans.h"
15 : #include "xfs_sb.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_alloc.h"
18 : #include "xfs_alloc_btree.h"
19 : #include "xfs_ialloc.h"
20 : #include "xfs_ialloc_btree.h"
21 : #include "xfs_rmap.h"
22 : #include "xfs_rmap_btree.h"
23 : #include "xfs_refcount_btree.h"
24 : #include "xfs_extent_busy.h"
25 : #include "xfs_ag.h"
26 : #include "xfs_ag_resv.h"
27 : #include "xfs_quota.h"
28 : #include "xfs_qm.h"
29 : #include "xfs_defer.h"
30 : #include "xfs_errortag.h"
31 : #include "xfs_error.h"
32 : #include "xfs_reflink.h"
33 : #include "xfs_health.h"
34 : #include "xfs_buf_xfile.h"
35 : #include "xfs_da_format.h"
36 : #include "xfs_da_btree.h"
37 : #include "xfs_attr.h"
38 : #include "xfs_dir2.h"
39 : #include "xfs_rtrmap_btree.h"
40 : #include "xfs_rtbitmap.h"
41 : #include "xfs_rtgroup.h"
42 : #include "xfs_rtalloc.h"
43 : #include "xfs_imeta.h"
44 : #include "xfs_rtrefcount_btree.h"
45 : #include "scrub/scrub.h"
46 : #include "scrub/common.h"
47 : #include "scrub/trace.h"
48 : #include "scrub/repair.h"
49 : #include "scrub/bitmap.h"
50 : #include "scrub/stats.h"
51 : #include "scrub/xfile.h"
52 : #include "scrub/attr_repair.h"
53 :
54 : /*
55 : * Attempt to repair some metadata, if the metadata is corrupt and userspace
56 : * told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
57 : * and will set *fixed to true if it thinks it repaired anything.
58 : */
59 : int
60 18771777 : xrep_attempt(
61 : struct xfs_scrub *sc,
62 : struct xchk_stats_run *run)
63 : {
64 18771777 : u64 repair_start;
65 18771777 : int error = 0;
66 :
67 18771777 : trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
68 :
69 18770150 : xchk_ag_btcur_free(&sc->sa);
70 18770813 : xchk_rtgroup_btcur_free(&sc->sr);
71 :
72 : /* Repair whatever's broken. */
73 18770804 : ASSERT(sc->ops->repair);
74 18770804 : run->repair_attempted = true;
75 18770804 : repair_start = xchk_stats_now();
76 18770260 : error = sc->ops->repair(sc);
77 18767567 : trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
78 18762400 : run->repair_ns += xchk_stats_elapsed_ns(repair_start);
79 18764335 : switch (error) {
80 18376600 : case 0:
81 : /*
82 : * Repair succeeded. Commit the fixes and perform a second
83 : * scrub so that we can tell userspace if we fixed the problem.
84 : */
85 18376600 : sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
86 18376600 : sc->flags |= XREP_ALREADY_FIXED;
87 18376600 : run->repair_succeeded = true;
88 18376600 : return -EAGAIN;
89 213 : case -ECHRNG:
90 213 : sc->flags |= XCHK_NEED_DRAIN;
91 213 : run->retries++;
92 213 : return -EAGAIN;
93 0 : case -EDEADLOCK:
94 : /* Tell the caller to try again having grabbed all the locks. */
95 0 : if (!(sc->flags & XCHK_TRY_HARDER)) {
96 0 : sc->flags |= XCHK_TRY_HARDER;
97 0 : run->retries++;
98 0 : return -EAGAIN;
99 : }
100 : /*
101 : * We tried harder but still couldn't grab all the resources
102 : * we needed to fix it. The corruption has not been fixed,
103 : * so exit to userspace with the scan's output flags unchanged.
104 : */
105 : return 0;
106 387522 : default:
107 : /*
108 : * EAGAIN tells the caller to re-scrub, so we cannot return
109 : * that here.
110 : */
111 387522 : ASSERT(error != -EAGAIN);
112 : return error;
113 : }
114 : }
115 :
116 : /*
117 : * Complain about unfixable problems in the filesystem. We don't log
118 : * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
119 : * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
120 : * administrator isn't running xfs_scrub in no-repairs mode.
121 : *
122 : * Use this helper function because _ratelimited silently declares a static
123 : * structure to track rate limiting information.
124 : */
125 : void
126 0 : xrep_failure(
127 : struct xfs_mount *mp)
128 : {
129 0 : xfs_alert_ratelimited(mp,
130 : "Corruption not fixed during online repair. Unmount and run xfs_repair.");
131 0 : }
132 :
133 : /*
134 : * Repair probe -- userspace uses this to probe if we're willing to repair a
135 : * given mountpoint.
136 : */
137 : int
138 4066 : xrep_probe(
139 : struct xfs_scrub *sc)
140 : {
141 4066 : int error = 0;
142 :
143 4066 : if (xchk_should_terminate(sc, &error))
144 0 : return error;
145 :
146 : return 0;
147 : }
148 :
149 : /*
150 : * Roll a transaction, keeping the AG headers locked and reinitializing
151 : * the btree cursors.
152 : */
153 : int
154 639900 : xrep_roll_ag_trans(
155 : struct xfs_scrub *sc)
156 : {
157 639900 : int error;
158 :
159 : /*
160 : * Keep the AG header buffers locked while we roll the transaction.
161 : * Ensure that both AG buffers are dirty and held when we roll the
162 : * transaction so that they move forward in the log without losing the
163 : * bli (and hence the bli type) when the transaction commits.
164 : *
165 : * Normal code would never hold clean buffers across a roll, but repair
166 : * needs both buffers to maintain a total lock on the AG.
167 : */
168 639900 : if (sc->sa.agi_bp) {
169 639900 : xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
170 640151 : xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
171 : }
172 :
173 639582 : if (sc->sa.agf_bp) {
174 639582 : xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
175 640306 : xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
176 : }
177 :
178 : /*
179 : * Roll the transaction. We still hold the AG header buffers locked
180 : * regardless of whether or not that succeeds. On failure, the buffers
181 : * will be released during teardown on our way out of the kernel. If
182 : * successful, join the buffers to the new transaction and move on.
183 : */
184 640257 : error = xfs_trans_roll(&sc->tp);
185 639248 : if (error)
186 : return error;
187 :
188 : /* Join the AG headers to the new transaction. */
189 639248 : if (sc->sa.agi_bp)
190 639248 : xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
191 639514 : if (sc->sa.agf_bp)
192 639514 : xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
193 :
194 : return 0;
195 : }
196 :
197 : /* Roll the scrub transaction, holding the primary metadata locked. */
198 : int
199 8348322 : xrep_roll_trans(
200 : struct xfs_scrub *sc)
201 : {
202 8348322 : if (!sc->ip)
203 4679 : return xrep_roll_ag_trans(sc);
204 8343643 : return xfs_trans_roll_inode(&sc->tp, sc->ip);
205 : }
206 :
207 : /* Finish all deferred work attached to the repair transaction. */
208 : int
209 1129624 : xrep_defer_finish(
210 : struct xfs_scrub *sc)
211 : {
212 1129624 : int error;
213 :
214 : /*
215 : * Keep the AG header buffers locked while we complete deferred work
216 : * items. Ensure that both AG buffers are dirty and held when we roll
217 : * the transaction so that they move forward in the log without losing
218 : * the bli (and hence the bli type) when the transaction commits.
219 : *
220 : * Normal code would never hold clean buffers across a roll, but repair
221 : * needs both buffers to maintain a total lock on the AG.
222 : */
223 1129624 : if (sc->sa.agi_bp) {
224 843973 : xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
225 844532 : xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
226 : }
227 :
228 1129082 : if (sc->sa.agf_bp) {
229 859017 : xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
230 860839 : xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
231 : }
232 :
233 : /*
234 : * Finish all deferred work items. We still hold the AG header buffers
235 : * locked regardless of whether or not that succeeds. On failure, the
236 : * buffers will be released during teardown on our way out of the
237 : * kernel. If successful, join the buffers to the new transaction
238 : * and move on.
239 : */
240 1130654 : error = xfs_defer_finish(&sc->tp);
241 1129788 : if (error)
242 : return error;
243 :
244 : /*
245 : * Release the hold that we set above because defer_finish won't do
246 : * that for us. The defer roll code redirties held buffers after each
247 : * roll, so the AG header buffers should be ready for logging.
248 : */
249 1129788 : if (sc->sa.agi_bp)
250 844136 : xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
251 1128199 : if (sc->sa.agf_bp)
252 858132 : xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
253 :
254 : return 0;
255 : }
256 :
257 : /*
258 : * Does the given AG have enough space to rebuild a btree? Neither AG
259 : * reservation can be critical, and we must have enough space (factoring
260 : * in AG reservations) to construct a whole btree.
261 : */
262 : bool
263 0 : xrep_ag_has_space(
264 : struct xfs_perag *pag,
265 : xfs_extlen_t nr_blocks,
266 : enum xfs_ag_resv_type type)
267 : {
268 0 : return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
269 0 : !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
270 0 : pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
271 : }
272 :
273 : /*
274 : * Figure out how many blocks to reserve for an AG repair. We calculate the
275 : * worst case estimate for the number of blocks we'd need to rebuild one of
276 : * any type of per-AG btree.
277 : */
278 : xfs_extlen_t
279 6750954 : xrep_calc_ag_resblks(
280 : struct xfs_scrub *sc)
281 : {
282 6750954 : struct xfs_mount *mp = sc->mp;
283 6750954 : struct xfs_scrub_metadata *sm = sc->sm;
284 6750954 : struct xfs_perag *pag;
285 6750954 : struct xfs_buf *bp;
286 6750954 : xfs_agino_t icount = NULLAGINO;
287 6750954 : xfs_extlen_t aglen = NULLAGBLOCK;
288 6750954 : xfs_extlen_t usedlen;
289 6750954 : xfs_extlen_t freelen;
290 6750954 : xfs_extlen_t bnobt_sz;
291 6750954 : xfs_extlen_t inobt_sz;
292 6750954 : xfs_extlen_t rmapbt_sz;
293 6750954 : xfs_extlen_t refcbt_sz;
294 6750954 : int error;
295 :
296 6750954 : if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
297 : return 0;
298 :
299 3009864 : pag = xfs_perag_get(mp, sm->sm_agno);
300 6026588 : if (xfs_perag_initialised_agi(pag)) {
301 : /* Use in-core icount if possible. */
302 3013294 : icount = pag->pagi_count;
303 : } else {
304 : /* Try to get the actual counters from disk. */
305 0 : error = xfs_ialloc_read_agi(pag, NULL, &bp);
306 0 : if (!error) {
307 0 : icount = pag->pagi_count;
308 0 : xfs_buf_relse(bp);
309 : }
310 : }
311 :
312 : /* Now grab the block counters from the AGF. */
313 3013294 : error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
314 3012331 : if (error) {
315 0 : aglen = pag->block_count;
316 0 : freelen = aglen;
317 0 : usedlen = aglen;
318 : } else {
319 3012331 : struct xfs_agf *agf = bp->b_addr;
320 :
321 3012331 : aglen = be32_to_cpu(agf->agf_length);
322 3012331 : freelen = be32_to_cpu(agf->agf_freeblks);
323 3012331 : usedlen = aglen - freelen;
324 3012331 : xfs_buf_relse(bp);
325 : }
326 :
327 : /* If the icount is impossible, make some worst-case assumptions. */
328 3013188 : if (icount == NULLAGINO ||
329 : !xfs_verify_agino(pag, icount)) {
330 495682 : icount = pag->agino_max - pag->agino_min + 1;
331 : }
332 :
333 : /* If the block counts are impossible, make worst-case assumptions. */
334 3013188 : if (aglen == NULLAGBLOCK ||
335 3013047 : aglen != pag->block_count ||
336 : freelen >= aglen) {
337 221 : aglen = pag->block_count;
338 221 : freelen = aglen;
339 221 : usedlen = aglen;
340 : }
341 3013188 : xfs_perag_put(pag);
342 :
343 3013676 : trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
344 : freelen, usedlen);
345 :
346 : /*
347 : * Figure out how many blocks we'd need worst case to rebuild
348 : * each type of btree. Note that we can only rebuild the
349 : * bnobt/cntbt or inobt/finobt as pairs.
350 : */
351 3013340 : bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
352 3011825 : if (xfs_has_sparseinodes(mp))
353 3011825 : inobt_sz = xfs_iallocbt_calc_size(mp, icount /
354 : XFS_INODES_PER_HOLEMASK_BIT);
355 : else
356 0 : inobt_sz = xfs_iallocbt_calc_size(mp, icount /
357 : XFS_INODES_PER_CHUNK);
358 3011289 : if (xfs_has_finobt(mp))
359 3011385 : inobt_sz *= 2;
360 3011289 : if (xfs_has_reflink(mp))
361 2994423 : refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
362 : else
363 : refcbt_sz = 0;
364 6021076 : if (xfs_has_rmapbt(mp)) {
365 : /*
366 : * Guess how many blocks we need to rebuild the rmapbt.
367 : * For non-reflink filesystems we can't have more records than
368 : * used blocks. However, with reflink it's possible to have
369 : * more than one rmap record per AG block. We don't know how
370 : * many rmaps there could be in the AG, so we start off with
371 : * what we hope is an generous over-estimation.
372 : */
373 2994268 : if (xfs_has_reflink(mp))
374 2993807 : rmapbt_sz = xfs_rmapbt_calc_size(mp,
375 2993807 : (unsigned long long)aglen * 2);
376 : else
377 461 : rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
378 : } else {
379 : rmapbt_sz = 0;
380 : }
381 :
382 3011425 : trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
383 : inobt_sz, rmapbt_sz, refcbt_sz);
384 :
385 3011694 : return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
386 : }
387 :
388 : #ifdef CONFIG_XFS_RT
389 : /*
390 : * Figure out how many blocks to reserve for a rtgroup repair. We calculate
391 : * the worst case estimate for the number of blocks we'd need to rebuild one of
392 : * any type of per-rtgroup btree.
393 : */
394 : xfs_extlen_t
395 297364 : xrep_calc_rtgroup_resblks(
396 : struct xfs_scrub *sc)
397 : {
398 297364 : struct xfs_mount *mp = sc->mp;
399 297364 : struct xfs_scrub_metadata *sm = sc->sm;
400 297364 : struct xfs_rtgroup *rtg;
401 297364 : xfs_extlen_t usedlen;
402 297364 : xfs_extlen_t rmapbt_sz = 0;
403 :
404 297364 : if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
405 : return 0;
406 :
407 95846 : rtg = xfs_rtgroup_get(mp, sm->sm_agno);
408 95887 : usedlen = rtg->rtg_blockcount;
409 95887 : xfs_rtgroup_put(rtg);
410 :
411 95896 : if (xfs_has_rmapbt(mp))
412 95897 : rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen);
413 :
414 95846 : trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen,
415 : rmapbt_sz);
416 :
417 95846 : return rmapbt_sz;
418 : }
419 : #endif /* CONFIG_XFS_RT */
420 :
421 : /*
422 : * Reconstructing per-AG Btrees
423 : *
424 : * When a space btree is corrupt, we don't bother trying to fix it. Instead,
425 : * we scan secondary space metadata to derive the records that should be in
426 : * the damaged btree, initialize a fresh btree root, and insert the records.
427 : * Note that for rebuilding the rmapbt we scan all the primary data to
428 : * generate the new records.
429 : *
430 : * However, that leaves the matter of removing all the metadata describing the
431 : * old broken structure. For primary metadata we use the rmap data to collect
432 : * every extent with a matching rmap owner (bitmap); we then iterate all other
433 : * metadata structures with the same rmap owner to collect the extents that
434 : * cannot be removed (sublist). We then subtract sublist from bitmap to
435 : * derive the blocks that were used by the old btree. These blocks can be
436 : * reaped.
437 : *
438 : * For rmapbt reconstructions we must use different tactics for extent
439 : * collection. First we iterate all primary metadata (this excludes the old
440 : * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
441 : * records are collected as bitmap. The bnobt records are collected as
442 : * sublist. As with the other btrees we subtract sublist from bitmap, and the
443 : * result (since the rmapbt lives in the free space) are the blocks from the
444 : * old rmapbt.
445 : */
446 :
447 : /* Ensure the freelist is the correct size. */
448 : int
449 37870 : xrep_fix_freelist(
450 : struct xfs_scrub *sc,
451 : int alloc_flags)
452 : {
453 37870 : struct xfs_alloc_arg args = {0};
454 :
455 37870 : args.mp = sc->mp;
456 37870 : args.tp = sc->tp;
457 37870 : args.agno = sc->sa.pag->pag_agno;
458 37870 : args.alignment = 1;
459 37870 : args.pag = sc->sa.pag;
460 :
461 37870 : return xfs_alloc_fix_freelist(&args, alloc_flags);
462 : }
463 :
464 : /*
465 : * Finding per-AG Btree Roots for AGF/AGI Reconstruction
466 : *
467 : * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
468 : * the AG headers by using the rmap data to rummage through the AG looking for
469 : * btree roots. This is not guaranteed to work if the AG is heavily damaged
470 : * or the rmap data are corrupt.
471 : *
472 : * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
473 : * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
474 : * AGI is being rebuilt. It must maintain these locks until it's safe for
475 : * other threads to change the btrees' shapes. The caller provides
476 : * information about the btrees to look for by passing in an array of
477 : * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
478 : * The (root, height) fields will be set on return if anything is found. The
479 : * last element of the array should have a NULL buf_ops to mark the end of the
480 : * array.
481 : *
482 : * For every rmapbt record matching any of the rmap owners in btree_info,
483 : * read each block referenced by the rmap record. If the block is a btree
484 : * block from this filesystem matching any of the magic numbers and has a
485 : * level higher than what we've already seen, remember the block and the
486 : * height of the tree required to have such a block. When the call completes,
487 : * we return the highest block we've found for each btree description; those
488 : * should be the roots.
489 : */
490 :
491 : struct xrep_findroot {
492 : struct xfs_scrub *sc;
493 : struct xfs_buf *agfl_bp;
494 : struct xfs_agf *agf;
495 : struct xrep_find_ag_btree *btree_info;
496 : };
497 :
498 : /* See if our block is in the AGFL. */
499 : STATIC int
500 735305869 : xrep_findroot_agfl_walk(
501 : struct xfs_mount *mp,
502 : xfs_agblock_t bno,
503 : void *priv)
504 : {
505 735305869 : xfs_agblock_t *agbno = priv;
506 :
507 735305869 : return (*agbno == bno) ? -ECANCELED : 0;
508 : }
509 :
510 : /* Does this block match the btree information passed in? */
511 : STATIC int
512 82744852 : xrep_findroot_block(
513 : struct xrep_findroot *ri,
514 : struct xrep_find_ag_btree *fab,
515 : uint64_t owner,
516 : xfs_agblock_t agbno,
517 : bool *done_with_block)
518 : {
519 82744852 : struct xfs_mount *mp = ri->sc->mp;
520 82744852 : struct xfs_buf *bp;
521 82744852 : struct xfs_btree_block *btblock;
522 82744852 : xfs_daddr_t daddr;
523 82744852 : int block_level;
524 82744852 : int error = 0;
525 :
526 82744852 : daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
527 :
528 : /*
529 : * Blocks in the AGFL have stale contents that might just happen to
530 : * have a matching magic and uuid. We don't want to pull these blocks
531 : * in as part of a tree root, so we have to filter out the AGFL stuff
532 : * here. If the AGFL looks insane we'll just refuse to repair.
533 : */
534 82744852 : if (owner == XFS_RMAP_OWN_AG) {
535 81360524 : error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
536 : xrep_findroot_agfl_walk, &agbno);
537 81360343 : if (error == -ECANCELED)
538 : return 0;
539 76532011 : if (error)
540 : return error;
541 : }
542 :
543 : /*
544 : * Read the buffer into memory so that we can see if it's a match for
545 : * our btree type. We have no clue if it is beforehand, and we want to
546 : * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
547 : * will cause needless disk reads in subsequent calls to this function)
548 : * and logging metadata verifier failures.
549 : *
550 : * Therefore, pass in NULL buffer ops. If the buffer was already in
551 : * memory from some other caller it will already have b_ops assigned.
552 : * If it was in memory from a previous unsuccessful findroot_block
553 : * call, the buffer won't have b_ops but it should be clean and ready
554 : * for us to try to verify if the read call succeeds. The same applies
555 : * if the buffer wasn't in memory at all.
556 : *
557 : * Note: If we never match a btree type with this buffer, it will be
558 : * left in memory with NULL b_ops. This shouldn't be a problem unless
559 : * the buffer gets written.
560 : */
561 77916339 : error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
562 : mp->m_bsize, 0, &bp, NULL);
563 77916833 : if (error)
564 : return error;
565 :
566 : /* Ensure the block magic matches the btree type we're looking for. */
567 77916833 : btblock = XFS_BUF_TO_BLOCK(bp);
568 77916833 : ASSERT(fab->buf_ops->magic[1] != 0);
569 77916833 : if (btblock->bb_magic != fab->buf_ops->magic[1])
570 50121125 : goto out;
571 :
572 : /*
573 : * If the buffer already has ops applied and they're not the ones for
574 : * this btree type, we know this block doesn't match the btree and we
575 : * can bail out.
576 : *
577 : * If the buffer ops match ours, someone else has already validated
578 : * the block for us, so we can move on to checking if this is a root
579 : * block candidate.
580 : *
581 : * If the buffer does not have ops, nobody has successfully validated
582 : * the contents and the buffer cannot be dirty. If the magic, uuid,
583 : * and structure match this btree type then we'll move on to checking
584 : * if it's a root block candidate. If there is no match, bail out.
585 : */
586 27795708 : if (bp->b_ops) {
587 27795708 : if (bp->b_ops != fab->buf_ops)
588 0 : goto out;
589 : } else {
590 0 : ASSERT(!xfs_trans_buf_is_dirty(bp));
591 0 : if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
592 0 : &mp->m_sb.sb_meta_uuid))
593 0 : goto out;
594 : /*
595 : * Read verifiers can reference b_ops, so we set the pointer
596 : * here. If the verifier fails we'll reset the buffer state
597 : * to what it was before we touched the buffer.
598 : */
599 0 : bp->b_ops = fab->buf_ops;
600 0 : fab->buf_ops->verify_read(bp);
601 0 : if (bp->b_error) {
602 0 : bp->b_ops = NULL;
603 0 : bp->b_error = 0;
604 0 : goto out;
605 : }
606 :
607 : /*
608 : * Some read verifiers will (re)set b_ops, so we must be
609 : * careful not to change b_ops after running the verifier.
610 : */
611 : }
612 :
613 : /*
614 : * This block passes the magic/uuid and verifier tests for this btree
615 : * type. We don't need the caller to try the other tree types.
616 : */
617 27795708 : *done_with_block = true;
618 :
619 : /*
620 : * Compare this btree block's level to the height of the current
621 : * candidate root block.
622 : *
623 : * If the level matches the root we found previously, throw away both
624 : * blocks because there can't be two candidate roots.
625 : *
626 : * If level is lower in the tree than the root we found previously,
627 : * ignore this block.
628 : */
629 27795708 : block_level = xfs_btree_get_level(btblock);
630 27795708 : if (block_level + 1 == fab->height) {
631 775438 : fab->root = NULLAGBLOCK;
632 775438 : goto out;
633 27020270 : } else if (block_level < fab->height) {
634 25233633 : goto out;
635 : }
636 :
637 : /*
638 : * This is the highest block in the tree that we've found so far.
639 : * Update the btree height to reflect what we've learned from this
640 : * block.
641 : */
642 1786637 : fab->height = block_level + 1;
643 :
644 : /*
645 : * If this block doesn't have sibling pointers, then it's the new root
646 : * block candidate. Otherwise, the root will be found farther up the
647 : * tree.
648 : */
649 1786637 : if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
650 : btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
651 1314056 : fab->root = agbno;
652 : else
653 472581 : fab->root = NULLAGBLOCK;
654 :
655 1786637 : trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
656 1786637 : be32_to_cpu(btblock->bb_magic), fab->height - 1);
657 77916707 : out:
658 77916707 : xfs_trans_brelse(ri->sc->tp, bp);
659 77916707 : return error;
660 : }
661 :
662 : /*
663 : * Do any of the blocks in this rmap record match one of the btrees we're
664 : * looking for?
665 : */
666 : STATIC int
667 7205333076 : xrep_findroot_rmap(
668 : struct xfs_btree_cur *cur,
669 : const struct xfs_rmap_irec *rec,
670 : void *priv)
671 : {
672 7205333076 : struct xrep_findroot *ri = priv;
673 7205333076 : struct xrep_find_ag_btree *fab;
674 7205333076 : xfs_agblock_t b;
675 7205333076 : bool done;
676 7205333076 : int error = 0;
677 :
678 : /* Ignore anything that isn't AG metadata. */
679 7205333076 : if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
680 : return 0;
681 :
682 : /* Otherwise scan each block + btree type. */
683 2859929369 : for (b = 0; b < rec->rm_blockcount; b++) {
684 2667898563 : done = false;
685 10524403141 : for (fab = ri->btree_info; fab->buf_ops; fab++) {
686 7884285434 : if (rec->rm_owner != fab->rmap_owner)
687 7801555105 : continue;
688 82730329 : error = xrep_findroot_block(ri, fab,
689 82730329 : rec->rm_owner, rec->rm_startblock + b,
690 : &done);
691 82745214 : if (error)
692 0 : return error;
693 82745214 : if (done)
694 : break;
695 : }
696 : }
697 :
698 : return 0;
699 : }
700 :
701 : /* Find the roots of the per-AG btrees described in btree_info. */
702 : int
703 445181 : xrep_find_ag_btree_roots(
704 : struct xfs_scrub *sc,
705 : struct xfs_buf *agf_bp,
706 : struct xrep_find_ag_btree *btree_info,
707 : struct xfs_buf *agfl_bp)
708 : {
709 445181 : struct xfs_mount *mp = sc->mp;
710 445181 : struct xrep_findroot ri;
711 445181 : struct xrep_find_ag_btree *fab;
712 445181 : struct xfs_btree_cur *cur;
713 445181 : int error;
714 :
715 445181 : ASSERT(xfs_buf_islocked(agf_bp));
716 445181 : ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
717 :
718 445181 : ri.sc = sc;
719 445181 : ri.btree_info = btree_info;
720 445181 : ri.agf = agf_bp->b_addr;
721 445181 : ri.agfl_bp = agfl_bp;
722 1758210 : for (fab = btree_info; fab->buf_ops; fab++) {
723 1313088 : ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
724 1313088 : ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
725 1313029 : fab->root = NULLAGBLOCK;
726 1313029 : fab->height = 0;
727 : }
728 :
729 445122 : cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
730 445220 : error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
731 445251 : xfs_btree_del_cursor(cur, error);
732 :
733 445318 : return error;
734 : }
735 :
736 : #ifdef CONFIG_XFS_QUOTA
737 : /* Update some quota flags in the superblock. */
738 : void
739 11334 : xrep_update_qflags(
740 : struct xfs_scrub *sc,
741 : unsigned int clear_flags,
742 : unsigned int set_flags)
743 : {
744 11334 : struct xfs_mount *mp = sc->mp;
745 11334 : struct xfs_buf *bp;
746 :
747 11334 : mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
748 11334 : if ((mp->m_qflags & clear_flags) == 0 &&
749 5667 : (mp->m_qflags & set_flags) == set_flags)
750 0 : goto no_update;
751 :
752 11334 : mp->m_qflags &= ~clear_flags;
753 11334 : mp->m_qflags |= set_flags;
754 :
755 11334 : spin_lock(&mp->m_sb_lock);
756 11334 : mp->m_sb.sb_qflags &= ~clear_flags;
757 11334 : mp->m_sb.sb_qflags |= set_flags;
758 11334 : spin_unlock(&mp->m_sb_lock);
759 :
760 : /*
761 : * Update the quota flags in the ondisk superblock without touching
762 : * the summary counters. We have not quiesced inode chunk allocation,
763 : * so we cannot coordinate with updates to the icount and ifree percpu
764 : * counters.
765 : */
766 11334 : bp = xfs_trans_getsb(sc->tp);
767 11334 : xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
768 11334 : xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
769 11334 : xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
770 :
771 11334 : no_update:
772 11334 : mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
773 11334 : }
774 :
775 : /* Force a quotacheck the next time we mount. */
776 : void
777 0 : xrep_force_quotacheck(
778 : struct xfs_scrub *sc,
779 : xfs_dqtype_t type)
780 : {
781 0 : uint flag;
782 :
783 0 : flag = xfs_quota_chkd_flag(type);
784 0 : if (!(flag & sc->mp->m_qflags))
785 : return;
786 :
787 0 : xrep_update_qflags(sc, flag, 0);
788 : }
789 :
790 : /*
791 : * Attach dquots to this inode, or schedule quotacheck to fix them.
792 : *
793 : * This function ensures that the appropriate dquots are attached to an inode.
794 : * We cannot allow the dquot code to allocate an on-disk dquot block here
795 : * because we're already in transaction context. The on-disk dquot should
796 : * already exist anyway. If the quota code signals corruption or missing quota
797 : * information, schedule quotacheck, which will repair corruptions in the quota
798 : * metadata.
799 : */
800 : int
801 17315816 : xrep_ino_dqattach(
802 : struct xfs_scrub *sc)
803 : {
804 17315816 : int error;
805 :
806 17315816 : ASSERT(sc->tp != NULL);
807 17315816 : ASSERT(sc->ip != NULL);
808 :
809 17315816 : error = xfs_qm_dqattach(sc->ip);
810 17314427 : switch (error) {
811 0 : case -EFSBADCRC:
812 : case -EFSCORRUPTED:
813 : case -ENOENT:
814 0 : xfs_err_ratelimited(sc->mp,
815 : "inode %llu repair encountered quota error %d, quotacheck forced.",
816 : (unsigned long long)sc->ip->i_ino, error);
817 0 : if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
818 0 : xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
819 0 : if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
820 0 : xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
821 0 : if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
822 0 : xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
823 : fallthrough;
824 : case -ESRCH:
825 : error = 0;
826 : break;
827 : default:
828 : break;
829 : }
830 :
831 17314427 : return error;
832 : }
833 : #endif /* CONFIG_XFS_QUOTA */
834 :
835 : /*
836 : * Ensure that the inode being repaired is ready to handle a certain number of
837 : * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode
838 : * being repaired and have joined it to the scrub transaction.
839 : */
840 : int
841 3340405 : xrep_ino_ensure_extent_count(
842 : struct xfs_scrub *sc,
843 : int whichfork,
844 : xfs_extnum_t nextents)
845 : {
846 3340405 : xfs_extnum_t max_extents;
847 3340405 : bool large_extcount;
848 :
849 3340405 : large_extcount = xfs_inode_has_large_extent_counts(sc->ip);
850 3340405 : max_extents = xfs_iext_max_nextents(large_extcount, whichfork);
851 3340405 : if (nextents <= max_extents)
852 : return 0;
853 0 : if (large_extcount)
854 : return -EFSCORRUPTED;
855 0 : if (!xfs_has_large_extent_counts(sc->mp))
856 : return -EFSCORRUPTED;
857 :
858 0 : max_extents = xfs_iext_max_nextents(true, whichfork);
859 0 : if (nextents > max_extents)
860 : return -EFSCORRUPTED;
861 :
862 0 : sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
863 0 : xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
864 0 : return 0;
865 : }
866 :
867 : /* Initialize all the btree cursors for an AG repair. */
868 : void
869 22368958 : xrep_ag_btcur_init(
870 : struct xfs_scrub *sc,
871 : struct xchk_ag *sa)
872 : {
873 22368958 : struct xfs_mount *mp = sc->mp;
874 :
875 : /* Set up a bnobt cursor for cross-referencing. */
876 22368958 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
877 : sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
878 22230377 : sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
879 : sc->sa.pag, XFS_BTNUM_BNO);
880 22232045 : sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
881 : sc->sa.pag, XFS_BTNUM_CNT);
882 : }
883 :
884 : /* Set up a inobt cursor for cross-referencing. */
885 22369428 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
886 : sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
887 22228204 : sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
888 : sa->agi_bp, XFS_BTNUM_INO);
889 22232112 : if (xfs_has_finobt(mp))
890 22228438 : sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
891 : sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
892 : }
893 :
894 : /* Set up a rmapbt cursor for cross-referencing. */
895 22375921 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
896 : xfs_has_rmapbt(mp))
897 22310743 : sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
898 : sc->sa.pag);
899 :
900 : /* Set up a refcountbt cursor for cross-referencing. */
901 22378162 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
902 : xfs_has_reflink(mp))
903 22280711 : sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
904 : sa->agf_bp, sc->sa.pag);
905 22377698 : }
906 :
907 : /*
908 : * Reinitialize the in-core AG state after a repair by rereading the AGF
909 : * buffer. We had better get the same AGF buffer as the one that's attached
910 : * to the scrub context.
911 : */
912 : int
913 266472 : xrep_reinit_pagf(
914 : struct xfs_scrub *sc)
915 : {
916 266472 : struct xfs_perag *pag = sc->sa.pag;
917 266472 : struct xfs_buf *bp;
918 266472 : int error;
919 :
920 266472 : ASSERT(pag);
921 532944 : ASSERT(xfs_perag_initialised_agf(pag));
922 :
923 266472 : clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
924 266569 : error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
925 266459 : if (error)
926 : return error;
927 :
928 266459 : if (bp != sc->sa.agf_bp) {
929 0 : ASSERT(bp == sc->sa.agf_bp);
930 0 : return -EFSCORRUPTED;
931 : }
932 :
933 : return 0;
934 : }
935 :
936 : /*
937 : * Reinitialize the in-core AG state after a repair by rereading the AGI
938 : * buffer. We had better get the same AGI buffer as the one that's attached
939 : * to the scrub context.
940 : */
941 : int
942 144666 : xrep_reinit_pagi(
943 : struct xfs_scrub *sc)
944 : {
945 144666 : struct xfs_perag *pag = sc->sa.pag;
946 144666 : struct xfs_buf *bp;
947 144666 : int error;
948 :
949 144666 : ASSERT(pag);
950 289332 : ASSERT(xfs_perag_initialised_agi(pag));
951 :
952 144666 : clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
953 144699 : error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
954 144606 : if (error)
955 : return error;
956 :
957 144606 : if (bp != sc->sa.agi_bp) {
958 0 : ASSERT(bp == sc->sa.agi_bp);
959 0 : return -EFSCORRUPTED;
960 : }
961 :
962 : return 0;
963 : }
964 :
965 : /*
966 : * Given an active reference to a perag structure, load AG headers and cursors.
967 : * This should only be called to scan an AG while repairing file-based metadata.
968 : */
969 : int
970 21928795 : xrep_ag_init(
971 : struct xfs_scrub *sc,
972 : struct xfs_perag *pag,
973 : struct xchk_ag *sa)
974 : {
975 21928795 : int error;
976 :
977 21928795 : ASSERT(!sa->pag);
978 :
979 21928795 : error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
980 21922618 : if (error)
981 : return error;
982 :
983 21923034 : error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
984 21926427 : if (error)
985 : return error;
986 :
987 : /* Grab our own passive reference from the caller's ref. */
988 21926892 : sa->pag = xfs_perag_hold(pag);
989 21930655 : xrep_ag_btcur_init(sc, sa);
990 21930655 : return 0;
991 : }
992 :
993 : #ifdef CONFIG_XFS_RT
994 : /* Initialize all the btree cursors for a RT repair. */
995 : void
996 5486253 : xrep_rtgroup_btcur_init(
997 : struct xfs_scrub *sc,
998 : struct xchk_rt *sr)
999 : {
1000 5486253 : struct xfs_mount *mp = sc->mp;
1001 :
1002 5486253 : ASSERT(sr->rtg != NULL);
1003 :
1004 5486253 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT &&
1005 5475112 : (sr->rtlock_flags & XFS_RTGLOCK_RMAP) &&
1006 5474962 : xfs_has_rtrmapbt(mp))
1007 5474401 : sr->rmap_cur = xfs_rtrmapbt_init_cursor(mp, sc->tp, sr->rtg,
1008 5474859 : sr->rtg->rtg_rmapip);
1009 :
1010 5485795 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
1011 5457168 : (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) &&
1012 5457167 : xfs_has_rtreflink(mp))
1013 5454081 : sr->refc_cur = xfs_rtrefcountbt_init_cursor(mp, sc->tp,
1014 5454087 : sr->rtg, sr->rtg->rtg_refcountip);
1015 5485789 : }
1016 :
1017 : /*
1018 : * Given a reference to a rtgroup structure, lock rtgroup btree inodes and
1019 : * create btree cursors. Must only be called to repair a regular rt file.
1020 : */
1021 : int
1022 5397466 : xrep_rtgroup_init(
1023 : struct xfs_scrub *sc,
1024 : struct xfs_rtgroup *rtg,
1025 : struct xchk_rt *sr,
1026 : unsigned int rtglock_flags)
1027 : {
1028 5397466 : ASSERT(sr->rtg == NULL);
1029 :
1030 5397466 : xfs_rtgroup_lock(NULL, rtg, rtglock_flags);
1031 5398445 : sr->rtlock_flags = rtglock_flags;
1032 :
1033 : /* Grab our own passive reference from the caller's ref. */
1034 5398445 : sr->rtg = xfs_rtgroup_hold(rtg);
1035 5398532 : xrep_rtgroup_btcur_init(sc, sr);
1036 5397610 : return 0;
1037 : }
1038 :
1039 : /*
1040 : * Ensure that all rt blocks in the given range are not marked free. If
1041 : * @must_align is true, then both ends must be aligned to a rt extent.
1042 : */
1043 : int
1044 72652946 : xrep_require_rtext_inuse(
1045 : struct xfs_scrub *sc,
1046 : xfs_rtblock_t rtbno,
1047 : xfs_filblks_t len,
1048 : bool must_align)
1049 : {
1050 72652946 : struct xfs_mount *mp = sc->mp;
1051 72652946 : xfs_rtxnum_t startrtx;
1052 72652946 : xfs_rtxnum_t endrtx;
1053 72652946 : xfs_extlen_t mod;
1054 72652946 : bool is_free = false;
1055 72652946 : int error;
1056 :
1057 72652946 : startrtx = xfs_rtb_to_rtx(mp, rtbno, &mod);
1058 72652968 : if (must_align && mod != 0)
1059 : return -EFSCORRUPTED;
1060 :
1061 72652968 : endrtx = xfs_rtb_to_rtx(mp, rtbno + len - 1, &mod);
1062 72652949 : if (must_align && mod != mp->m_sb.sb_rextsize - 1)
1063 : return -EFSCORRUPTED;
1064 :
1065 72652949 : error = xfs_rtalloc_extent_is_free(mp, sc->tp, startrtx,
1066 72652949 : endrtx - startrtx + 1, &is_free);
1067 72654035 : if (error)
1068 : return error;
1069 72654035 : if (is_free)
1070 0 : return -EFSCORRUPTED;
1071 :
1072 : return 0;
1073 : }
1074 : #endif /* CONFIG_XFS_RT */
1075 :
1076 : /* Reinitialize the per-AG block reservation for the AG we just fixed. */
1077 : int
1078 1621382696 : xrep_reset_perag_resv(
1079 : struct xfs_scrub *sc)
1080 : {
1081 1621382696 : int error;
1082 :
1083 1621382696 : if (!(sc->flags & XREP_RESET_PERAG_RESV))
1084 : return 0;
1085 :
1086 267560 : ASSERT(sc->sa.pag != NULL);
1087 267560 : ASSERT(sc->ops->type == ST_PERAG);
1088 267560 : ASSERT(sc->tp);
1089 :
1090 267560 : sc->flags &= ~XREP_RESET_PERAG_RESV;
1091 267560 : xfs_ag_resv_free(sc->sa.pag);
1092 268766 : error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
1093 268308 : if (error == -ENOSPC) {
1094 0 : xfs_err(sc->mp,
1095 : "Insufficient free space to reset per-AG reservation for AG %u after repair.",
1096 : sc->sa.pag->pag_agno);
1097 0 : error = 0;
1098 : }
1099 :
1100 : return error;
1101 : }
1102 :
1103 : /* Decide if we are going to call the repair function for a scrub type. */
1104 : bool
1105 19165997 : xrep_will_attempt(
1106 : struct xfs_scrub *sc)
1107 : {
1108 : /* Userspace asked us to rebuild the structure regardless. */
1109 19165997 : if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
1110 : return true;
1111 :
1112 : /* Let debug users force us into the repair routines. */
1113 4926 : if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
1114 : return true;
1115 :
1116 : /* Metadata is corrupt or failed cross-referencing. */
1117 4878 : if (xchk_needs_repair(sc->sm))
1118 510 : return true;
1119 :
1120 : return false;
1121 : }
1122 :
1123 : /* Try to fix some part of a metadata inode by calling another scrubber. */
1124 : STATIC int
1125 389285 : xrep_metadata_inode_subtype(
1126 : struct xfs_scrub *sc,
1127 : unsigned int scrub_type)
1128 : {
1129 389285 : __u32 smtype = sc->sm->sm_type;
1130 389285 : __u32 smflags = sc->sm->sm_flags;
1131 389285 : int error;
1132 :
1133 : /*
1134 : * Let's see if the inode needs repair. We're going to open-code calls
1135 : * to the scrub and repair functions so that we can hang on to the
1136 : * resources that we already acquired instead of using the standard
1137 : * setup/teardown routines.
1138 : */
1139 389285 : sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
1140 389285 : sc->sm->sm_type = scrub_type;
1141 :
1142 389285 : switch (scrub_type) {
1143 129877 : case XFS_SCRUB_TYPE_INODE:
1144 129877 : error = xchk_inode(sc);
1145 129877 : break;
1146 129878 : case XFS_SCRUB_TYPE_BMBTD:
1147 129878 : error = xchk_bmap_data(sc);
1148 129878 : break;
1149 129530 : case XFS_SCRUB_TYPE_BMBTA:
1150 129530 : error = xchk_bmap_attr(sc);
1151 129530 : break;
1152 0 : default:
1153 0 : ASSERT(0);
1154 0 : error = -EFSCORRUPTED;
1155 : }
1156 389286 : if (error)
1157 97 : goto out;
1158 :
1159 389189 : if (!xrep_will_attempt(sc))
1160 0 : goto out;
1161 :
1162 : /*
1163 : * Repair some part of the inode. This will potentially join the inode
1164 : * to the transaction.
1165 : */
1166 389189 : switch (scrub_type) {
1167 129878 : case XFS_SCRUB_TYPE_INODE:
1168 129878 : error = xrep_inode(sc);
1169 129878 : break;
1170 129878 : case XFS_SCRUB_TYPE_BMBTD:
1171 129878 : error = xrep_bmap(sc, XFS_DATA_FORK, false);
1172 129878 : break;
1173 129433 : case XFS_SCRUB_TYPE_BMBTA:
1174 129433 : error = xrep_bmap(sc, XFS_ATTR_FORK, false);
1175 129433 : break;
1176 : }
1177 389185 : if (error)
1178 352 : goto out;
1179 :
1180 : /*
1181 : * Finish all deferred intent items and then roll the transaction so
1182 : * that the inode will not be joined to the transaction when we exit
1183 : * the function.
1184 : */
1185 388833 : error = xfs_defer_finish(&sc->tp);
1186 388838 : if (error)
1187 0 : goto out;
1188 388838 : error = xfs_trans_roll(&sc->tp);
1189 388838 : if (error)
1190 0 : goto out;
1191 :
1192 : /*
1193 : * Clear the corruption flags and re-check the metadata that we just
1194 : * repaired.
1195 : */
1196 388838 : sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
1197 :
1198 388838 : switch (scrub_type) {
1199 129877 : case XFS_SCRUB_TYPE_INODE:
1200 129877 : error = xchk_inode(sc);
1201 129877 : break;
1202 129528 : case XFS_SCRUB_TYPE_BMBTD:
1203 129528 : error = xchk_bmap_data(sc);
1204 129528 : break;
1205 129433 : case XFS_SCRUB_TYPE_BMBTA:
1206 129433 : error = xchk_bmap_attr(sc);
1207 129433 : break;
1208 : }
1209 388841 : if (error)
1210 0 : goto out;
1211 :
1212 : /* If corruption persists, the repair has failed. */
1213 388841 : if (xchk_needs_repair(sc->sm)) {
1214 0 : error = -EFSCORRUPTED;
1215 0 : goto out;
1216 : }
1217 388841 : out:
1218 389290 : sc->sm->sm_type = smtype;
1219 389290 : sc->sm->sm_flags = smflags;
1220 389290 : return error;
1221 : }
1222 :
1223 : /*
1224 : * Repair the ondisk forks of a metadata inode. The caller must ensure that
1225 : * sc->ip points to the metadata inode and the ILOCK is held on that inode.
1226 : * The inode must not be joined to the transaction before the call, and will
1227 : * not be afterwards.
1228 : */
1229 : int
1230 129878 : xrep_metadata_inode_forks(
1231 : struct xfs_scrub *sc)
1232 : {
1233 129878 : bool dirty = false;
1234 129878 : int error;
1235 :
1236 : /* Repair the inode record and the data fork. */
1237 129878 : error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1238 129878 : if (error)
1239 : return error;
1240 :
1241 129878 : error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1242 129878 : if (error)
1243 : return error;
1244 :
1245 : /* Make sure the attr fork looks ok before we delete it. */
1246 129530 : error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
1247 129530 : if (error)
1248 : return error;
1249 :
1250 : /* Clear the reflink flag since metadata never shares. */
1251 129433 : if (xfs_is_reflink_inode(sc->ip)) {
1252 0 : dirty = true;
1253 0 : xfs_trans_ijoin(sc->tp, sc->ip, 0);
1254 0 : error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1255 0 : if (error)
1256 : return error;
1257 : }
1258 :
1259 : /*
1260 : * Clear the attr forks since metadata shouldn't have one unless
1261 : * parent pointers and the metadata directory tree are enabled.
1262 : */
1263 129433 : if (xfs_inode_hasattr(sc->ip) &&
1264 129433 : !(xfs_has_parent(sc->mp) && xfs_has_metadir(sc->mp))) {
1265 0 : if (!dirty) {
1266 0 : dirty = true;
1267 0 : xfs_trans_ijoin(sc->tp, sc->ip, 0);
1268 : }
1269 0 : error = xrep_xattr_reset_fork(sc);
1270 0 : if (error)
1271 : return error;
1272 : }
1273 :
1274 : /*
1275 : * If we modified the inode, roll the transaction but don't rejoin the
1276 : * inode to the new transaction because xrep_bmap_data can do that.
1277 : */
1278 129433 : if (dirty) {
1279 0 : error = xfs_trans_roll(&sc->tp);
1280 0 : if (error)
1281 0 : return error;
1282 : dirty = false;
1283 : }
1284 :
1285 : return 0;
1286 : }
1287 :
1288 : /*
1289 : * Set a file's link count, being careful about integer overflows. Returns
1290 : * true if we had to correct an integer overflow.
1291 : */
1292 : bool
1293 197822 : xrep_set_nlink(
1294 : struct xfs_inode *ip,
1295 : uint64_t nlink)
1296 : {
1297 197822 : bool ret = false;
1298 :
1299 197822 : if (nlink > XFS_NLINK_PINNED) {
1300 : /*
1301 : * The observed link count will overflow the nlink field.
1302 : *
1303 : * The VFS won't let users create more hardlinks if the link
1304 : * count is larger than XFS_MAXLINK, but it will let them
1305 : * delete hardlinks. XFS_MAXLINK is half of XFS_NLINK_PINNED,
1306 : * which means that sysadmins could actually fix this situation
1307 : * by deleting links and calling us again.
1308 : *
1309 : * Set the link count to the largest possible value that will
1310 : * fit in the field. This will buy us the most possible time
1311 : * to avoid a UAF should the sysadmins start deleting links.
1312 : * As long as the link count stays above MAXLINK the undercount
1313 : * problem will not get worse.
1314 : */
1315 0 : BUILD_BUG_ON((uint64_t)XFS_MAXLINK >= XFS_NLINK_PINNED);
1316 :
1317 0 : nlink = XFS_NLINK_PINNED;
1318 0 : ret = true;
1319 : }
1320 :
1321 197822 : set_nlink(VFS_I(ip), nlink);
1322 :
1323 197808 : if (VFS_I(ip)->i_nlink == 0) {
1324 : /* had better be on an unlinked list */
1325 0 : ASSERT(xfs_inode_on_unlinked_list(ip));
1326 0 : if (!xfs_inode_on_unlinked_list(ip))
1327 0 : xfs_emerg(ip->i_mount, "IUNLINK ino 0x%llx nlink %u prevun 0x%x nextun 0x%x", ip->i_ino, VFS_I(ip)->i_nlink, ip->i_prev_unlinked, ip->i_next_unlinked);
1328 : } else {
1329 : /* had better not be on an unlinked list */
1330 197808 : ASSERT(!xfs_inode_on_unlinked_list(ip));
1331 197808 : if (xfs_inode_on_unlinked_list(ip))
1332 0 : xfs_emerg(ip->i_mount, "IUNLINK ino 0x%llx nlink %u prevun 0x%x nextun 0x%x", ip->i_ino, VFS_I(ip)->i_nlink, ip->i_prev_unlinked, ip->i_next_unlinked);
1333 : }
1334 :
1335 197808 : return ret;
1336 : }
1337 :
1338 : /*
1339 : * Set up an xfile and a buffer cache so that we can use the xfbtree. Buffer
1340 : * target initialization registers a shrinker, so we cannot be in transaction
1341 : * context. Park our resources in the scrub context and let the teardown
1342 : * function take care of them at the right time.
1343 : */
1344 : int
1345 167811 : xrep_setup_buftarg(
1346 : struct xfs_scrub *sc,
1347 : const char *descr)
1348 : {
1349 167811 : ASSERT(sc->tp == NULL);
1350 :
1351 167811 : return xfile_alloc_buftarg(sc->mp, descr, &sc->xfile_buftarg);
1352 : }
1353 :
1354 : /*
1355 : * Create a dummy transaction for use in a live update hook function. This
1356 : * function MUST NOT be called from regular repair code because the current
1357 : * process' transaction is saved via the cookie.
1358 : */
1359 : int
1360 804794 : xrep_trans_alloc_hook_dummy(
1361 : struct xfs_mount *mp,
1362 : void **cookiep,
1363 : struct xfs_trans **tpp)
1364 : {
1365 804794 : int error;
1366 :
1367 804794 : *cookiep = current->journal_info;
1368 804794 : current->journal_info = NULL;
1369 :
1370 804794 : error = xfs_trans_alloc_empty(mp, tpp);
1371 804794 : if (!error)
1372 : return 0;
1373 :
1374 0 : current->journal_info = *cookiep;
1375 0 : *cookiep = NULL;
1376 0 : return error;
1377 : }
1378 :
1379 : /* Cancel a dummy transaction used by a live update hook function. */
1380 : void
1381 804794 : xrep_trans_cancel_hook_dummy(
1382 : void **cookiep,
1383 : struct xfs_trans *tp)
1384 : {
1385 804794 : xfs_trans_cancel(tp);
1386 804794 : current->journal_info = *cookiep;
1387 804794 : *cookiep = NULL;
1388 804794 : }
1389 :
1390 : /*
1391 : * See if this buffer can pass the given ->verify_struct() function.
1392 : *
1393 : * If the buffer already has ops attached and they're not the ones that were
1394 : * passed in, we reject the buffer. Otherwise, we perform the structure test
1395 : * (note that we do not check CRCs) and return the outcome of the test. The
1396 : * buffer ops and error state are left unchanged.
1397 : */
1398 : bool
1399 83722 : xrep_buf_verify_struct(
1400 : struct xfs_buf *bp,
1401 : const struct xfs_buf_ops *ops)
1402 : {
1403 83722 : const struct xfs_buf_ops *old_ops = bp->b_ops;
1404 83722 : xfs_failaddr_t fa;
1405 83722 : int old_error;
1406 :
1407 83722 : if (old_ops) {
1408 83722 : if (old_ops != ops)
1409 : return false;
1410 : }
1411 :
1412 83722 : old_error = bp->b_error;
1413 83722 : bp->b_ops = ops;
1414 83722 : fa = bp->b_ops->verify_struct(bp);
1415 83722 : bp->b_ops = old_ops;
1416 83722 : bp->b_error = old_error;
1417 :
1418 83722 : return fa == NULL;
1419 : }
1420 :
1421 : /* Are we looking at a realtime metadata inode? */
1422 : bool
1423 5897588 : xrep_is_rtmeta_ino(
1424 : struct xfs_scrub *sc,
1425 : struct xfs_rtgroup *rtg,
1426 : xfs_ino_t ino)
1427 : {
1428 : /*
1429 : * All filesystems have rt bitmap and summary inodes, even if they
1430 : * don't have an rt section.
1431 : */
1432 5897588 : if (ino == sc->mp->m_rbmip->i_ino)
1433 : return true;
1434 5849132 : if (ino == sc->mp->m_rsumip->i_ino)
1435 : return true;
1436 :
1437 : /* Newer rt metadata files are not guaranteed to exist */
1438 5397578 : if (rtg->rtg_rmapip && ino == rtg->rtg_rmapip->i_ino)
1439 : return true;
1440 5397578 : if (rtg->rtg_refcountip && ino == rtg->rtg_refcountip->i_ino)
1441 0 : return true;
1442 :
1443 : return false;
1444 : }
1445 :
1446 : /* Check the sanity of a rmap record for a metadata btree inode. */
1447 : int
1448 42805 : xrep_check_ino_btree_mapping(
1449 : struct xfs_scrub *sc,
1450 : const struct xfs_rmap_irec *rec)
1451 : {
1452 42805 : enum xbtree_recpacking outcome;
1453 42805 : int error;
1454 :
1455 : /*
1456 : * Metadata btree inodes never have extended attributes, and all blocks
1457 : * should have the bmbt block flag set.
1458 : */
1459 42805 : if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) ||
1460 : !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
1461 : return -EFSCORRUPTED;
1462 :
1463 : /* Make sure the block is within the AG. */
1464 42805 : if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock,
1465 42805 : rec->rm_blockcount))
1466 : return -EFSCORRUPTED;
1467 :
1468 : /* Make sure this isn't free space. */
1469 42805 : error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
1470 : rec->rm_blockcount, &outcome);
1471 42805 : if (error)
1472 : return error;
1473 42805 : if (outcome != XBTREE_RECPACKING_EMPTY)
1474 0 : return -EFSCORRUPTED;
1475 :
1476 : return 0;
1477 : }
1478 :
1479 : /*
1480 : * Reset the block count of the inode being repaired, and adjust the dquot
1481 : * block usage to match. The inode must not have an xattr fork.
1482 : */
1483 : void
1484 39435 : xrep_inode_set_nblocks(
1485 : struct xfs_scrub *sc,
1486 : int64_t new_blocks)
1487 : {
1488 39435 : int64_t delta;
1489 :
1490 39435 : delta = new_blocks - sc->ip->i_nblocks;
1491 39435 : sc->ip->i_nblocks = new_blocks;
1492 :
1493 39435 : xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1494 39442 : if (delta != 0)
1495 1098 : xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT,
1496 : delta);
1497 39442 : }
1498 :
1499 : /* Reset the block reservation for a metadata inode. */
1500 : int
1501 39416 : xrep_reset_imeta_reservation(
1502 : struct xfs_scrub *sc)
1503 : {
1504 39416 : struct xfs_inode *ip = sc->ip;
1505 39416 : int64_t delta;
1506 39416 : int error;
1507 :
1508 39416 : delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
1509 39416 : if (delta == 0)
1510 : return 0;
1511 :
1512 1098 : if (delta > 0) {
1513 979 : int64_t give_back;
1514 :
1515 : /* Too many blocks, free from the incore reservation. */
1516 979 : give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
1517 979 : if (give_back > 0) {
1518 979 : xfs_mod_delalloc(ip->i_mount, -give_back);
1519 979 : xfs_mod_fdblocks(ip->i_mount, give_back, true);
1520 979 : ip->i_delayed_blks -= give_back;
1521 : }
1522 :
1523 979 : return 0;
1524 : }
1525 :
1526 : /* Not enough reservation, try to add more. @delta is negative here. */
1527 119 : error = xfs_mod_fdblocks(sc->mp, delta, true);
1528 119 : while (error == -ENOSPC) {
1529 0 : delta++;
1530 0 : if (delta == 0) {
1531 0 : xfs_warn(sc->mp,
1532 : "Insufficient free space to reset space reservation for inode 0x%llx after repair.",
1533 : ip->i_ino);
1534 0 : return 0;
1535 : }
1536 0 : error = xfs_mod_fdblocks(sc->mp, delta, true);
1537 : }
1538 119 : if (error)
1539 : return error;
1540 :
1541 119 : xfs_mod_delalloc(sc->mp, -delta);
1542 119 : ip->i_delayed_blks += -delta;
1543 119 : return 0;
1544 : }
|