Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_alloc.h"
13 : #include "xfs_ialloc.h"
14 : #include "xfs_health.h"
15 : #include "xfs_btree.h"
16 : #include "xfs_ag.h"
17 : #include "xfs_rtbitmap.h"
18 : #include "xfs_inode.h"
19 : #include "xfs_icache.h"
20 : #include "scrub/scrub.h"
21 : #include "scrub/common.h"
22 : #include "scrub/trace.h"
23 : #include "scrub/fscounters.h"
24 :
25 : /*
26 : * FS Summary Counters
27 : * ===================
28 : *
29 : * The basics of filesystem summary counter checking are that we iterate the
30 : * AGs counting the number of free blocks, free space btree blocks, per-AG
31 : * reservations, inodes, delayed allocation reservations, and free inodes.
32 : * Then we compare what we computed against the in-core counters.
33 : *
34 : * However, the reality is that summary counters are a tricky beast to check.
35 : * While we /could/ freeze the filesystem and scramble around the AGs counting
36 : * the free blocks, in practice we prefer not do that for a scan because
37 : * freezing is costly. To get around this, we added a per-cpu counter of the
38 : * delalloc reservations so that we can rotor around the AGs relatively
39 : * quickly, and we allow the counts to be slightly off because we're not taking
40 : * any locks while we do this.
41 : *
42 : * So the first thing we do is warm up the buffer cache in the setup routine by
43 : * walking all the AGs to make sure the incore per-AG structure has been
44 : * initialized. The expected value calculation then iterates the incore per-AG
45 : * structures as quickly as it can. We snapshot the percpu counters before and
46 : * after this operation and use the difference in counter values to guess at
47 : * our tolerance for mismatch between expected and actual counter values.
48 : */
49 :
50 : /*
51 : * Since the expected value computation is lockless but only browses incore
52 : * values, the percpu counters should be fairly close to each other. However,
53 : * we'll allow ourselves to be off by at least this (arbitrary) amount.
54 : */
55 : #define XCHK_FSCOUNT_MIN_VARIANCE (512)
56 :
57 : /*
58 : * Make sure the per-AG structure has been initialized from the on-disk header
59 : * contents and trust that the incore counters match the ondisk counters. (The
60 : * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
61 : * summary counters after checking all AG headers). Do this from the setup
62 : * function so that the inner AG aggregation loop runs as quickly as possible.
63 : *
64 : * This function runs during the setup phase /before/ we start checking any
65 : * metadata.
66 : */
67 : STATIC int
68 148973 : xchk_fscount_warmup(
69 : struct xfs_scrub *sc)
70 : {
71 148973 : struct xfs_mount *mp = sc->mp;
72 148973 : struct xfs_buf *agi_bp = NULL;
73 148973 : struct xfs_buf *agf_bp = NULL;
74 148973 : struct xfs_perag *pag = NULL;
75 148973 : xfs_agnumber_t agno;
76 148973 : int error = 0;
77 :
78 878449 : for_each_perag(mp, agno, pag) {
79 729476 : if (xchk_should_terminate(sc, &error))
80 : break;
81 2188428 : if (xfs_perag_initialised_agi(pag) &&
82 : xfs_perag_initialised_agf(pag))
83 729476 : continue;
84 :
85 : /* Lock both AG headers. */
86 0 : error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
87 0 : if (error)
88 : break;
89 0 : error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
90 0 : if (error)
91 : break;
92 :
93 : /*
94 : * These are supposed to be initialized by the header read
95 : * function.
96 : */
97 0 : if (!xfs_perag_initialised_agi(pag) ||
98 : !xfs_perag_initialised_agf(pag)) {
99 0 : error = -EFSCORRUPTED;
100 0 : break;
101 : }
102 :
103 0 : xfs_buf_relse(agf_bp);
104 0 : agf_bp = NULL;
105 0 : xfs_buf_relse(agi_bp);
106 0 : agi_bp = NULL;
107 : }
108 :
109 148973 : if (agf_bp)
110 0 : xfs_buf_relse(agf_bp);
111 148973 : if (agi_bp)
112 0 : xfs_buf_relse(agi_bp);
113 148973 : if (pag)
114 0 : xfs_perag_rele(pag);
115 148973 : return error;
116 : }
117 :
118 : static inline int
119 44658 : xchk_fsfreeze(
120 : struct xfs_scrub *sc)
121 : {
122 44658 : int error;
123 :
124 44658 : error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
125 44658 : trace_xchk_fsfreeze(sc, error);
126 44658 : return error;
127 : }
128 :
129 : static inline int
130 44658 : xchk_fsthaw(
131 : struct xfs_scrub *sc)
132 : {
133 44658 : int error;
134 :
135 : /* This should always succeed, we have a kernel freeze */
136 44658 : error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
137 44658 : trace_xchk_fsthaw(sc, error);
138 44658 : return error;
139 : }
140 :
141 : /*
142 : * We couldn't stabilize the filesystem long enough to sample all the variables
143 : * that comprise the summary counters and compare them to the percpu counters.
144 : * We need to disable all writer threads, which means taking the first two
145 : * freeze levels to put userspace to sleep, and the third freeze level to
146 : * prevent background threads from starting new transactions. Take one level
147 : * more to prevent other callers from unfreezing the filesystem while we run.
148 : */
149 : STATIC int
150 44658 : xchk_fscounters_freeze(
151 : struct xfs_scrub *sc)
152 : {
153 44658 : struct xchk_fscounters *fsc = sc->buf;
154 44658 : int error = 0;
155 :
156 44658 : if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
157 30590 : sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
158 30590 : mnt_drop_write_file(sc->file);
159 : }
160 :
161 : /* Try to grab a kernel freeze. */
162 44658 : while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
163 0 : if (xchk_should_terminate(sc, &error))
164 0 : return error;
165 :
166 0 : delay(HZ / 10);
167 : }
168 44658 : if (error)
169 : return error;
170 :
171 44658 : fsc->frozen = true;
172 44658 : return 0;
173 : }
174 :
175 : /* Thaw the filesystem after checking or repairing fscounters. */
176 : STATIC void
177 148973 : xchk_fscounters_cleanup(
178 : void *buf)
179 : {
180 148973 : struct xchk_fscounters *fsc = buf;
181 148973 : struct xfs_scrub *sc = fsc->sc;
182 148973 : int error;
183 :
184 148973 : if (!fsc->frozen)
185 : return;
186 :
187 44658 : error = xchk_fsthaw(sc);
188 44658 : if (error)
189 0 : xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
190 : else
191 44658 : fsc->frozen = false;
192 : }
193 :
194 : int
195 148973 : xchk_setup_fscounters(
196 : struct xfs_scrub *sc)
197 : {
198 148973 : struct xchk_fscounters *fsc;
199 148973 : int error;
200 :
201 : /*
202 : * If the AGF doesn't track btreeblks, we have to lock the AGF to count
203 : * btree block usage by walking the actual btrees.
204 : */
205 148973 : if (!xfs_has_lazysbcount(sc->mp))
206 0 : xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
207 :
208 148973 : sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
209 148973 : if (!sc->buf)
210 : return -ENOMEM;
211 148973 : sc->buf_cleanup = xchk_fscounters_cleanup;
212 148973 : fsc = sc->buf;
213 148973 : fsc->sc = sc;
214 :
215 148973 : xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
216 :
217 : /* We must get the incore counters set up before we can proceed. */
218 148973 : error = xchk_fscount_warmup(sc);
219 148973 : if (error)
220 : return error;
221 :
222 : /*
223 : * Pause all writer activity in the filesystem while we're scrubbing to
224 : * reduce the likelihood of background perturbations to the counters
225 : * throwing off our calculations.
226 : *
227 : * If we're repairing, we need to prevent any other thread from
228 : * changing the global fs summary counters while we're repairing them.
229 : * This requires the fs to be frozen, which will disable background
230 : * reclaim and purge all inactive inodes.
231 : */
232 276690 : if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
233 44658 : error = xchk_fscounters_freeze(sc);
234 44658 : if (error)
235 : return error;
236 : }
237 :
238 148973 : return xchk_trans_alloc_empty(sc);
239 : }
240 :
241 : /*
242 : * Part 1: Collecting filesystem summary counts. For each AG, we add its
243 : * summary counts (total inodes, free inodes, free data blocks) to an incore
244 : * copy of the overall filesystem summary counts.
245 : *
246 : * To avoid false corruption reports in part 2, any failure in this part must
247 : * set the INCOMPLETE flag even when a negative errno is returned. This care
248 : * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
249 : * ECANCELED) that are absorbed into a scrub state flag update by
250 : * xchk_*_process_error. Scrub and repair share the same incore data
251 : * structures, so the INCOMPLETE flag is critical to prevent a repair based on
252 : * insufficient information.
253 : */
254 :
255 : /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
256 : static int
257 0 : xchk_fscount_btreeblks(
258 : struct xfs_scrub *sc,
259 : struct xchk_fscounters *fsc,
260 : xfs_agnumber_t agno)
261 : {
262 0 : xfs_extlen_t blocks;
263 0 : int error;
264 :
265 0 : error = xchk_ag_init_existing(sc, agno, &sc->sa);
266 0 : if (error)
267 0 : goto out_free;
268 :
269 0 : error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
270 0 : if (error)
271 0 : goto out_free;
272 0 : fsc->fdblocks += blocks - 1;
273 :
274 0 : error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
275 0 : if (error)
276 0 : goto out_free;
277 0 : fsc->fdblocks += blocks - 1;
278 :
279 0 : out_free:
280 0 : xchk_ag_free(sc, &sc->sa);
281 0 : return error;
282 : }
283 :
284 : /*
285 : * Calculate what the global in-core counters ought to be from the incore
286 : * per-AG structure. Callers can compare this to the actual in-core counters
287 : * to estimate by how much both in-core and on-disk counters need to be
288 : * adjusted.
289 : */
290 : STATIC int
291 148973 : xchk_fscount_aggregate_agcounts(
292 : struct xfs_scrub *sc,
293 : struct xchk_fscounters *fsc)
294 : {
295 148973 : struct xfs_mount *mp = sc->mp;
296 148973 : struct xfs_perag *pag;
297 148973 : uint64_t delayed;
298 148973 : xfs_agnumber_t agno;
299 148973 : int tries = 8;
300 148973 : int error = 0;
301 :
302 148973 : retry:
303 148973 : fsc->icount = 0;
304 148973 : fsc->ifree = 0;
305 148973 : fsc->fdblocks = 0;
306 :
307 878425 : for_each_perag(mp, agno, pag) {
308 729458 : if (xchk_should_terminate(sc, &error))
309 : break;
310 :
311 : /* This somehow got unset since the warmup? */
312 2188356 : if (!xfs_perag_initialised_agi(pag) ||
313 : !xfs_perag_initialised_agf(pag)) {
314 0 : error = -EFSCORRUPTED;
315 0 : break;
316 : }
317 :
318 : /* Count all the inodes */
319 729452 : fsc->icount += pag->pagi_count;
320 729452 : fsc->ifree += pag->pagi_freecount;
321 :
322 : /* Add up the free/freelist/bnobt/cntbt blocks */
323 729452 : fsc->fdblocks += pag->pagf_freeblks;
324 729452 : fsc->fdblocks += pag->pagf_flcount;
325 729452 : if (xfs_has_lazysbcount(sc->mp)) {
326 729452 : fsc->fdblocks += pag->pagf_btreeblks;
327 : } else {
328 0 : error = xchk_fscount_btreeblks(sc, fsc, agno);
329 0 : if (error)
330 : break;
331 : }
332 :
333 : /*
334 : * Per-AG reservations are taken out of the incore counters,
335 : * so they must be left out of the free blocks computation.
336 : */
337 729452 : fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
338 729452 : fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
339 1458904 : if (xfs_perag_prohibits_alloc(pag))
340 0 : fsc->fdblocks -= xfs_ag_fdblocks(pag);
341 : }
342 148973 : if (pag)
343 6 : xfs_perag_rele(pag);
344 148973 : if (error) {
345 6 : xchk_set_incomplete(sc);
346 6 : return error;
347 : }
348 :
349 : /*
350 : * The global incore space reservation is taken from the incore
351 : * counters, so leave that out of the computation.
352 : */
353 148967 : fsc->fdblocks -= mp->m_resblks_avail;
354 :
355 : /*
356 : * Delayed allocation reservations are taken out of the incore counters
357 : * but not recorded on disk, so leave them and their indlen blocks out
358 : * of the computation.
359 : */
360 148967 : delayed = percpu_counter_sum(&mp->m_delalloc_blks);
361 148967 : fsc->fdblocks -= delayed;
362 :
363 148967 : trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
364 : delayed);
365 :
366 :
367 : /* Bail out if the values we compute are totally nonsense. */
368 148967 : if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
369 148967 : fsc->fdblocks > mp->m_sb.sb_dblocks ||
370 148967 : fsc->ifree > fsc->icount_max)
371 : return -EFSCORRUPTED;
372 :
373 : /*
374 : * If ifree > icount then we probably had some perturbation in the
375 : * counters while we were calculating things. We'll try a few times
376 : * to maintain ifree <= icount before giving up.
377 : */
378 148967 : if (fsc->ifree > fsc->icount) {
379 0 : if (tries--)
380 0 : goto retry;
381 : return -EDEADLOCK;
382 : }
383 :
384 : return 0;
385 : }
386 :
387 : #ifdef CONFIG_XFS_RT
388 : STATIC int
389 115744561 : xchk_fscount_add_frextent(
390 : struct xfs_mount *mp,
391 : struct xfs_trans *tp,
392 : const struct xfs_rtalloc_rec *rec,
393 : void *priv)
394 : {
395 115744561 : struct xchk_fscounters *fsc = priv;
396 115744561 : int error = 0;
397 :
398 115744561 : fsc->frextents += rec->ar_extcount;
399 :
400 115744561 : xchk_should_terminate(fsc->sc, &error);
401 115744561 : return error;
402 : }
403 :
404 : /* Calculate the number of free realtime extents from the realtime bitmap. */
405 : STATIC int
406 148967 : xchk_fscount_count_frextents(
407 : struct xfs_scrub *sc,
408 : struct xchk_fscounters *fsc)
409 : {
410 148967 : struct xfs_mount *mp = sc->mp;
411 148967 : int error;
412 :
413 148967 : fsc->frextents = 0;
414 148967 : if (!xfs_has_realtime(mp))
415 : return 0;
416 :
417 76848 : xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
418 76848 : error = xfs_rtalloc_query_all(sc->mp, sc->tp,
419 : xchk_fscount_add_frextent, fsc);
420 76848 : if (error) {
421 0 : xchk_set_incomplete(sc);
422 0 : goto out_unlock;
423 : }
424 :
425 76848 : out_unlock:
426 76848 : xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
427 76848 : return error;
428 : }
429 : #else
430 : STATIC int
431 : xchk_fscount_count_frextents(
432 : struct xfs_scrub *sc,
433 : struct xchk_fscounters *fsc)
434 : {
435 : fsc->frextents = 0;
436 : return 0;
437 : }
438 : #endif /* CONFIG_XFS_RT */
439 :
440 : /*
441 : * Part 2: Comparing filesystem summary counters. All we have to do here is
442 : * sum the percpu counters and compare them to what we've observed.
443 : */
444 :
445 : /*
446 : * Is the @counter reasonably close to the @expected value?
447 : *
448 : * We neither locked nor froze anything in the filesystem while aggregating the
449 : * per-AG data to compute the @expected value, which means that the counter
450 : * could have changed. We know the @old_value of the summation of the counter
451 : * before the aggregation, and we re-sum the counter now. If the expected
452 : * value falls between the two summations, we're ok.
453 : *
454 : * Otherwise, we /might/ have a problem. If the change in the summations is
455 : * more than we want to tolerate, the filesystem is probably busy and we should
456 : * just send back INCOMPLETE and see if userspace will try again.
457 : *
458 : * If we're repairing then we require an exact match.
459 : */
460 : static inline bool
461 595868 : xchk_fscount_within_range(
462 : struct xfs_scrub *sc,
463 : const int64_t old_value,
464 : struct percpu_counter *counter,
465 : uint64_t expected)
466 : {
467 595868 : int64_t min_value, max_value;
468 595868 : int64_t curr_value = percpu_counter_sum(counter);
469 :
470 595868 : trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
471 : old_value);
472 :
473 : /* Negative values are always wrong. */
474 595868 : if (curr_value < 0)
475 : return false;
476 :
477 : /* Exact matches are always ok. */
478 595868 : if (curr_value == expected)
479 : return true;
480 :
481 : /* We require exact matches when repair is running. */
482 85553 : if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
483 : return false;
484 :
485 68524 : min_value = min(old_value, curr_value);
486 68524 : max_value = max(old_value, curr_value);
487 :
488 : /* Within the before-and-after range is ok. */
489 68524 : if (expected >= min_value && expected <= max_value)
490 49444 : return true;
491 :
492 : /* Everything else is bad. */
493 : return false;
494 : }
495 :
496 : /* Check the superblock counters. */
497 : int
498 148973 : xchk_fscounters(
499 : struct xfs_scrub *sc)
500 : {
501 148973 : struct xfs_mount *mp = sc->mp;
502 148973 : struct xchk_fscounters *fsc = sc->buf;
503 148973 : int64_t icount, ifree, fdblocks, frextents;
504 148973 : bool try_again = false;
505 148973 : int error;
506 :
507 : /* Snapshot the percpu counters. */
508 148973 : icount = percpu_counter_sum(&mp->m_icount);
509 148973 : ifree = percpu_counter_sum(&mp->m_ifree);
510 148973 : fdblocks = percpu_counter_sum(&mp->m_fdblocks);
511 148973 : frextents = percpu_counter_sum(&mp->m_frextents);
512 :
513 : /* No negative values, please! */
514 148973 : if (icount < 0 || ifree < 0)
515 0 : xchk_set_corrupt(sc);
516 :
517 : /*
518 : * If the filesystem is not frozen, the counter summation calls above
519 : * can race with xfs_mod_freecounter, which subtracts a requested space
520 : * reservation from the counter and undoes the subtraction if that made
521 : * the counter go negative. Therefore, it's possible to see negative
522 : * values here, and we should only flag that as a corruption if we
523 : * froze the fs. This is much more likely to happen with frextents
524 : * since there are no reserved pools.
525 : */
526 148973 : if (fdblocks < 0 || frextents < 0) {
527 0 : if (!fsc->frozen)
528 : return -EDEADLOCK;
529 :
530 0 : xchk_set_corrupt(sc);
531 0 : return 0;
532 : }
533 :
534 : /* See if icount is obviously wrong. */
535 148973 : if (icount < fsc->icount_min || icount > fsc->icount_max)
536 0 : xchk_set_corrupt(sc);
537 :
538 : /* See if fdblocks is obviously wrong. */
539 148973 : if (fdblocks > mp->m_sb.sb_dblocks)
540 0 : xchk_set_corrupt(sc);
541 :
542 : /* See if frextents is obviously wrong. */
543 148973 : if (frextents > mp->m_sb.sb_rextents)
544 0 : xchk_set_corrupt(sc);
545 :
546 : /*
547 : * If ifree exceeds icount by more than the minimum variance then
548 : * something's probably wrong with the counters.
549 : */
550 148973 : if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
551 0 : xchk_set_corrupt(sc);
552 :
553 : /* Walk the incore AG headers to calculate the expected counters. */
554 148973 : error = xchk_fscount_aggregate_agcounts(sc, fsc);
555 148973 : if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
556 6 : return error;
557 :
558 : /* Count the free extents counter for rt volumes. */
559 148967 : error = xchk_fscount_count_frextents(sc, fsc);
560 148967 : if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
561 0 : return error;
562 148967 : if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
563 : return 0;
564 :
565 : /*
566 : * Compare the in-core counters with whatever we counted. If the fs is
567 : * frozen, we treat the discrepancy as a corruption because the freeze
568 : * should have stabilized the counter values. Otherwise, we need
569 : * userspace to call us back having granted us freeze permission.
570 : */
571 148967 : if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
572 : fsc->icount)) {
573 121 : if (fsc->frozen)
574 0 : xchk_set_corrupt(sc);
575 : else
576 : try_again = true;
577 : }
578 :
579 148967 : if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
580 5608 : if (fsc->frozen)
581 0 : xchk_set_corrupt(sc);
582 : else
583 : try_again = true;
584 : }
585 :
586 148967 : if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
587 : fsc->fdblocks)) {
588 20969 : if (fsc->frozen)
589 0 : xchk_set_corrupt(sc);
590 : else
591 : try_again = true;
592 : }
593 :
594 148967 : if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
595 : fsc->frextents)) {
596 9411 : if (fsc->frozen)
597 0 : xchk_set_corrupt(sc);
598 : else
599 : try_again = true;
600 : }
601 :
602 139556 : if (try_again)
603 21256 : return -EDEADLOCK;
604 :
605 : return 0;
606 : }
|