Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_log_format.h"
12 : #include "xfs_trans.h"
13 : #include "xfs_mount.h"
14 : #include "xfs_alloc.h"
15 : #include "xfs_ialloc.h"
16 : #include "xfs_health.h"
17 : #include "xfs_btree.h"
18 : #include "xfs_ag.h"
19 : #include "xfs_rtalloc.h"
20 : #include "xfs_inode.h"
21 : #include "xfs_icache.h"
22 : #include "scrub/scrub.h"
23 : #include "scrub/common.h"
24 : #include "scrub/trace.h"
25 : #include "scrub/fscounters.h"
26 :
27 : /*
28 : * FS Summary Counters
29 : * ===================
30 : *
31 : * The basics of filesystem summary counter checking are that we iterate the
32 : * AGs counting the number of free blocks, free space btree blocks, per-AG
33 : * reservations, inodes, delayed allocation reservations, and free inodes.
34 : * Then we compare what we computed against the in-core counters.
35 : *
36 : * However, the reality is that summary counters are a tricky beast to check.
37 : * While we /could/ freeze the filesystem and scramble around the AGs counting
38 : * the free blocks, in practice we prefer not do that for a scan because
39 : * freezing is costly. To get around this, we added a per-cpu counter of the
40 : * delalloc reservations so that we can rotor around the AGs relatively
41 : * quickly, and we allow the counts to be slightly off because we're not taking
42 : * any locks while we do this.
43 : *
44 : * So the first thing we do is warm up the buffer cache in the setup routine by
45 : * walking all the AGs to make sure the incore per-AG structure has been
46 : * initialized. The expected value calculation then iterates the incore per-AG
47 : * structures as quickly as it can. We snapshot the percpu counters before and
48 : * after this operation and use the difference in counter values to guess at
49 : * our tolerance for mismatch between expected and actual counter values.
50 : */
51 :
52 : /*
53 : * Since the expected value computation is lockless but only browses incore
54 : * values, the percpu counters should be fairly close to each other. However,
55 : * we'll allow ourselves to be off by at least this (arbitrary) amount.
56 : */
57 : #define XCHK_FSCOUNT_MIN_VARIANCE (512)
58 :
59 : /*
60 : * Make sure the per-AG structure has been initialized from the on-disk header
61 : * contents and trust that the incore counters match the ondisk counters. (The
62 : * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
63 : * summary counters after checking all AG headers). Do this from the setup
64 : * function so that the inner AG aggregation loop runs as quickly as possible.
65 : *
66 : * This function runs during the setup phase /before/ we start checking any
67 : * metadata.
68 : */
69 : STATIC int
70 360244 : xchk_fscount_warmup(
71 : struct xfs_scrub *sc)
72 : {
73 360244 : struct xfs_mount *mp = sc->mp;
74 360244 : struct xfs_buf *agi_bp = NULL;
75 360244 : struct xfs_buf *agf_bp = NULL;
76 360244 : struct xfs_perag *pag = NULL;
77 360244 : xfs_agnumber_t agno;
78 360244 : int error = 0;
79 :
80 1809235 : for_each_perag(mp, agno, pag) {
81 1448991 : if (xchk_should_terminate(sc, &error))
82 : break;
83 4346973 : if (xfs_perag_initialised_agi(pag) &&
84 : xfs_perag_initialised_agf(pag))
85 1448991 : continue;
86 :
87 : /* Lock both AG headers. */
88 0 : error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
89 0 : if (error)
90 : break;
91 0 : error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
92 0 : if (error)
93 : break;
94 :
95 : /*
96 : * These are supposed to be initialized by the header read
97 : * function.
98 : */
99 0 : if (!xfs_perag_initialised_agi(pag) ||
100 : !xfs_perag_initialised_agf(pag)) {
101 0 : error = -EFSCORRUPTED;
102 0 : break;
103 : }
104 :
105 0 : xfs_buf_relse(agf_bp);
106 0 : agf_bp = NULL;
107 0 : xfs_buf_relse(agi_bp);
108 0 : agi_bp = NULL;
109 : }
110 :
111 360244 : if (agf_bp)
112 0 : xfs_buf_relse(agf_bp);
113 360244 : if (agi_bp)
114 0 : xfs_buf_relse(agi_bp);
115 360244 : if (pag)
116 0 : xfs_perag_rele(pag);
117 360244 : return error;
118 : }
119 :
120 : static inline int
121 26515 : xchk_fsfreeze(
122 : struct xfs_scrub *sc)
123 : {
124 26515 : int error;
125 :
126 26515 : error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
127 26515 : trace_xchk_fsfreeze(sc, error);
128 26515 : return error;
129 : }
130 :
131 : static inline int
132 26515 : xchk_fsthaw(
133 : struct xfs_scrub *sc)
134 : {
135 26515 : int error;
136 :
137 : /* This should always succeed, we have a kernel freeze */
138 26515 : error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
139 26515 : trace_xchk_fsthaw(sc, error);
140 26515 : return error;
141 : }
142 :
143 : /*
144 : * We couldn't stabilize the filesystem long enough to sample all the variables
145 : * that comprise the summary counters and compare them to the percpu counters.
146 : * We need to disable all writer threads, which means taking the first two
147 : * freeze levels to put userspace to sleep, and the third freeze level to
148 : * prevent background threads from starting new transactions. Take one level
149 : * more to prevent other callers from unfreezing the filesystem while we run.
150 : */
151 : STATIC int
152 26515 : xchk_fscounters_freeze(
153 : struct xfs_scrub *sc)
154 : {
155 26515 : struct xchk_fscounters *fsc = sc->buf;
156 26515 : int error = 0;
157 :
158 26515 : if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
159 10676 : sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
160 10676 : mnt_drop_write_file(sc->file);
161 : }
162 :
163 : /* Try to grab a kernel freeze. */
164 26515 : while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
165 0 : if (xchk_should_terminate(sc, &error))
166 0 : return error;
167 :
168 0 : delay(HZ / 10);
169 : }
170 26515 : if (error)
171 : return error;
172 :
173 26515 : fsc->frozen = true;
174 26515 : return 0;
175 : }
176 :
177 : /* Thaw the filesystem after checking or repairing fscounters. */
178 : STATIC void
179 360244 : xchk_fscounters_cleanup(
180 : void *buf)
181 : {
182 360244 : struct xchk_fscounters *fsc = buf;
183 360244 : struct xfs_scrub *sc = fsc->sc;
184 360244 : int error;
185 :
186 360244 : if (!fsc->frozen)
187 : return;
188 :
189 26515 : error = xchk_fsthaw(sc);
190 26515 : if (error)
191 0 : xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
192 : else
193 26515 : fsc->frozen = false;
194 : }
195 :
196 : int
197 360244 : xchk_setup_fscounters(
198 : struct xfs_scrub *sc)
199 : {
200 360244 : struct xchk_fscounters *fsc;
201 360244 : int error;
202 :
203 : /*
204 : * If the AGF doesn't track btreeblks, we have to lock the AGF to count
205 : * btree block usage by walking the actual btrees.
206 : */
207 360244 : if (!xfs_has_lazysbcount(sc->mp))
208 0 : xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
209 :
210 360244 : sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
211 360244 : if (!sc->buf)
212 : return -ENOMEM;
213 360244 : sc->buf_cleanup = xchk_fscounters_cleanup;
214 360244 : fsc = sc->buf;
215 360244 : fsc->sc = sc;
216 :
217 360244 : xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
218 :
219 : /* We must get the incore counters set up before we can proceed. */
220 360244 : error = xchk_fscount_warmup(sc);
221 360244 : if (error)
222 : return error;
223 :
224 : /*
225 : * Pause all writer activity in the filesystem while we're scrubbing to
226 : * reduce the likelihood of background perturbations to the counters
227 : * throwing off our calculations.
228 : */
229 360244 : if (sc->flags & XCHK_TRY_HARDER) {
230 26515 : error = xchk_fscounters_freeze(sc);
231 26515 : if (error)
232 : return error;
233 : }
234 :
235 360244 : return xfs_trans_alloc_empty(sc->mp, &sc->tp);
236 : }
237 :
238 : /*
239 : * Part 1: Collecting filesystem summary counts. For each AG, we add its
240 : * summary counts (total inodes, free inodes, free data blocks) to an incore
241 : * copy of the overall filesystem summary counts.
242 : *
243 : * To avoid false corruption reports in part 2, any failure in this part must
244 : * set the INCOMPLETE flag even when a negative errno is returned. This care
245 : * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
246 : * ECANCELED) that are absorbed into a scrub state flag update by
247 : * xchk_*_process_error.
248 : */
249 :
250 : /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
251 : static int
252 0 : xchk_fscount_btreeblks(
253 : struct xfs_scrub *sc,
254 : struct xchk_fscounters *fsc,
255 : xfs_agnumber_t agno)
256 : {
257 0 : xfs_extlen_t blocks;
258 0 : int error;
259 :
260 0 : error = xchk_ag_init_existing(sc, agno, &sc->sa);
261 0 : if (error)
262 0 : goto out_free;
263 :
264 0 : error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
265 0 : if (error)
266 0 : goto out_free;
267 0 : fsc->fdblocks += blocks - 1;
268 :
269 0 : error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
270 0 : if (error)
271 0 : goto out_free;
272 0 : fsc->fdblocks += blocks - 1;
273 :
274 0 : out_free:
275 0 : xchk_ag_free(sc, &sc->sa);
276 0 : return error;
277 : }
278 :
279 : /*
280 : * Calculate what the global in-core counters ought to be from the incore
281 : * per-AG structure. Callers can compare this to the actual in-core counters
282 : * to estimate by how much both in-core and on-disk counters need to be
283 : * adjusted.
284 : */
285 : STATIC int
286 360244 : xchk_fscount_aggregate_agcounts(
287 : struct xfs_scrub *sc,
288 : struct xchk_fscounters *fsc)
289 : {
290 360244 : struct xfs_mount *mp = sc->mp;
291 360244 : struct xfs_perag *pag;
292 360244 : uint64_t delayed;
293 360244 : xfs_agnumber_t agno;
294 360244 : int tries = 8;
295 360244 : int error = 0;
296 :
297 360244 : retry:
298 360244 : fsc->icount = 0;
299 360244 : fsc->ifree = 0;
300 360244 : fsc->fdblocks = 0;
301 :
302 1809231 : for_each_perag(mp, agno, pag) {
303 1448988 : if (xchk_should_terminate(sc, &error))
304 : break;
305 :
306 : /* This somehow got unset since the warmup? */
307 4346961 : if (!xfs_perag_initialised_agi(pag) ||
308 : !xfs_perag_initialised_agf(pag)) {
309 0 : error = -EFSCORRUPTED;
310 0 : break;
311 : }
312 :
313 : /* Count all the inodes */
314 1448987 : fsc->icount += pag->pagi_count;
315 1448987 : fsc->ifree += pag->pagi_freecount;
316 :
317 : /* Add up the free/freelist/bnobt/cntbt blocks */
318 1448987 : fsc->fdblocks += pag->pagf_freeblks;
319 1448987 : fsc->fdblocks += pag->pagf_flcount;
320 1448987 : if (xfs_has_lazysbcount(sc->mp)) {
321 1448987 : fsc->fdblocks += pag->pagf_btreeblks;
322 : } else {
323 0 : error = xchk_fscount_btreeblks(sc, fsc, agno);
324 0 : if (error)
325 : break;
326 : }
327 :
328 : /*
329 : * Per-AG reservations are taken out of the incore counters,
330 : * so they must be left out of the free blocks computation.
331 : */
332 1448987 : fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
333 1448987 : fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
334 :
335 : }
336 360244 : if (pag)
337 1 : xfs_perag_rele(pag);
338 360244 : if (error) {
339 1 : xchk_set_incomplete(sc);
340 1 : return error;
341 : }
342 :
343 : /*
344 : * The global incore space reservation is taken from the incore
345 : * counters, so leave that out of the computation.
346 : */
347 360243 : fsc->fdblocks -= mp->m_resblks_avail;
348 :
349 : /*
350 : * Delayed allocation reservations are taken out of the incore counters
351 : * but not recorded on disk, so leave them and their indlen blocks out
352 : * of the computation.
353 : */
354 360243 : delayed = percpu_counter_sum(&mp->m_delalloc_blks);
355 360243 : fsc->fdblocks -= delayed;
356 :
357 360243 : trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
358 : delayed);
359 :
360 :
361 : /* Bail out if the values we compute are totally nonsense. */
362 360243 : if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
363 360243 : fsc->fdblocks > mp->m_sb.sb_dblocks ||
364 360243 : fsc->ifree > fsc->icount_max)
365 : return -EFSCORRUPTED;
366 :
367 : /*
368 : * If ifree > icount then we probably had some perturbation in the
369 : * counters while we were calculating things. We'll try a few times
370 : * to maintain ifree <= icount before giving up.
371 : */
372 360243 : if (fsc->ifree > fsc->icount) {
373 0 : if (tries--)
374 0 : goto retry;
375 : return -EDEADLOCK;
376 : }
377 :
378 : return 0;
379 : }
380 :
381 : #ifdef CONFIG_XFS_RT
382 : STATIC int
383 21007624 : xchk_fscount_add_frextent(
384 : struct xfs_mount *mp,
385 : struct xfs_trans *tp,
386 : const struct xfs_rtalloc_rec *rec,
387 : void *priv)
388 : {
389 21007624 : struct xchk_fscounters *fsc = priv;
390 21007624 : int error = 0;
391 :
392 21007624 : fsc->frextents += rec->ar_extcount;
393 :
394 21007624 : xchk_should_terminate(fsc->sc, &error);
395 21007624 : return error;
396 : }
397 :
398 : /* Calculate the number of free realtime extents from the realtime bitmap. */
399 : STATIC int
400 360243 : xchk_fscount_count_frextents(
401 : struct xfs_scrub *sc,
402 : struct xchk_fscounters *fsc)
403 : {
404 360243 : struct xfs_mount *mp = sc->mp;
405 360243 : int error;
406 :
407 360243 : fsc->frextents = 0;
408 360243 : if (!xfs_has_realtime(mp))
409 : return 0;
410 :
411 146504 : xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
412 146504 : error = xfs_rtalloc_query_all(sc->mp, sc->tp,
413 : xchk_fscount_add_frextent, fsc);
414 146504 : if (error) {
415 0 : xchk_set_incomplete(sc);
416 0 : goto out_unlock;
417 : }
418 :
419 146504 : out_unlock:
420 146504 : xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
421 146504 : return error;
422 : }
423 : #else
424 : STATIC int
425 : xchk_fscount_count_frextents(
426 : struct xfs_scrub *sc,
427 : struct xchk_fscounters *fsc)
428 : {
429 : fsc->frextents = 0;
430 : return 0;
431 : }
432 : #endif /* CONFIG_XFS_RT */
433 :
434 : /*
435 : * Part 2: Comparing filesystem summary counters. All we have to do here is
436 : * sum the percpu counters and compare them to what we've observed.
437 : */
438 :
439 : /*
440 : * Is the @counter reasonably close to the @expected value?
441 : *
442 : * We neither locked nor froze anything in the filesystem while aggregating the
443 : * per-AG data to compute the @expected value, which means that the counter
444 : * could have changed. We know the @old_value of the summation of the counter
445 : * before the aggregation, and we re-sum the counter now. If the expected
446 : * value falls between the two summations, we're ok.
447 : *
448 : * Otherwise, we /might/ have a problem. If the change in the summations is
449 : * more than we want to tolerate, the filesystem is probably busy and we should
450 : * just send back INCOMPLETE and see if userspace will try again.
451 : *
452 : * If we're repairing then we require an exact match.
453 : */
454 : static inline bool
455 1440972 : xchk_fscount_within_range(
456 : struct xfs_scrub *sc,
457 : const int64_t old_value,
458 : struct percpu_counter *counter,
459 : uint64_t expected)
460 : {
461 1440972 : int64_t min_value, max_value;
462 1440972 : int64_t curr_value = percpu_counter_sum(counter);
463 :
464 1440972 : trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
465 : old_value);
466 :
467 : /* Negative values are always wrong. */
468 1440972 : if (curr_value < 0)
469 : return false;
470 :
471 : /* Exact matches are always ok. */
472 1440972 : if (curr_value == expected)
473 : return true;
474 :
475 107092 : min_value = min(old_value, curr_value);
476 107092 : max_value = max(old_value, curr_value);
477 :
478 : /* Within the before-and-after range is ok. */
479 107092 : if (expected >= min_value && expected <= max_value)
480 75574 : return true;
481 :
482 : /* Everything else is bad. */
483 : return false;
484 : }
485 :
486 : /* Check the superblock counters. */
487 : int
488 360244 : xchk_fscounters(
489 : struct xfs_scrub *sc)
490 : {
491 360244 : struct xfs_mount *mp = sc->mp;
492 360244 : struct xchk_fscounters *fsc = sc->buf;
493 360244 : int64_t icount, ifree, fdblocks, frextents;
494 360244 : bool try_again = false;
495 360244 : int error;
496 :
497 : /* Snapshot the percpu counters. */
498 360244 : icount = percpu_counter_sum(&mp->m_icount);
499 360244 : ifree = percpu_counter_sum(&mp->m_ifree);
500 360244 : fdblocks = percpu_counter_sum(&mp->m_fdblocks);
501 360244 : frextents = percpu_counter_sum(&mp->m_frextents);
502 :
503 : /* No negative values, please! */
504 360244 : if (icount < 0 || ifree < 0)
505 0 : xchk_set_corrupt(sc);
506 :
507 : /*
508 : * If the filesystem is not frozen, the counter summation calls above
509 : * can race with xfs_mod_freecounter, which subtracts a requested space
510 : * reservation from the counter and undoes the subtraction if that made
511 : * the counter go negative. Therefore, it's possible to see negative
512 : * values here, and we should only flag that as a corruption if we
513 : * froze the fs. This is much more likely to happen with frextents
514 : * since there are no reserved pools.
515 : */
516 360244 : if (fdblocks < 0 || frextents < 0) {
517 0 : if (!fsc->frozen)
518 : return -EDEADLOCK;
519 :
520 0 : xchk_set_corrupt(sc);
521 0 : return 0;
522 : }
523 :
524 : /* See if icount is obviously wrong. */
525 360244 : if (icount < fsc->icount_min || icount > fsc->icount_max)
526 0 : xchk_set_corrupt(sc);
527 :
528 : /* See if fdblocks is obviously wrong. */
529 360244 : if (fdblocks > mp->m_sb.sb_dblocks)
530 0 : xchk_set_corrupt(sc);
531 :
532 : /* See if frextents is obviously wrong. */
533 360244 : if (frextents > mp->m_sb.sb_rextents)
534 0 : xchk_set_corrupt(sc);
535 :
536 : /*
537 : * If ifree exceeds icount by more than the minimum variance then
538 : * something's probably wrong with the counters.
539 : */
540 360244 : if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
541 0 : xchk_set_corrupt(sc);
542 :
543 : /* Walk the incore AG headers to calculate the expected counters. */
544 360244 : error = xchk_fscount_aggregate_agcounts(sc, fsc);
545 360244 : if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
546 1 : return error;
547 :
548 : /* Count the free extents counter for rt volumes. */
549 360243 : error = xchk_fscount_count_frextents(sc, fsc);
550 360243 : if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
551 0 : return error;
552 360243 : if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
553 : return 0;
554 :
555 : /*
556 : * Compare the in-core counters with whatever we counted. If the fs is
557 : * frozen, we treat the discrepancy as a corruption because the freeze
558 : * should have stabilized the counter values. Otherwise, we need
559 : * userspace to call us back having granted us freeze permission.
560 : */
561 360243 : if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
562 : fsc->icount)) {
563 1 : if (fsc->frozen)
564 0 : xchk_set_corrupt(sc);
565 : else
566 : try_again = true;
567 : }
568 :
569 360243 : if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
570 2029 : if (fsc->frozen)
571 0 : xchk_set_corrupt(sc);
572 : else
573 : try_again = true;
574 : }
575 :
576 360243 : if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
577 : fsc->fdblocks)) {
578 25987 : if (fsc->frozen)
579 0 : xchk_set_corrupt(sc);
580 : else
581 : try_again = true;
582 : }
583 :
584 360243 : if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
585 : fsc->frextents)) {
586 3501 : if (fsc->frozen)
587 0 : xchk_set_corrupt(sc);
588 : else
589 : try_again = true;
590 : }
591 :
592 356742 : if (try_again)
593 26515 : return -EDEADLOCK;
594 :
595 : return 0;
596 : }
|