Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_alloc.h"
13 : #include "xfs_ialloc.h"
14 : #include "xfs_health.h"
15 : #include "xfs_btree.h"
16 : #include "xfs_ag.h"
17 : #include "xfs_rtalloc.h"
18 : #include "xfs_inode.h"
19 : #include "xfs_icache.h"
20 : #include "scrub/scrub.h"
21 : #include "scrub/common.h"
22 : #include "scrub/trace.h"
23 : #include "scrub/fscounters.h"
24 :
25 : /*
26 : * FS Summary Counters
27 : * ===================
28 : *
29 : * The basics of filesystem summary counter checking are that we iterate the
30 : * AGs counting the number of free blocks, free space btree blocks, per-AG
31 : * reservations, inodes, delayed allocation reservations, and free inodes.
32 : * Then we compare what we computed against the in-core counters.
33 : *
34 : * However, the reality is that summary counters are a tricky beast to check.
35 : * While we /could/ freeze the filesystem and scramble around the AGs counting
36 : * the free blocks, in practice we prefer not do that for a scan because
37 : * freezing is costly. To get around this, we added a per-cpu counter of the
38 : * delalloc reservations so that we can rotor around the AGs relatively
39 : * quickly, and we allow the counts to be slightly off because we're not taking
40 : * any locks while we do this.
41 : *
42 : * So the first thing we do is warm up the buffer cache in the setup routine by
43 : * walking all the AGs to make sure the incore per-AG structure has been
44 : * initialized. The expected value calculation then iterates the incore per-AG
45 : * structures as quickly as it can. We snapshot the percpu counters before and
46 : * after this operation and use the difference in counter values to guess at
47 : * our tolerance for mismatch between expected and actual counter values.
48 : */
49 :
50 : /*
51 : * Since the expected value computation is lockless but only browses incore
52 : * values, the percpu counters should be fairly close to each other. However,
53 : * we'll allow ourselves to be off by at least this (arbitrary) amount.
54 : */
55 : #define XCHK_FSCOUNT_MIN_VARIANCE (512)
56 :
57 : /*
58 : * Make sure the per-AG structure has been initialized from the on-disk header
59 : * contents and trust that the incore counters match the ondisk counters. (The
60 : * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
61 : * summary counters after checking all AG headers). Do this from the setup
62 : * function so that the inner AG aggregation loop runs as quickly as possible.
63 : *
64 : * This function runs during the setup phase /before/ we start checking any
65 : * metadata.
66 : */
67 : STATIC int
68 103764 : xchk_fscount_warmup(
69 : struct xfs_scrub *sc)
70 : {
71 103764 : struct xfs_mount *mp = sc->mp;
72 103764 : struct xfs_buf *agi_bp = NULL;
73 103764 : struct xfs_buf *agf_bp = NULL;
74 103764 : struct xfs_perag *pag = NULL;
75 103764 : xfs_agnumber_t agno;
76 103764 : int error = 0;
77 :
78 526835 : for_each_perag(mp, agno, pag) {
79 423071 : if (xchk_should_terminate(sc, &error))
80 : break;
81 1269213 : if (xfs_perag_initialised_agi(pag) &&
82 : xfs_perag_initialised_agf(pag))
83 423071 : continue;
84 :
85 : /* Lock both AG headers. */
86 0 : error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
87 0 : if (error)
88 : break;
89 0 : error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
90 0 : if (error)
91 : break;
92 :
93 : /*
94 : * These are supposed to be initialized by the header read
95 : * function.
96 : */
97 0 : if (!xfs_perag_initialised_agi(pag) ||
98 : !xfs_perag_initialised_agf(pag)) {
99 0 : error = -EFSCORRUPTED;
100 0 : break;
101 : }
102 :
103 0 : xfs_buf_relse(agf_bp);
104 0 : agf_bp = NULL;
105 0 : xfs_buf_relse(agi_bp);
106 0 : agi_bp = NULL;
107 : }
108 :
109 103764 : if (agf_bp)
110 0 : xfs_buf_relse(agf_bp);
111 103764 : if (agi_bp)
112 0 : xfs_buf_relse(agi_bp);
113 103764 : if (pag)
114 0 : xfs_perag_rele(pag);
115 103764 : return error;
116 : }
117 :
118 : static inline int
119 22303 : xchk_fsfreeze(
120 : struct xfs_scrub *sc)
121 : {
122 22303 : int error;
123 :
124 22303 : error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
125 22303 : trace_xchk_fsfreeze(sc, error);
126 22303 : return error;
127 : }
128 :
129 : static inline int
130 22303 : xchk_fsthaw(
131 : struct xfs_scrub *sc)
132 : {
133 22303 : int error;
134 :
135 : /* This should always succeed, we have a kernel freeze */
136 22303 : error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
137 22303 : trace_xchk_fsthaw(sc, error);
138 22303 : return error;
139 : }
140 :
141 : /*
142 : * We couldn't stabilize the filesystem long enough to sample all the variables
143 : * that comprise the summary counters and compare them to the percpu counters.
144 : * We need to disable all writer threads, which means taking the first two
145 : * freeze levels to put userspace to sleep, and the third freeze level to
146 : * prevent background threads from starting new transactions. Take one level
147 : * more to prevent other callers from unfreezing the filesystem while we run.
148 : */
149 : STATIC int
150 22303 : xchk_fscounters_freeze(
151 : struct xfs_scrub *sc)
152 : {
153 22303 : struct xchk_fscounters *fsc = sc->buf;
154 22303 : int error = 0;
155 :
156 22303 : if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
157 15595 : sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
158 15595 : mnt_drop_write_file(sc->file);
159 : }
160 :
161 : /* Try to grab a kernel freeze. */
162 22303 : while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
163 0 : if (xchk_should_terminate(sc, &error))
164 0 : return error;
165 :
166 0 : delay(HZ / 10);
167 : }
168 22303 : if (error)
169 : return error;
170 :
171 22303 : fsc->frozen = true;
172 22303 : return 0;
173 : }
174 :
175 : /* Thaw the filesystem after checking or repairing fscounters. */
176 : STATIC void
177 103764 : xchk_fscounters_cleanup(
178 : void *buf)
179 : {
180 103764 : struct xchk_fscounters *fsc = buf;
181 103764 : struct xfs_scrub *sc = fsc->sc;
182 103764 : int error;
183 :
184 103764 : if (!fsc->frozen)
185 : return;
186 :
187 22303 : error = xchk_fsthaw(sc);
188 22303 : if (error)
189 0 : xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
190 : else
191 22303 : fsc->frozen = false;
192 : }
193 :
194 : int
195 103764 : xchk_setup_fscounters(
196 : struct xfs_scrub *sc)
197 : {
198 103764 : struct xchk_fscounters *fsc;
199 103764 : int error;
200 :
201 : /*
202 : * If the AGF doesn't track btreeblks, we have to lock the AGF to count
203 : * btree block usage by walking the actual btrees.
204 : */
205 103764 : if (!xfs_has_lazysbcount(sc->mp))
206 0 : xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
207 :
208 103764 : sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
209 103764 : if (!sc->buf)
210 : return -ENOMEM;
211 103764 : sc->buf_cleanup = xchk_fscounters_cleanup;
212 103764 : fsc = sc->buf;
213 103764 : fsc->sc = sc;
214 :
215 103764 : xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
216 :
217 : /* We must get the incore counters set up before we can proceed. */
218 103764 : error = xchk_fscount_warmup(sc);
219 103764 : if (error)
220 : return error;
221 :
222 : /*
223 : * Pause all writer activity in the filesystem while we're scrubbing to
224 : * reduce the likelihood of background perturbations to the counters
225 : * throwing off our calculations.
226 : *
227 : * If we're repairing, we need to prevent any other thread from
228 : * changing the global fs summary counters while we're repairing them.
229 : * This requires the fs to be frozen, which will disable background
230 : * reclaim and purge all inactive inodes.
231 : */
232 198300 : if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
233 22303 : error = xchk_fscounters_freeze(sc);
234 22303 : if (error)
235 : return error;
236 : }
237 :
238 103764 : return xchk_trans_alloc_empty(sc);
239 : }
240 :
241 : /*
242 : * Part 1: Collecting filesystem summary counts. For each AG, we add its
243 : * summary counts (total inodes, free inodes, free data blocks) to an incore
244 : * copy of the overall filesystem summary counts.
245 : *
246 : * To avoid false corruption reports in part 2, any failure in this part must
247 : * set the INCOMPLETE flag even when a negative errno is returned. This care
248 : * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
249 : * ECANCELED) that are absorbed into a scrub state flag update by
250 : * xchk_*_process_error. Scrub and repair share the same incore data
251 : * structures, so the INCOMPLETE flag is critical to prevent a repair based on
252 : * insufficient information.
253 : */
254 :
255 : /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
256 : static int
257 0 : xchk_fscount_btreeblks(
258 : struct xfs_scrub *sc,
259 : struct xchk_fscounters *fsc,
260 : xfs_agnumber_t agno)
261 : {
262 0 : xfs_extlen_t blocks;
263 0 : int error;
264 :
265 0 : error = xchk_ag_init_existing(sc, agno, &sc->sa);
266 0 : if (error)
267 0 : goto out_free;
268 :
269 0 : error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
270 0 : if (error)
271 0 : goto out_free;
272 0 : fsc->fdblocks += blocks - 1;
273 :
274 0 : error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
275 0 : if (error)
276 0 : goto out_free;
277 0 : fsc->fdblocks += blocks - 1;
278 :
279 0 : out_free:
280 0 : xchk_ag_free(sc, &sc->sa);
281 0 : return error;
282 : }
283 :
284 : /*
285 : * Calculate what the global in-core counters ought to be from the incore
286 : * per-AG structure. Callers can compare this to the actual in-core counters
287 : * to estimate by how much both in-core and on-disk counters need to be
288 : * adjusted.
289 : */
290 : STATIC int
291 103764 : xchk_fscount_aggregate_agcounts(
292 : struct xfs_scrub *sc,
293 : struct xchk_fscounters *fsc)
294 : {
295 103764 : struct xfs_mount *mp = sc->mp;
296 103764 : struct xfs_perag *pag;
297 103764 : uint64_t delayed;
298 103764 : xfs_agnumber_t agno;
299 103764 : int tries = 8;
300 103764 : int error = 0;
301 :
302 103764 : retry:
303 103764 : fsc->icount = 0;
304 103764 : fsc->ifree = 0;
305 103764 : fsc->fdblocks = 0;
306 :
307 526823 : for_each_perag(mp, agno, pag) {
308 423062 : if (xchk_should_terminate(sc, &error))
309 : break;
310 :
311 : /* This somehow got unset since the warmup? */
312 1269177 : if (!xfs_perag_initialised_agi(pag) ||
313 : !xfs_perag_initialised_agf(pag)) {
314 0 : error = -EFSCORRUPTED;
315 0 : break;
316 : }
317 :
318 : /* Count all the inodes */
319 423059 : fsc->icount += pag->pagi_count;
320 423059 : fsc->ifree += pag->pagi_freecount;
321 :
322 : /* Add up the free/freelist/bnobt/cntbt blocks */
323 423059 : fsc->fdblocks += pag->pagf_freeblks;
324 423059 : fsc->fdblocks += pag->pagf_flcount;
325 423059 : if (xfs_has_lazysbcount(sc->mp)) {
326 423059 : fsc->fdblocks += pag->pagf_btreeblks;
327 : } else {
328 0 : error = xchk_fscount_btreeblks(sc, fsc, agno);
329 0 : if (error)
330 : break;
331 : }
332 :
333 : /*
334 : * Per-AG reservations are taken out of the incore counters,
335 : * so they must be left out of the free blocks computation.
336 : */
337 423059 : fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
338 423059 : fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
339 :
340 : }
341 103764 : if (pag)
342 3 : xfs_perag_rele(pag);
343 103764 : if (error) {
344 3 : xchk_set_incomplete(sc);
345 3 : return error;
346 : }
347 :
348 : /*
349 : * The global incore space reservation is taken from the incore
350 : * counters, so leave that out of the computation.
351 : */
352 103761 : fsc->fdblocks -= mp->m_resblks_avail;
353 :
354 : /*
355 : * Delayed allocation reservations are taken out of the incore counters
356 : * but not recorded on disk, so leave them and their indlen blocks out
357 : * of the computation.
358 : */
359 103761 : delayed = percpu_counter_sum(&mp->m_delalloc_blks);
360 103761 : fsc->fdblocks -= delayed;
361 :
362 103761 : trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
363 : delayed);
364 :
365 :
366 : /* Bail out if the values we compute are totally nonsense. */
367 103761 : if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
368 103761 : fsc->fdblocks > mp->m_sb.sb_dblocks ||
369 103761 : fsc->ifree > fsc->icount_max)
370 : return -EFSCORRUPTED;
371 :
372 : /*
373 : * If ifree > icount then we probably had some perturbation in the
374 : * counters while we were calculating things. We'll try a few times
375 : * to maintain ifree <= icount before giving up.
376 : */
377 103761 : if (fsc->ifree > fsc->icount) {
378 0 : if (tries--)
379 0 : goto retry;
380 : return -EDEADLOCK;
381 : }
382 :
383 : return 0;
384 : }
385 :
386 : #ifdef CONFIG_XFS_RT
387 : STATIC int
388 25855567 : xchk_fscount_add_frextent(
389 : struct xfs_mount *mp,
390 : struct xfs_trans *tp,
391 : const struct xfs_rtalloc_rec *rec,
392 : void *priv)
393 : {
394 25855567 : struct xchk_fscounters *fsc = priv;
395 25855567 : int error = 0;
396 :
397 25855567 : fsc->frextents += rec->ar_extcount;
398 :
399 25855567 : xchk_should_terminate(fsc->sc, &error);
400 25855567 : return error;
401 : }
402 :
403 : /* Calculate the number of free realtime extents from the realtime bitmap. */
404 : STATIC int
405 103761 : xchk_fscount_count_frextents(
406 : struct xfs_scrub *sc,
407 : struct xchk_fscounters *fsc)
408 : {
409 103761 : struct xfs_mount *mp = sc->mp;
410 103761 : int error;
411 :
412 103761 : fsc->frextents = 0;
413 103761 : if (!xfs_has_realtime(mp))
414 : return 0;
415 :
416 54018 : xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
417 54018 : error = xfs_rtalloc_query_all(sc->mp, sc->tp,
418 : xchk_fscount_add_frextent, fsc);
419 54018 : if (error) {
420 1 : xchk_set_incomplete(sc);
421 1 : goto out_unlock;
422 : }
423 :
424 54017 : out_unlock:
425 54018 : xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
426 54018 : return error;
427 : }
428 : #else
429 : STATIC int
430 : xchk_fscount_count_frextents(
431 : struct xfs_scrub *sc,
432 : struct xchk_fscounters *fsc)
433 : {
434 : fsc->frextents = 0;
435 : return 0;
436 : }
437 : #endif /* CONFIG_XFS_RT */
438 :
439 : /*
440 : * Part 2: Comparing filesystem summary counters. All we have to do here is
441 : * sum the percpu counters and compare them to what we've observed.
442 : */
443 :
444 : /*
445 : * Is the @counter reasonably close to the @expected value?
446 : *
447 : * We neither locked nor froze anything in the filesystem while aggregating the
448 : * per-AG data to compute the @expected value, which means that the counter
449 : * could have changed. We know the @old_value of the summation of the counter
450 : * before the aggregation, and we re-sum the counter now. If the expected
451 : * value falls between the two summations, we're ok.
452 : *
453 : * Otherwise, we /might/ have a problem. If the change in the summations is
454 : * more than we want to tolerate, the filesystem is probably busy and we should
455 : * just send back INCOMPLETE and see if userspace will try again.
456 : *
457 : * If we're repairing then we require an exact match.
458 : */
459 : static inline bool
460 415040 : xchk_fscount_within_range(
461 : struct xfs_scrub *sc,
462 : const int64_t old_value,
463 : struct percpu_counter *counter,
464 : uint64_t expected)
465 : {
466 415040 : int64_t min_value, max_value;
467 415040 : int64_t curr_value = percpu_counter_sum(counter);
468 :
469 415040 : trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
470 : old_value);
471 :
472 : /* Negative values are always wrong. */
473 415040 : if (curr_value < 0)
474 : return false;
475 :
476 : /* Exact matches are always ok. */
477 415040 : if (curr_value == expected)
478 : return true;
479 :
480 : /* We require exact matches when repair is running. */
481 30719 : if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
482 : return false;
483 :
484 26231 : min_value = min(old_value, curr_value);
485 26231 : max_value = max(old_value, curr_value);
486 :
487 : /* Within the before-and-after range is ok. */
488 26231 : if (expected >= min_value && expected <= max_value)
489 18049 : return true;
490 :
491 : /* Everything else is bad. */
492 : return false;
493 : }
494 :
495 : /* Check the superblock counters. */
496 : int
497 103764 : xchk_fscounters(
498 : struct xfs_scrub *sc)
499 : {
500 103764 : struct xfs_mount *mp = sc->mp;
501 103764 : struct xchk_fscounters *fsc = sc->buf;
502 103764 : int64_t icount, ifree, fdblocks, frextents;
503 103764 : bool try_again = false;
504 103764 : int error;
505 :
506 : /* Snapshot the percpu counters. */
507 103764 : icount = percpu_counter_sum(&mp->m_icount);
508 103764 : ifree = percpu_counter_sum(&mp->m_ifree);
509 103764 : fdblocks = percpu_counter_sum(&mp->m_fdblocks);
510 103764 : frextents = percpu_counter_sum(&mp->m_frextents);
511 :
512 : /* No negative values, please! */
513 103764 : if (icount < 0 || ifree < 0)
514 0 : xchk_set_corrupt(sc);
515 :
516 : /*
517 : * If the filesystem is not frozen, the counter summation calls above
518 : * can race with xfs_mod_freecounter, which subtracts a requested space
519 : * reservation from the counter and undoes the subtraction if that made
520 : * the counter go negative. Therefore, it's possible to see negative
521 : * values here, and we should only flag that as a corruption if we
522 : * froze the fs. This is much more likely to happen with frextents
523 : * since there are no reserved pools.
524 : */
525 103764 : if (fdblocks < 0 || frextents < 0) {
526 0 : if (!fsc->frozen)
527 : return -EDEADLOCK;
528 :
529 0 : xchk_set_corrupt(sc);
530 0 : return 0;
531 : }
532 :
533 : /* See if icount is obviously wrong. */
534 103764 : if (icount < fsc->icount_min || icount > fsc->icount_max)
535 0 : xchk_set_corrupt(sc);
536 :
537 : /* See if fdblocks is obviously wrong. */
538 103764 : if (fdblocks > mp->m_sb.sb_dblocks)
539 0 : xchk_set_corrupt(sc);
540 :
541 : /* See if frextents is obviously wrong. */
542 103764 : if (frextents > mp->m_sb.sb_rextents)
543 0 : xchk_set_corrupt(sc);
544 :
545 : /*
546 : * If ifree exceeds icount by more than the minimum variance then
547 : * something's probably wrong with the counters.
548 : */
549 103764 : if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
550 0 : xchk_set_corrupt(sc);
551 :
552 : /* Walk the incore AG headers to calculate the expected counters. */
553 103764 : error = xchk_fscount_aggregate_agcounts(sc, fsc);
554 103764 : if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
555 3 : return error;
556 :
557 : /* Count the free extents counter for rt volumes. */
558 103761 : error = xchk_fscount_count_frextents(sc, fsc);
559 103761 : if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
560 1 : return error;
561 103760 : if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
562 : return 0;
563 :
564 : /*
565 : * Compare the in-core counters with whatever we counted. If the fs is
566 : * frozen, we treat the discrepancy as a corruption because the freeze
567 : * should have stabilized the counter values. Otherwise, we need
568 : * userspace to call us back having granted us freeze permission.
569 : */
570 103760 : if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
571 : fsc->icount)) {
572 18 : if (fsc->frozen)
573 0 : xchk_set_corrupt(sc);
574 : else
575 : try_again = true;
576 : }
577 :
578 103760 : if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
579 1700 : if (fsc->frozen)
580 0 : xchk_set_corrupt(sc);
581 : else
582 : try_again = true;
583 : }
584 :
585 103760 : if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
586 : fsc->fdblocks)) {
587 9097 : if (fsc->frozen)
588 0 : xchk_set_corrupt(sc);
589 : else
590 : try_again = true;
591 : }
592 :
593 103760 : if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
594 : fsc->frextents)) {
595 1855 : if (fsc->frozen)
596 0 : xchk_set_corrupt(sc);
597 : else
598 : try_again = true;
599 : }
600 :
601 101905 : if (try_again)
602 9228 : return -EDEADLOCK;
603 :
604 : return 0;
605 : }
|