Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_btree.h"
13 : #include "xfs_btree_staging.h"
14 : #include "xfs_log_format.h"
15 : #include "xfs_trans.h"
16 : #include "xfs_inode.h"
17 : #include "xfs_icache.h"
18 : #include "xfs_alloc.h"
19 : #include "xfs_alloc_btree.h"
20 : #include "xfs_ialloc.h"
21 : #include "xfs_ialloc_btree.h"
22 : #include "xfs_refcount_btree.h"
23 : #include "xfs_rmap.h"
24 : #include "xfs_rmap_btree.h"
25 : #include "xfs_log.h"
26 : #include "xfs_trans_priv.h"
27 : #include "xfs_da_format.h"
28 : #include "xfs_da_btree.h"
29 : #include "xfs_dir2_priv.h"
30 : #include "xfs_attr.h"
31 : #include "xfs_reflink.h"
32 : #include "xfs_ag.h"
33 : #include "xfs_error.h"
34 : #include "xfs_quota.h"
35 : #include "xfs_swapext.h"
36 : #include "scrub/scrub.h"
37 : #include "scrub/common.h"
38 : #include "scrub/trace.h"
39 : #include "scrub/repair.h"
40 : #include "scrub/health.h"
41 :
42 : /* Common code for the metadata scrubbers. */
43 :
44 : /*
45 : * Handling operational errors.
46 : *
47 : * The *_process_error() family of functions are used to process error return
48 : * codes from functions called as part of a scrub operation.
49 : *
50 : * If there's no error, we return true to tell the caller that it's ok
51 : * to move on to the next check in its list.
52 : *
53 : * For non-verifier errors (e.g. ENOMEM) we return false to tell the
54 : * caller that something bad happened, and we preserve *error so that
55 : * the caller can return the *error up the stack to userspace.
56 : *
57 : * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
58 : * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words,
59 : * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
60 : * not via return codes. We return false to tell the caller that
61 : * something bad happened. Since the error has been cleared, the caller
62 : * will (presumably) return that zero and scrubbing will move on to
63 : * whatever's next.
64 : *
65 : * ftrace can be used to record the precise metadata location and the
66 : * approximate code location of the failed operation.
67 : */
68 :
69 : /* Check for operational errors. */
70 : static bool
71 226204873 : __xchk_process_error(
72 : struct xfs_scrub *sc,
73 : xfs_agnumber_t agno,
74 : xfs_agblock_t bno,
75 : int *error,
76 : __u32 errflag,
77 : void *ret_ip)
78 : {
79 226204873 : switch (*error) {
80 : case 0:
81 : return true;
82 84840 : case -EDEADLOCK:
83 : case -ECHRNG:
84 : /* Used to restart an op with deadlock avoidance. */
85 169680 : trace_xchk_deadlock_retry(
86 84840 : sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
87 : sc->sm, *error);
88 84840 : break;
89 127 : case -ECANCELED:
90 : /*
91 : * ECANCELED here means that the caller set one of the scrub
92 : * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
93 : * quickly. Set error to zero and do not continue.
94 : */
95 127 : trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
96 127 : *error = 0;
97 127 : break;
98 99 : case -EFSBADCRC:
99 : case -EFSCORRUPTED:
100 : /* Note the badness but don't abort. */
101 99 : sc->sm->sm_flags |= errflag;
102 99 : xchk_whine(sc->mp, "type %s agno 0x%x agbno 0x%x error %d errflag 0x%x ret_ip %pS",
103 99 : xchk_type_string(sc->sm->sm_type),
104 : agno,
105 : bno,
106 : *error,
107 : errflag,
108 : ret_ip);
109 99 : *error = 0;
110 146 : fallthrough;
111 146 : default:
112 146 : if (*error)
113 47 : xchk_whine(sc->mp, "type %s agno 0x%x agbno 0x%x error %d ret_ip %pS",
114 47 : xchk_type_string(sc->sm->sm_type),
115 : agno,
116 : bno,
117 : *error,
118 : ret_ip);
119 146 : trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
120 146 : break;
121 : }
122 : return false;
123 : }
124 :
125 : bool
126 4654179 : xchk_process_error(
127 : struct xfs_scrub *sc,
128 : xfs_agnumber_t agno,
129 : xfs_agblock_t bno,
130 : int *error)
131 : {
132 4654179 : return __xchk_process_error(sc, agno, bno, error,
133 : XFS_SCRUB_OFLAG_CORRUPT, __return_address);
134 : }
135 :
136 : bool
137 221866793 : xchk_xref_process_error(
138 : struct xfs_scrub *sc,
139 : xfs_agnumber_t agno,
140 : xfs_agblock_t bno,
141 : int *error)
142 : {
143 221866793 : return __xchk_process_error(sc, agno, bno, error,
144 : XFS_SCRUB_OFLAG_XFAIL, __return_address);
145 : }
146 :
147 : /* Check for operational errors for a file offset. */
148 : static bool
149 1519428763 : __xchk_fblock_process_error(
150 : struct xfs_scrub *sc,
151 : int whichfork,
152 : xfs_fileoff_t offset,
153 : int *error,
154 : __u32 errflag,
155 : void *ret_ip)
156 : {
157 1519428763 : switch (*error) {
158 : case 0:
159 : return true;
160 0 : case -EDEADLOCK:
161 : case -ECHRNG:
162 : /* Used to restart an op with deadlock avoidance. */
163 0 : trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
164 0 : break;
165 11 : case -ECANCELED:
166 : /*
167 : * ECANCELED here means that the caller set one of the scrub
168 : * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
169 : * quickly. Set error to zero and do not continue.
170 : */
171 11 : trace_xchk_file_op_error(sc, whichfork, offset, *error,
172 : ret_ip);
173 11 : *error = 0;
174 11 : break;
175 0 : case -EFSBADCRC:
176 : case -EFSCORRUPTED:
177 : /* Note the badness but don't abort. */
178 0 : sc->sm->sm_flags |= errflag;
179 0 : xchk_whine(sc->mp, "ino 0x%llx fork %d type %s offset %llu error %d errflag 0x%x ret_ip %pS",
180 0 : sc->ip->i_ino,
181 : whichfork,
182 0 : xchk_type_string(sc->sm->sm_type),
183 : offset,
184 : *error,
185 : errflag,
186 : ret_ip);
187 0 : *error = 0;
188 0 : fallthrough;
189 0 : default:
190 0 : if (*error)
191 0 : xchk_whine(sc->mp, "ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS",
192 0 : sc->ip->i_ino,
193 : whichfork,
194 0 : xchk_type_string(sc->sm->sm_type),
195 : offset,
196 : *error,
197 : ret_ip);
198 0 : trace_xchk_file_op_error(sc, whichfork, offset, *error,
199 : ret_ip);
200 0 : break;
201 : }
202 : return false;
203 : }
204 :
205 : bool
206 885302899 : xchk_fblock_process_error(
207 : struct xfs_scrub *sc,
208 : int whichfork,
209 : xfs_fileoff_t offset,
210 : int *error)
211 : {
212 885382042 : return __xchk_fblock_process_error(sc, whichfork, offset, error,
213 : XFS_SCRUB_OFLAG_CORRUPT, __return_address);
214 : }
215 :
216 : bool
217 634789887 : xchk_fblock_xref_process_error(
218 : struct xfs_scrub *sc,
219 : int whichfork,
220 : xfs_fileoff_t offset,
221 : int *error)
222 : {
223 634789887 : return __xchk_fblock_process_error(sc, whichfork, offset, error,
224 : XFS_SCRUB_OFLAG_XFAIL, __return_address);
225 : }
226 :
227 : /*
228 : * Handling scrub corruption/optimization/warning checks.
229 : *
230 : * The *_set_{corrupt,preen,warning}() family of functions are used to
231 : * record the presence of metadata that is incorrect (corrupt), could be
232 : * optimized somehow (preen), or should be flagged for administrative
233 : * review but is not incorrect (warn).
234 : *
235 : * ftrace can be used to record the precise metadata location and
236 : * approximate code location of the failed check.
237 : */
238 :
239 : /* Record a block which could be optimized. */
240 : void
241 1480544 : xchk_block_set_preen(
242 : struct xfs_scrub *sc,
243 : struct xfs_buf *bp)
244 : {
245 1480544 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
246 1480544 : trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
247 1480392 : }
248 :
249 : /*
250 : * Record an inode which could be optimized. The trace data will
251 : * include the block given by bp if bp is given; otherwise it will use
252 : * the block location of the inode record itself.
253 : */
254 : void
255 1970585 : xchk_ino_set_preen(
256 : struct xfs_scrub *sc,
257 : xfs_ino_t ino)
258 : {
259 1970585 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
260 1970585 : trace_xchk_ino_preen(sc, ino, __return_address);
261 1970544 : }
262 :
263 : /* Record something being wrong with the filesystem primary superblock. */
264 : void
265 0 : xchk_set_corrupt(
266 : struct xfs_scrub *sc)
267 : {
268 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
269 0 : xchk_whine(sc->mp, "type %s ret_ip %pS", xchk_type_string(sc->sm->sm_type),
270 : __return_address);
271 0 : trace_xchk_fs_error(sc, 0, __return_address);
272 0 : }
273 :
274 : /* Record a corrupt block. */
275 : void
276 0 : xchk_block_set_corrupt(
277 : struct xfs_scrub *sc,
278 : struct xfs_buf *bp)
279 : {
280 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
281 0 : trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
282 0 : xchk_whine(sc->mp, "type %s agno 0x%x agbno 0x%x ret_ip %pS",
283 0 : xchk_type_string(sc->sm->sm_type),
284 : xfs_daddr_to_agno(sc->mp, xfs_buf_daddr(bp)),
285 : xfs_daddr_to_agbno(sc->mp, xfs_buf_daddr(bp)),
286 : __return_address);
287 0 : }
288 :
289 : #ifdef CONFIG_XFS_QUOTA
290 : /* Record a corrupt quota counter. */
291 : void
292 0 : xchk_qcheck_set_corrupt(
293 : struct xfs_scrub *sc,
294 : unsigned int dqtype,
295 : xfs_dqid_t id)
296 : {
297 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
298 0 : xchk_whine(sc->mp, "type %s dqtype %u id %u ret_ip %pS",
299 0 : xchk_type_string(sc->sm->sm_type), dqtype, id, __return_address);
300 0 : trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
301 0 : }
302 : #endif /* CONFIG_XFS_QUOTA */
303 :
304 : /* Record a corruption while cross-referencing. */
305 : void
306 0 : xchk_block_xref_set_corrupt(
307 : struct xfs_scrub *sc,
308 : struct xfs_buf *bp)
309 : {
310 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
311 0 : trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
312 0 : xchk_whine(sc->mp, "type %s agno 0x%x agbno 0x%x ret_ip %pS",
313 0 : xchk_type_string(sc->sm->sm_type),
314 : xfs_daddr_to_agno(sc->mp, xfs_buf_daddr(bp)),
315 : xfs_daddr_to_agbno(sc->mp, xfs_buf_daddr(bp)),
316 : __return_address);
317 0 : }
318 :
319 : /*
320 : * Record a corrupt inode. The trace data will include the block given
321 : * by bp if bp is given; otherwise it will use the block location of the
322 : * inode record itself.
323 : */
324 : void
325 0 : xchk_ino_set_corrupt(
326 : struct xfs_scrub *sc,
327 : xfs_ino_t ino)
328 : {
329 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
330 0 : xchk_whine(sc->mp, "ino 0x%llx type %s ret_ip %pS",
331 0 : ino, xchk_type_string(sc->sm->sm_type), __return_address);
332 0 : trace_xchk_ino_error(sc, ino, __return_address);
333 0 : }
334 :
335 : /* Record a corruption while cross-referencing with an inode. */
336 : void
337 0 : xchk_ino_xref_set_corrupt(
338 : struct xfs_scrub *sc,
339 : xfs_ino_t ino)
340 : {
341 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
342 0 : xchk_whine(sc->mp, "ino 0x%llx type %s ret_ip %pS",
343 0 : ino, xchk_type_string(sc->sm->sm_type), __return_address);
344 0 : trace_xchk_ino_error(sc, ino, __return_address);
345 0 : }
346 :
347 : /* Record corruption in a block indexed by a file fork. */
348 : void
349 23 : xchk_fblock_set_corrupt(
350 : struct xfs_scrub *sc,
351 : int whichfork,
352 : xfs_fileoff_t offset)
353 : {
354 23 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
355 23 : xchk_whine(sc->mp, "ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
356 23 : sc->ip->i_ino,
357 : whichfork,
358 23 : xchk_type_string(sc->sm->sm_type),
359 : offset,
360 : __return_address);
361 23 : trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
362 23 : }
363 :
364 : /* Record a corruption while cross-referencing a fork block. */
365 : void
366 10 : xchk_fblock_xref_set_corrupt(
367 : struct xfs_scrub *sc,
368 : int whichfork,
369 : xfs_fileoff_t offset)
370 : {
371 10 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
372 10 : xchk_whine(sc->mp, "ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
373 10 : sc->ip->i_ino,
374 : whichfork,
375 10 : xchk_type_string(sc->sm->sm_type),
376 : offset,
377 : __return_address);
378 10 : trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
379 10 : }
380 :
381 : /*
382 : * Warn about inodes that need administrative review but is not
383 : * incorrect.
384 : */
385 : void
386 0 : xchk_ino_set_warning(
387 : struct xfs_scrub *sc,
388 : xfs_ino_t ino)
389 : {
390 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
391 0 : xchk_whine(sc->mp, "ino 0x%llx type %s ret_ip %pS",
392 0 : ino, xchk_type_string(sc->sm->sm_type), __return_address);
393 0 : trace_xchk_ino_warning(sc, ino, __return_address);
394 0 : }
395 :
396 : /* Warn about a block indexed by a file fork that needs review. */
397 : void
398 147 : xchk_fblock_set_warning(
399 : struct xfs_scrub *sc,
400 : int whichfork,
401 : xfs_fileoff_t offset)
402 : {
403 147 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
404 147 : xchk_whine(sc->mp, "ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
405 147 : sc->ip->i_ino,
406 : whichfork,
407 147 : xchk_type_string(sc->sm->sm_type),
408 : offset,
409 : __return_address);
410 147 : trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
411 147 : }
412 :
413 : /* Signal an incomplete scrub. */
414 : void
415 279 : xchk_set_incomplete(
416 : struct xfs_scrub *sc)
417 : {
418 279 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
419 279 : trace_xchk_incomplete(sc, __return_address);
420 279 : }
421 :
422 : /*
423 : * rmap scrubbing -- compute the number of blocks with a given owner,
424 : * at least according to the reverse mapping data.
425 : */
426 :
427 : struct xchk_rmap_ownedby_info {
428 : const struct xfs_owner_info *oinfo;
429 : xfs_filblks_t *blocks;
430 : };
431 :
432 : STATIC int
433 7603199393 : xchk_count_rmap_ownedby_irec(
434 : struct xfs_btree_cur *cur,
435 : const struct xfs_rmap_irec *rec,
436 : void *priv)
437 : {
438 7603199393 : struct xchk_rmap_ownedby_info *sroi = priv;
439 7603199393 : bool irec_attr;
440 7603199393 : bool oinfo_attr;
441 :
442 7603199393 : irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
443 7603199393 : oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
444 :
445 7603199393 : if (rec->rm_owner != sroi->oinfo->oi_owner)
446 : return 0;
447 :
448 37072509 : if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
449 37072509 : (*sroi->blocks) += rec->rm_blockcount;
450 :
451 : return 0;
452 : }
453 :
454 : /*
455 : * Calculate the number of blocks the rmap thinks are owned by something.
456 : * The caller should pass us an rmapbt cursor.
457 : */
458 : int
459 1194720 : xchk_count_rmap_ownedby_ag(
460 : struct xfs_scrub *sc,
461 : struct xfs_btree_cur *cur,
462 : const struct xfs_owner_info *oinfo,
463 : xfs_filblks_t *blocks)
464 : {
465 1194720 : struct xchk_rmap_ownedby_info sroi = {
466 : .oinfo = oinfo,
467 : .blocks = blocks,
468 : };
469 :
470 1194720 : *blocks = 0;
471 1194720 : return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
472 : &sroi);
473 : }
474 :
475 : /*
476 : * AG scrubbing
477 : *
478 : * These helpers facilitate locking an allocation group's header
479 : * buffers, setting up cursors for all btrees that are present, and
480 : * cleaning everything up once we're through.
481 : */
482 :
483 : /* Decide if we want to return an AG header read failure. */
484 : static inline bool
485 : want_ag_read_header_failure(
486 : struct xfs_scrub *sc,
487 : unsigned int type)
488 : {
489 : /* Return all AG header read failures when scanning btrees. */
490 0 : if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
491 0 : sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
492 : sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
493 : return true;
494 : /*
495 : * If we're scanning a given type of AG header, we only want to
496 : * see read failures from that specific header. We'd like the
497 : * other headers to cross-check them, but this isn't required.
498 : */
499 0 : if (sc->sm->sm_type == type)
500 : return true;
501 : return false;
502 : }
503 :
504 : /*
505 : * Grab the AG header buffers for the attached perag structure.
506 : *
507 : * The headers should be released by xchk_ag_free, but as a fail safe we attach
508 : * all the buffers we grab to the scrub transaction so they'll all be freed
509 : * when we cancel it.
510 : */
511 : static inline int
512 441668254 : xchk_perag_read_headers(
513 : struct xfs_scrub *sc,
514 : struct xchk_ag *sa)
515 : {
516 441668254 : int error;
517 :
518 441668254 : error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
519 443312096 : if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
520 : return error;
521 :
522 443312096 : error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
523 443382986 : if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
524 0 : return error;
525 :
526 : return 0;
527 : }
528 :
529 : /*
530 : * Grab the AG headers for the attached perag structure and wait for pending
531 : * intents to drain.
532 : */
533 : int
534 442318538 : xchk_perag_drain_and_lock(
535 : struct xfs_scrub *sc)
536 : {
537 442318538 : struct xchk_ag *sa = &sc->sa;
538 442318538 : int error = 0;
539 :
540 442318538 : ASSERT(sa->pag != NULL);
541 442318538 : ASSERT(sa->agi_bp == NULL);
542 442318538 : ASSERT(sa->agf_bp == NULL);
543 :
544 442369130 : do {
545 442369130 : if (xchk_should_terminate(sc, &error))
546 9 : return error;
547 :
548 441754829 : error = xchk_perag_read_headers(sc, sa);
549 443324277 : if (error)
550 0 : return error;
551 :
552 : /*
553 : * If we've grabbed an inode for scrubbing then we assume that
554 : * holding its ILOCK will suffice to coordinate with any intent
555 : * chains involving this inode.
556 : */
557 443324277 : if (sc->ip)
558 : return 0;
559 :
560 : /*
561 : * Decide if this AG is quiet enough for all metadata to be
562 : * consistent with each other. XFS allows the AG header buffer
563 : * locks to cycle across transaction rolls while processing
564 : * chains of deferred ops, which means that there could be
565 : * other threads in the middle of processing a chain of
566 : * deferred ops. For regular operations we are careful about
567 : * ordering operations to prevent collisions between threads
568 : * (which is why we don't need a per-AG lock), but scrub and
569 : * repair have to serialize against chained operations.
570 : *
571 : * We just locked all the AG headers buffers; now take a look
572 : * to see if there are any intents in progress. If there are,
573 : * drop the AG headers and wait for the intents to drain.
574 : * Since we hold all the AG header locks for the duration of
575 : * the scrub, this is the only time we have to sample the
576 : * intents counter; any threads increasing it after this point
577 : * can't possibly be in the middle of a chain of AG metadata
578 : * updates.
579 : *
580 : * Obviously, this should be slanted against scrub and in favor
581 : * of runtime threads.
582 : */
583 4528953 : if (!xfs_perag_intent_busy(sa->pag))
584 : return 0;
585 :
586 172584 : if (sa->agf_bp) {
587 172584 : xfs_trans_brelse(sc->tp, sa->agf_bp);
588 172584 : sa->agf_bp = NULL;
589 : }
590 :
591 172584 : if (sa->agi_bp) {
592 172584 : xfs_trans_brelse(sc->tp, sa->agi_bp);
593 172584 : sa->agi_bp = NULL;
594 : }
595 :
596 172584 : if (!(sc->flags & XCHK_FSGATES_DRAIN))
597 : return -ECHRNG;
598 50592 : error = xfs_perag_intent_drain(sa->pag);
599 50592 : if (error == -ERESTARTSYS)
600 0 : error = -EINTR;
601 50592 : } while (!error);
602 :
603 : return error;
604 : }
605 :
606 : /*
607 : * Grab the per-AG structure, grab all AG header buffers, and wait until there
608 : * aren't any pending intents. Returns -ENOENT if we can't grab the perag
609 : * structure.
610 : */
611 : int
612 440824348 : xchk_ag_read_headers(
613 : struct xfs_scrub *sc,
614 : xfs_agnumber_t agno,
615 : struct xchk_ag *sa)
616 : {
617 440824348 : struct xfs_mount *mp = sc->mp;
618 :
619 440824348 : ASSERT(!sa->pag);
620 440824348 : sa->pag = xfs_perag_get(mp, agno);
621 442682549 : if (!sa->pag)
622 : return -ENOENT;
623 :
624 442682549 : return xchk_perag_drain_and_lock(sc);
625 : }
626 :
627 : /* Release all the AG btree cursors. */
628 : void
629 1603082813 : xchk_ag_btcur_free(
630 : struct xchk_ag *sa)
631 : {
632 1603082813 : if (sa->refc_cur)
633 343936485 : xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
634 1603230621 : if (sa->rmap_cur)
635 344189864 : xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
636 1603120230 : if (sa->fino_cur)
637 455698374 : xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
638 1603655692 : if (sa->ino_cur)
639 456225523 : xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
640 1603732702 : if (sa->cnt_cur)
641 456408871 : xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
642 1603560326 : if (sa->bno_cur)
643 456312042 : xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
644 :
645 1603635400 : sa->refc_cur = NULL;
646 1603635400 : sa->rmap_cur = NULL;
647 1603635400 : sa->fino_cur = NULL;
648 1603635400 : sa->ino_cur = NULL;
649 1603635400 : sa->bno_cur = NULL;
650 1603635400 : sa->cnt_cur = NULL;
651 1603635400 : }
652 :
653 : /* Initialize all the btree cursors for an AG. */
654 : void
655 442628447 : xchk_ag_btcur_init(
656 : struct xfs_scrub *sc,
657 : struct xchk_ag *sa)
658 : {
659 442628447 : struct xfs_mount *mp = sc->mp;
660 :
661 885500148 : if (sa->agf_bp &&
662 442926685 : xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
663 : /* Set up a bnobt cursor for cross-referencing. */
664 442864809 : sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
665 : sa->pag, XFS_BTNUM_BNO);
666 : }
667 :
668 886334168 : if (sa->agf_bp &&
669 443084062 : xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
670 : /* Set up a cntbt cursor for cross-referencing. */
671 443376020 : sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
672 : sa->pag, XFS_BTNUM_CNT);
673 : }
674 :
675 : /* Set up a inobt cursor for cross-referencing. */
676 886658901 : if (sa->agi_bp &&
677 443445443 : xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
678 443367105 : sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
679 : XFS_BTNUM_INO);
680 : }
681 :
682 : /* Set up a finobt cursor for cross-referencing. */
683 886517036 : if (sa->agi_bp && xfs_has_finobt(mp) &&
684 443190674 : xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
685 443306045 : sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
686 : XFS_BTNUM_FINO);
687 : }
688 :
689 : /* Set up a rmapbt cursor for cross-referencing. */
690 774612317 : if (sa->agf_bp && xfs_has_rmapbt(mp) &&
691 331286631 : xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
692 331171576 : sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
693 : sa->pag);
694 : }
695 :
696 : /* Set up a refcountbt cursor for cross-referencing. */
697 774531507 : if (sa->agf_bp && xfs_has_reflink(mp) &&
698 331157029 : xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
699 331169101 : sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
700 : sa->agf_bp, sa->pag);
701 : }
702 443339357 : }
703 :
704 : /* Release the AG header context and btree cursors. */
705 : void
706 1554419700 : xchk_ag_free(
707 : struct xfs_scrub *sc,
708 : struct xchk_ag *sa)
709 : {
710 1554419700 : xchk_ag_btcur_free(sa);
711 1554083626 : xrep_reset_perag_resv(sc);
712 1556338311 : if (sa->agf_bp) {
713 456083302 : xfs_trans_brelse(sc->tp, sa->agf_bp);
714 456313961 : sa->agf_bp = NULL;
715 : }
716 1556568970 : if (sa->agi_bp) {
717 456310738 : xfs_trans_brelse(sc->tp, sa->agi_bp);
718 456344161 : sa->agi_bp = NULL;
719 : }
720 1556602393 : if (sa->pag) {
721 456465131 : xfs_perag_put(sa->pag);
722 456408126 : sa->pag = NULL;
723 : }
724 1556545388 : }
725 :
726 : /*
727 : * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
728 : * order. Locking order requires us to get the AGI before the AGF. We use the
729 : * transaction to avoid deadlocking on crosslinked metadata buffers; either the
730 : * caller passes one in (bmap scrub) or we have to create a transaction
731 : * ourselves. Returns ENOENT if the perag struct cannot be grabbed.
732 : */
733 : int
734 438934173 : xchk_ag_init(
735 : struct xfs_scrub *sc,
736 : xfs_agnumber_t agno,
737 : struct xchk_ag *sa)
738 : {
739 438934173 : int error;
740 :
741 438934173 : error = xchk_ag_read_headers(sc, agno, sa);
742 441271531 : if (error)
743 : return error;
744 :
745 441193183 : xchk_ag_btcur_init(sc, sa);
746 441193183 : return 0;
747 : }
748 :
749 : /* Per-scrubber setup functions */
750 :
751 : void
752 226541150 : xchk_trans_cancel(
753 : struct xfs_scrub *sc)
754 : {
755 226541150 : xfs_trans_cancel(sc->tp);
756 226550513 : sc->tp = NULL;
757 0 : }
758 :
759 : int
760 107405304 : xchk_trans_alloc_empty(
761 : struct xfs_scrub *sc)
762 : {
763 107405304 : return xfs_trans_alloc_empty(sc->mp, &sc->tp);
764 : }
765 :
766 : /*
767 : * Grab an empty transaction so that we can re-grab locked buffers if
768 : * one of our btrees turns out to be cyclic.
769 : *
770 : * If we're going to repair something, we need to ask for the largest possible
771 : * log reservation so that we can handle the worst case scenario for metadata
772 : * updates while rebuilding a metadata item. We also need to reserve as many
773 : * blocks in the head transaction as we think we're going to need to rebuild
774 : * the metadata object.
775 : */
776 : int
777 1108143053 : xchk_trans_alloc(
778 : struct xfs_scrub *sc,
779 : uint resblks)
780 : {
781 1108143053 : if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
782 87100140 : return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
783 : resblks, 0, 0, &sc->tp);
784 :
785 1021042913 : return xchk_trans_alloc_empty(sc);
786 : }
787 :
788 : /* Set us up with a transaction and an empty context. */
789 : int
790 4839978 : xchk_setup_fs(
791 : struct xfs_scrub *sc)
792 : {
793 4839978 : uint resblks;
794 :
795 4839978 : resblks = xrep_calc_ag_resblks(sc);
796 4839202 : return xchk_trans_alloc(sc, resblks);
797 : }
798 :
799 : /* Set us up with AG headers and btree cursors. */
800 : int
801 1740734 : xchk_setup_ag_btree(
802 : struct xfs_scrub *sc,
803 : bool force_log)
804 : {
805 1740734 : struct xfs_mount *mp = sc->mp;
806 1740734 : int error;
807 :
808 : /*
809 : * If the caller asks us to checkpont the log, do so. This
810 : * expensive operation should be performed infrequently and only
811 : * as a last resort. Any caller that sets force_log should
812 : * document why they need to do so.
813 : */
814 1740734 : if (force_log) {
815 0 : error = xchk_checkpoint_log(mp);
816 0 : if (error)
817 : return error;
818 : }
819 :
820 1740734 : error = xchk_setup_fs(sc);
821 1741702 : if (error)
822 : return error;
823 :
824 1740523 : return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
825 : }
826 :
827 : /* Push everything out of the log onto disk. */
828 : int
829 0 : xchk_checkpoint_log(
830 : struct xfs_mount *mp)
831 : {
832 0 : int error;
833 :
834 0 : error = xfs_log_force(mp, XFS_LOG_SYNC);
835 0 : if (error)
836 : return error;
837 0 : xfs_ail_push_all_sync(mp->m_ail);
838 0 : return 0;
839 : }
840 :
841 : /* Verify that an inode is allocated ondisk, then return its cached inode. */
842 : int
843 2810613077 : xchk_iget(
844 : struct xfs_scrub *sc,
845 : xfs_ino_t inum,
846 : struct xfs_inode **ipp)
847 : {
848 2810613077 : return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
849 : }
850 :
851 : /*
852 : * Try to grab an inode in a manner that avoids races with physical inode
853 : * allocation. If we can't, return the locked AGI buffer so that the caller
854 : * can single-step the loading process to see where things went wrong.
855 : * Callers must have a valid scrub transaction.
856 : *
857 : * If the iget succeeds, return 0, a NULL AGI, and the inode.
858 : *
859 : * If the iget fails, return the error, the locked AGI, and a NULL inode. This
860 : * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
861 : * no longer allocated; or any other corruption or runtime error.
862 : *
863 : * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
864 : *
865 : * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
866 : */
867 : int
868 439354 : xchk_iget_agi(
869 : struct xfs_scrub *sc,
870 : xfs_ino_t inum,
871 : struct xfs_buf **agi_bpp,
872 : struct xfs_inode **ipp)
873 : {
874 439354 : struct xfs_mount *mp = sc->mp;
875 439354 : struct xfs_trans *tp = sc->tp;
876 439354 : struct xfs_perag *pag;
877 439354 : int error;
878 :
879 439354 : ASSERT(sc->tp != NULL);
880 :
881 439354 : again:
882 439592 : *agi_bpp = NULL;
883 439592 : *ipp = NULL;
884 439592 : error = 0;
885 :
886 439592 : if (xchk_should_terminate(sc, &error))
887 1 : return error;
888 :
889 : /*
890 : * Attach the AGI buffer to the scrub transaction to avoid deadlocks
891 : * in the iget cache miss path.
892 : */
893 439591 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
894 439591 : error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
895 439591 : xfs_perag_put(pag);
896 439591 : if (error)
897 0 : return error;
898 :
899 439591 : error = xfs_iget(mp, tp, inum,
900 : XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
901 439591 : if (error == -EAGAIN) {
902 : /*
903 : * The inode may be in core but temporarily unavailable and may
904 : * require the AGI buffer before it can be returned. Drop the
905 : * AGI buffer and retry the lookup.
906 : *
907 : * Incore lookup will fail with EAGAIN on a cache hit if the
908 : * inode is queued to the inactivation list. The inactivation
909 : * worker may remove the inode from the unlinked list and hence
910 : * needs the AGI.
911 : *
912 : * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
913 : * to allow inodegc to make progress and move the inode to
914 : * IRECLAIMABLE state where xfs_iget will be able to return it
915 : * again if it can lock the inode.
916 : */
917 238 : xfs_trans_brelse(tp, *agi_bpp);
918 238 : delay(1);
919 238 : goto again;
920 : }
921 439353 : if (error)
922 : return error;
923 :
924 : /* We got the inode, so we can release the AGI. */
925 427183 : ASSERT(*ipp != NULL);
926 427183 : xfs_trans_brelse(tp, *agi_bpp);
927 427183 : *agi_bpp = NULL;
928 427183 : return 0;
929 : }
930 :
931 : #ifdef CONFIG_XFS_QUOTA
932 : /*
933 : * Try to attach dquots to this inode if we think we might want to repair it.
934 : * Callers must not hold any ILOCKs. If the dquots are broken and cannot be
935 : * attached, a quotacheck will be scheduled.
936 : */
937 : int
938 1086978065 : xchk_ino_dqattach(
939 : struct xfs_scrub *sc)
940 : {
941 1086978065 : ASSERT(sc->tp != NULL);
942 1086978065 : ASSERT(sc->ip != NULL);
943 :
944 2173956130 : if (!xchk_could_repair(sc))
945 : return 0;
946 :
947 48109200 : return xrep_ino_dqattach(sc);
948 : }
949 : #endif
950 :
951 : /* Install an inode that we opened by handle for scrubbing. */
952 : int
953 683518532 : xchk_install_handle_inode(
954 : struct xfs_scrub *sc,
955 : struct xfs_inode *ip)
956 : {
957 683518532 : if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
958 1216441 : xchk_irele(sc, ip);
959 1216441 : return -ENOENT;
960 : }
961 :
962 682302091 : sc->ip = ip;
963 682302091 : return 0;
964 : }
965 :
966 : /*
967 : * Install an already-referenced inode for scrubbing. Get our own reference to
968 : * the inode to make disposal simpler. The inode must not be in I_FREEING or
969 : * I_WILL_FREE state!
970 : */
971 : int
972 405549045 : xchk_install_live_inode(
973 : struct xfs_scrub *sc,
974 : struct xfs_inode *ip)
975 : {
976 405549045 : if (!igrab(VFS_I(ip))) {
977 0 : xchk_ino_set_corrupt(sc, ip->i_ino);
978 0 : return -EFSCORRUPTED;
979 : }
980 :
981 407734942 : sc->ip = ip;
982 407734942 : return 0;
983 : }
984 :
985 : /*
986 : * In preparation to scrub metadata structures that hang off of an inode,
987 : * grab either the inode referenced in the scrub control structure or the
988 : * inode passed in. If the inumber does not reference an allocated inode
989 : * record, the function returns ENOENT to end the scrub early. The inode
990 : * is not locked.
991 : */
992 : int
993 912696641 : xchk_iget_for_scrubbing(
994 : struct xfs_scrub *sc)
995 : {
996 912696641 : struct xfs_imap imap;
997 912696641 : struct xfs_mount *mp = sc->mp;
998 912696641 : struct xfs_perag *pag;
999 912696641 : struct xfs_buf *agi_bp;
1000 912696641 : struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
1001 912696641 : struct xfs_inode *ip = NULL;
1002 912696641 : xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
1003 912696641 : int error;
1004 :
1005 912696641 : ASSERT(sc->tp == NULL);
1006 :
1007 : /* We want to scan the inode we already had opened. */
1008 912696641 : if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
1009 338560831 : return xchk_install_live_inode(sc, ip_in);
1010 :
1011 : /* Reject internal metadata files and obviously bad inode numbers. */
1012 574135810 : if (xfs_internal_inum(mp, sc->sm->sm_ino))
1013 : return -ENOENT;
1014 572875280 : if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
1015 : return -ENOENT;
1016 :
1017 : /* Try a regular untrusted iget. */
1018 573714164 : error = xchk_iget(sc, sc->sm->sm_ino, &ip);
1019 574973540 : if (!error)
1020 571743069 : return xchk_install_handle_inode(sc, ip);
1021 3230471 : if (error == -ENOENT)
1022 : return error;
1023 9441 : if (error != -EINVAL)
1024 0 : goto out_error;
1025 :
1026 : /*
1027 : * EINVAL with IGET_UNTRUSTED probably means one of several things:
1028 : * userspace gave us an inode number that doesn't correspond to fs
1029 : * space; the inode btree lacks a record for this inode; or there is a
1030 : * record, and it says this inode is free.
1031 : *
1032 : * We want to look up this inode in the inobt to distinguish two
1033 : * scenarios: (1) the inobt says the inode is free, in which case
1034 : * there's nothing to do; and (2) the inobt says the inode is
1035 : * allocated, but loading it failed due to corruption.
1036 : *
1037 : * Allocate a transaction and grab the AGI to prevent inobt activity
1038 : * in this AG. Retry the iget in case someone allocated a new inode
1039 : * after the first iget failed.
1040 : */
1041 9441 : error = xchk_trans_alloc(sc, 0);
1042 9441 : if (error)
1043 0 : goto out_error;
1044 :
1045 9441 : error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
1046 9441 : if (error == 0) {
1047 : /* Actually got the inode, so install it. */
1048 0 : xchk_trans_cancel(sc);
1049 0 : return xchk_install_handle_inode(sc, ip);
1050 : }
1051 9441 : if (error == -ENOENT)
1052 0 : goto out_gone;
1053 9441 : if (error != -EINVAL)
1054 0 : goto out_cancel;
1055 :
1056 : /* Ensure that we have protected against inode allocation/freeing. */
1057 9441 : if (agi_bp == NULL) {
1058 0 : ASSERT(agi_bp != NULL);
1059 0 : error = -ECANCELED;
1060 0 : goto out_cancel;
1061 : }
1062 :
1063 : /*
1064 : * Untrusted iget failed a second time. Let's try an inobt lookup.
1065 : * If the inobt thinks this the inode neither can exist inside the
1066 : * filesystem nor is allocated, return ENOENT to signal that the check
1067 : * can be skipped.
1068 : *
1069 : * If the lookup returns corruption, we'll mark this inode corrupt and
1070 : * exit to userspace. There's little chance of fixing anything until
1071 : * the inobt is straightened out, but there's nothing we can do here.
1072 : *
1073 : * If the lookup encounters any other error, exit to userspace.
1074 : *
1075 : * If the lookup succeeds, something else must be very wrong in the fs
1076 : * such that setting up the incore inode failed in some strange way.
1077 : * Treat those as corruptions.
1078 : */
1079 9441 : pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1080 9441 : if (!pag) {
1081 0 : error = -EFSCORRUPTED;
1082 0 : goto out_cancel;
1083 : }
1084 :
1085 9441 : error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1086 : XFS_IGET_UNTRUSTED);
1087 9441 : xfs_perag_put(pag);
1088 9441 : if (error == -EINVAL || error == -ENOENT)
1089 9441 : goto out_gone;
1090 0 : if (!error)
1091 0 : error = -EFSCORRUPTED;
1092 :
1093 0 : out_cancel:
1094 0 : xchk_trans_cancel(sc);
1095 0 : out_error:
1096 0 : xchk_whine(mp, "type %s agno 0x%x agbno 0x%x error %d ret_ip %pS",
1097 0 : xchk_type_string(sc->sm->sm_type), agno,
1098 0 : XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), error,
1099 : __return_address);
1100 0 : trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1101 : error, __return_address);
1102 0 : return error;
1103 9441 : out_gone:
1104 : /* The file is gone, so there's nothing to check. */
1105 9441 : xchk_trans_cancel(sc);
1106 9441 : return -ENOENT;
1107 : }
1108 :
1109 : /* Release an inode, possibly dropping it in the process. */
1110 : void
1111 40336141964 : xchk_irele(
1112 : struct xfs_scrub *sc,
1113 : struct xfs_inode *ip)
1114 : {
1115 40336141964 : if (current->journal_info != NULL) {
1116 39110155127 : ASSERT(current->journal_info == sc->tp);
1117 :
1118 : /*
1119 : * If we are in a transaction, we /cannot/ drop the inode
1120 : * ourselves, because the VFS will trigger writeback, which
1121 : * can require a transaction. Clear DONTCACHE to force the
1122 : * inode to the LRU, where someone else can take care of
1123 : * dropping it.
1124 : *
1125 : * Note that when we grabbed our reference to the inode, it
1126 : * could have had an active ref and DONTCACHE set if a sysadmin
1127 : * is trying to coerce a change in file access mode. icache
1128 : * hits do not clear DONTCACHE, so we must do it here.
1129 : */
1130 39110155127 : spin_lock(&VFS_I(ip)->i_lock);
1131 39400989309 : VFS_I(ip)->i_state &= ~I_DONTCACHE;
1132 39400989309 : spin_unlock(&VFS_I(ip)->i_lock);
1133 1225986837 : } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
1134 : /*
1135 : * If this is the last reference to the inode and the caller
1136 : * permits it, set DONTCACHE to avoid thrashing.
1137 : */
1138 191758289 : d_mark_dontcache(VFS_I(ip));
1139 : }
1140 :
1141 40590165149 : xfs_irele(ip);
1142 40712853871 : }
1143 :
1144 : /*
1145 : * Set us up to scrub metadata mapped by a file's fork. Callers must not use
1146 : * this to operate on user-accessible regular file data because the MMAPLOCK is
1147 : * not taken.
1148 : */
1149 : int
1150 404980124 : xchk_setup_inode_contents(
1151 : struct xfs_scrub *sc,
1152 : unsigned int resblks)
1153 : {
1154 404980124 : int error;
1155 :
1156 404980124 : error = xchk_iget_for_scrubbing(sc);
1157 406599686 : if (error)
1158 : return error;
1159 :
1160 : /* Lock the inode so the VFS cannot touch this file. */
1161 404626894 : xchk_ilock(sc, XFS_IOLOCK_EXCL);
1162 :
1163 403831534 : error = xchk_trans_alloc(sc, resblks);
1164 403892392 : if (error)
1165 0 : goto out;
1166 :
1167 403892392 : error = xchk_ino_dqattach(sc);
1168 403748230 : if (error)
1169 0 : goto out;
1170 :
1171 403748230 : xchk_ilock(sc, XFS_ILOCK_EXCL);
1172 : out:
1173 : /* scrub teardown will unlock and release the inode for us */
1174 : return error;
1175 : }
1176 :
1177 : void
1178 1548868588 : xchk_ilock(
1179 : struct xfs_scrub *sc,
1180 : unsigned int ilock_flags)
1181 : {
1182 2357243712 : xfs_ilock(sc->ip, ilock_flags);
1183 403831534 : sc->ilock_flags |= ilock_flags;
1184 403908398 : }
1185 :
1186 : bool
1187 103334651 : xchk_ilock_nowait(
1188 : struct xfs_scrub *sc,
1189 : unsigned int ilock_flags)
1190 : {
1191 103334651 : if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1192 103329877 : sc->ilock_flags |= ilock_flags;
1193 103329877 : return true;
1194 : }
1195 :
1196 : return false;
1197 : }
1198 :
1199 : void
1200 1328528256 : xchk_iunlock(
1201 : struct xfs_scrub *sc,
1202 : unsigned int ilock_flags)
1203 : {
1204 1328528256 : sc->ilock_flags &= ~ilock_flags;
1205 1328528256 : xfs_iunlock(sc->ip, ilock_flags);
1206 1329206703 : }
1207 :
1208 : /*
1209 : * Predicate that decides if we need to evaluate the cross-reference check.
1210 : * If there was an error accessing the cross-reference btree, just delete
1211 : * the cursor and skip the check.
1212 : */
1213 : bool
1214 12060933202 : xchk_should_check_xref(
1215 : struct xfs_scrub *sc,
1216 : int *error,
1217 : struct xfs_btree_cur **curpp)
1218 : {
1219 : /* No point in xref if we already know we're corrupt. */
1220 12060933202 : if (xchk_skip_xref(sc->sm))
1221 : return false;
1222 :
1223 12060933202 : if (*error == 0)
1224 : return true;
1225 :
1226 0 : if (curpp) {
1227 : /* If we've already given up on xref, just bail out. */
1228 0 : if (!*curpp)
1229 : return false;
1230 :
1231 : /* xref error, delete cursor and bail out. */
1232 0 : xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1233 0 : *curpp = NULL;
1234 : }
1235 :
1236 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1237 0 : xchk_whine(sc->mp, "type %s xref error %d ret_ip %pS",
1238 0 : xchk_type_string(sc->sm->sm_type),
1239 : *error,
1240 : __return_address);
1241 0 : trace_xchk_xref_error(sc, *error, __return_address);
1242 :
1243 : /*
1244 : * Errors encountered during cross-referencing with another
1245 : * data structure should not cause this scrubber to abort.
1246 : */
1247 0 : *error = 0;
1248 0 : return false;
1249 : }
1250 :
1251 : /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1252 : void
1253 118067238 : xchk_buffer_recheck(
1254 : struct xfs_scrub *sc,
1255 : struct xfs_buf *bp)
1256 : {
1257 118067238 : xfs_failaddr_t fa;
1258 :
1259 118067238 : if (bp->b_ops == NULL) {
1260 0 : xchk_block_set_corrupt(sc, bp);
1261 0 : return;
1262 : }
1263 118067238 : if (bp->b_ops->verify_struct == NULL) {
1264 0 : xchk_set_incomplete(sc);
1265 0 : return;
1266 : }
1267 118067238 : fa = bp->b_ops->verify_struct(bp);
1268 118222739 : if (!fa)
1269 : return;
1270 0 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1271 0 : trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1272 0 : xchk_whine(sc->mp, "type %s agno 0x%x agbno 0x%x ret_ip %pS",
1273 0 : xchk_type_string(sc->sm->sm_type),
1274 : xfs_daddr_to_agno(sc->mp, xfs_buf_daddr(bp)),
1275 : xfs_daddr_to_agbno(sc->mp, xfs_buf_daddr(bp)),
1276 : fa);
1277 : }
1278 :
1279 : static inline int
1280 314752 : xchk_metadata_inode_subtype(
1281 : struct xfs_scrub *sc,
1282 : unsigned int scrub_type)
1283 : {
1284 314752 : __u32 smtype = sc->sm->sm_type;
1285 314752 : int error;
1286 :
1287 314752 : sc->sm->sm_type = scrub_type;
1288 :
1289 314752 : switch (scrub_type) {
1290 157368 : case XFS_SCRUB_TYPE_INODE:
1291 157368 : error = xchk_inode(sc);
1292 157368 : break;
1293 157384 : case XFS_SCRUB_TYPE_BMBTD:
1294 157384 : error = xchk_bmap_data(sc);
1295 157384 : break;
1296 0 : default:
1297 0 : ASSERT(0);
1298 0 : error = -EFSCORRUPTED;
1299 0 : break;
1300 : }
1301 :
1302 314768 : sc->sm->sm_type = smtype;
1303 314768 : return error;
1304 : }
1305 :
1306 : /*
1307 : * Scrub the attr/data forks of a metadata inode. The metadata inode must be
1308 : * pointed to by sc->ip and the ILOCK must be held.
1309 : */
1310 : int
1311 157372 : xchk_metadata_inode_forks(
1312 : struct xfs_scrub *sc)
1313 : {
1314 157372 : bool shared;
1315 157372 : int error;
1316 :
1317 157372 : if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1318 : return 0;
1319 :
1320 : /* Check the inode record. */
1321 157370 : error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1322 157384 : if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1323 : return error;
1324 :
1325 : /* Metadata inodes don't live on the rt device. */
1326 157384 : if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1327 0 : xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1328 0 : return 0;
1329 : }
1330 :
1331 : /* They should never participate in reflink. */
1332 157384 : if (xfs_is_reflink_inode(sc->ip)) {
1333 0 : xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1334 0 : return 0;
1335 : }
1336 :
1337 : /* They also should never have extended attributes. */
1338 157384 : if (xfs_inode_hasattr(sc->ip)) {
1339 0 : xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1340 0 : return 0;
1341 : }
1342 :
1343 : /* Invoke the data fork scrubber. */
1344 157384 : error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1345 157384 : if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1346 : return error;
1347 :
1348 : /* Look for incorrect shared blocks. */
1349 157384 : if (xfs_has_reflink(sc->mp)) {
1350 79143 : error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1351 : &shared);
1352 158286 : if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1353 : &error))
1354 0 : return error;
1355 79143 : if (shared)
1356 0 : xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1357 : }
1358 :
1359 : return 0;
1360 : }
1361 :
1362 : /*
1363 : * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1364 : * operation. Callers must not hold any locks that intersect with the CPU
1365 : * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1366 : * to change kernel code.
1367 : */
1368 : void
1369 48167070 : xchk_fsgates_enable(
1370 : struct xfs_scrub *sc,
1371 : unsigned int scrub_fsgates)
1372 : {
1373 48167070 : ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1374 48167070 : ASSERT(!(sc->flags & scrub_fsgates));
1375 :
1376 48167070 : trace_xchk_fsgates_enable(sc, scrub_fsgates);
1377 :
1378 48153715 : if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1379 122908 : xfs_drain_wait_enable();
1380 :
1381 48153715 : if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1382 18677 : xfs_dqtrx_hook_enable();
1383 :
1384 48153715 : if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1385 47997254 : xfs_dir_hook_enable();
1386 :
1387 48208667 : if (scrub_fsgates & XCHK_FSGATES_RMAP)
1388 15937 : xfs_rmap_hook_enable();
1389 :
1390 48208666 : sc->flags |= scrub_fsgates;
1391 48208666 : }
1392 :
1393 : /*
1394 : * Decide if this is this a cached inode that's also allocated. The caller
1395 : * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1396 : * from being allocated or freed.
1397 : *
1398 : * Look up an inode by number in the given file system. If the inode number
1399 : * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA.
1400 : * If the inode is being reclaimed, return -ENODATA because we know the inode
1401 : * cache cannot be updating the ondisk metadata.
1402 : *
1403 : * Otherwise, the incore inode is the one we want, and it is either live,
1404 : * somewhere in the inactivation machinery, or reclaimable. The inode is
1405 : * allocated if i_mode is nonzero. In all three cases, the cached inode will
1406 : * be more up to date than the ondisk inode buffer, so we must use the incore
1407 : * i_mode.
1408 : */
1409 : int
1410 2990818176 : xchk_inode_is_allocated(
1411 : struct xfs_scrub *sc,
1412 : xfs_agino_t agino,
1413 : bool *inuse)
1414 : {
1415 2990818176 : struct xfs_mount *mp = sc->mp;
1416 2990818176 : struct xfs_perag *pag = sc->sa.pag;
1417 2990818176 : xfs_ino_t ino;
1418 2990818176 : struct xfs_inode *ip;
1419 2990818176 : int error;
1420 :
1421 : /* caller must hold perag reference */
1422 2990818176 : if (pag == NULL) {
1423 0 : ASSERT(pag != NULL);
1424 0 : return -EINVAL;
1425 : }
1426 :
1427 : /* caller must have AGI buffer */
1428 2990818176 : if (sc->sa.agi_bp == NULL) {
1429 0 : ASSERT(sc->sa.agi_bp != NULL);
1430 0 : return -EINVAL;
1431 : }
1432 :
1433 : /* reject inode numbers outside existing AGs */
1434 2990818176 : ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1435 2990818176 : if (!xfs_verify_ino(mp, ino))
1436 : return -EINVAL;
1437 :
1438 2990277190 : error = -ENODATA;
1439 2990277190 : rcu_read_lock();
1440 2990160291 : ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1441 2990455945 : if (!ip) {
1442 : /* cache miss */
1443 59793656 : goto out_rcu;
1444 : }
1445 :
1446 : /*
1447 : * If the inode number doesn't match, the incore inode got reused
1448 : * during an RCU grace period and the radix tree hasn't been updated.
1449 : * This isn't the inode we want.
1450 : */
1451 2930662289 : spin_lock(&ip->i_flags_lock);
1452 2932890113 : if (ip->i_ino != ino)
1453 0 : goto out_skip;
1454 :
1455 2932890113 : trace_xchk_inode_is_allocated(ip);
1456 :
1457 : /*
1458 : * We have an incore inode that matches the inode we want, and the
1459 : * caller holds the perag structure and the AGI buffer. Let's check
1460 : * our assumptions below:
1461 : */
1462 :
1463 : #ifdef DEBUG
1464 : /*
1465 : * (1) If the incore inode is live (i.e. referenced from the dcache),
1466 : * it will not be INEW, nor will it be in the inactivation or reclaim
1467 : * machinery. The ondisk inode had better be allocated. This is the
1468 : * most trivial case.
1469 : */
1470 2931358436 : if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1471 : XFS_INACTIVATING))) {
1472 : /* live inode */
1473 2929942003 : ASSERT(VFS_I(ip)->i_mode != 0);
1474 : }
1475 :
1476 : /*
1477 : * If the incore inode is INEW, there are several possibilities:
1478 : *
1479 : * (2) For a file that is being created, note that we allocate the
1480 : * ondisk inode before allocating, initializing, and adding the incore
1481 : * inode to the radix tree.
1482 : *
1483 : * (3) If the incore inode is being recycled, the inode has to be
1484 : * allocated because we don't allow freed inodes to be recycled.
1485 : * Recycling doesn't touch i_mode.
1486 : */
1487 2931358436 : if (ip->i_flags & XFS_INEW) {
1488 : /* created on disk already or recycling */
1489 912 : ASSERT(VFS_I(ip)->i_mode != 0);
1490 : }
1491 :
1492 : /*
1493 : * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1494 : * inactivation has not started (!INACTIVATING), it is still allocated.
1495 : */
1496 2931358436 : if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1497 : !(ip->i_flags & XFS_INACTIVATING)) {
1498 : /* definitely before difree */
1499 21300 : ASSERT(VFS_I(ip)->i_mode != 0);
1500 : }
1501 : #endif
1502 :
1503 : /*
1504 : * If the incore inode is undergoing inactivation (INACTIVATING), there
1505 : * are two possibilities:
1506 : *
1507 : * (5) It is before the point where it would get freed ondisk, in which
1508 : * case i_mode is still nonzero.
1509 : *
1510 : * (6) It has already been freed, in which case i_mode is zero.
1511 : *
1512 : * We don't take the ILOCK here, but difree and dialloc update the AGI,
1513 : * and we've taken the AGI buffer lock, which prevents that from
1514 : * happening.
1515 : */
1516 :
1517 : /*
1518 : * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1519 : * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still
1520 : * reflects the ondisk state.
1521 : */
1522 :
1523 : /*
1524 : * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1525 : * the flush code uses i_mode to format the ondisk inode.
1526 : */
1527 :
1528 : /*
1529 : * (9) If the inode is in IRECLAIM and was reachable via the radix
1530 : * tree, it still has the same i_mode as it did before it entered
1531 : * reclaim. The inode object is still alive because we hold the RCU
1532 : * read lock.
1533 : */
1534 :
1535 2931358436 : *inuse = VFS_I(ip)->i_mode != 0;
1536 2931358436 : error = 0;
1537 :
1538 2931358436 : out_skip:
1539 2931358436 : spin_unlock(&ip->i_flags_lock);
1540 2992521856 : out_rcu:
1541 2992521856 : rcu_read_unlock();
1542 2992521856 : return error;
1543 : }
1544 :
1545 : /* Complain about failures... */
1546 : void
1547 327 : xchk_whine(
1548 : const struct xfs_mount *mp,
1549 : const char *fmt,
1550 : ...)
1551 : {
1552 327 : struct va_format vaf;
1553 327 : va_list args;
1554 :
1555 327 : va_start(args, fmt);
1556 :
1557 327 : vaf.fmt = fmt;
1558 327 : vaf.va = &args;
1559 :
1560 327 : printk(KERN_INFO "XFS (%s) %pS: %pV\n", mp->m_super->s_id,
1561 : __return_address, &vaf);
1562 327 : va_end(args);
1563 :
1564 327 : if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
1565 0 : xfs_stack_trace();
1566 327 : }
|