Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
4 : * Author: Darrick J. Wong <djwong@kernel.org>
5 : */
6 : #include "xfs.h"
7 : #include "xfs_fs.h"
8 : #include "xfs_shared.h"
9 : #include "xfs_format.h"
10 : #include "xfs_trans_resv.h"
11 : #include "xfs_mount.h"
12 : #include "xfs_log_format.h"
13 : #include "xfs_trans.h"
14 : #include "xfs_inode.h"
15 : #include "xfs_quota.h"
16 : #include "xfs_qm.h"
17 : #include "xfs_scrub.h"
18 : #include "xfs_btree.h"
19 : #include "xfs_btree_staging.h"
20 : #include "xfs_buf_xfile.h"
21 : #include "xfs_rmap.h"
22 : #include "xfs_xchgrange.h"
23 : #include "xfs_swapext.h"
24 : #include "xfs_da_format.h"
25 : #include "xfs_da_btree.h"
26 : #include "xfs_xattr.h"
27 : #include "xfs_dir2.h"
28 : #include "xfs_icache.h"
29 : #include "scrub/scrub.h"
30 : #include "scrub/common.h"
31 : #include "scrub/trace.h"
32 : #include "scrub/repair.h"
33 : #include "scrub/health.h"
34 : #include "scrub/stats.h"
35 : #include "scrub/xfile.h"
36 : #include "scrub/tempfile.h"
37 : #include "scrub/orphanage.h"
38 :
39 : /*
40 : * Online Scrub and Repair
41 : *
42 : * Traditionally, XFS (the kernel driver) did not know how to check or
43 : * repair on-disk data structures. That task was left to the xfs_check
44 : * and xfs_repair tools, both of which require taking the filesystem
45 : * offline for a thorough but time consuming examination. Online
46 : * scrub & repair, on the other hand, enables us to check the metadata
47 : * for obvious errors while carefully stepping around the filesystem's
48 : * ongoing operations, locking rules, etc.
49 : *
50 : * Given that most XFS metadata consist of records stored in a btree,
51 : * most of the checking functions iterate the btree blocks themselves
52 : * looking for irregularities. When a record block is encountered, each
53 : * record can be checked for obviously bad values. Record values can
54 : * also be cross-referenced against other btrees to look for potential
55 : * misunderstandings between pieces of metadata.
56 : *
57 : * It is expected that the checkers responsible for per-AG metadata
58 : * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
59 : * metadata structure, and perform any relevant cross-referencing before
60 : * unlocking the AG and returning the results to userspace. These
61 : * scrubbers must not keep an AG locked for too long to avoid tying up
62 : * the block and inode allocators.
63 : *
64 : * Block maps and b-trees rooted in an inode present a special challenge
65 : * because they can involve extents from any AG. The general scrubber
66 : * structure of lock -> check -> xref -> unlock still holds, but AG
67 : * locking order rules /must/ be obeyed to avoid deadlocks. The
68 : * ordering rule, of course, is that we must lock in increasing AG
69 : * order. Helper functions are provided to track which AG headers we've
70 : * already locked. If we detect an imminent locking order violation, we
71 : * can signal a potential deadlock, in which case the scrubber can jump
72 : * out to the top level, lock all the AGs in order, and retry the scrub.
73 : *
74 : * For file data (directories, extended attributes, symlinks) scrub, we
75 : * can simply lock the inode and walk the data. For btree data
76 : * (directories and attributes) we follow the same btree-scrubbing
77 : * strategy outlined previously to check the records.
78 : *
79 : * We use a bit of trickery with transactions to avoid buffer deadlocks
80 : * if there is a cycle in the metadata. The basic problem is that
81 : * travelling down a btree involves locking the current buffer at each
82 : * tree level. If a pointer should somehow point back to a buffer that
83 : * we've already examined, we will deadlock due to the second buffer
84 : * locking attempt. Note however that grabbing a buffer in transaction
85 : * context links the locked buffer to the transaction. If we try to
86 : * re-grab the buffer in the context of the same transaction, we avoid
87 : * the second lock attempt and continue. Between the verifier and the
88 : * scrubber, something will notice that something is amiss and report
89 : * the corruption. Therefore, each scrubber will allocate an empty
90 : * transaction, attach buffers to it, and cancel the transaction at the
91 : * end of the scrub run. Cancelling a non-dirty transaction simply
92 : * unlocks the buffers.
93 : *
94 : * There are four pieces of data that scrub can communicate to
95 : * userspace. The first is the error code (errno), which can be used to
96 : * communicate operational errors in performing the scrub. There are
97 : * also three flags that can be set in the scrub context. If the data
98 : * structure itself is corrupt, the CORRUPT flag will be set. If
99 : * the metadata is correct but otherwise suboptimal, the PREEN flag
100 : * will be set.
101 : *
102 : * We perform secondary validation of filesystem metadata by
103 : * cross-referencing every record with all other available metadata.
104 : * For example, for block mapping extents, we verify that there are no
105 : * records in the free space and inode btrees corresponding to that
106 : * space extent and that there is a corresponding entry in the reverse
107 : * mapping btree. Inconsistent metadata is noted by setting the
108 : * XCORRUPT flag; btree query function errors are noted by setting the
109 : * XFAIL flag and deleting the cursor to prevent further attempts to
110 : * cross-reference with a defective btree.
111 : *
112 : * If a piece of metadata proves corrupt or suboptimal, the userspace
113 : * program can ask the kernel to apply some tender loving care (TLC) to
114 : * the metadata object by setting the REPAIR flag and re-calling the
115 : * scrub ioctl. "Corruption" is defined by metadata violating the
116 : * on-disk specification; operations cannot continue if the violation is
117 : * left untreated. It is possible for XFS to continue if an object is
118 : * "suboptimal", however performance may be degraded. Repairs are
119 : * usually performed by rebuilding the metadata entirely out of
120 : * redundant metadata. Optimizing, on the other hand, can sometimes be
121 : * done without rebuilding entire structures.
122 : *
123 : * Generally speaking, the repair code has the following code structure:
124 : * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
125 : * The first check helps us figure out if we need to rebuild or simply
126 : * optimize the structure so that the rebuild knows what to do. The
127 : * second check evaluates the completeness of the repair; that is what
128 : * is reported to userspace.
129 : *
130 : * A quick note on symbol prefixes:
131 : * - "xfs_" are general XFS symbols.
132 : * - "xchk_" are symbols related to metadata checking.
133 : * - "xrep_" are symbols related to metadata repair.
134 : * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS.
135 : */
136 :
137 : /*
138 : * Scrub probe -- userspace uses this to probe if we're willing to scrub
139 : * or repair a given mountpoint. This will be used by xfs_scrub to
140 : * probe the kernel's abilities to scrub (and repair) the metadata. We
141 : * do this by validating the ioctl inputs from userspace, preparing the
142 : * filesystem for a scrub (or a repair) operation, and immediately
143 : * returning to userspace. Userspace can use the returned errno and
144 : * structure state to decide (in broad terms) if scrub/repair are
145 : * supported by the running kernel.
146 : */
147 : static int
148 18812 : xchk_probe(
149 : struct xfs_scrub *sc)
150 : {
151 18812 : int error = 0;
152 :
153 18812 : if (xchk_should_terminate(sc, &error))
154 0 : return error;
155 :
156 : return 0;
157 : }
158 :
159 : /* Scrub setup and teardown */
160 :
161 : #define FSGATES_MASK (XCHK_FSGATES_ALL | XREP_FSGATES_ALL)
162 : static inline void
163 506464975 : xchk_fsgates_disable(
164 : struct xfs_scrub *sc)
165 : {
166 506464975 : if (!(sc->flags & FSGATES_MASK))
167 : return;
168 :
169 12112493 : trace_xchk_fsgates_disable(sc, sc->flags & FSGATES_MASK);
170 :
171 12112588 : if (sc->flags & XCHK_FSGATES_DRAIN)
172 69513 : xfs_defer_drain_wait_disable();
173 :
174 12112588 : if (sc->flags & XCHK_FSGATES_QUOTA)
175 11554 : xfs_dqtrx_hook_disable();
176 :
177 12112588 : if (sc->flags & XCHK_FSGATES_DIRENTS)
178 11957528 : xfs_dir_hook_disable();
179 :
180 12113279 : if (sc->flags & XCHK_FSGATES_RMAP)
181 12663 : xfs_rmap_hook_disable();
182 :
183 12113291 : if (sc->flags & XREP_FSGATES_ATOMIC_XCHG)
184 408136 : xfs_xchg_range_rele_log_assist(sc->mp);
185 :
186 12113291 : if (sc->flags & XREP_FSGATES_LARP)
187 363945 : xfs_attr_rele_log_assist(sc->mp);
188 :
189 12113291 : sc->flags &= ~FSGATES_MASK;
190 : }
191 : #undef FSGATES_MASK
192 :
193 : /* Free all the resources and finish the transactions. */
194 : STATIC int
195 506176000 : xchk_teardown(
196 : struct xfs_scrub *sc,
197 : int error)
198 : {
199 506176000 : xchk_ag_free(sc, &sc->sa);
200 506327132 : xchk_rtgroup_btcur_free(&sc->sr);
201 :
202 506401782 : if (sc->tp) {
203 503272187 : if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
204 3525557 : error = xfs_trans_commit(sc->tp);
205 : else
206 499746630 : xfs_trans_cancel(sc->tp);
207 503311820 : sc->tp = NULL;
208 : }
209 506441415 : if (sc->sr.rtg)
210 297015 : xchk_rtgroup_free(sc, &sc->sr);
211 : else
212 506151177 : xchk_rt_unlock(sc, &sc->sr);
213 506460527 : if (sc->ip) {
214 500209571 : if (sc->ilock_flags)
215 499685412 : xchk_iunlock(sc, sc->ilock_flags);
216 500179558 : xchk_irele(sc, sc->ip);
217 500219712 : sc->ip = NULL;
218 : }
219 506470668 : if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
220 3526927 : sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
221 3526927 : mnt_drop_write_file(sc->file);
222 : }
223 506470600 : if (sc->xfile_buftarg) {
224 80697 : xfile_free_buftarg(sc->xfile_buftarg);
225 80716 : sc->xfile_buftarg = NULL;
226 : }
227 506470619 : if (sc->xfile) {
228 104981 : xfile_destroy(sc->xfile);
229 104981 : sc->xfile = NULL;
230 : }
231 506470619 : if (sc->buf) {
232 101175497 : if (sc->buf_cleanup)
233 89407160 : sc->buf_cleanup(sc->buf);
234 101176016 : kvfree(sc->buf);
235 101176000 : sc->buf_cleanup = NULL;
236 101176000 : sc->buf = NULL;
237 : }
238 :
239 506471122 : xrep_tempfile_rele(sc);
240 506462152 : xrep_orphanage_rele(sc);
241 506461363 : xchk_fsgates_disable(sc);
242 506465163 : return error;
243 : }
244 :
245 : /* Scrubbing dispatch. */
246 :
247 : static const struct xchk_meta_ops meta_scrub_ops[] = {
248 : [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */
249 : .type = ST_NONE,
250 : .setup = xchk_setup_fs,
251 : .scrub = xchk_probe,
252 : .repair = xrep_probe,
253 : },
254 : [XFS_SCRUB_TYPE_SB] = { /* superblock */
255 : .type = ST_PERAG,
256 : .setup = xchk_setup_agheader,
257 : .scrub = xchk_superblock,
258 : .repair = xrep_superblock,
259 : },
260 : [XFS_SCRUB_TYPE_AGF] = { /* agf */
261 : .type = ST_PERAG,
262 : .setup = xchk_setup_agheader,
263 : .scrub = xchk_agf,
264 : .repair = xrep_agf,
265 : },
266 : [XFS_SCRUB_TYPE_AGFL]= { /* agfl */
267 : .type = ST_PERAG,
268 : .setup = xchk_setup_agheader,
269 : .scrub = xchk_agfl,
270 : .repair = xrep_agfl,
271 : },
272 : [XFS_SCRUB_TYPE_AGI] = { /* agi */
273 : .type = ST_PERAG,
274 : .setup = xchk_setup_agheader,
275 : .scrub = xchk_agi,
276 : .repair = xrep_agi,
277 : },
278 : [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
279 : .type = ST_PERAG,
280 : .setup = xchk_setup_ag_allocbt,
281 : .scrub = xchk_bnobt,
282 : .repair = xrep_allocbt,
283 : .repair_eval = xrep_revalidate_allocbt,
284 : },
285 : [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
286 : .type = ST_PERAG,
287 : .setup = xchk_setup_ag_allocbt,
288 : .scrub = xchk_cntbt,
289 : .repair = xrep_allocbt,
290 : .repair_eval = xrep_revalidate_allocbt,
291 : },
292 : [XFS_SCRUB_TYPE_INOBT] = { /* inobt */
293 : .type = ST_PERAG,
294 : .setup = xchk_setup_ag_iallocbt,
295 : .scrub = xchk_inobt,
296 : .repair = xrep_iallocbt,
297 : .repair_eval = xrep_revalidate_iallocbt,
298 : },
299 : [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
300 : .type = ST_PERAG,
301 : .setup = xchk_setup_ag_iallocbt,
302 : .scrub = xchk_finobt,
303 : .has = xfs_has_finobt,
304 : .repair = xrep_iallocbt,
305 : .repair_eval = xrep_revalidate_iallocbt,
306 : },
307 : [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
308 : .type = ST_PERAG,
309 : .setup = xchk_setup_ag_rmapbt,
310 : .scrub = xchk_rmapbt,
311 : .has = xfs_has_rmapbt,
312 : .repair = xrep_rmapbt,
313 : },
314 : [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
315 : .type = ST_PERAG,
316 : .setup = xchk_setup_ag_refcountbt,
317 : .scrub = xchk_refcountbt,
318 : .has = xfs_has_reflink,
319 : .repair = xrep_refcountbt,
320 : },
321 : [XFS_SCRUB_TYPE_INODE] = { /* inode record */
322 : .type = ST_INODE,
323 : .setup = xchk_setup_inode,
324 : .scrub = xchk_inode,
325 : .repair = xrep_inode,
326 : },
327 : [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
328 : .type = ST_INODE,
329 : .setup = xchk_setup_inode_bmap,
330 : .scrub = xchk_bmap_data,
331 : .repair = xrep_bmap_data,
332 : },
333 : [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
334 : .type = ST_INODE,
335 : .setup = xchk_setup_inode_bmap,
336 : .scrub = xchk_bmap_attr,
337 : .repair = xrep_bmap_attr,
338 : },
339 : [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
340 : .type = ST_INODE,
341 : .setup = xchk_setup_inode_bmap,
342 : .scrub = xchk_bmap_cow,
343 : .repair = xrep_bmap_cow,
344 : },
345 : [XFS_SCRUB_TYPE_DIR] = { /* directory */
346 : .type = ST_INODE,
347 : .setup = xchk_setup_directory,
348 : .scrub = xchk_directory,
349 : .repair = xrep_directory,
350 : },
351 : [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
352 : .type = ST_INODE,
353 : .setup = xchk_setup_xattr,
354 : .scrub = xchk_xattr,
355 : .repair = xrep_xattr,
356 : },
357 : [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
358 : .type = ST_INODE,
359 : .setup = xchk_setup_symlink,
360 : .scrub = xchk_symlink,
361 : .repair = xrep_symlink,
362 : },
363 : [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
364 : .type = ST_INODE,
365 : .setup = xchk_setup_parent,
366 : .scrub = xchk_parent,
367 : .repair = xrep_parent,
368 : },
369 : [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
370 : .type = ST_FS,
371 : .setup = xchk_setup_rtbitmap,
372 : .scrub = xchk_rtbitmap,
373 : .has = xfs_has_realtime,
374 : .repair = xrep_rtbitmap,
375 : },
376 : [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
377 : .type = ST_FS,
378 : .setup = xchk_setup_rtsummary,
379 : .scrub = xchk_rtsummary,
380 : .has = xfs_has_realtime,
381 : .repair = xrep_rtsummary,
382 : },
383 : [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
384 : .type = ST_FS,
385 : .setup = xchk_setup_quota,
386 : .scrub = xchk_quota,
387 : .repair = xrep_quota,
388 : },
389 : [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
390 : .type = ST_FS,
391 : .setup = xchk_setup_quota,
392 : .scrub = xchk_quota,
393 : .repair = xrep_quota,
394 : },
395 : [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
396 : .type = ST_FS,
397 : .setup = xchk_setup_quota,
398 : .scrub = xchk_quota,
399 : .repair = xrep_quota,
400 : },
401 : [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
402 : .type = ST_FS,
403 : .setup = xchk_setup_fscounters,
404 : .scrub = xchk_fscounters,
405 : .repair = xrep_fscounters,
406 : },
407 : [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
408 : .type = ST_FS,
409 : .setup = xchk_setup_quotacheck,
410 : .scrub = xchk_quotacheck,
411 : .repair = xrep_quotacheck,
412 : },
413 : [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */
414 : .type = ST_FS,
415 : .setup = xchk_setup_nlinks,
416 : .scrub = xchk_nlinks,
417 : .repair = xrep_nlinks,
418 : },
419 : [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */
420 : .type = ST_FS,
421 : .setup = xchk_setup_fs,
422 : .scrub = xchk_health_record,
423 : .repair = xrep_notsupported,
424 : },
425 : [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */
426 : .type = ST_INODE,
427 : .setup = xchk_setup_dirtree,
428 : .scrub = xchk_dirtree,
429 : .has = xfs_has_parent,
430 : .repair = xrep_dirtree,
431 : },
432 : [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */
433 : .type = ST_RTGROUP,
434 : .setup = xchk_setup_rgsuperblock,
435 : .scrub = xchk_rgsuperblock,
436 : .has = xfs_has_rtgroups,
437 : .repair = xrep_rgsuperblock,
438 : },
439 : [XFS_SCRUB_TYPE_RGBITMAP] = { /* realtime group bitmap */
440 : .type = ST_RTGROUP,
441 : .setup = xchk_setup_rgbitmap,
442 : .scrub = xchk_rgbitmap,
443 : .has = xfs_has_rtgroups,
444 : .repair = xrep_rgbitmap,
445 : },
446 : [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */
447 : .type = ST_RTGROUP,
448 : .setup = xchk_setup_rtrmapbt,
449 : .scrub = xchk_rtrmapbt,
450 : .has = xfs_has_rtrmapbt,
451 : .repair = xrep_rtrmapbt,
452 : },
453 : [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */
454 : .type = ST_RTGROUP,
455 : .setup = xchk_setup_rtrefcountbt,
456 : .scrub = xchk_rtrefcountbt,
457 : .has = xfs_has_rtreflink,
458 : .repair = xrep_rtrefcountbt,
459 : },
460 : };
461 :
462 : static int
463 504250996 : xchk_validate_inputs(
464 : struct xfs_mount *mp,
465 : struct xfs_scrub_metadata *sm)
466 : {
467 504250996 : int error;
468 504250996 : const struct xchk_meta_ops *ops;
469 :
470 504250996 : error = -EINVAL;
471 : /* Check our inputs. */
472 504250996 : sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
473 504250996 : if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
474 0 : goto out;
475 : /* sm_reserved[] must be zero */
476 1008609708 : if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
477 0 : goto out;
478 :
479 504358712 : error = -ENOENT;
480 : /* Do we know about this type of metadata? */
481 504358712 : if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
482 0 : goto out;
483 504358712 : ops = &meta_scrub_ops[sm->sm_type];
484 504358712 : if (ops->setup == NULL || ops->scrub == NULL)
485 0 : goto out;
486 : /* Does this fs even support this type of metadata? */
487 504358712 : if (ops->has && !ops->has(mp))
488 9954 : goto out;
489 :
490 504344517 : error = -EINVAL;
491 : /* restricting fields must be appropriate for type */
492 504344517 : switch (ops->type) {
493 290014 : case ST_NONE:
494 : case ST_FS:
495 290014 : if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
496 0 : goto out;
497 : break;
498 2199132 : case ST_PERAG:
499 2199132 : if (sm->sm_ino || sm->sm_gen ||
500 2199132 : sm->sm_agno >= mp->m_sb.sb_agcount)
501 0 : goto out;
502 : break;
503 501633451 : case ST_INODE:
504 501633451 : if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
505 0 : goto out;
506 : break;
507 221920 : case ST_RTGROUP:
508 221920 : if (sm->sm_ino || sm->sm_gen)
509 0 : goto out;
510 221920 : if (!xfs_has_rtgroups(mp) && sm->sm_agno != 0)
511 0 : goto out;
512 221920 : if (xfs_has_rtgroups(mp) && sm->sm_agno >= mp->m_sb.sb_rgcount)
513 0 : goto out;
514 : break;
515 0 : default:
516 0 : goto out;
517 : }
518 :
519 : /* No rebuild without repair. */
520 504344517 : if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) &&
521 : !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
522 : return -EINVAL;
523 :
524 : /*
525 : * We only want to repair read-write v5+ filesystems. Defer the check
526 : * for ops->repair until after our scrub confirms that we need to
527 : * perform repairs so that we avoid failing due to not supporting
528 : * repairing an object that doesn't need repairs.
529 : */
530 504344517 : if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
531 1766123 : error = -EOPNOTSUPP;
532 1766123 : if (!xfs_has_crc(mp))
533 0 : goto out;
534 :
535 1766123 : error = -EROFS;
536 3532246 : if (xfs_is_readonly(mp))
537 819 : goto out;
538 : }
539 :
540 : error = 0;
541 : out:
542 : return error;
543 : }
544 :
545 : #ifdef CONFIG_XFS_ONLINE_REPAIR
546 428545254 : static inline void xchk_postmortem(struct xfs_scrub *sc)
547 : {
548 : /*
549 : * Userspace asked us to repair something, we repaired it, rescanned
550 : * it, and the rescan says it's still broken. Scream about this in
551 : * the system logs.
552 : */
553 428545254 : if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
554 1745109 : (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
555 : XFS_SCRUB_OFLAG_XCORRUPT)))
556 0 : xrep_failure(sc->mp);
557 428545254 : }
558 : #else
559 : static inline void xchk_postmortem(struct xfs_scrub *sc)
560 : {
561 : /*
562 : * Userspace asked us to scrub something, it's broken, and we have no
563 : * way of fixing it. Scream in the logs.
564 : */
565 : if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
566 : XFS_SCRUB_OFLAG_XCORRUPT))
567 : xfs_alert_ratelimited(sc->mp,
568 : "Corruption detected during scrub.");
569 : }
570 : #endif /* CONFIG_XFS_ONLINE_REPAIR */
571 :
572 : static inline void
573 1743576 : repair_outcomes(struct xfs_scrub *sc, int error)
574 : {
575 1743576 : struct xfs_scrub_metadata *sm = sc->sm;
576 1743576 : const char *wut = NULL;
577 :
578 1743576 : if (sc->flags & XREP_ALREADY_FIXED) {
579 : wut = "*** REPAIR SUCCESS";
580 : error = 0;
581 2829 : } else if (error == -EBUSY) {
582 : wut = "??? FILESYSTEM BUSY";
583 : } else if (error == -EAGAIN) {
584 : wut = "??? REPAIR DEFERRED";
585 : } else if (error == -ECANCELED) {
586 : wut = "??? REPAIR CANCELLED";
587 : } else if (error == -EINTR) {
588 : wut = "??? REPAIR INTERRUPTED";
589 2684 : } else if (error != -EOPNOTSUPP && error != -ENOENT) {
590 0 : wut = "!!! REPAIR FAILED";
591 0 : xfs_info(sc->mp,
592 : "%s ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags 0x%x error %d",
593 : wut, XFS_I(file_inode(sc->file))->i_ino,
594 : xchk_type_string(sm->sm_type), sm->sm_agno,
595 : sm->sm_ino, sm->sm_gen, sm->sm_flags, error);
596 0 : return;
597 : } else {
598 : return;
599 : }
600 :
601 1740892 : xfs_info_ratelimited(sc->mp,
602 : "%s ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags 0x%x error %d",
603 : wut, XFS_I(file_inode(sc->file))->i_ino,
604 : xchk_type_string(sm->sm_type), sm->sm_agno, sm->sm_ino,
605 : sm->sm_gen, sm->sm_flags, error);
606 : }
607 :
608 : /* Dispatch metadata scrubbing. */
609 : int
610 504192196 : xfs_scrub_metadata(
611 : struct file *file,
612 : struct xfs_scrub_metadata *sm)
613 : {
614 504192196 : struct xchk_stats_run run = { };
615 504192196 : struct xfs_scrub *sc;
616 504192196 : struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
617 504192196 : u64 check_start;
618 504192196 : int error = 0;
619 :
620 504192196 : BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
621 : (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
622 :
623 504192196 : trace_xchk_start(XFS_I(file_inode(file)), sm, error);
624 :
625 : /* Forbidden if we are shut down or mounted norecovery. */
626 504213244 : error = -ESHUTDOWN;
627 1008426488 : if (xfs_is_shutdown(mp))
628 0 : goto out;
629 504213244 : error = -ENOTRECOVERABLE;
630 504213244 : if (xfs_has_norecovery(mp))
631 2 : goto out;
632 :
633 504213242 : error = xchk_validate_inputs(mp, sm);
634 504358359 : if (error)
635 10773 : goto out;
636 :
637 504347586 : xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
638 : "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
639 :
640 504332679 : sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
641 504601957 : if (!sc) {
642 0 : error = -ENOMEM;
643 0 : goto out;
644 : }
645 :
646 504601957 : sc->mp = mp;
647 504601957 : sc->file = file;
648 504601957 : sc->sm = sm;
649 504601957 : sc->ops = &meta_scrub_ops[sm->sm_type];
650 504601957 : sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
651 : retry_op:
652 : /*
653 : * When repairs are allowed, prevent freezing or readonly remount while
654 : * scrub is running with a real transaction.
655 : */
656 506426611 : if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
657 3548461 : error = mnt_want_write_file(sc->file);
658 3548464 : if (error)
659 1 : goto out_sc;
660 :
661 3548463 : sc->flags |= XCHK_HAVE_FREEZE_PROT;
662 : }
663 :
664 : /* Set up for the operation. */
665 506426613 : error = sc->ops->setup(sc);
666 506472480 : if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
667 0 : goto try_harder;
668 506472480 : if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
669 39806 : goto need_drain;
670 506432674 : if (error)
671 3129587 : goto out_teardown;
672 :
673 : /* Scrub for errors. */
674 503303087 : check_start = xchk_stats_now();
675 503324713 : if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
676 143934 : error = sc->ops->repair_eval(sc);
677 : else
678 503180779 : error = sc->ops->scrub(sc);
679 502961981 : run.scrub_ns += xchk_stats_elapsed_ns(check_start);
680 503192053 : if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
681 15804 : goto try_harder;
682 503176249 : if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
683 20170 : goto need_drain;
684 503156079 : if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
685 72767368 : goto out_teardown;
686 :
687 430388711 : xchk_update_health(sc);
688 :
689 860562860 : if (xchk_could_repair(sc)) {
690 : /*
691 : * If userspace asked for a repair but it wasn't necessary,
692 : * report that back to userspace.
693 : */
694 1743219 : if (!xrep_will_attempt(sc)) {
695 1557 : sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
696 1557 : goto out_nofix;
697 : }
698 :
699 : /*
700 : * If it's broken, userspace wants us to fix it, and we haven't
701 : * already tried to fix it, then attempt a repair.
702 : */
703 1743654 : error = xrep_attempt(sc, &run);
704 1743590 : repair_outcomes(sc, error);
705 1743587 : if (error == -EAGAIN) {
706 : /*
707 : * Either the repair function succeeded or it couldn't
708 : * get all the resources it needs; either way, we go
709 : * back to the beginning and call the scrub function.
710 : */
711 1740851 : error = xchk_teardown(sc, 0);
712 1740890 : if (error) {
713 0 : xrep_failure(mp);
714 0 : goto out_sc;
715 : }
716 1740890 : goto retry_op;
717 : }
718 : }
719 :
720 428549243 : out_nofix:
721 428550800 : xchk_postmortem(sc);
722 504443665 : out_teardown:
723 504443665 : error = xchk_teardown(sc, error);
724 504646613 : out_sc:
725 504646613 : kfree(sc);
726 504678869 : out:
727 504678869 : trace_xchk_done(XFS_I(file_inode(file)), sm, error);
728 504660187 : if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
729 0 : sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
730 0 : error = 0;
731 : }
732 504660187 : if (error != -ENOENT)
733 428753538 : xchk_stats_merge(mp, sm, &run);
734 504658172 : return error;
735 59976 : need_drain:
736 59976 : error = xchk_teardown(sc, 0);
737 59976 : if (error)
738 0 : goto out_sc;
739 59976 : sc->flags |= XCHK_NEED_DRAIN;
740 59976 : run.retries++;
741 59976 : goto retry_op;
742 15804 : try_harder:
743 : /*
744 : * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down
745 : * everything we hold, then set up again with preparation for
746 : * worst-case scenarios.
747 : */
748 15804 : error = xchk_teardown(sc, 0);
749 15804 : if (error)
750 0 : goto out_sc;
751 15804 : sc->flags |= XCHK_TRY_HARDER;
752 15804 : run.retries++;
753 15804 : goto retry_op;
754 : }
755 :
756 : /* Decide if there have been any scrub failures up to this point. */
757 : static inline bool
758 233709211 : xfs_scrubv_previous_failures(
759 : struct xfs_mount *mp,
760 : struct xfs_scrub_vec_head *vhead,
761 : struct xfs_scrub_vec *barrier_vec)
762 : {
763 233709211 : struct xfs_scrub_vec *v;
764 233709211 : __u32 failmask;
765 :
766 233709211 : failmask = barrier_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
767 :
768 934725735 : for (v = vhead->svh_vecs; v < barrier_vec; v++) {
769 701016538 : if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
770 233634930 : continue;
771 :
772 : /*
773 : * Runtime errors count as a previous failure, except the ones
774 : * used to ask userspace to retry.
775 : */
776 467381608 : if (v->sv_ret && v->sv_ret != -EBUSY && v->sv_ret != -ENOENT &&
777 : v->sv_ret != -EUSERS)
778 : return true;
779 :
780 : /*
781 : * If any of the out-flags on the scrub vector match the mask
782 : * that was set on the barrier vector, that's a previous fail.
783 : */
784 467381608 : if (v->sv_flags & failmask)
785 : return true;
786 : }
787 :
788 : return false;
789 : }
790 :
791 : /* Vectored scrub implementation to reduce ioctl calls. */
792 : int
793 89747385 : xfs_scrubv_metadata(
794 : struct file *file,
795 : struct xfs_scrub_vec_head *vhead)
796 : {
797 89747385 : struct xfs_inode *ip_in = XFS_I(file_inode(file));
798 89747385 : struct xfs_mount *mp = ip_in->i_mount;
799 89747385 : struct xfs_inode *ip = NULL;
800 89747385 : struct xfs_scrub_vec *v;
801 89747385 : bool set_dontcache = false;
802 89747385 : unsigned int i;
803 89747385 : int error = 0;
804 :
805 89747385 : BUILD_BUG_ON(sizeof(struct xfs_scrub_vec_head) ==
806 : sizeof(struct xfs_scrub_metadata));
807 89747385 : BUILD_BUG_ON(XFS_IOC_SCRUB_METADATA == XFS_IOC_SCRUBV_METADATA);
808 :
809 89747385 : trace_xchk_scrubv_start(ip_in, vhead);
810 :
811 89749198 : if (vhead->svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
812 : return -EINVAL;
813 825456596 : for (i = 0, v = vhead->svh_vecs; i < vhead->svh_nr; i++, v++) {
814 735706013 : if (v->sv_reserved)
815 : return -EINVAL;
816 735706013 : if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
817 233711474 : (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT))
818 : return -EINVAL;
819 :
820 : /*
821 : * If we detect at least one inode-type scrub, we might
822 : * consider setting dontcache at the end.
823 : */
824 735706013 : if (v->sv_type < XFS_SCRUB_TYPE_NR &&
825 501980133 : meta_scrub_ops[v->sv_type].type == ST_INODE)
826 501461383 : set_dontcache = true;
827 :
828 735706013 : trace_xchk_scrubv_item(mp, vhead, v);
829 : }
830 :
831 : /*
832 : * If the caller provided us with a nonzero inode number that isn't the
833 : * ioctl file, try to grab a reference to it to eliminate all further
834 : * untrusted inode lookups. If we can't get the inode, let each scrub
835 : * function try again.
836 : */
837 89750583 : if (vhead->svh_ino != ip_in->i_ino) {
838 67587584 : xfs_iget(mp, NULL, vhead->svh_ino, XFS_IGET_UNTRUSTED, 0, &ip);
839 67586444 : if (ip && (VFS_I(ip)->i_generation != vhead->svh_gen ||
840 72969 : (xfs_is_metadir_inode(ip) &&
841 72969 : !S_ISDIR(VFS_I(ip)->i_mode)))) {
842 140391 : xfs_irele(ip);
843 140391 : ip = NULL;
844 : }
845 : }
846 89749443 : if (!ip) {
847 22837601 : if (!igrab(VFS_I(ip_in)))
848 : return -EFSCORRUPTED;
849 22837590 : ip = ip_in;
850 : }
851 :
852 : /* Run all the scrubbers. */
853 825296961 : for (i = 0, v = vhead->svh_vecs; i < vhead->svh_nr; i++, v++) {
854 735545566 : struct xfs_scrub_metadata sm = {
855 735545566 : .sm_type = v->sv_type,
856 735545566 : .sm_flags = v->sv_flags,
857 735545566 : .sm_ino = vhead->svh_ino,
858 735545566 : .sm_gen = vhead->svh_gen,
859 735545566 : .sm_agno = vhead->svh_agno,
860 : };
861 :
862 735545566 : if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
863 233705958 : if (xfs_scrubv_previous_failures(mp, vhead, v)) {
864 14 : v->sv_ret = -ECANCELED;
865 14 : trace_xchk_scrubv_barrier_fail(mp, vhead, v);
866 107 : break;
867 : }
868 :
869 233705944 : continue;
870 : }
871 :
872 501839608 : v->sv_ret = xfs_scrub_metadata(file, &sm);
873 501838349 : v->sv_flags = sm.sm_flags;
874 :
875 : /* Leave the inode in memory if something's wrong with it. */
876 501838349 : if (xchk_needs_repair(&sm))
877 4795708 : set_dontcache = false;
878 :
879 501838349 : if (vhead->svh_rest_us) {
880 0 : ktime_t expires;
881 :
882 0 : expires = ktime_add_ns(ktime_get(),
883 : vhead->svh_rest_us * 1000);
884 0 : set_current_state(TASK_KILLABLE);
885 0 : schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
886 : }
887 501838349 : if (fatal_signal_pending(current)) {
888 : error = -EINTR;
889 : break;
890 : }
891 : }
892 :
893 : /*
894 : * If we're holding the only reference to this inode and the scan was
895 : * clean, mark it dontcache so that we don't pollute the cache.
896 : */
897 89751488 : if (set_dontcache && atomic_read(&VFS_I(ip)->i_count) == 1)
898 9713547 : d_mark_dontcache(VFS_I(ip));
899 89751487 : xfs_irele(ip);
900 89751487 : return error;
901 : }
|