LCOV - code coverage report
Current view: top level - fs/xfs/scrub - health.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc4-xfsa @ Mon Jul 31 20:08:27 PDT 2023 Lines: 90 109 82.6 %
Date: 2023-07-31 20:08:27 Functions: 5 5 100.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * Copyright (C) 2019-2023 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <djwong@kernel.org>
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_trans_resv.h"
      11             : #include "xfs_mount.h"
      12             : #include "xfs_btree.h"
      13             : #include "xfs_trans_resv.h"
      14             : #include "xfs_mount.h"
      15             : #include "xfs_ag.h"
      16             : #include "xfs_health.h"
      17             : #include "xfs_rtgroup.h"
      18             : #include "scrub/scrub.h"
      19             : #include "scrub/health.h"
      20             : #include "scrub/common.h"
      21             : 
      22             : /*
      23             :  * Scrub and In-Core Filesystem Health Assessments
      24             :  * ===============================================
      25             :  *
      26             :  * Online scrub and repair have the time and the ability to perform stronger
      27             :  * checks than we can do from the metadata verifiers, because they can
      28             :  * cross-reference records between data structures.  Therefore, scrub is in a
      29             :  * good position to update the online filesystem health assessments to reflect
      30             :  * the good/bad state of the data structure.
      31             :  *
      32             :  * We therefore extend scrub in the following ways to achieve this:
      33             :  *
      34             :  * 1. Create a "sick_mask" field in the scrub context.  When we're setting up a
      35             :  * scrub call, set this to the default XFS_SICK_* flag(s) for the selected
      36             :  * scrub type (call it A).  Scrub and repair functions can override the default
      37             :  * sick_mask value if they choose.
      38             :  *
      39             :  * 2. If the scrubber returns a runtime error code, we exit making no changes
      40             :  * to the incore sick state.
      41             :  *
      42             :  * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore
      43             :  * sick flags before exiting.
      44             :  *
      45             :  * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore
      46             :  * sick flags.  If the user didn't want to repair then we exit, leaving the
      47             :  * metadata structure unfixed and the sick flag set.
      48             :  *
      49             :  * 5. Now we know that A is corrupt and the user wants to repair, so run the
      50             :  * repairer.  If the repairer returns an error code, we exit with that error
      51             :  * code, having made no further changes to the incore sick state.
      52             :  *
      53             :  * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean,
      54             :  * use sick_mask to clear the incore sick flags.  This should have the effect
      55             :  * that A is no longer marked sick.
      56             :  *
      57             :  * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and
      58             :  * use sick_mask to set the incore sick flags.  This should have no externally
      59             :  * visible effect since we already set them in step (4).
      60             :  *
      61             :  * There are some complications to this story, however.  For certain types of
      62             :  * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild
      63             :  * both structures at the same time.  The following principles apply to this
      64             :  * type of repair strategy:
      65             :  *
      66             :  * 8. Any repair function that rebuilds multiple structures should update
      67             :  * sick_mask_visible to reflect whatever other structures are rebuilt, and
      68             :  * verify that all the rebuilt structures can pass a scrub check.  The outcomes
      69             :  * of 5-7 still apply, but with a sick_mask that covers everything being
      70             :  * rebuilt.
      71             :  */
      72             : 
      73             : /* Map our scrub type to a sick mask and a set of health update functions. */
      74             : 
      75             : enum xchk_health_group {
      76             :         XHG_FS = 1,
      77             :         XHG_RT,
      78             :         XHG_AG,
      79             :         XHG_INO,
      80             :         XHG_RTGROUP,
      81             : };
      82             : 
      83             : struct xchk_health_map {
      84             :         enum xchk_health_group  group;
      85             :         unsigned int            sick_mask;
      86             : };
      87             : 
      88             : static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
      89             :         [XFS_SCRUB_TYPE_SB]             = { XHG_AG,  XFS_SICK_AG_SB },
      90             :         [XFS_SCRUB_TYPE_AGF]            = { XHG_AG,  XFS_SICK_AG_AGF },
      91             :         [XFS_SCRUB_TYPE_AGFL]           = { XHG_AG,  XFS_SICK_AG_AGFL },
      92             :         [XFS_SCRUB_TYPE_AGI]            = { XHG_AG,  XFS_SICK_AG_AGI },
      93             :         [XFS_SCRUB_TYPE_BNOBT]          = { XHG_AG,  XFS_SICK_AG_BNOBT },
      94             :         [XFS_SCRUB_TYPE_CNTBT]          = { XHG_AG,  XFS_SICK_AG_CNTBT },
      95             :         [XFS_SCRUB_TYPE_INOBT]          = { XHG_AG,  XFS_SICK_AG_INOBT },
      96             :         [XFS_SCRUB_TYPE_FINOBT]         = { XHG_AG,  XFS_SICK_AG_FINOBT },
      97             :         [XFS_SCRUB_TYPE_RMAPBT]         = { XHG_AG,  XFS_SICK_AG_RMAPBT },
      98             :         [XFS_SCRUB_TYPE_REFCNTBT]       = { XHG_AG,  XFS_SICK_AG_REFCNTBT },
      99             :         [XFS_SCRUB_TYPE_INODE]          = { XHG_INO, XFS_SICK_INO_CORE },
     100             :         [XFS_SCRUB_TYPE_BMBTD]          = { XHG_INO, XFS_SICK_INO_BMBTD },
     101             :         [XFS_SCRUB_TYPE_BMBTA]          = { XHG_INO, XFS_SICK_INO_BMBTA },
     102             :         [XFS_SCRUB_TYPE_BMBTC]          = { XHG_INO, XFS_SICK_INO_BMBTC },
     103             :         [XFS_SCRUB_TYPE_DIR]            = { XHG_INO, XFS_SICK_INO_DIR },
     104             :         [XFS_SCRUB_TYPE_XATTR]          = { XHG_INO, XFS_SICK_INO_XATTR },
     105             :         [XFS_SCRUB_TYPE_SYMLINK]        = { XHG_INO, XFS_SICK_INO_SYMLINK },
     106             :         [XFS_SCRUB_TYPE_PARENT]         = { XHG_INO, XFS_SICK_INO_PARENT },
     107             :         [XFS_SCRUB_TYPE_RTBITMAP]       = { XHG_RT,  XFS_SICK_RT_BITMAP },
     108             :         [XFS_SCRUB_TYPE_RTSUM]          = { XHG_RT,  XFS_SICK_RT_SUMMARY },
     109             :         [XFS_SCRUB_TYPE_UQUOTA]         = { XHG_FS,  XFS_SICK_FS_UQUOTA },
     110             :         [XFS_SCRUB_TYPE_GQUOTA]         = { XHG_FS,  XFS_SICK_FS_GQUOTA },
     111             :         [XFS_SCRUB_TYPE_PQUOTA]         = { XHG_FS,  XFS_SICK_FS_PQUOTA },
     112             :         [XFS_SCRUB_TYPE_FSCOUNTERS]     = { XHG_FS,  XFS_SICK_FS_COUNTERS },
     113             :         [XFS_SCRUB_TYPE_QUOTACHECK]     = { XHG_FS,  XFS_SICK_FS_QUOTACHECK },
     114             :         [XFS_SCRUB_TYPE_NLINKS]         = { XHG_FS,  XFS_SICK_FS_NLINKS },
     115             :         [XFS_SCRUB_TYPE_DIRTREE]        = { XHG_INO, XFS_SICK_INO_DIRTREE },
     116             :         [XFS_SCRUB_TYPE_RGSUPER]        = { XHG_RTGROUP, XFS_SICK_RT_SUPER },
     117             :         [XFS_SCRUB_TYPE_RTRMAPBT]       = { XHG_RTGROUP, XFS_SICK_RT_RMAPBT },
     118             :         [XFS_SCRUB_TYPE_RTREFCBT]       = { XHG_RTGROUP, XFS_SICK_RT_REFCNTBT },
     119             : };
     120             : 
     121             : /* Return the health status mask for this scrub type. */
     122             : unsigned int
     123   504610802 : xchk_health_mask_for_scrub_type(
     124             :         __u32                   scrub_type)
     125             : {
     126   504610802 :         return type_to_health_flag[scrub_type].sick_mask;
     127             : }
     128             : 
     129             : /*
     130             :  * Scrub gave the filesystem a clean bill of health, so clear all the indirect
     131             :  * markers of past problems (at least for the fs and ags) so that we can be
     132             :  * healthy again.
     133             :  */
     134             : STATIC void
     135       11155 : xchk_mark_all_healthy(
     136             :         struct xfs_mount        *mp)
     137             : {
     138       11155 :         struct xfs_perag        *pag;
     139       11155 :         struct xfs_rtgroup      *rtg;
     140       11155 :         xfs_agnumber_t          agno;
     141       11155 :         xfs_rgnumber_t          rgno;
     142             : 
     143       11155 :         xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
     144       11155 :         xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
     145       63794 :         for_each_perag(mp, agno, pag)
     146       52639 :                 xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
     147       30001 :         for_each_rtgroup(mp, rgno, rtg)
     148       18846 :                 xfs_rtgroup_mark_healthy(rtg, XFS_SICK_RT_INDIRECT);
     149       11155 : }
     150             : 
     151             : /*
     152             :  * Update filesystem health assessments based on what we found and did.
     153             :  *
     154             :  * If the scrubber finds errors, we mark sick whatever's mentioned in
     155             :  * sick_mask, no matter whether this is a first scan or an
     156             :  * evaluation of repair effectiveness.
     157             :  *
     158             :  * Otherwise, no direct corruption was found, so mark whatever's in
     159             :  * sick_mask as healthy.
     160             :  */
     161             : void
     162   430269231 : xchk_update_health(
     163             :         struct xfs_scrub        *sc)
     164             : {
     165   430269231 :         struct xfs_perag        *pag;
     166   430269231 :         struct xfs_rtgroup      *rtg;
     167   430269231 :         bool                    bad;
     168             : 
     169             :         /*
     170             :          * The HEALTHY scrub type is a request from userspace to clear all the
     171             :          * indirect flags after a clean scan of the entire filesystem.  As such
     172             :          * there's no sick flag defined for it, so we branch here ahead of the
     173             :          * mask check.
     174             :          */
     175   430269231 :         if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY &&
     176             :             !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
     177       11155 :                 xchk_mark_all_healthy(sc->mp);
     178       11155 :                 return;
     179             :         }
     180             : 
     181   430258076 :         if (!sc->sick_mask)
     182             :                 return;
     183             : 
     184   430101116 :         bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
     185             :                                    XFS_SCRUB_OFLAG_XCORRUPT));
     186   430101116 :         switch (type_to_health_flag[sc->sm->sm_type].group) {
     187     2926413 :         case XHG_AG:
     188     2926413 :                 pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
     189     2926408 :                 if (bad) {
     190          18 :                         xfs_ag_mark_sick(pag, sc->sick_mask);
     191          18 :                         xfs_ag_mark_checked(pag, sc->sick_mask);
     192             :                 } else
     193     2926390 :                         xfs_ag_mark_healthy(pag, sc->sick_mask);
     194     2926399 :                 xfs_perag_put(pag);
     195     2926399 :                 break;
     196   426699160 :         case XHG_INO:
     197   426699160 :                 if (!sc->ip)
     198             :                         return;
     199   426699160 :                 if (bad) {
     200           6 :                         unsigned int    mask = sc->sick_mask;
     201             : 
     202             :                         /*
     203             :                          * If we're coming in for repairs then we don't want
     204             :                          * sickness flags to propagate to the incore health
     205             :                          * status if the inode gets inactivated before we can
     206             :                          * fix it.
     207             :                          */
     208           6 :                         if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
     209           0 :                                 mask |= XFS_SICK_INO_FORGET;
     210           6 :                         xfs_inode_mark_sick(sc->ip, mask);
     211           6 :                         xfs_inode_mark_checked(sc->ip, sc->sick_mask);
     212             :                 } else
     213   426699154 :                         xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
     214             :                 break;
     215      242786 :         case XHG_FS:
     216      242786 :                 if (bad) {
     217           0 :                         xfs_fs_mark_sick(sc->mp, sc->sick_mask);
     218           0 :                         xfs_fs_mark_checked(sc->mp, sc->sick_mask);
     219             :                 } else
     220      242786 :                         xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
     221             :                 break;
     222       95153 :         case XHG_RT:
     223       95153 :                 if (bad) {
     224           0 :                         xfs_rt_mark_sick(sc->mp, sc->sick_mask);
     225           0 :                         xfs_rt_mark_checked(sc->mp, sc->sick_mask);
     226             :                 } else
     227       95153 :                         xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
     228             :                 break;
     229      137604 :         case XHG_RTGROUP:
     230      137604 :                 rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
     231      137602 :                 if (bad) {
     232           0 :                         xfs_rtgroup_mark_sick(rtg, sc->sick_mask);
     233           0 :                         xfs_rtgroup_mark_checked(rtg, sc->sick_mask);
     234             :                 } else
     235      137602 :                         xfs_rtgroup_mark_healthy(rtg, sc->sick_mask);
     236      137604 :                 xfs_rtgroup_put(rtg);
     237      137604 :                 break;
     238           0 :         default:
     239           0 :                 ASSERT(0);
     240           0 :                 break;
     241             :         }
     242             : }
     243             : 
     244             : /* Is the given per-AG btree healthy enough for scanning? */
     245             : bool
     246  3522596366 : xchk_ag_btree_healthy_enough(
     247             :         struct xfs_scrub        *sc,
     248             :         struct xfs_perag        *pag,
     249             :         xfs_btnum_t             btnum)
     250             : {
     251  3522596366 :         unsigned int            mask = 0;
     252             : 
     253             :         /*
     254             :          * We always want the cursor if it's the same type as whatever we're
     255             :          * scrubbing, even if we already know the structure is corrupt.
     256             :          *
     257             :          * Otherwise, we're only interested in the btree for cross-referencing.
     258             :          * If we know the btree is bad then don't bother, just set XFAIL.
     259             :          */
     260  3522596366 :         switch (btnum) {
     261   587102859 :         case XFS_BTNUM_BNO:
     262   587102859 :                 if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
     263             :                         return true;
     264             :                 mask = XFS_SICK_AG_BNOBT;
     265             :                 break;
     266   587102967 :         case XFS_BTNUM_CNT:
     267   587102967 :                 if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
     268             :                         return true;
     269             :                 mask = XFS_SICK_AG_CNTBT;
     270             :                 break;
     271   587103053 :         case XFS_BTNUM_INO:
     272   587103053 :                 if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
     273             :                         return true;
     274             :                 mask = XFS_SICK_AG_INOBT;
     275             :                 break;
     276   587102696 :         case XFS_BTNUM_FINO:
     277   587102696 :                 if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
     278             :                         return true;
     279             :                 mask = XFS_SICK_AG_FINOBT;
     280             :                 break;
     281   587094663 :         case XFS_BTNUM_RMAP:
     282   587094663 :                 if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
     283             :                         return true;
     284             :                 mask = XFS_SICK_AG_RMAPBT;
     285             :                 break;
     286   587090128 :         case XFS_BTNUM_REFC:
     287   587090128 :                 if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
     288             :                         return true;
     289             :                 mask = XFS_SICK_AG_REFCNTBT;
     290             :                 break;
     291           0 :         default:
     292           0 :                 ASSERT(0);
     293           0 :                 return true;
     294             :         }
     295             : 
     296             :         /*
     297             :          * If we just repaired some AG metadata, sc->sick_mask will reflect all
     298             :          * the per-AG metadata types that were repaired.  Exclude these from
     299             :          * the filesystem health query because we have not yet updated the
     300             :          * health status and we want everything to be scanned.
     301             :          */
     302  3521487528 :         if ((sc->flags & XREP_ALREADY_FIXED) &&
     303   384108016 :             type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
     304     4155614 :                 mask &= ~sc->sick_mask;
     305             : 
     306  3521487528 :         if (xfs_ag_has_sickness(pag, mask)) {
     307           0 :                 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
     308           0 :                 return false;
     309             :         }
     310             : 
     311             :         return true;
     312             : }
     313             : 
     314             : /*
     315             :  * Quick scan to double-check that there isn't any evidence of lingering
     316             :  * primary health problems.  If we're still clear, then the health update will
     317             :  * take care of clearing the indirect evidence.
     318             :  */
     319             : int
     320       11155 : xchk_health_record(
     321             :         struct xfs_scrub        *sc)
     322             : {
     323       11155 :         struct xfs_mount        *mp = sc->mp;
     324       11155 :         struct xfs_perag        *pag;
     325       11155 :         struct xfs_rtgroup      *rtg;
     326       11155 :         xfs_agnumber_t          agno;
     327       11155 :         xfs_rgnumber_t          rgno;
     328             : 
     329       11155 :         unsigned int            sick;
     330       11155 :         unsigned int            checked;
     331             : 
     332       11155 :         xfs_fs_measure_sickness(mp, &sick, &checked);
     333       11155 :         if (sick & XFS_SICK_FS_PRIMARY)
     334           0 :                 xchk_set_corrupt(sc);
     335             : 
     336       11155 :         xfs_rt_measure_sickness(mp, &sick, &checked);
     337       11155 :         if (sick & XFS_SICK_RT_PRIMARY)
     338           0 :                 xchk_set_corrupt(sc);
     339             : 
     340       63794 :         for_each_perag(mp, agno, pag) {
     341       52639 :                 xfs_ag_measure_sickness(pag, &sick, &checked);
     342       52639 :                 if (sick & XFS_SICK_AG_PRIMARY)
     343           0 :                         xchk_set_corrupt(sc);
     344             :         }
     345             : 
     346       30001 :         for_each_rtgroup(mp, rgno, rtg) {
     347       18846 :                 xfs_rtgroup_measure_sickness(rtg, &sick, &checked);
     348       18846 :                 if (sick & XFS_SICK_RT_PRIMARY)
     349           0 :                         xchk_set_corrupt(sc);
     350             :         }
     351             : 
     352       11155 :         return 0;
     353             : }

Generated by: LCOV version 1.14