LCOV - code coverage report
Current view: top level - fs/xfs/scrub - repair.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-djwx @ Mon Jul 31 20:08:22 PDT 2023 Lines: 161 335 48.1 %
Date: 2023-07-31 20:08:22 Functions: 10 20 50.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <djwong@kernel.org>
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_trans_resv.h"
      11             : #include "xfs_mount.h"
      12             : #include "xfs_btree.h"
      13             : #include "xfs_log_format.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_sb.h"
      16             : #include "xfs_inode.h"
      17             : #include "xfs_alloc.h"
      18             : #include "xfs_alloc_btree.h"
      19             : #include "xfs_ialloc.h"
      20             : #include "xfs_ialloc_btree.h"
      21             : #include "xfs_rmap.h"
      22             : #include "xfs_rmap_btree.h"
      23             : #include "xfs_refcount_btree.h"
      24             : #include "xfs_extent_busy.h"
      25             : #include "xfs_ag.h"
      26             : #include "xfs_ag_resv.h"
      27             : #include "xfs_quota.h"
      28             : #include "xfs_qm.h"
      29             : #include "scrub/scrub.h"
      30             : #include "scrub/common.h"
      31             : #include "scrub/trace.h"
      32             : #include "scrub/repair.h"
      33             : #include "scrub/bitmap.h"
      34             : 
      35             : /*
      36             :  * Attempt to repair some metadata, if the metadata is corrupt and userspace
      37             :  * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
      38             :  * and will set *fixed to true if it thinks it repaired anything.
      39             :  */
      40             : int
      41   316679776 : xrep_attempt(
      42             :         struct xfs_scrub        *sc)
      43             : {
      44   316679776 :         int                     error = 0;
      45             : 
      46   316679776 :         trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
      47             : 
      48   316407764 :         xchk_ag_btcur_free(&sc->sa);
      49             : 
      50             :         /* Repair whatever's broken. */
      51   316426239 :         ASSERT(sc->ops->repair);
      52   316426239 :         error = sc->ops->repair(sc);
      53   316460650 :         trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
      54   316385379 :         switch (error) {
      55     1021998 :         case 0:
      56             :                 /*
      57             :                  * Repair succeeded.  Commit the fixes and perform a second
      58             :                  * scrub so that we can tell userspace if we fixed the problem.
      59             :                  */
      60     1021998 :                 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
      61     1021998 :                 sc->flags |= XREP_ALREADY_FIXED;
      62     1021998 :                 return -EAGAIN;
      63           0 :         case -ECHRNG:
      64           0 :                 sc->flags |= XCHK_NEED_DRAIN;
      65           0 :                 return -EAGAIN;
      66           0 :         case -EDEADLOCK:
      67             :                 /* Tell the caller to try again having grabbed all the locks. */
      68           0 :                 if (!(sc->flags & XCHK_TRY_HARDER)) {
      69           0 :                         sc->flags |= XCHK_TRY_HARDER;
      70           0 :                         return -EAGAIN;
      71             :                 }
      72             :                 /*
      73             :                  * We tried harder but still couldn't grab all the resources
      74             :                  * we needed to fix it.  The corruption has not been fixed,
      75             :                  * so exit to userspace with the scan's output flags unchanged.
      76             :                  */
      77             :                 return 0;
      78   315363381 :         default:
      79             :                 /*
      80             :                  * EAGAIN tells the caller to re-scrub, so we cannot return
      81             :                  * that here.
      82             :                  */
      83   315363381 :                 ASSERT(error != -EAGAIN);
      84             :                 return error;
      85             :         }
      86             : }
      87             : 
      88             : /*
      89             :  * Complain about unfixable problems in the filesystem.  We don't log
      90             :  * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
      91             :  * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
      92             :  * administrator isn't running xfs_scrub in no-repairs mode.
      93             :  *
      94             :  * Use this helper function because _ratelimited silently declares a static
      95             :  * structure to track rate limiting information.
      96             :  */
      97             : void
      98   315504377 : xrep_failure(
      99             :         struct xfs_mount        *mp)
     100             : {
     101   315504377 :         xfs_alert_ratelimited(mp,
     102             : "Corruption not fixed during online repair.  Unmount and run xfs_repair.");
     103   315597253 : }
     104             : 
     105             : /*
     106             :  * Repair probe -- userspace uses this to probe if we're willing to repair a
     107             :  * given mountpoint.
     108             :  */
     109             : int
     110       51014 : xrep_probe(
     111             :         struct xfs_scrub        *sc)
     112             : {
     113       51014 :         int                     error = 0;
     114             : 
     115       51014 :         if (xchk_should_terminate(sc, &error))
     116           0 :                 return error;
     117             : 
     118             :         return 0;
     119             : }
     120             : 
     121             : /*
     122             :  * Roll a transaction, keeping the AG headers locked and reinitializing
     123             :  * the btree cursors.
     124             :  */
     125             : int
     126      205281 : xrep_roll_ag_trans(
     127             :         struct xfs_scrub        *sc)
     128             : {
     129      205281 :         int                     error;
     130             : 
     131             :         /*
     132             :          * Keep the AG header buffers locked while we roll the transaction.
     133             :          * Ensure that both AG buffers are dirty and held when we roll the
     134             :          * transaction so that they move forward in the log without losing the
     135             :          * bli (and hence the bli type) when the transaction commits.
     136             :          *
     137             :          * Normal code would never hold clean buffers across a roll, but repair
     138             :          * needs both buffers to maintain a total lock on the AG.
     139             :          */
     140      205281 :         if (sc->sa.agi_bp) {
     141      205281 :                 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
     142      205300 :                 xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
     143             :         }
     144             : 
     145      205042 :         if (sc->sa.agf_bp) {
     146      205042 :                 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
     147      205482 :                 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
     148             :         }
     149             : 
     150             :         /*
     151             :          * Roll the transaction.  We still hold the AG header buffers locked
     152             :          * regardless of whether or not that succeeds.  On failure, the buffers
     153             :          * will be released during teardown on our way out of the kernel.  If
     154             :          * successful, join the buffers to the new transaction and move on.
     155             :          */
     156      205461 :         error = xfs_trans_roll(&sc->tp);
     157      205288 :         if (error)
     158             :                 return error;
     159             : 
     160             :         /* Join the AG headers to the new transaction. */
     161      205288 :         if (sc->sa.agi_bp)
     162      205288 :                 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
     163      205073 :         if (sc->sa.agf_bp)
     164      205073 :                 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
     165             : 
     166             :         return 0;
     167             : }
     168             : 
     169             : /*
     170             :  * Does the given AG have enough space to rebuild a btree?  Neither AG
     171             :  * reservation can be critical, and we must have enough space (factoring
     172             :  * in AG reservations) to construct a whole btree.
     173             :  */
     174             : bool
     175           0 : xrep_ag_has_space(
     176             :         struct xfs_perag        *pag,
     177             :         xfs_extlen_t            nr_blocks,
     178             :         enum xfs_ag_resv_type   type)
     179             : {
     180           0 :         return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
     181           0 :                 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
     182           0 :                 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
     183             : }
     184             : 
     185             : /*
     186             :  * Figure out how many blocks to reserve for an AG repair.  We calculate the
     187             :  * worst case estimate for the number of blocks we'd need to rebuild one of
     188             :  * any type of per-AG btree.
     189             :  */
     190             : xfs_extlen_t
     191    11525236 : xrep_calc_ag_resblks(
     192             :         struct xfs_scrub                *sc)
     193             : {
     194    11525236 :         struct xfs_mount                *mp = sc->mp;
     195    11525236 :         struct xfs_scrub_metadata       *sm = sc->sm;
     196    11525236 :         struct xfs_perag                *pag;
     197    11525236 :         struct xfs_buf                  *bp;
     198    11525236 :         xfs_agino_t                     icount = NULLAGINO;
     199    11525236 :         xfs_extlen_t                    aglen = NULLAGBLOCK;
     200    11525236 :         xfs_extlen_t                    usedlen;
     201    11525236 :         xfs_extlen_t                    freelen;
     202    11525236 :         xfs_extlen_t                    bnobt_sz;
     203    11525236 :         xfs_extlen_t                    inobt_sz;
     204    11525236 :         xfs_extlen_t                    rmapbt_sz;
     205    11525236 :         xfs_extlen_t                    refcbt_sz;
     206    11525236 :         int                             error;
     207             : 
     208    11525236 :         if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
     209             :                 return 0;
     210             : 
     211     3558399 :         pag = xfs_perag_get(mp, sm->sm_agno);
     212     7123144 :         if (xfs_perag_initialised_agi(pag)) {
     213             :                 /* Use in-core icount if possible. */
     214     3561572 :                 icount = pag->pagi_count;
     215             :         } else {
     216             :                 /* Try to get the actual counters from disk. */
     217           0 :                 error = xfs_ialloc_read_agi(pag, NULL, &bp);
     218           0 :                 if (!error) {
     219           0 :                         icount = pag->pagi_count;
     220           0 :                         xfs_buf_relse(bp);
     221             :                 }
     222             :         }
     223             : 
     224             :         /* Now grab the block counters from the AGF. */
     225     3561572 :         error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
     226     3558122 :         if (error) {
     227           0 :                 aglen = pag->block_count;
     228           0 :                 freelen = aglen;
     229           0 :                 usedlen = aglen;
     230             :         } else {
     231     3558122 :                 struct xfs_agf  *agf = bp->b_addr;
     232             : 
     233     3558122 :                 aglen = be32_to_cpu(agf->agf_length);
     234     3558122 :                 freelen = be32_to_cpu(agf->agf_freeblks);
     235     3558122 :                 usedlen = aglen - freelen;
     236     3558122 :                 xfs_buf_relse(bp);
     237             :         }
     238             : 
     239             :         /* If the icount is impossible, make some worst-case assumptions. */
     240     3559807 :         if (icount == NULLAGINO ||
     241             :             !xfs_verify_agino(pag, icount)) {
     242     1702548 :                 icount = pag->agino_max - pag->agino_min + 1;
     243             :         }
     244             : 
     245             :         /* If the block counts are impossible, make worst-case assumptions. */
     246     3559807 :         if (aglen == NULLAGBLOCK ||
     247     3559234 :             aglen != pag->block_count ||
     248             :             freelen >= aglen) {
     249         590 :                 aglen = pag->block_count;
     250         590 :                 freelen = aglen;
     251         590 :                 usedlen = aglen;
     252             :         }
     253     3559807 :         xfs_perag_put(pag);
     254             : 
     255     3561153 :         trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
     256             :                         freelen, usedlen);
     257             : 
     258             :         /*
     259             :          * Figure out how many blocks we'd need worst case to rebuild
     260             :          * each type of btree.  Note that we can only rebuild the
     261             :          * bnobt/cntbt or inobt/finobt as pairs.
     262             :          */
     263     3559706 :         bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
     264     3558050 :         if (xfs_has_sparseinodes(mp))
     265     3558050 :                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
     266             :                                 XFS_INODES_PER_HOLEMASK_BIT);
     267             :         else
     268           0 :                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
     269             :                                 XFS_INODES_PER_CHUNK);
     270     3557833 :         if (xfs_has_finobt(mp))
     271     3557882 :                 inobt_sz *= 2;
     272     3557833 :         if (xfs_has_reflink(mp))
     273     2681145 :                 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
     274             :         else
     275             :                 refcbt_sz = 0;
     276     3557014 :         if (xfs_has_rmapbt(mp)) {
     277             :                 /*
     278             :                  * Guess how many blocks we need to rebuild the rmapbt.
     279             :                  * For non-reflink filesystems we can't have more records than
     280             :                  * used blocks.  However, with reflink it's possible to have
     281             :                  * more than one rmap record per AG block.  We don't know how
     282             :                  * many rmaps there could be in the AG, so we start off with
     283             :                  * what we hope is an generous over-estimation.
     284             :                  */
     285     2680419 :                 if (xfs_has_reflink(mp))
     286     2680419 :                         rmapbt_sz = xfs_rmapbt_calc_size(mp,
     287     2680419 :                                         (unsigned long long)aglen * 2);
     288             :                 else
     289           0 :                         rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
     290             :         } else {
     291             :                 rmapbt_sz = 0;
     292             :         }
     293             : 
     294     3557157 :         trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
     295             :                         inobt_sz, rmapbt_sz, refcbt_sz);
     296             : 
     297     3556944 :         return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
     298             : }
     299             : 
     300             : /* Allocate a block in an AG. */
     301             : int
     302           0 : xrep_alloc_ag_block(
     303             :         struct xfs_scrub                *sc,
     304             :         const struct xfs_owner_info     *oinfo,
     305             :         xfs_fsblock_t                   *fsbno,
     306             :         enum xfs_ag_resv_type           resv)
     307             : {
     308           0 :         struct xfs_alloc_arg            args = {0};
     309           0 :         xfs_agblock_t                   bno;
     310           0 :         int                             error;
     311             : 
     312           0 :         switch (resv) {
     313           0 :         case XFS_AG_RESV_AGFL:
     314             :         case XFS_AG_RESV_RMAPBT:
     315           0 :                 error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
     316             :                                 sc->sa.agf_bp, &bno, 1);
     317           0 :                 if (error)
     318             :                         return error;
     319           0 :                 if (bno == NULLAGBLOCK)
     320             :                         return -ENOSPC;
     321           0 :                 xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
     322           0 :                 *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
     323           0 :                 if (resv == XFS_AG_RESV_RMAPBT)
     324           0 :                         xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
     325             :                 return 0;
     326             :         default:
     327           0 :                 break;
     328             :         }
     329             : 
     330           0 :         args.tp = sc->tp;
     331           0 :         args.mp = sc->mp;
     332           0 :         args.pag = sc->sa.pag;
     333           0 :         args.oinfo = *oinfo;
     334           0 :         args.minlen = 1;
     335           0 :         args.maxlen = 1;
     336           0 :         args.prod = 1;
     337           0 :         args.resv = resv;
     338             : 
     339           0 :         error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno);
     340           0 :         if (error)
     341             :                 return error;
     342           0 :         if (args.fsbno == NULLFSBLOCK)
     343             :                 return -ENOSPC;
     344           0 :         ASSERT(args.len == 1);
     345           0 :         *fsbno = args.fsbno;
     346             : 
     347           0 :         return 0;
     348             : }
     349             : 
     350             : /* Initialize a new AG btree root block with zero entries. */
     351             : int
     352           0 : xrep_init_btblock(
     353             :         struct xfs_scrub                *sc,
     354             :         xfs_fsblock_t                   fsb,
     355             :         struct xfs_buf                  **bpp,
     356             :         xfs_btnum_t                     btnum,
     357             :         const struct xfs_buf_ops        *ops)
     358             : {
     359           0 :         struct xfs_trans                *tp = sc->tp;
     360           0 :         struct xfs_mount                *mp = sc->mp;
     361           0 :         struct xfs_buf                  *bp;
     362           0 :         int                             error;
     363             : 
     364           0 :         trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
     365           0 :                         XFS_FSB_TO_AGBNO(mp, fsb), btnum);
     366             : 
     367           0 :         ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
     368           0 :         error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
     369           0 :                         XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
     370             :                         &bp);
     371           0 :         if (error)
     372             :                 return error;
     373           0 :         xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
     374           0 :         xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
     375           0 :         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
     376           0 :         xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
     377           0 :         bp->b_ops = ops;
     378           0 :         *bpp = bp;
     379             : 
     380           0 :         return 0;
     381             : }
     382             : 
     383             : /*
     384             :  * Reconstructing per-AG Btrees
     385             :  *
     386             :  * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
     387             :  * we scan secondary space metadata to derive the records that should be in
     388             :  * the damaged btree, initialize a fresh btree root, and insert the records.
     389             :  * Note that for rebuilding the rmapbt we scan all the primary data to
     390             :  * generate the new records.
     391             :  *
     392             :  * However, that leaves the matter of removing all the metadata describing the
     393             :  * old broken structure.  For primary metadata we use the rmap data to collect
     394             :  * every extent with a matching rmap owner (bitmap); we then iterate all other
     395             :  * metadata structures with the same rmap owner to collect the extents that
     396             :  * cannot be removed (sublist).  We then subtract sublist from bitmap to
     397             :  * derive the blocks that were used by the old btree.  These blocks can be
     398             :  * reaped.
     399             :  *
     400             :  * For rmapbt reconstructions we must use different tactics for extent
     401             :  * collection.  First we iterate all primary metadata (this excludes the old
     402             :  * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
     403             :  * records are collected as bitmap.  The bnobt records are collected as
     404             :  * sublist.  As with the other btrees we subtract sublist from bitmap, and the
     405             :  * result (since the rmapbt lives in the free space) are the blocks from the
     406             :  * old rmapbt.
     407             :  *
     408             :  * Disposal of Blocks from Old per-AG Btrees
     409             :  *
     410             :  * Now that we've constructed a new btree to replace the damaged one, we want
     411             :  * to dispose of the blocks that (we think) the old btree was using.
     412             :  * Previously, we used the rmapbt to collect the extents (bitmap) with the
     413             :  * rmap owner corresponding to the tree we rebuilt, collected extents for any
     414             :  * blocks with the same rmap owner that are owned by another data structure
     415             :  * (sublist), and subtracted sublist from bitmap.  In theory the extents
     416             :  * remaining in bitmap are the old btree's blocks.
     417             :  *
     418             :  * Unfortunately, it's possible that the btree was crosslinked with other
     419             :  * blocks on disk.  The rmap data can tell us if there are multiple owners, so
     420             :  * if the rmapbt says there is an owner of this block other than @oinfo, then
     421             :  * the block is crosslinked.  Remove the reverse mapping and continue.
     422             :  *
     423             :  * If there is one rmap record, we can free the block, which removes the
     424             :  * reverse mapping but doesn't add the block to the free space.  Our repair
     425             :  * strategy is to hope the other metadata objects crosslinked on this block
     426             :  * will be rebuilt (atop different blocks), thereby removing all the cross
     427             :  * links.
     428             :  *
     429             :  * If there are no rmap records at all, we also free the block.  If the btree
     430             :  * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
     431             :  * supposed to be a rmap record and everything is ok.  For other btrees there
     432             :  * had to have been an rmap entry for the block to have ended up on @bitmap,
     433             :  * so if it's gone now there's something wrong and the fs will shut down.
     434             :  *
     435             :  * Note: If there are multiple rmap records with only the same rmap owner as
     436             :  * the btree we're trying to rebuild and the block is indeed owned by another
     437             :  * data structure with the same rmap owner, then the block will be in sublist
     438             :  * and therefore doesn't need disposal.  If there are multiple rmap records
     439             :  * with only the same rmap owner but the block is not owned by something with
     440             :  * the same rmap owner, the block will be freed.
     441             :  *
     442             :  * The caller is responsible for locking the AG headers for the entire rebuild
     443             :  * operation so that nothing else can sneak in and change the AG state while
     444             :  * we're not looking.  We also assume that the caller already invalidated any
     445             :  * buffers associated with @bitmap.
     446             :  */
     447             : 
     448             : static int
     449           0 : xrep_invalidate_block(
     450             :         uint64_t                fsbno,
     451             :         void                    *priv)
     452             : {
     453           0 :         struct xfs_scrub        *sc = priv;
     454           0 :         struct xfs_buf          *bp;
     455           0 :         int                     error;
     456             : 
     457             :         /* Skip AG headers and post-EOFS blocks */
     458           0 :         if (!xfs_verify_fsbno(sc->mp, fsbno))
     459             :                 return 0;
     460             : 
     461           0 :         error = xfs_buf_incore(sc->mp->m_ddev_targp,
     462           0 :                         XFS_FSB_TO_DADDR(sc->mp, fsbno),
     463           0 :                         XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
     464           0 :         if (error)
     465             :                 return 0;
     466             : 
     467           0 :         xfs_trans_bjoin(sc->tp, bp);
     468           0 :         xfs_trans_binval(sc->tp, bp);
     469           0 :         return 0;
     470             : }
     471             : 
     472             : /*
     473             :  * Invalidate buffers for per-AG btree blocks we're dumping.  This function
     474             :  * is not intended for use with file data repairs; we have bunmapi for that.
     475             :  */
     476             : int
     477           0 : xrep_invalidate_blocks(
     478             :         struct xfs_scrub        *sc,
     479             :         struct xbitmap          *bitmap)
     480             : {
     481             :         /*
     482             :          * For each block in each extent, see if there's an incore buffer for
     483             :          * exactly that block; if so, invalidate it.  The buffer cache only
     484             :          * lets us look for one buffer at a time, so we have to look one block
     485             :          * at a time.  Avoid invalidating AG headers and post-EOFS blocks
     486             :          * because we never own those; and if we can't TRYLOCK the buffer we
     487             :          * assume it's owned by someone else.
     488             :          */
     489           0 :         return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc);
     490             : }
     491             : 
     492             : /* Ensure the freelist is the correct size. */
     493             : int
     494           0 : xrep_fix_freelist(
     495             :         struct xfs_scrub        *sc,
     496             :         bool                    can_shrink)
     497             : {
     498           0 :         struct xfs_alloc_arg    args = {0};
     499             : 
     500           0 :         args.mp = sc->mp;
     501           0 :         args.tp = sc->tp;
     502           0 :         args.agno = sc->sa.pag->pag_agno;
     503           0 :         args.alignment = 1;
     504           0 :         args.pag = sc->sa.pag;
     505             : 
     506           0 :         return xfs_alloc_fix_freelist(&args,
     507             :                         can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
     508             : }
     509             : 
     510             : /* Information about reaping extents after a repair. */
     511             : struct xrep_reap_state {
     512             :         struct xfs_scrub                *sc;
     513             : 
     514             :         /* Reverse mapping owner and metadata reservation type. */
     515             :         const struct xfs_owner_info     *oinfo;
     516             :         enum xfs_ag_resv_type           resv;
     517             : };
     518             : 
     519             : /*
     520             :  * Put a block back on the AGFL.
     521             :  */
     522             : STATIC int
     523           0 : xrep_put_freelist(
     524             :         struct xfs_scrub        *sc,
     525             :         xfs_agblock_t           agbno)
     526             : {
     527           0 :         struct xfs_buf          *agfl_bp;
     528           0 :         int                     error;
     529             : 
     530             :         /* Make sure there's space on the freelist. */
     531           0 :         error = xrep_fix_freelist(sc, true);
     532           0 :         if (error)
     533             :                 return error;
     534             : 
     535             :         /*
     536             :          * Since we're "freeing" a lost block onto the AGFL, we have to
     537             :          * create an rmap for the block prior to merging it or else other
     538             :          * parts will break.
     539             :          */
     540           0 :         error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
     541             :                         &XFS_RMAP_OINFO_AG);
     542           0 :         if (error)
     543             :                 return error;
     544             : 
     545             :         /* Put the block on the AGFL. */
     546           0 :         error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
     547           0 :         if (error)
     548             :                 return error;
     549             : 
     550           0 :         error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
     551             :                         agfl_bp, agbno, 0);
     552           0 :         if (error)
     553             :                 return error;
     554           0 :         xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
     555             :                         XFS_EXTENT_BUSY_SKIP_DISCARD);
     556             : 
     557           0 :         return 0;
     558             : }
     559             : 
     560             : /* Dispose of a single block. */
     561             : STATIC int
     562           0 : xrep_reap_block(
     563             :         uint64_t                        fsbno,
     564             :         void                            *priv)
     565             : {
     566           0 :         struct xrep_reap_state          *rs = priv;
     567           0 :         struct xfs_scrub                *sc = rs->sc;
     568           0 :         struct xfs_btree_cur            *cur;
     569           0 :         struct xfs_buf                  *agf_bp = NULL;
     570           0 :         xfs_agblock_t                   agbno;
     571           0 :         bool                            has_other_rmap;
     572           0 :         int                             error;
     573             : 
     574           0 :         ASSERT(sc->ip != NULL ||
     575             :                XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
     576           0 :         trace_xrep_dispose_btree_extent(sc->mp,
     577           0 :                         XFS_FSB_TO_AGNO(sc->mp, fsbno),
     578           0 :                         XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
     579             : 
     580           0 :         agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
     581           0 :         ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
     582             : 
     583             :         /*
     584             :          * If we are repairing per-inode metadata, we need to read in the AGF
     585             :          * buffer.  Otherwise, we're repairing a per-AG structure, so reuse
     586             :          * the AGF buffer that the setup functions already grabbed.
     587             :          */
     588           0 :         if (sc->ip) {
     589           0 :                 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
     590           0 :                 if (error)
     591             :                         return error;
     592             :         } else {
     593           0 :                 agf_bp = sc->sa.agf_bp;
     594             :         }
     595           0 :         cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
     596             : 
     597             :         /* Can we find any other rmappings? */
     598           0 :         error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
     599             :                         &has_other_rmap);
     600           0 :         xfs_btree_del_cursor(cur, error);
     601           0 :         if (error)
     602           0 :                 goto out_free;
     603             : 
     604             :         /*
     605             :          * If there are other rmappings, this block is cross linked and must
     606             :          * not be freed.  Remove the reverse mapping and move on.  Otherwise,
     607             :          * we were the only owner of the block, so free the extent, which will
     608             :          * also remove the rmap.
     609             :          *
     610             :          * XXX: XFS doesn't support detecting the case where a single block
     611             :          * metadata structure is crosslinked with a multi-block structure
     612             :          * because the buffer cache doesn't detect aliasing problems, so we
     613             :          * can't fix 100% of crosslinking problems (yet).  The verifiers will
     614             :          * blow on writeout, the filesystem will shut down, and the admin gets
     615             :          * to run xfs_repair.
     616             :          */
     617           0 :         if (has_other_rmap)
     618           0 :                 error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
     619             :                                         1, rs->oinfo);
     620           0 :         else if (rs->resv == XFS_AG_RESV_AGFL)
     621           0 :                 error = xrep_put_freelist(sc, agbno);
     622             :         else
     623           0 :                 error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo,
     624             :                                 rs->resv);
     625           0 :         if (agf_bp != sc->sa.agf_bp)
     626           0 :                 xfs_trans_brelse(sc->tp, agf_bp);
     627           0 :         if (error)
     628             :                 return error;
     629             : 
     630           0 :         if (sc->ip)
     631           0 :                 return xfs_trans_roll_inode(&sc->tp, sc->ip);
     632           0 :         return xrep_roll_ag_trans(sc);
     633             : 
     634             : out_free:
     635           0 :         if (agf_bp != sc->sa.agf_bp)
     636           0 :                 xfs_trans_brelse(sc->tp, agf_bp);
     637             :         return error;
     638             : }
     639             : 
     640             : /* Dispose of every block of every extent in the bitmap. */
     641             : int
     642      205361 : xrep_reap_extents(
     643             :         struct xfs_scrub                *sc,
     644             :         struct xbitmap                  *bitmap,
     645             :         const struct xfs_owner_info     *oinfo,
     646             :         enum xfs_ag_resv_type           type)
     647             : {
     648      205361 :         struct xrep_reap_state          rs = {
     649             :                 .sc                     = sc,
     650             :                 .oinfo                  = oinfo,
     651             :                 .resv                   = type,
     652             :         };
     653             : 
     654      205361 :         ASSERT(xfs_has_rmapbt(sc->mp));
     655             : 
     656      205361 :         return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs);
     657             : }
     658             : 
     659             : /*
     660             :  * Finding per-AG Btree Roots for AGF/AGI Reconstruction
     661             :  *
     662             :  * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
     663             :  * the AG headers by using the rmap data to rummage through the AG looking for
     664             :  * btree roots.  This is not guaranteed to work if the AG is heavily damaged
     665             :  * or the rmap data are corrupt.
     666             :  *
     667             :  * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
     668             :  * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
     669             :  * AGI is being rebuilt.  It must maintain these locks until it's safe for
     670             :  * other threads to change the btrees' shapes.  The caller provides
     671             :  * information about the btrees to look for by passing in an array of
     672             :  * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
     673             :  * The (root, height) fields will be set on return if anything is found.  The
     674             :  * last element of the array should have a NULL buf_ops to mark the end of the
     675             :  * array.
     676             :  *
     677             :  * For every rmapbt record matching any of the rmap owners in btree_info,
     678             :  * read each block referenced by the rmap record.  If the block is a btree
     679             :  * block from this filesystem matching any of the magic numbers and has a
     680             :  * level higher than what we've already seen, remember the block and the
     681             :  * height of the tree required to have such a block.  When the call completes,
     682             :  * we return the highest block we've found for each btree description; those
     683             :  * should be the roots.
     684             :  */
     685             : 
     686             : struct xrep_findroot {
     687             :         struct xfs_scrub                *sc;
     688             :         struct xfs_buf                  *agfl_bp;
     689             :         struct xfs_agf                  *agf;
     690             :         struct xrep_find_ag_btree       *btree_info;
     691             : };
     692             : 
     693             : /* See if our block is in the AGFL. */
     694             : STATIC int
     695   510615011 : xrep_findroot_agfl_walk(
     696             :         struct xfs_mount        *mp,
     697             :         xfs_agblock_t           bno,
     698             :         void                    *priv)
     699             : {
     700   510615011 :         xfs_agblock_t           *agbno = priv;
     701             : 
     702   510615011 :         return (*agbno == bno) ? -ECANCELED : 0;
     703             : }
     704             : 
     705             : /* Does this block match the btree information passed in? */
     706             : STATIC int
     707    55309484 : xrep_findroot_block(
     708             :         struct xrep_findroot            *ri,
     709             :         struct xrep_find_ag_btree       *fab,
     710             :         uint64_t                        owner,
     711             :         xfs_agblock_t                   agbno,
     712             :         bool                            *done_with_block)
     713             : {
     714    55309484 :         struct xfs_mount                *mp = ri->sc->mp;
     715    55309484 :         struct xfs_buf                  *bp;
     716    55309484 :         struct xfs_btree_block          *btblock;
     717    55309484 :         xfs_daddr_t                     daddr;
     718    55309484 :         int                             block_level;
     719    55309484 :         int                             error = 0;
     720             : 
     721    55309484 :         daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
     722             : 
     723             :         /*
     724             :          * Blocks in the AGFL have stale contents that might just happen to
     725             :          * have a matching magic and uuid.  We don't want to pull these blocks
     726             :          * in as part of a tree root, so we have to filter out the AGFL stuff
     727             :          * here.  If the AGFL looks insane we'll just refuse to repair.
     728             :          */
     729    55309484 :         if (owner == XFS_RMAP_OWN_AG) {
     730    54208776 :                 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
     731             :                                 xrep_findroot_agfl_walk, &agbno);
     732    54210137 :                 if (error == -ECANCELED)
     733             :                         return 0;
     734    50055798 :                 if (error)
     735             :                         return error;
     736             :         }
     737             : 
     738             :         /*
     739             :          * Read the buffer into memory so that we can see if it's a match for
     740             :          * our btree type.  We have no clue if it is beforehand, and we want to
     741             :          * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
     742             :          * will cause needless disk reads in subsequent calls to this function)
     743             :          * and logging metadata verifier failures.
     744             :          *
     745             :          * Therefore, pass in NULL buffer ops.  If the buffer was already in
     746             :          * memory from some other caller it will already have b_ops assigned.
     747             :          * If it was in memory from a previous unsuccessful findroot_block
     748             :          * call, the buffer won't have b_ops but it should be clean and ready
     749             :          * for us to try to verify if the read call succeeds.  The same applies
     750             :          * if the buffer wasn't in memory at all.
     751             :          *
     752             :          * Note: If we never match a btree type with this buffer, it will be
     753             :          * left in memory with NULL b_ops.  This shouldn't be a problem unless
     754             :          * the buffer gets written.
     755             :          */
     756    51156506 :         error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
     757             :                         mp->m_bsize, 0, &bp, NULL);
     758    51156604 :         if (error)
     759             :                 return error;
     760             : 
     761             :         /* Ensure the block magic matches the btree type we're looking for. */
     762    51156604 :         btblock = XFS_BUF_TO_BLOCK(bp);
     763    51156604 :         ASSERT(fab->buf_ops->magic[1] != 0);
     764    51156604 :         if (btblock->bb_magic != fab->buf_ops->magic[1])
     765    32540774 :                 goto out;
     766             : 
     767             :         /*
     768             :          * If the buffer already has ops applied and they're not the ones for
     769             :          * this btree type, we know this block doesn't match the btree and we
     770             :          * can bail out.
     771             :          *
     772             :          * If the buffer ops match ours, someone else has already validated
     773             :          * the block for us, so we can move on to checking if this is a root
     774             :          * block candidate.
     775             :          *
     776             :          * If the buffer does not have ops, nobody has successfully validated
     777             :          * the contents and the buffer cannot be dirty.  If the magic, uuid,
     778             :          * and structure match this btree type then we'll move on to checking
     779             :          * if it's a root block candidate.  If there is no match, bail out.
     780             :          */
     781    18615830 :         if (bp->b_ops) {
     782    18615830 :                 if (bp->b_ops != fab->buf_ops)
     783           0 :                         goto out;
     784             :         } else {
     785           0 :                 ASSERT(!xfs_trans_buf_is_dirty(bp));
     786           0 :                 if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
     787           0 :                                 &mp->m_sb.sb_meta_uuid))
     788           0 :                         goto out;
     789             :                 /*
     790             :                  * Read verifiers can reference b_ops, so we set the pointer
     791             :                  * here.  If the verifier fails we'll reset the buffer state
     792             :                  * to what it was before we touched the buffer.
     793             :                  */
     794           0 :                 bp->b_ops = fab->buf_ops;
     795           0 :                 fab->buf_ops->verify_read(bp);
     796           0 :                 if (bp->b_error) {
     797           0 :                         bp->b_ops = NULL;
     798           0 :                         bp->b_error = 0;
     799           0 :                         goto out;
     800             :                 }
     801             : 
     802             :                 /*
     803             :                  * Some read verifiers will (re)set b_ops, so we must be
     804             :                  * careful not to change b_ops after running the verifier.
     805             :                  */
     806             :         }
     807             : 
     808             :         /*
     809             :          * This block passes the magic/uuid and verifier tests for this btree
     810             :          * type.  We don't need the caller to try the other tree types.
     811             :          */
     812    18615830 :         *done_with_block = true;
     813             : 
     814             :         /*
     815             :          * Compare this btree block's level to the height of the current
     816             :          * candidate root block.
     817             :          *
     818             :          * If the level matches the root we found previously, throw away both
     819             :          * blocks because there can't be two candidate roots.
     820             :          *
     821             :          * If level is lower in the tree than the root we found previously,
     822             :          * ignore this block.
     823             :          */
     824    18615830 :         block_level = xfs_btree_get_level(btblock);
     825    18615830 :         if (block_level + 1 == fab->height) {
     826      481410 :                 fab->root = NULLAGBLOCK;
     827      481410 :                 goto out;
     828    18134420 :         } else if (block_level < fab->height) {
     829    16651679 :                 goto out;
     830             :         }
     831             : 
     832             :         /*
     833             :          * This is the highest block in the tree that we've found so far.
     834             :          * Update the btree height to reflect what we've learned from this
     835             :          * block.
     836             :          */
     837     1482741 :         fab->height = block_level + 1;
     838             : 
     839             :         /*
     840             :          * If this block doesn't have sibling pointers, then it's the new root
     841             :          * block candidate.  Otherwise, the root will be found farther up the
     842             :          * tree.
     843             :          */
     844     1482741 :         if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
     845             :             btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
     846     1204727 :                 fab->root = agbno;
     847             :         else
     848      278014 :                 fab->root = NULLAGBLOCK;
     849             : 
     850     1482741 :         trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
     851     1482741 :                         be32_to_cpu(btblock->bb_magic), fab->height - 1);
     852    51156519 : out:
     853    51156519 :         xfs_trans_brelse(ri->sc->tp, bp);
     854    51156519 :         return error;
     855             : }
     856             : 
     857             : /*
     858             :  * Do any of the blocks in this rmap record match one of the btrees we're
     859             :  * looking for?
     860             :  */
     861             : STATIC int
     862  4838775843 : xrep_findroot_rmap(
     863             :         struct xfs_btree_cur            *cur,
     864             :         const struct xfs_rmap_irec      *rec,
     865             :         void                            *priv)
     866             : {
     867  4838775843 :         struct xrep_findroot            *ri = priv;
     868  4838775843 :         struct xrep_find_ag_btree       *fab;
     869  4838775843 :         xfs_agblock_t                   b;
     870  4838775843 :         bool                            done;
     871  4838775843 :         int                             error = 0;
     872             : 
     873             :         /* Ignore anything that isn't AG metadata. */
     874  4838775843 :         if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
     875             :                 return 0;
     876             : 
     877             :         /* Otherwise scan each block + btree type. */
     878  2303146838 :         for (b = 0; b < rec->rm_blockcount; b++) {
     879  2201071200 :                 done = false;
     880  8623366824 :                 for (fab = ri->btree_info; fab->buf_ops; fab++) {
     881  6440840758 :                         if (rec->rm_owner != fab->rmap_owner)
     882  6385601570 :                                 continue;
     883    55239188 :                         error = xrep_findroot_block(ri, fab,
     884    55239188 :                                         rec->rm_owner, rec->rm_startblock + b,
     885             :                                         &done);
     886    55310639 :                         if (error)
     887           0 :                                 return error;
     888    55310639 :                         if (done)
     889             :                                 break;
     890             :                 }
     891             :         }
     892             : 
     893             :         return 0;
     894             : }
     895             : 
     896             : /* Find the roots of the per-AG btrees described in btree_info. */
     897             : int
     898      406003 : xrep_find_ag_btree_roots(
     899             :         struct xfs_scrub                *sc,
     900             :         struct xfs_buf                  *agf_bp,
     901             :         struct xrep_find_ag_btree       *btree_info,
     902             :         struct xfs_buf                  *agfl_bp)
     903             : {
     904      406003 :         struct xfs_mount                *mp = sc->mp;
     905      406003 :         struct xrep_findroot            ri;
     906      406003 :         struct xrep_find_ag_btree       *fab;
     907      406003 :         struct xfs_btree_cur            *cur;
     908      406003 :         int                             error;
     909             : 
     910      406003 :         ASSERT(xfs_buf_islocked(agf_bp));
     911      406003 :         ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
     912             : 
     913      406003 :         ri.sc = sc;
     914      406003 :         ri.btree_info = btree_info;
     915      406003 :         ri.agf = agf_bp->b_addr;
     916      406003 :         ri.agfl_bp = agfl_bp;
     917     1609016 :         for (fab = btree_info; fab->buf_ops; fab++) {
     918     1203032 :                 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
     919     1203032 :                 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
     920     1203013 :                 fab->root = NULLAGBLOCK;
     921     1203013 :                 fab->height = 0;
     922             :         }
     923             : 
     924      405984 :         cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
     925      406381 :         error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
     926      406437 :         xfs_btree_del_cursor(cur, error);
     927             : 
     928      406499 :         return error;
     929             : }
     930             : 
     931             : /* Force a quotacheck the next time we mount. */
     932             : void
     933           0 : xrep_force_quotacheck(
     934             :         struct xfs_scrub        *sc,
     935             :         xfs_dqtype_t            type)
     936             : {
     937           0 :         uint                    flag;
     938             : 
     939           0 :         flag = xfs_quota_chkd_flag(type);
     940           0 :         if (!(flag & sc->mp->m_qflags))
     941             :                 return;
     942             : 
     943           0 :         mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
     944           0 :         sc->mp->m_qflags &= ~flag;
     945           0 :         spin_lock(&sc->mp->m_sb_lock);
     946           0 :         sc->mp->m_sb.sb_qflags &= ~flag;
     947           0 :         spin_unlock(&sc->mp->m_sb_lock);
     948           0 :         xfs_log_sb(sc->tp);
     949           0 :         mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
     950             : }
     951             : 
     952             : /*
     953             :  * Attach dquots to this inode, or schedule quotacheck to fix them.
     954             :  *
     955             :  * This function ensures that the appropriate dquots are attached to an inode.
     956             :  * We cannot allow the dquot code to allocate an on-disk dquot block here
     957             :  * because we're already in transaction context with the inode locked.  The
     958             :  * on-disk dquot should already exist anyway.  If the quota code signals
     959             :  * corruption or missing quota information, schedule quotacheck, which will
     960             :  * repair corruptions in the quota metadata.
     961             :  */
     962             : int
     963           0 : xrep_ino_dqattach(
     964             :         struct xfs_scrub        *sc)
     965             : {
     966           0 :         int                     error;
     967             : 
     968           0 :         error = xfs_qm_dqattach_locked(sc->ip, false);
     969           0 :         switch (error) {
     970           0 :         case -EFSBADCRC:
     971             :         case -EFSCORRUPTED:
     972             :         case -ENOENT:
     973           0 :                 xfs_err_ratelimited(sc->mp,
     974             : "inode %llu repair encountered quota error %d, quotacheck forced.",
     975             :                                 (unsigned long long)sc->ip->i_ino, error);
     976           0 :                 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
     977           0 :                         xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
     978           0 :                 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
     979           0 :                         xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
     980           0 :                 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
     981           0 :                         xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
     982             :                 fallthrough;
     983             :         case -ESRCH:
     984             :                 error = 0;
     985             :                 break;
     986             :         default:
     987             :                 break;
     988             :         }
     989             : 
     990           0 :         return error;
     991             : }

Generated by: LCOV version 1.14