LCOV - code coverage report
Current view: top level - fs/xfs/scrub - repair.c (source / functions) Hit Total Coverage
Test: fstests of 6.5.0-rc3-achx @ Mon Jul 31 20:08:12 PDT 2023 Lines: 372 473 78.6 %
Date: 2023-07-31 20:08:12 Functions: 27 30 90.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
       4             :  * Author: Darrick J. Wong <djwong@kernel.org>
       5             :  */
       6             : #include "xfs.h"
       7             : #include "xfs_fs.h"
       8             : #include "xfs_shared.h"
       9             : #include "xfs_format.h"
      10             : #include "xfs_trans_resv.h"
      11             : #include "xfs_mount.h"
      12             : #include "xfs_btree.h"
      13             : #include "xfs_log_format.h"
      14             : #include "xfs_trans.h"
      15             : #include "xfs_sb.h"
      16             : #include "xfs_inode.h"
      17             : #include "xfs_alloc.h"
      18             : #include "xfs_alloc_btree.h"
      19             : #include "xfs_ialloc.h"
      20             : #include "xfs_ialloc_btree.h"
      21             : #include "xfs_rmap.h"
      22             : #include "xfs_rmap_btree.h"
      23             : #include "xfs_refcount_btree.h"
      24             : #include "xfs_extent_busy.h"
      25             : #include "xfs_ag.h"
      26             : #include "xfs_ag_resv.h"
      27             : #include "xfs_quota.h"
      28             : #include "xfs_qm.h"
      29             : #include "xfs_defer.h"
      30             : #include "xfs_errortag.h"
      31             : #include "xfs_error.h"
      32             : #include "xfs_reflink.h"
      33             : #include "xfs_health.h"
      34             : #include "xfs_buf_xfile.h"
      35             : #include "xfs_da_format.h"
      36             : #include "xfs_da_btree.h"
      37             : #include "xfs_attr.h"
      38             : #include "xfs_dir2.h"
      39             : #include "scrub/scrub.h"
      40             : #include "scrub/common.h"
      41             : #include "scrub/trace.h"
      42             : #include "scrub/repair.h"
      43             : #include "scrub/bitmap.h"
      44             : #include "scrub/stats.h"
      45             : #include "scrub/xfile.h"
      46             : #include "scrub/attr_repair.h"
      47             : 
      48             : /*
      49             :  * Attempt to repair some metadata, if the metadata is corrupt and userspace
      50             :  * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
      51             :  * and will set *fixed to true if it thinks it repaired anything.
      52             :  */
      53             : int
      54    48917533 : xrep_attempt(
      55             :         struct xfs_scrub        *sc,
      56             :         struct xchk_stats_run   *run)
      57             : {
      58    48917533 :         u64                     repair_start;
      59    48917533 :         int                     error = 0;
      60             : 
      61    48917533 :         trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
      62             : 
      63    48907738 :         xchk_ag_btcur_free(&sc->sa);
      64             : 
      65             :         /* Repair whatever's broken. */
      66    48907486 :         ASSERT(sc->ops->repair);
      67    48907486 :         run->repair_attempted = true;
      68    48907486 :         repair_start = xchk_stats_now();
      69    48909357 :         error = sc->ops->repair(sc);
      70    48918368 :         trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
      71    48915813 :         run->repair_ns += xchk_stats_elapsed_ns(repair_start);
      72    48918897 :         switch (error) {
      73    21547577 :         case 0:
      74             :                 /*
      75             :                  * Repair succeeded.  Commit the fixes and perform a second
      76             :                  * scrub so that we can tell userspace if we fixed the problem.
      77             :                  */
      78    21547577 :                 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
      79    21547577 :                 sc->flags |= XREP_ALREADY_FIXED;
      80    21547577 :                 run->repair_succeeded = true;
      81    21547577 :                 return -EAGAIN;
      82         124 :         case -ECHRNG:
      83         124 :                 sc->flags |= XCHK_NEED_DRAIN;
      84         124 :                 run->retries++;
      85         124 :                 return -EAGAIN;
      86           0 :         case -EDEADLOCK:
      87             :                 /* Tell the caller to try again having grabbed all the locks. */
      88           0 :                 if (!(sc->flags & XCHK_TRY_HARDER)) {
      89           0 :                         sc->flags |= XCHK_TRY_HARDER;
      90           0 :                         run->retries++;
      91           0 :                         return -EAGAIN;
      92             :                 }
      93             :                 /*
      94             :                  * We tried harder but still couldn't grab all the resources
      95             :                  * we needed to fix it.  The corruption has not been fixed,
      96             :                  * so exit to userspace with the scan's output flags unchanged.
      97             :                  */
      98             :                 return 0;
      99    27371196 :         default:
     100             :                 /*
     101             :                  * EAGAIN tells the caller to re-scrub, so we cannot return
     102             :                  * that here.
     103             :                  */
     104    27371196 :                 ASSERT(error != -EAGAIN);
     105             :                 return error;
     106             :         }
     107             : }
     108             : 
     109             : /*
     110             :  * Complain about unfixable problems in the filesystem.  We don't log
     111             :  * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
     112             :  * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
     113             :  * administrator isn't running xfs_scrub in no-repairs mode.
     114             :  *
     115             :  * Use this helper function because _ratelimited silently declares a static
     116             :  * structure to track rate limiting information.
     117             :  */
     118             : void
     119           0 : xrep_failure(
     120             :         struct xfs_mount        *mp)
     121             : {
     122           0 :         xfs_alert_ratelimited(mp,
     123             : "Corruption not fixed during online repair.  Unmount and run xfs_repair.");
     124           0 : }
     125             : 
     126             : /*
     127             :  * Repair probe -- userspace uses this to probe if we're willing to repair a
     128             :  * given mountpoint.
     129             :  */
     130             : int
     131        4862 : xrep_probe(
     132             :         struct xfs_scrub        *sc)
     133             : {
     134        4862 :         int                     error = 0;
     135             : 
     136        4862 :         if (xchk_should_terminate(sc, &error))
     137           0 :                 return error;
     138             : 
     139             :         return 0;
     140             : }
     141             : 
     142             : /*
     143             :  * Roll a transaction, keeping the AG headers locked and reinitializing
     144             :  * the btree cursors.
     145             :  */
     146             : int
     147      258654 : xrep_roll_ag_trans(
     148             :         struct xfs_scrub        *sc)
     149             : {
     150      258654 :         int                     error;
     151             : 
     152             :         /*
     153             :          * Keep the AG header buffers locked while we roll the transaction.
     154             :          * Ensure that both AG buffers are dirty and held when we roll the
     155             :          * transaction so that they move forward in the log without losing the
     156             :          * bli (and hence the bli type) when the transaction commits.
     157             :          *
     158             :          * Normal code would never hold clean buffers across a roll, but repair
     159             :          * needs both buffers to maintain a total lock on the AG.
     160             :          */
     161      258654 :         if (sc->sa.agi_bp) {
     162      258654 :                 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
     163      258786 :                 xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
     164             :         }
     165             : 
     166      258576 :         if (sc->sa.agf_bp) {
     167      258576 :                 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
     168      258823 :                 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
     169             :         }
     170             : 
     171             :         /*
     172             :          * Roll the transaction.  We still hold the AG header buffers locked
     173             :          * regardless of whether or not that succeeds.  On failure, the buffers
     174             :          * will be released during teardown on our way out of the kernel.  If
     175             :          * successful, join the buffers to the new transaction and move on.
     176             :          */
     177      258775 :         error = xfs_trans_roll(&sc->tp);
     178      258506 :         if (error)
     179             :                 return error;
     180             : 
     181             :         /* Join the AG headers to the new transaction. */
     182      258506 :         if (sc->sa.agi_bp)
     183      258506 :                 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
     184      258421 :         if (sc->sa.agf_bp)
     185      258421 :                 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
     186             : 
     187             :         return 0;
     188             : }
     189             : 
     190             : /* Roll the scrub transaction, holding the primary metadata locked. */
     191             : int
     192     7151044 : xrep_roll_trans(
     193             :         struct xfs_scrub        *sc)
     194             : {
     195     7151044 :         if (!sc->ip)
     196        3101 :                 return xrep_roll_ag_trans(sc);
     197     7147943 :         return xfs_trans_roll_inode(&sc->tp, sc->ip);
     198             : }
     199             : 
     200             : /* Finish all deferred work attached to the repair transaction. */
     201             : int
     202      436073 : xrep_defer_finish(
     203             :         struct xfs_scrub        *sc)
     204             : {
     205      436073 :         int                     error;
     206             : 
     207             :         /*
     208             :          * Keep the AG header buffers locked while we complete deferred work
     209             :          * items.  Ensure that both AG buffers are dirty and held when we roll
     210             :          * the transaction so that they move forward in the log without losing
     211             :          * the bli (and hence the bli type) when the transaction commits.
     212             :          *
     213             :          * Normal code would never hold clean buffers across a roll, but repair
     214             :          * needs both buffers to maintain a total lock on the AG.
     215             :          */
     216      436073 :         if (sc->sa.agi_bp) {
     217      367535 :                 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
     218      367850 :                 xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
     219             :         }
     220             : 
     221      435952 :         if (sc->sa.agf_bp) {
     222      388249 :                 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
     223      389036 :                 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
     224             :         }
     225             : 
     226             :         /*
     227             :          * Finish all deferred work items.  We still hold the AG header buffers
     228             :          * locked regardless of whether or not that succeeds.  On failure, the
     229             :          * buffers will be released during teardown on our way out of the
     230             :          * kernel.  If successful, join the buffers to the new transaction
     231             :          * and move on.
     232             :          */
     233      436641 :         error = xfs_defer_finish(&sc->tp);
     234      436288 :         if (error)
     235             :                 return error;
     236             : 
     237             :         /*
     238             :          * Release the hold that we set above because defer_finish won't do
     239             :          * that for us.  The defer roll code redirties held buffers after each
     240             :          * roll, so the AG header buffers should be ready for logging.
     241             :          */
     242      436288 :         if (sc->sa.agi_bp)
     243      367750 :                 xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
     244      435643 :         if (sc->sa.agf_bp)
     245      387940 :                 xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
     246             : 
     247             :         return 0;
     248             : }
     249             : 
     250             : /*
     251             :  * Does the given AG have enough space to rebuild a btree?  Neither AG
     252             :  * reservation can be critical, and we must have enough space (factoring
     253             :  * in AG reservations) to construct a whole btree.
     254             :  */
     255             : bool
     256           0 : xrep_ag_has_space(
     257             :         struct xfs_perag        *pag,
     258             :         xfs_extlen_t            nr_blocks,
     259             :         enum xfs_ag_resv_type   type)
     260             : {
     261           0 :         return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
     262           0 :                 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
     263           0 :                 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
     264             : }
     265             : 
     266             : /*
     267             :  * Figure out how many blocks to reserve for an AG repair.  We calculate the
     268             :  * worst case estimate for the number of blocks we'd need to rebuild one of
     269             :  * any type of per-AG btree.
     270             :  */
     271             : xfs_extlen_t
     272     4840780 : xrep_calc_ag_resblks(
     273             :         struct xfs_scrub                *sc)
     274             : {
     275     4840780 :         struct xfs_mount                *mp = sc->mp;
     276     4840780 :         struct xfs_scrub_metadata       *sm = sc->sm;
     277     4840780 :         struct xfs_perag                *pag;
     278     4840780 :         struct xfs_buf                  *bp;
     279     4840780 :         xfs_agino_t                     icount = NULLAGINO;
     280     4840780 :         xfs_extlen_t                    aglen = NULLAGBLOCK;
     281     4840780 :         xfs_extlen_t                    usedlen;
     282     4840780 :         xfs_extlen_t                    freelen;
     283     4840780 :         xfs_extlen_t                    bnobt_sz;
     284     4840780 :         xfs_extlen_t                    inobt_sz;
     285     4840780 :         xfs_extlen_t                    rmapbt_sz;
     286     4840780 :         xfs_extlen_t                    refcbt_sz;
     287     4840780 :         int                             error;
     288             : 
     289     4840780 :         if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
     290             :                 return 0;
     291             : 
     292     1630130 :         pag = xfs_perag_get(mp, sm->sm_agno);
     293     3262114 :         if (xfs_perag_initialised_agi(pag)) {
     294             :                 /* Use in-core icount if possible. */
     295     1631057 :                 icount = pag->pagi_count;
     296             :         } else {
     297             :                 /* Try to get the actual counters from disk. */
     298           0 :                 error = xfs_ialloc_read_agi(pag, NULL, &bp);
     299           0 :                 if (!error) {
     300           0 :                         icount = pag->pagi_count;
     301           0 :                         xfs_buf_relse(bp);
     302             :                 }
     303             :         }
     304             : 
     305             :         /* Now grab the block counters from the AGF. */
     306     1631057 :         error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
     307     1630682 :         if (error) {
     308           0 :                 aglen = pag->block_count;
     309           0 :                 freelen = aglen;
     310           0 :                 usedlen = aglen;
     311             :         } else {
     312     1630682 :                 struct xfs_agf  *agf = bp->b_addr;
     313             : 
     314     1630682 :                 aglen = be32_to_cpu(agf->agf_length);
     315     1630682 :                 freelen = be32_to_cpu(agf->agf_freeblks);
     316     1630682 :                 usedlen = aglen - freelen;
     317     1630682 :                 xfs_buf_relse(bp);
     318             :         }
     319             : 
     320             :         /* If the icount is impossible, make some worst-case assumptions. */
     321     1630998 :         if (icount == NULLAGINO ||
     322             :             !xfs_verify_agino(pag, icount)) {
     323      290934 :                 icount = pag->agino_max - pag->agino_min + 1;
     324             :         }
     325             : 
     326             :         /* If the block counts are impossible, make worst-case assumptions. */
     327     1630998 :         if (aglen == NULLAGBLOCK ||
     328     1630966 :             aglen != pag->block_count ||
     329             :             freelen >= aglen) {
     330          77 :                 aglen = pag->block_count;
     331          77 :                 freelen = aglen;
     332          77 :                 usedlen = aglen;
     333             :         }
     334     1630998 :         xfs_perag_put(pag);
     335             : 
     336     1631104 :         trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
     337             :                         freelen, usedlen);
     338             : 
     339             :         /*
     340             :          * Figure out how many blocks we'd need worst case to rebuild
     341             :          * each type of btree.  Note that we can only rebuild the
     342             :          * bnobt/cntbt or inobt/finobt as pairs.
     343             :          */
     344     1630921 :         bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
     345     1630544 :         if (xfs_has_sparseinodes(mp))
     346     1630544 :                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
     347             :                                 XFS_INODES_PER_HOLEMASK_BIT);
     348             :         else
     349           0 :                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
     350             :                                 XFS_INODES_PER_CHUNK);
     351     1630379 :         if (xfs_has_finobt(mp))
     352     1630395 :                 inobt_sz *= 2;
     353     1630379 :         if (xfs_has_reflink(mp))
     354     1332213 :                 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
     355             :         else
     356             :                 refcbt_sz = 0;
     357     1630084 :         if (xfs_has_rmapbt(mp)) {
     358             :                 /*
     359             :                  * Guess how many blocks we need to rebuild the rmapbt.
     360             :                  * For non-reflink filesystems we can't have more records than
     361             :                  * used blocks.  However, with reflink it's possible to have
     362             :                  * more than one rmap record per AG block.  We don't know how
     363             :                  * many rmaps there could be in the AG, so we start off with
     364             :                  * what we hope is an generous over-estimation.
     365             :                  */
     366     1332134 :                 if (xfs_has_reflink(mp))
     367     1331923 :                         rmapbt_sz = xfs_rmapbt_calc_size(mp,
     368     1331923 :                                         (unsigned long long)aglen * 2);
     369             :                 else
     370         211 :                         rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
     371             :         } else {
     372             :                 rmapbt_sz = 0;
     373             :         }
     374             : 
     375     1630292 :         trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
     376             :                         inobt_sz, rmapbt_sz, refcbt_sz);
     377             : 
     378     1630314 :         return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
     379             : }
     380             : 
     381             : /*
     382             :  * Reconstructing per-AG Btrees
     383             :  *
     384             :  * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
     385             :  * we scan secondary space metadata to derive the records that should be in
     386             :  * the damaged btree, initialize a fresh btree root, and insert the records.
     387             :  * Note that for rebuilding the rmapbt we scan all the primary data to
     388             :  * generate the new records.
     389             :  *
     390             :  * However, that leaves the matter of removing all the metadata describing the
     391             :  * old broken structure.  For primary metadata we use the rmap data to collect
     392             :  * every extent with a matching rmap owner (bitmap); we then iterate all other
     393             :  * metadata structures with the same rmap owner to collect the extents that
     394             :  * cannot be removed (sublist).  We then subtract sublist from bitmap to
     395             :  * derive the blocks that were used by the old btree.  These blocks can be
     396             :  * reaped.
     397             :  *
     398             :  * For rmapbt reconstructions we must use different tactics for extent
     399             :  * collection.  First we iterate all primary metadata (this excludes the old
     400             :  * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
     401             :  * records are collected as bitmap.  The bnobt records are collected as
     402             :  * sublist.  As with the other btrees we subtract sublist from bitmap, and the
     403             :  * result (since the rmapbt lives in the free space) are the blocks from the
     404             :  * old rmapbt.
     405             :  */
     406             : 
     407             : /* Ensure the freelist is the correct size. */
     408             : int
     409       18863 : xrep_fix_freelist(
     410             :         struct xfs_scrub        *sc,
     411             :         int                     alloc_flags)
     412             : {
     413       18863 :         struct xfs_alloc_arg    args = {0};
     414             : 
     415       18863 :         args.mp = sc->mp;
     416       18863 :         args.tp = sc->tp;
     417       18863 :         args.agno = sc->sa.pag->pag_agno;
     418       18863 :         args.alignment = 1;
     419       18863 :         args.pag = sc->sa.pag;
     420             : 
     421       18863 :         return xfs_alloc_fix_freelist(&args, alloc_flags);
     422             : }
     423             : 
     424             : /*
     425             :  * Finding per-AG Btree Roots for AGF/AGI Reconstruction
     426             :  *
     427             :  * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
     428             :  * the AG headers by using the rmap data to rummage through the AG looking for
     429             :  * btree roots.  This is not guaranteed to work if the AG is heavily damaged
     430             :  * or the rmap data are corrupt.
     431             :  *
     432             :  * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
     433             :  * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
     434             :  * AGI is being rebuilt.  It must maintain these locks until it's safe for
     435             :  * other threads to change the btrees' shapes.  The caller provides
     436             :  * information about the btrees to look for by passing in an array of
     437             :  * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
     438             :  * The (root, height) fields will be set on return if anything is found.  The
     439             :  * last element of the array should have a NULL buf_ops to mark the end of the
     440             :  * array.
     441             :  *
     442             :  * For every rmapbt record matching any of the rmap owners in btree_info,
     443             :  * read each block referenced by the rmap record.  If the block is a btree
     444             :  * block from this filesystem matching any of the magic numbers and has a
     445             :  * level higher than what we've already seen, remember the block and the
     446             :  * height of the tree required to have such a block.  When the call completes,
     447             :  * we return the highest block we've found for each btree description; those
     448             :  * should be the roots.
     449             :  */
     450             : 
     451             : struct xrep_findroot {
     452             :         struct xfs_scrub                *sc;
     453             :         struct xfs_buf                  *agfl_bp;
     454             :         struct xfs_agf                  *agf;
     455             :         struct xrep_find_ag_btree       *btree_info;
     456             : };
     457             : 
     458             : /* See if our block is in the AGFL. */
     459             : STATIC int
     460   567292009 : xrep_findroot_agfl_walk(
     461             :         struct xfs_mount        *mp,
     462             :         xfs_agblock_t           bno,
     463             :         void                    *priv)
     464             : {
     465   567292009 :         xfs_agblock_t           *agbno = priv;
     466             : 
     467   567292009 :         return (*agbno == bno) ? -ECANCELED : 0;
     468             : }
     469             : 
     470             : /* Does this block match the btree information passed in? */
     471             : STATIC int
     472    59064814 : xrep_findroot_block(
     473             :         struct xrep_findroot            *ri,
     474             :         struct xrep_find_ag_btree       *fab,
     475             :         uint64_t                        owner,
     476             :         xfs_agblock_t                   agbno,
     477             :         bool                            *done_with_block)
     478             : {
     479    59064814 :         struct xfs_mount                *mp = ri->sc->mp;
     480    59064814 :         struct xfs_buf                  *bp;
     481    59064814 :         struct xfs_btree_block          *btblock;
     482    59064814 :         xfs_daddr_t                     daddr;
     483    59064814 :         int                             block_level;
     484    59064814 :         int                             error = 0;
     485             : 
     486    59064814 :         daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
     487             : 
     488             :         /*
     489             :          * Blocks in the AGFL have stale contents that might just happen to
     490             :          * have a matching magic and uuid.  We don't want to pull these blocks
     491             :          * in as part of a tree root, so we have to filter out the AGFL stuff
     492             :          * here.  If the AGFL looks insane we'll just refuse to repair.
     493             :          */
     494    59064814 :         if (owner == XFS_RMAP_OWN_AG) {
     495    58384500 :                 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
     496             :                                 xrep_findroot_agfl_walk, &agbno);
     497    58384498 :                 if (error == -ECANCELED)
     498             :                         return 0;
     499    56157915 :                 if (error)
     500             :                         return error;
     501             :         }
     502             : 
     503             :         /*
     504             :          * Read the buffer into memory so that we can see if it's a match for
     505             :          * our btree type.  We have no clue if it is beforehand, and we want to
     506             :          * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
     507             :          * will cause needless disk reads in subsequent calls to this function)
     508             :          * and logging metadata verifier failures.
     509             :          *
     510             :          * Therefore, pass in NULL buffer ops.  If the buffer was already in
     511             :          * memory from some other caller it will already have b_ops assigned.
     512             :          * If it was in memory from a previous unsuccessful findroot_block
     513             :          * call, the buffer won't have b_ops but it should be clean and ready
     514             :          * for us to try to verify if the read call succeeds.  The same applies
     515             :          * if the buffer wasn't in memory at all.
     516             :          *
     517             :          * Note: If we never match a btree type with this buffer, it will be
     518             :          * left in memory with NULL b_ops.  This shouldn't be a problem unless
     519             :          * the buffer gets written.
     520             :          */
     521    56838229 :         error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
     522             :                         mp->m_bsize, 0, &bp, NULL);
     523    56838252 :         if (error)
     524             :                 return error;
     525             : 
     526             :         /* Ensure the block magic matches the btree type we're looking for. */
     527    56838252 :         btblock = XFS_BUF_TO_BLOCK(bp);
     528    56838252 :         ASSERT(fab->buf_ops->magic[1] != 0);
     529    56838252 :         if (btblock->bb_magic != fab->buf_ops->magic[1])
     530    36520731 :                 goto out;
     531             : 
     532             :         /*
     533             :          * If the buffer already has ops applied and they're not the ones for
     534             :          * this btree type, we know this block doesn't match the btree and we
     535             :          * can bail out.
     536             :          *
     537             :          * If the buffer ops match ours, someone else has already validated
     538             :          * the block for us, so we can move on to checking if this is a root
     539             :          * block candidate.
     540             :          *
     541             :          * If the buffer does not have ops, nobody has successfully validated
     542             :          * the contents and the buffer cannot be dirty.  If the magic, uuid,
     543             :          * and structure match this btree type then we'll move on to checking
     544             :          * if it's a root block candidate.  If there is no match, bail out.
     545             :          */
     546    20317521 :         if (bp->b_ops) {
     547    20317521 :                 if (bp->b_ops != fab->buf_ops)
     548           0 :                         goto out;
     549             :         } else {
     550           0 :                 ASSERT(!xfs_trans_buf_is_dirty(bp));
     551           0 :                 if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
     552           0 :                                 &mp->m_sb.sb_meta_uuid))
     553           0 :                         goto out;
     554             :                 /*
     555             :                  * Read verifiers can reference b_ops, so we set the pointer
     556             :                  * here.  If the verifier fails we'll reset the buffer state
     557             :                  * to what it was before we touched the buffer.
     558             :                  */
     559           0 :                 bp->b_ops = fab->buf_ops;
     560           0 :                 fab->buf_ops->verify_read(bp);
     561           0 :                 if (bp->b_error) {
     562           0 :                         bp->b_ops = NULL;
     563           0 :                         bp->b_error = 0;
     564           0 :                         goto out;
     565             :                 }
     566             : 
     567             :                 /*
     568             :                  * Some read verifiers will (re)set b_ops, so we must be
     569             :                  * careful not to change b_ops after running the verifier.
     570             :                  */
     571             :         }
     572             : 
     573             :         /*
     574             :          * This block passes the magic/uuid and verifier tests for this btree
     575             :          * type.  We don't need the caller to try the other tree types.
     576             :          */
     577    20317521 :         *done_with_block = true;
     578             : 
     579             :         /*
     580             :          * Compare this btree block's level to the height of the current
     581             :          * candidate root block.
     582             :          *
     583             :          * If the level matches the root we found previously, throw away both
     584             :          * blocks because there can't be two candidate roots.
     585             :          *
     586             :          * If level is lower in the tree than the root we found previously,
     587             :          * ignore this block.
     588             :          */
     589    20317521 :         block_level = xfs_btree_get_level(btblock);
     590    20317521 :         if (block_level + 1 == fab->height) {
     591      470141 :                 fab->root = NULLAGBLOCK;
     592      470141 :                 goto out;
     593    19847380 :         } else if (block_level < fab->height) {
     594    19018006 :                 goto out;
     595             :         }
     596             : 
     597             :         /*
     598             :          * This is the highest block in the tree that we've found so far.
     599             :          * Update the btree height to reflect what we've learned from this
     600             :          * block.
     601             :          */
     602      829374 :         fab->height = block_level + 1;
     603             : 
     604             :         /*
     605             :          * If this block doesn't have sibling pointers, then it's the new root
     606             :          * block candidate.  Otherwise, the root will be found farther up the
     607             :          * tree.
     608             :          */
     609      829374 :         if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
     610             :             btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
     611      545241 :                 fab->root = agbno;
     612             :         else
     613      284133 :                 fab->root = NULLAGBLOCK;
     614             : 
     615      829374 :         trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
     616      829374 :                         be32_to_cpu(btblock->bb_magic), fab->height - 1);
     617    56838252 : out:
     618    56838252 :         xfs_trans_brelse(ri->sc->tp, bp);
     619    56838252 :         return error;
     620             : }
     621             : 
     622             : /*
     623             :  * Do any of the blocks in this rmap record match one of the btrees we're
     624             :  * looking for?
     625             :  */
     626             : STATIC int
     627  5353377347 : xrep_findroot_rmap(
     628             :         struct xfs_btree_cur            *cur,
     629             :         const struct xfs_rmap_irec      *rec,
     630             :         void                            *priv)
     631             : {
     632  5353377347 :         struct xrep_findroot            *ri = priv;
     633  5353377347 :         struct xrep_find_ag_btree       *fab;
     634  5353377347 :         xfs_agblock_t                   b;
     635  5353377347 :         bool                            done;
     636  5353377347 :         int                             error = 0;
     637             : 
     638             :         /* Ignore anything that isn't AG metadata. */
     639  5353377347 :         if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
     640             :                 return 0;
     641             : 
     642             :         /* Otherwise scan each block + btree type. */
     643  1496387359 :         for (b = 0; b < rec->rm_blockcount; b++) {
     644  1387580913 :                 done = false;
     645  5411885507 :                 for (fab = ri->btree_info; fab->buf_ops; fab++) {
     646  4044617117 :                         if (rec->rm_owner != fab->rmap_owner)
     647  3985557294 :                                 continue;
     648    59059823 :                         error = xrep_findroot_block(ri, fab,
     649    59059823 :                                         rec->rm_owner, rec->rm_startblock + b,
     650             :                                         &done);
     651    59064818 :                         if (error)
     652           0 :                                 return error;
     653    59064818 :                         if (done)
     654             :                                 break;
     655             :                 }
     656             :         }
     657             : 
     658             :         return 0;
     659             : }
     660             : 
     661             : /* Find the roots of the per-AG btrees described in btree_info. */
     662             : int
     663      184425 : xrep_find_ag_btree_roots(
     664             :         struct xfs_scrub                *sc,
     665             :         struct xfs_buf                  *agf_bp,
     666             :         struct xrep_find_ag_btree       *btree_info,
     667             :         struct xfs_buf                  *agfl_bp)
     668             : {
     669      184425 :         struct xfs_mount                *mp = sc->mp;
     670      184425 :         struct xrep_findroot            ri;
     671      184425 :         struct xrep_find_ag_btree       *fab;
     672      184425 :         struct xfs_btree_cur            *cur;
     673      184425 :         int                             error;
     674             : 
     675      184425 :         ASSERT(xfs_buf_islocked(agf_bp));
     676      184425 :         ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
     677             : 
     678      184425 :         ri.sc = sc;
     679      184425 :         ri.btree_info = btree_info;
     680      184425 :         ri.agf = agf_bp->b_addr;
     681      184425 :         ri.agfl_bp = agfl_bp;
     682      729652 :         for (fab = btree_info; fab->buf_ops; fab++) {
     683      545230 :                 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
     684      545230 :                 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
     685      545227 :                 fab->root = NULLAGBLOCK;
     686      545227 :                 fab->height = 0;
     687             :         }
     688             : 
     689      184422 :         cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
     690      184428 :         error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
     691      184425 :         xfs_btree_del_cursor(cur, error);
     692             : 
     693      184427 :         return error;
     694             : }
     695             : 
     696             : #ifdef CONFIG_XFS_QUOTA
     697             : /* Update some quota flags in the superblock. */
     698             : void
     699        6208 : xrep_update_qflags(
     700             :         struct xfs_scrub        *sc,
     701             :         unsigned int            clear_flags,
     702             :         unsigned int            set_flags)
     703             : {
     704        6208 :         struct xfs_mount        *mp = sc->mp;
     705        6208 :         struct xfs_buf          *bp;
     706             : 
     707        6208 :         mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
     708        6208 :         if ((mp->m_qflags & clear_flags) == 0 &&
     709        3104 :             (mp->m_qflags & set_flags) == set_flags)
     710           0 :                 goto no_update;
     711             : 
     712        6208 :         mp->m_qflags &= ~clear_flags;
     713        6208 :         mp->m_qflags |= set_flags;
     714             : 
     715        6208 :         spin_lock(&mp->m_sb_lock);
     716        6208 :         mp->m_sb.sb_qflags &= ~clear_flags;
     717        6208 :         mp->m_sb.sb_qflags |= set_flags;
     718        6208 :         spin_unlock(&mp->m_sb_lock);
     719             : 
     720             :         /*
     721             :          * Update the quota flags in the ondisk superblock without touching
     722             :          * the summary counters.  We have not quiesced inode chunk allocation,
     723             :          * so we cannot coordinate with updates to the icount and ifree percpu
     724             :          * counters.
     725             :          */
     726        6208 :         bp = xfs_trans_getsb(sc->tp);
     727        6208 :         xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
     728        6208 :         xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
     729        6208 :         xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
     730             : 
     731        6208 : no_update:
     732        6208 :         mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
     733        6208 : }
     734             : 
     735             : /* Force a quotacheck the next time we mount. */
     736             : void
     737           0 : xrep_force_quotacheck(
     738             :         struct xfs_scrub        *sc,
     739             :         xfs_dqtype_t            type)
     740             : {
     741           0 :         uint                    flag;
     742             : 
     743           0 :         flag = xfs_quota_chkd_flag(type);
     744           0 :         if (!(flag & sc->mp->m_qflags))
     745             :                 return;
     746             : 
     747           0 :         xrep_update_qflags(sc, flag, 0);
     748             : }
     749             : 
     750             : /*
     751             :  * Attach dquots to this inode, or schedule quotacheck to fix them.
     752             :  *
     753             :  * This function ensures that the appropriate dquots are attached to an inode.
     754             :  * We cannot allow the dquot code to allocate an on-disk dquot block here
     755             :  * because we're already in transaction context.  The on-disk dquot should
     756             :  * already exist anyway.  If the quota code signals corruption or missing quota
     757             :  * information, schedule quotacheck, which will repair corruptions in the quota
     758             :  * metadata.
     759             :  */
     760             : int
     761    48112188 : xrep_ino_dqattach(
     762             :         struct xfs_scrub        *sc)
     763             : {
     764    48112188 :         int                     error;
     765             : 
     766    48112188 :         ASSERT(sc->tp != NULL);
     767    48112188 :         ASSERT(sc->ip != NULL);
     768             : 
     769    48112188 :         error = xfs_qm_dqattach(sc->ip);
     770    48098760 :         switch (error) {
     771           0 :         case -EFSBADCRC:
     772             :         case -EFSCORRUPTED:
     773             :         case -ENOENT:
     774           0 :                 xfs_err_ratelimited(sc->mp,
     775             : "inode %llu repair encountered quota error %d, quotacheck forced.",
     776             :                                 (unsigned long long)sc->ip->i_ino, error);
     777           0 :                 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
     778           0 :                         xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
     779           0 :                 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
     780           0 :                         xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
     781           0 :                 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
     782           0 :                         xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
     783             :                 fallthrough;
     784             :         case -ESRCH:
     785             :                 error = 0;
     786             :                 break;
     787             :         default:
     788             :                 break;
     789             :         }
     790             : 
     791    48098760 :         return error;
     792             : }
     793             : #endif /* CONFIG_XFS_QUOTA */
     794             : 
     795             : /*
     796             :  * Ensure that the inode being repaired is ready to handle a certain number of
     797             :  * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
     798             :  * being repaired and have joined it to the scrub transaction.
     799             :  */
     800             : int
     801     2568665 : xrep_ino_ensure_extent_count(
     802             :         struct xfs_scrub        *sc,
     803             :         int                     whichfork,
     804             :         xfs_extnum_t            nextents)
     805             : {
     806     2568665 :         xfs_extnum_t            max_extents;
     807     2568665 :         bool                    large_extcount;
     808             : 
     809     2568665 :         large_extcount = xfs_inode_has_large_extent_counts(sc->ip);
     810     2568665 :         max_extents = xfs_iext_max_nextents(large_extcount, whichfork);
     811     2568665 :         if (nextents <= max_extents)
     812             :                 return 0;
     813           0 :         if (large_extcount)
     814             :                 return -EFSCORRUPTED;
     815           0 :         if (!xfs_has_large_extent_counts(sc->mp))
     816             :                 return -EFSCORRUPTED;
     817             : 
     818           0 :         max_extents = xfs_iext_max_nextents(true, whichfork);
     819           0 :         if (nextents > max_extents)
     820             :                 return -EFSCORRUPTED;
     821             : 
     822           0 :         sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
     823           0 :         xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
     824           0 :         return 0;
     825             : }
     826             : 
     827             : /* Initialize all the btree cursors for an AG repair. */
     828             : void
     829    12950816 : xrep_ag_btcur_init(
     830             :         struct xfs_scrub        *sc,
     831             :         struct xchk_ag          *sa)
     832             : {
     833    12950816 :         struct xfs_mount        *mp = sc->mp;
     834             : 
     835             :         /* Set up a bnobt cursor for cross-referencing. */
     836    12950816 :         if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
     837             :             sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
     838    12910467 :                 sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
     839             :                                 sc->sa.pag, XFS_BTNUM_BNO);
     840    12910654 :                 sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
     841             :                                 sc->sa.pag, XFS_BTNUM_CNT);
     842             :         }
     843             : 
     844             :         /* Set up a inobt cursor for cross-referencing. */
     845    12951226 :         if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
     846             :             sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
     847    12877104 :                 sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
     848             :                                 sa->agi_bp, XFS_BTNUM_INO);
     849    12876690 :                 if (xfs_has_finobt(mp))
     850    12877159 :                         sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
     851             :                                         sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
     852             :         }
     853             : 
     854             :         /* Set up a rmapbt cursor for cross-referencing. */
     855    12951004 :         if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
     856             :             xfs_has_rmapbt(mp))
     857    12919915 :                 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
     858             :                                 sc->sa.pag);
     859             : 
     860             :         /* Set up a refcountbt cursor for cross-referencing. */
     861    12951962 :         if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
     862             :             xfs_has_reflink(mp))
     863    12918332 :                 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
     864             :                                 sa->agf_bp, sc->sa.pag);
     865    12952456 : }
     866             : 
     867             : /*
     868             :  * Reinitialize the in-core AG state after a repair by rereading the AGF
     869             :  * buffer.  We had better get the same AGF buffer as the one that's attached
     870             :  * to the scrub context.
     871             :  */
     872             : int
     873       85912 : xrep_reinit_pagf(
     874             :         struct xfs_scrub        *sc)
     875             : {
     876       85912 :         struct xfs_perag        *pag = sc->sa.pag;
     877       85912 :         struct xfs_buf          *bp;
     878       85912 :         int                     error;
     879             : 
     880       85912 :         ASSERT(pag);
     881      171824 :         ASSERT(xfs_perag_initialised_agf(pag));
     882             : 
     883       85912 :         clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
     884       85978 :         error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
     885       85983 :         if (error)
     886             :                 return error;
     887             : 
     888       85983 :         if (bp != sc->sa.agf_bp) {
     889           0 :                 ASSERT(bp == sc->sa.agf_bp);
     890           0 :                 return -EFSCORRUPTED;
     891             :         }
     892             : 
     893             :         return 0;
     894             : }
     895             : 
     896             : /*
     897             :  * Reinitialize the in-core AG state after a repair by rereading the AGI
     898             :  * buffer.  We had better get the same AGI buffer as the one that's attached
     899             :  * to the scrub context.
     900             :  */
     901             : int
     902       73710 : xrep_reinit_pagi(
     903             :         struct xfs_scrub        *sc)
     904             : {
     905       73710 :         struct xfs_perag        *pag = sc->sa.pag;
     906       73710 :         struct xfs_buf          *bp;
     907       73710 :         int                     error;
     908             : 
     909       73710 :         ASSERT(pag);
     910      147420 :         ASSERT(xfs_perag_initialised_agi(pag));
     911             : 
     912       73710 :         clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
     913       73729 :         error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
     914       73689 :         if (error)
     915             :                 return error;
     916             : 
     917       73689 :         if (bp != sc->sa.agi_bp) {
     918           0 :                 ASSERT(bp == sc->sa.agi_bp);
     919           0 :                 return -EFSCORRUPTED;
     920             :         }
     921             : 
     922             :         return 0;
     923             : }
     924             : 
     925             : /*
     926             :  * Given an active reference to a perag structure, load AG headers and cursors.
     927             :  * This should only be called to scan an AG while repairing file-based metadata.
     928             :  */
     929             : int
     930    12775584 : xrep_ag_init(
     931             :         struct xfs_scrub        *sc,
     932             :         struct xfs_perag        *pag,
     933             :         struct xchk_ag          *sa)
     934             : {
     935    12775584 :         int                     error;
     936             : 
     937    12775584 :         ASSERT(!sa->pag);
     938             : 
     939    12775584 :         error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
     940    12774330 :         if (error)
     941             :                 return error;
     942             : 
     943    12774293 :         error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
     944    12774716 :         if (error)
     945             :                 return error;
     946             : 
     947             :         /* Grab our own passive reference from the caller's ref. */
     948    12774747 :         sa->pag = xfs_perag_hold(pag);
     949    12775974 :         xrep_ag_btcur_init(sc, sa);
     950    12775974 :         return 0;
     951             : }
     952             : 
     953             : /* Reinitialize the per-AG block reservation for the AG we just fixed. */
     954             : int
     955  1556329355 : xrep_reset_perag_resv(
     956             :         struct xfs_scrub        *sc)
     957             : {
     958  1556329355 :         int                     error;
     959             : 
     960  1556329355 :         if (!(sc->flags & XREP_RESET_PERAG_RESV))
     961             :                 return 0;
     962             : 
     963      118973 :         ASSERT(sc->sa.pag != NULL);
     964      118973 :         ASSERT(sc->ops->type == ST_PERAG);
     965      118973 :         ASSERT(sc->tp);
     966             : 
     967      118973 :         sc->flags &= ~XREP_RESET_PERAG_RESV;
     968      118973 :         error = xfs_ag_resv_free(sc->sa.pag);
     969      119359 :         if (error)
     970           0 :                 goto out;
     971      119359 :         error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
     972      119137 :         if (error == -ENOSPC) {
     973           0 :                 xfs_err(sc->mp,
     974             : "Insufficient free space to reset per-AG reservation for AG %u after repair.",
     975             :                                 sc->sa.pag->pag_agno);
     976           0 :                 error = 0;
     977             :         }
     978             : 
     979      119137 : out:
     980             :         return error;
     981             : }
     982             : 
     983             : /* Decide if we are going to call the repair function for a scrub type. */
     984             : bool
     985    48937209 : xrep_will_attempt(
     986             :         struct xfs_scrub        *sc)
     987             : {
     988             :         /* Userspace asked us to rebuild the structure regardless. */
     989    48937209 :         if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
     990             :                 return true;
     991             : 
     992             :         /* Let debug users force us into the repair routines. */
     993        5745 :         if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
     994             :                 return true;
     995             : 
     996             :         /* Metadata is corrupt or failed cross-referencing. */
     997        5711 :         if (xchk_needs_repair(sc->sm))
     998         507 :                 return true;
     999             : 
    1000             :         return false;
    1001             : }
    1002             : 
    1003             : /* Try to fix some part of a metadata inode by calling another scrubber. */
    1004             : STATIC int
    1005       19886 : xrep_metadata_inode_subtype(
    1006             :         struct xfs_scrub        *sc,
    1007             :         unsigned int            scrub_type)
    1008             : {
    1009       19886 :         __u32                   smtype = sc->sm->sm_type;
    1010       19886 :         __u32                   smflags = sc->sm->sm_flags;
    1011       19886 :         int                     error;
    1012             : 
    1013             :         /*
    1014             :          * Let's see if the inode needs repair.  We're going to open-code calls
    1015             :          * to the scrub and repair functions so that we can hang on to the
    1016             :          * resources that we already acquired instead of using the standard
    1017             :          * setup/teardown routines.
    1018             :          */
    1019       19886 :         sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
    1020       19886 :         sc->sm->sm_type = scrub_type;
    1021             : 
    1022       19886 :         switch (scrub_type) {
    1023        7216 :         case XFS_SCRUB_TYPE_INODE:
    1024        7216 :                 error = xchk_inode(sc);
    1025        7216 :                 break;
    1026        7216 :         case XFS_SCRUB_TYPE_BMBTD:
    1027        7216 :                 error = xchk_bmap_data(sc);
    1028        7216 :                 break;
    1029        5454 :         case XFS_SCRUB_TYPE_BMBTA:
    1030        5454 :                 error = xchk_bmap_attr(sc);
    1031        5454 :                 break;
    1032           0 :         default:
    1033           0 :                 ASSERT(0);
    1034           0 :                 error = -EFSCORRUPTED;
    1035             :         }
    1036       19886 :         if (error)
    1037        5454 :                 goto out;
    1038             : 
    1039       14432 :         if (!xrep_will_attempt(sc))
    1040           0 :                 goto out;
    1041             : 
    1042             :         /*
    1043             :          * Repair some part of the inode.  This will potentially join the inode
    1044             :          * to the transaction.
    1045             :          */
    1046       14432 :         switch (scrub_type) {
    1047        7216 :         case XFS_SCRUB_TYPE_INODE:
    1048        7216 :                 error = xrep_inode(sc);
    1049        7216 :                 break;
    1050        7216 :         case XFS_SCRUB_TYPE_BMBTD:
    1051        7216 :                 error = xrep_bmap(sc, XFS_DATA_FORK, false);
    1052        7216 :                 break;
    1053           0 :         case XFS_SCRUB_TYPE_BMBTA:
    1054           0 :                 error = xrep_bmap(sc, XFS_ATTR_FORK, false);
    1055           0 :                 break;
    1056             :         }
    1057       14432 :         if (error)
    1058        1762 :                 goto out;
    1059             : 
    1060             :         /*
    1061             :          * Finish all deferred intent items and then roll the transaction so
    1062             :          * that the inode will not be joined to the transaction when we exit
    1063             :          * the function.
    1064             :          */
    1065       12670 :         error = xfs_defer_finish(&sc->tp);
    1066       12670 :         if (error)
    1067           0 :                 goto out;
    1068       12670 :         error = xfs_trans_roll(&sc->tp);
    1069       12670 :         if (error)
    1070           0 :                 goto out;
    1071             : 
    1072             :         /*
    1073             :          * Clear the corruption flags and re-check the metadata that we just
    1074             :          * repaired.
    1075             :          */
    1076       12670 :         sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
    1077             : 
    1078       12670 :         switch (scrub_type) {
    1079        7216 :         case XFS_SCRUB_TYPE_INODE:
    1080        7216 :                 error = xchk_inode(sc);
    1081        7216 :                 break;
    1082        5454 :         case XFS_SCRUB_TYPE_BMBTD:
    1083        5454 :                 error = xchk_bmap_data(sc);
    1084        5454 :                 break;
    1085           0 :         case XFS_SCRUB_TYPE_BMBTA:
    1086           0 :                 error = xchk_bmap_attr(sc);
    1087           0 :                 break;
    1088             :         }
    1089       12670 :         if (error)
    1090           0 :                 goto out;
    1091             : 
    1092             :         /* If corruption persists, the repair has failed. */
    1093       12670 :         if (xchk_needs_repair(sc->sm)) {
    1094           0 :                 error = -EFSCORRUPTED;
    1095           0 :                 goto out;
    1096             :         }
    1097       12670 : out:
    1098       19886 :         sc->sm->sm_type = smtype;
    1099       19886 :         sc->sm->sm_flags = smflags;
    1100       19886 :         return error;
    1101             : }
    1102             : 
    1103             : /*
    1104             :  * Repair the ondisk forks of a metadata inode.  The caller must ensure that
    1105             :  * sc->ip points to the metadata inode and the ILOCK is held on that inode.
    1106             :  * The inode must not be joined to the transaction before the call, and will
    1107             :  * not be afterwards.
    1108             :  */
    1109             : int
    1110        7216 : xrep_metadata_inode_forks(
    1111             :         struct xfs_scrub        *sc)
    1112             : {
    1113        7216 :         bool                    dirty = false;
    1114        7216 :         int                     error;
    1115             : 
    1116             :         /* Repair the inode record and the data fork. */
    1117        7216 :         error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
    1118        7216 :         if (error)
    1119             :                 return error;
    1120             : 
    1121        7216 :         error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
    1122        7216 :         if (error)
    1123             :                 return error;
    1124             : 
    1125             :         /* Make sure the attr fork looks ok before we delete it. */
    1126        5454 :         error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
    1127        5454 :         if (error)
    1128             :                 return error;
    1129             : 
    1130             :         /* Clear the reflink flag since metadata never shares. */
    1131           0 :         if (xfs_is_reflink_inode(sc->ip)) {
    1132           0 :                 dirty = true;
    1133           0 :                 xfs_trans_ijoin(sc->tp, sc->ip, 0);
    1134           0 :                 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
    1135           0 :                 if (error)
    1136             :                         return error;
    1137             :         }
    1138             : 
    1139             :         /* Clear the attr forks since metadata shouldn't have that. */
    1140           0 :         if (xfs_inode_hasattr(sc->ip)) {
    1141           0 :                 if (!dirty) {
    1142           0 :                         dirty = true;
    1143           0 :                         xfs_trans_ijoin(sc->tp, sc->ip, 0);
    1144             :                 }
    1145           0 :                 error = xrep_xattr_reset_fork(sc);
    1146           0 :                 if (error)
    1147             :                         return error;
    1148             :         }
    1149             : 
    1150             :         /*
    1151             :          * If we modified the inode, roll the transaction but don't rejoin the
    1152             :          * inode to the new transaction because xrep_bmap_data can do that.
    1153             :          */
    1154           0 :         if (dirty) {
    1155           0 :                 error = xfs_trans_roll(&sc->tp);
    1156           0 :                 if (error)
    1157           0 :                         return error;
    1158             :                 dirty = false;
    1159             :         }
    1160             : 
    1161             :         return 0;
    1162             : }
    1163             : 
    1164             : /*
    1165             :  * Set a file's link count, being careful about integer overflows.  Returns
    1166             :  * true if we had to correct an integer overflow.
    1167             :  */
    1168             : bool
    1169      192947 : xrep_set_nlink(
    1170             :         struct xfs_inode        *ip,
    1171             :         uint64_t                nlink)
    1172             : {
    1173      192947 :         bool                    ret = false;
    1174             : 
    1175      192947 :         if (nlink > XFS_NLINK_PINNED) {
    1176             :                 /*
    1177             :                  * The observed link count will overflow the nlink field.
    1178             :                  *
    1179             :                  * The VFS won't let users create more hardlinks if the link
    1180             :                  * count is larger than XFS_MAXLINK, but it will let them
    1181             :                  * delete hardlinks.  XFS_MAXLINK is half of XFS_NLINK_PINNED,
    1182             :                  * which means that sysadmins could actually fix this situation
    1183             :                  * by deleting links and calling us again.
    1184             :                  *
    1185             :                  * Set the link count to the largest possible value that will
    1186             :                  * fit in the field.  This will buy us the most possible time
    1187             :                  * to avoid a UAF should the sysadmins start deleting links.
    1188             :                  * As long as the link count stays above MAXLINK the undercount
    1189             :                  * problem will not get worse.
    1190             :                  */
    1191           0 :                 BUILD_BUG_ON((uint64_t)XFS_MAXLINK >= XFS_NLINK_PINNED);
    1192             : 
    1193           0 :                 nlink = XFS_NLINK_PINNED;
    1194           0 :                 ret = true;
    1195             :         }
    1196             : 
    1197      192947 :         set_nlink(VFS_I(ip), nlink);
    1198             : 
    1199      192936 :         if (VFS_I(ip)->i_nlink == 0) {
    1200             :                 /* had better be on an unlinked list */
    1201           0 :                 ASSERT(xfs_inode_on_unlinked_list(ip));
    1202           0 :                 if (!xfs_inode_on_unlinked_list(ip))
    1203           0 :                         xfs_emerg(ip->i_mount, "IUNLINK ino 0x%llx nlink %u prevun 0x%x nextun 0x%x", ip->i_ino, VFS_I(ip)->i_nlink, ip->i_prev_unlinked, ip->i_next_unlinked);
    1204             :         } else {
    1205             :                 /* had better not be on an unlinked list */
    1206      192936 :                 ASSERT(!xfs_inode_on_unlinked_list(ip));
    1207      192936 :                 if (xfs_inode_on_unlinked_list(ip))
    1208           0 :                         xfs_emerg(ip->i_mount, "IUNLINK ino 0x%llx nlink %u prevun 0x%x nextun 0x%x", ip->i_ino, VFS_I(ip)->i_nlink, ip->i_prev_unlinked, ip->i_next_unlinked);
    1209             :         }
    1210             : 
    1211      192936 :         return ret;
    1212             : }
    1213             : 
    1214             : /*
    1215             :  * Set up an xfile and a buffer cache so that we can use the xfbtree.  Buffer
    1216             :  * target initialization registers a shrinker, so we cannot be in transaction
    1217             :  * context.  Park our resources in the scrub context and let the teardown
    1218             :  * function take care of them at the right time.
    1219             :  */
    1220             : int
    1221       47054 : xrep_setup_buftarg(
    1222             :         struct xfs_scrub        *sc,
    1223             :         const char              *descr)
    1224             : {
    1225       47054 :         ASSERT(sc->tp == NULL);
    1226             : 
    1227       47054 :         return xfile_alloc_buftarg(sc->mp, descr, &sc->xfile_buftarg);
    1228             : }
    1229             : 
    1230             : /*
    1231             :  * Create a dummy transaction for use in a live update hook function.  This
    1232             :  * function MUST NOT be called from regular repair code because the current
    1233             :  * process' transaction is saved via the cookie.
    1234             :  */
    1235             : int
    1236      197889 : xrep_trans_alloc_hook_dummy(
    1237             :         struct xfs_mount        *mp,
    1238             :         void                    **cookiep,
    1239             :         struct xfs_trans        **tpp)
    1240             : {
    1241      197889 :         int                     error;
    1242             : 
    1243      197889 :         *cookiep = current->journal_info;
    1244      197889 :         current->journal_info = NULL;
    1245             : 
    1246      197889 :         error = xfs_trans_alloc_empty(mp, tpp);
    1247      197889 :         if (!error)
    1248             :                 return 0;
    1249             : 
    1250           0 :         current->journal_info = *cookiep;
    1251           0 :         *cookiep = NULL;
    1252           0 :         return error;
    1253             : }
    1254             : 
    1255             : /* Cancel a dummy transaction used by a live update hook function. */
    1256             : void
    1257      197890 : xrep_trans_cancel_hook_dummy(
    1258             :         void                    **cookiep,
    1259             :         struct xfs_trans        *tp)
    1260             : {
    1261      197890 :         xfs_trans_cancel(tp);
    1262      197890 :         current->journal_info = *cookiep;
    1263      197890 :         *cookiep = NULL;
    1264      197890 : }
    1265             : 
    1266             : /*
    1267             :  * See if this buffer can pass the given ->verify_struct() function.
    1268             :  *
    1269             :  * If the buffer already has ops attached and they're not the ones that were
    1270             :  * passed in, we reject the buffer.  Otherwise, we perform the structure test
    1271             :  * (note that we do not check CRCs) and return the outcome of the test.  The
    1272             :  * buffer ops and error state are left unchanged.
    1273             :  */
    1274             : bool
    1275       55900 : xrep_buf_verify_struct(
    1276             :         struct xfs_buf                  *bp,
    1277             :         const struct xfs_buf_ops        *ops)
    1278             : {
    1279       55900 :         const struct xfs_buf_ops        *old_ops = bp->b_ops;
    1280       55900 :         xfs_failaddr_t                  fa;
    1281       55900 :         int                             old_error;
    1282             : 
    1283       55900 :         if (old_ops) {
    1284       55900 :                 if (old_ops != ops)
    1285             :                         return false;
    1286             :         }
    1287             : 
    1288       55900 :         old_error = bp->b_error;
    1289       55900 :         bp->b_ops = ops;
    1290       55900 :         fa = bp->b_ops->verify_struct(bp);
    1291       55900 :         bp->b_ops = old_ops;
    1292       55900 :         bp->b_error = old_error;
    1293             : 
    1294       55900 :         return fa == NULL;
    1295             : }

Generated by: LCOV version 1.14