fs/xfs/scrub/repair.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  * Copyright (C) 2018 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_sb.h"
  16 #include "xfs_inode.h"
  17 #include "xfs_alloc.h"
  18 #include "xfs_alloc_btree.h"
  19 #include "xfs_ialloc.h"
  20 #include "xfs_ialloc_btree.h"
  21 #include "xfs_rmap.h"
  22 #include "xfs_rmap_btree.h"
  23 #include "xfs_refcount_btree.h"
  24 #include "xfs_extent_busy.h"
  25 #include "xfs_ag_resv.h"
  26 #include "xfs_quota.h"
  27 #include "scrub/scrub.h"
  28 #include "scrub/common.h"
  29 #include "scrub/trace.h"
  30 #include "scrub/repair.h"
  31 #include "scrub/bitmap.h"
  32
  33 /*
  34  * Attempt to repair some metadata, if the metadata is corrupt and userspace
  35  * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
  36  * and will set *fixed to true if it thinks it repaired anything.
  37  */
  38 int
  39 xrep_attempt(
  40         struct xfs_scrub        *sc)
  41 {
  42         int                     error = 0;
  43
  44         trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
  45
  46         xchk_ag_btcur_free(&sc->sa);
  47
  48         /* Repair whatever's broken. */
  49         ASSERT(sc->ops->repair);
  50         error = sc->ops->repair(sc);
  51         trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
  52         switch (error) {
  53         case 0:
  54                 /*
  55                  * Repair succeeded.  Commit the fixes and perform a second
  56                  * scrub so that we can tell userspace if we fixed the problem.
  57                  */
  58                 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
  59                 sc->flags |= XREP_ALREADY_FIXED;
  60                 return -EAGAIN;
  61         case -EDEADLOCK:
  62         case -EAGAIN:
  63                 /* Tell the caller to try again having grabbed all the locks. */
  64                 if (!(sc->flags & XCHK_TRY_HARDER)) {
  65                         sc->flags |= XCHK_TRY_HARDER;
  66                         return -EAGAIN;
  67                 }
  68                 /*
  69                  * We tried harder but still couldn't grab all the resources
  70                  * we needed to fix it.  The corruption has not been fixed,
  71                  * so report back to userspace.
  72                  */
  73                 return -EFSCORRUPTED;
  74         default:
  75                 return error;
  76         }
  77 }
  78
  79 /*
  80  * Complain about unfixable problems in the filesystem.  We don't log
  81  * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
  82  * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
  83  * administrator isn't running xfs_scrub in no-repairs mode.
  84  *
  85  * Use this helper function because _ratelimited silently declares a static
  86  * structure to track rate limiting information.
  87  */
  88 void
  89 xrep_failure(
  90         struct xfs_mount        *mp)
  91 {
  92         xfs_alert_ratelimited(mp,
  93 "Corruption not fixed during online repair.  Unmount and run xfs_repair.");
  94 }
  95
  96 /*
  97  * Repair probe -- userspace uses this to probe if we're willing to repair a
  98  * given mountpoint.
  99  */
 100 int
 101 xrep_probe(
 102         struct xfs_scrub        *sc)
 103 {
 104         int                     error = 0;
 105
 106         if (xchk_should_terminate(sc, &error))
 107                 return error;
 108
 109         return 0;
 110 }
 111
 112 /*
 113  * Roll a transaction, keeping the AG headers locked and reinitializing
 114  * the btree cursors.
 115  */
 116 int
 117 xrep_roll_ag_trans(
 118         struct xfs_scrub        *sc)
 119 {
 120         int                     error;
 121
 122         /* Keep the AG header buffers locked so we can keep going. */
 123         if (sc->sa.agi_bp)
 124                 xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 125         if (sc->sa.agf_bp)
 126                 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 127         if (sc->sa.agfl_bp)
 128                 xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
 129
 130         /*
 131          * Roll the transaction.  We still own the buffer and the buffer lock
 132          * regardless of whether or not the roll succeeds.  If the roll fails,
 133          * the buffers will be released during teardown on our way out of the
 134          * kernel.  If it succeeds, we join them to the new transaction and
 135          * move on.
 136          */
 137         error = xfs_trans_roll(&sc->tp);
 138         if (error)
 139                 return error;
 140
 141         /* Join AG headers to the new transaction. */
 142         if (sc->sa.agi_bp)
 143                 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
 144         if (sc->sa.agf_bp)
 145                 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 146         if (sc->sa.agfl_bp)
 147                 xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
 148
 149         return 0;
 150 }
 151
 152 /*
 153  * Does the given AG have enough space to rebuild a btree?  Neither AG
 154  * reservation can be critical, and we must have enough space (factoring
 155  * in AG reservations) to construct a whole btree.
 156  */
 157 bool
 158 xrep_ag_has_space(
 159         struct xfs_perag        *pag,
 160         xfs_extlen_t            nr_blocks,
 161         enum xfs_ag_resv_type   type)
 162 {
 163         return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
 164                 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
 165                 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
 166 }
 167
 168 /*
 169  * Figure out how many blocks to reserve for an AG repair.  We calculate the
 170  * worst case estimate for the number of blocks we'd need to rebuild one of
 171  * any type of per-AG btree.
 172  */
 173 xfs_extlen_t
 174 xrep_calc_ag_resblks(
 175         struct xfs_scrub                *sc)
 176 {
 177         struct xfs_mount                *mp = sc->mp;
 178         struct xfs_scrub_metadata       *sm = sc->sm;
 179         struct xfs_perag                *pag;
 180         struct xfs_buf                  *bp;
 181         xfs_agino_t                     icount = NULLAGINO;
 182         xfs_extlen_t                    aglen = NULLAGBLOCK;
 183         xfs_extlen_t                    usedlen;
 184         xfs_extlen_t                    freelen;
 185         xfs_extlen_t                    bnobt_sz;
 186         xfs_extlen_t                    inobt_sz;
 187         xfs_extlen_t                    rmapbt_sz;
 188         xfs_extlen_t                    refcbt_sz;
 189         int                             error;
 190
 191         if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 192                 return 0;
 193
 194         pag = xfs_perag_get(mp, sm->sm_agno);
 195         if (pag->pagi_init) {
 196                 /* Use in-core icount if possible. */
 197                 icount = pag->pagi_count;
 198         } else {
 199                 /* Try to get the actual counters from disk. */
 200                 error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
 201                 if (!error) {
 202                         icount = pag->pagi_count;
 203                         xfs_buf_relse(bp);
 204                 }
 205         }
 206
 207         /* Now grab the block counters from the AGF. */
 208         error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
 209         if (error) {
 210                 aglen = xfs_ag_block_count(mp, sm->sm_agno);
 211                 freelen = aglen;
 212                 usedlen = aglen;
 213         } else {
 214                 struct xfs_agf  *agf = bp->b_addr;
 215
 216                 aglen = be32_to_cpu(agf->agf_length);
 217                 freelen = be32_to_cpu(agf->agf_freeblks);
 218                 usedlen = aglen - freelen;
 219                 xfs_buf_relse(bp);
 220         }
 221         xfs_perag_put(pag);
 222
 223         /* If the icount is impossible, make some worst-case assumptions. */
 224         if (icount == NULLAGINO ||
 225             !xfs_verify_agino(mp, sm->sm_agno, icount)) {
 226                 xfs_agino_t     first, last;
 227
 228                 xfs_agino_range(mp, sm->sm_agno, &first, &last);
 229                 icount = last - first + 1;
 230         }
 231
 232         /* If the block counts are impossible, make worst-case assumptions. */
 233         if (aglen == NULLAGBLOCK ||
 234             aglen != xfs_ag_block_count(mp, sm->sm_agno) ||
 235             freelen >= aglen) {
 236                 aglen = xfs_ag_block_count(mp, sm->sm_agno);
 237                 freelen = aglen;
 238                 usedlen = aglen;
 239         }
 240
 241         trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
 242                         freelen, usedlen);
 243
 244         /*
 245          * Figure out how many blocks we'd need worst case to rebuild
 246          * each type of btree.  Note that we can only rebuild the
 247          * bnobt/cntbt or inobt/finobt as pairs.
 248          */
 249         bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
 250         if (xfs_sb_version_hassparseinodes(&mp->m_sb))
 251                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 252                                 XFS_INODES_PER_HOLEMASK_BIT);
 253         else
 254                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 255                                 XFS_INODES_PER_CHUNK);
 256         if (xfs_sb_version_hasfinobt(&mp->m_sb))
 257                 inobt_sz *= 2;
 258         if (xfs_sb_version_hasreflink(&mp->m_sb))
 259                 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
 260         else
 261                 refcbt_sz = 0;
 262         if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
 263                 /*
 264                  * Guess how many blocks we need to rebuild the rmapbt.
 265                  * For non-reflink filesystems we can't have more records than
 266                  * used blocks.  However, with reflink it's possible to have
 267                  * more than one rmap record per AG block.  We don't know how
 268                  * many rmaps there could be in the AG, so we start off with
 269                  * what we hope is an generous over-estimation.
 270                  */
 271                 if (xfs_sb_version_hasreflink(&mp->m_sb))
 272                         rmapbt_sz = xfs_rmapbt_calc_size(mp,
 273                                         (unsigned long long)aglen * 2);
 274                 else
 275                         rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
 276         } else {
 277                 rmapbt_sz = 0;
 278         }
 279
 280         trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
 281                         inobt_sz, rmapbt_sz, refcbt_sz);
 282
 283         return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 284 }
 285
 286 /* Allocate a block in an AG. */
 287 int
 288 xrep_alloc_ag_block(
 289         struct xfs_scrub                *sc,
 290         const struct xfs_owner_info     *oinfo,
 291         xfs_fsblock_t                   *fsbno,
 292         enum xfs_ag_resv_type           resv)
 293 {
 294         struct xfs_alloc_arg            args = {0};
 295         xfs_agblock_t                   bno;
 296         int                             error;
 297
 298         switch (resv) {
 299         case XFS_AG_RESV_AGFL:
 300         case XFS_AG_RESV_RMAPBT:
 301                 error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
 302                 if (error)
 303                         return error;
 304                 if (bno == NULLAGBLOCK)
 305                         return -ENOSPC;
 306                 xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
 307                                 1, false);
 308                 *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
 309                 if (resv == XFS_AG_RESV_RMAPBT)
 310                         xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
 311                 return 0;
 312         default:
 313                 break;
 314         }
 315
 316         args.tp = sc->tp;
 317         args.mp = sc->mp;
 318         args.oinfo = *oinfo;
 319         args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
 320         args.minlen = 1;
 321         args.maxlen = 1;
 322         args.prod = 1;
 323         args.type = XFS_ALLOCTYPE_THIS_AG;
 324         args.resv = resv;
 325
 326         error = xfs_alloc_vextent(&args);
 327         if (error)
 328                 return error;
 329         if (args.fsbno == NULLFSBLOCK)
 330                 return -ENOSPC;
 331         ASSERT(args.len == 1);
 332         *fsbno = args.fsbno;
 333
 334         return 0;
 335 }
 336
 337 /* Initialize a new AG btree root block with zero entries. */
 338 int
 339 xrep_init_btblock(
 340         struct xfs_scrub                *sc,
 341         xfs_fsblock_t                   fsb,
 342         struct xfs_buf                  **bpp,
 343         xfs_btnum_t                     btnum,
 344         const struct xfs_buf_ops        *ops)
 345 {
 346         struct xfs_trans                *tp = sc->tp;
 347         struct xfs_mount                *mp = sc->mp;
 348         struct xfs_buf                  *bp;
 349         int                             error;
 350
 351         trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
 352                         XFS_FSB_TO_AGBNO(mp, fsb), btnum);
 353
 354         ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
 355         error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
 356                         XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
 357                         &bp);
 358         if (error)
 359                 return error;
 360         xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
 361         xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
 362         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
 363         xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
 364         bp->b_ops = ops;
 365         *bpp = bp;
 366
 367         return 0;
 368 }
 369
 370 /*
 371  * Reconstructing per-AG Btrees
 372  *
 373  * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
 374  * we scan secondary space metadata to derive the records that should be in
 375  * the damaged btree, initialize a fresh btree root, and insert the records.
 376  * Note that for rebuilding the rmapbt we scan all the primary data to
 377  * generate the new records.
 378  *
 379  * However, that leaves the matter of removing all the metadata describing the
 380  * old broken structure.  For primary metadata we use the rmap data to collect
 381  * every extent with a matching rmap owner (bitmap); we then iterate all other
 382  * metadata structures with the same rmap owner to collect the extents that
 383  * cannot be removed (sublist).  We then subtract sublist from bitmap to
 384  * derive the blocks that were used by the old btree.  These blocks can be
 385  * reaped.
 386  *
 387  * For rmapbt reconstructions we must use different tactics for extent
 388  * collection.  First we iterate all primary metadata (this excludes the old
 389  * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
 390  * records are collected as bitmap.  The bnobt records are collected as
 391  * sublist.  As with the other btrees we subtract sublist from bitmap, and the
 392  * result (since the rmapbt lives in the free space) are the blocks from the
 393  * old rmapbt.
 394  *
 395  * Disposal of Blocks from Old per-AG Btrees
 396  *
 397  * Now that we've constructed a new btree to replace the damaged one, we want
 398  * to dispose of the blocks that (we think) the old btree was using.
 399  * Previously, we used the rmapbt to collect the extents (bitmap) with the
 400  * rmap owner corresponding to the tree we rebuilt, collected extents for any
 401  * blocks with the same rmap owner that are owned by another data structure
 402  * (sublist), and subtracted sublist from bitmap.  In theory the extents
 403  * remaining in bitmap are the old btree's blocks.
 404  *
 405  * Unfortunately, it's possible that the btree was crosslinked with other
 406  * blocks on disk.  The rmap data can tell us if there are multiple owners, so
 407  * if the rmapbt says there is an owner of this block other than @oinfo, then
 408  * the block is crosslinked.  Remove the reverse mapping and continue.
 409  *
 410  * If there is one rmap record, we can free the block, which removes the
 411  * reverse mapping but doesn't add the block to the free space.  Our repair
 412  * strategy is to hope the other metadata objects crosslinked on this block
 413  * will be rebuilt (atop different blocks), thereby removing all the cross
 414  * links.
 415  *
 416  * If there are no rmap records at all, we also free the block.  If the btree
 417  * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
 418  * supposed to be a rmap record and everything is ok.  For other btrees there
 419  * had to have been an rmap entry for the block to have ended up on @bitmap,
 420  * so if it's gone now there's something wrong and the fs will shut down.
 421  *
 422  * Note: If there are multiple rmap records with only the same rmap owner as
 423  * the btree we're trying to rebuild and the block is indeed owned by another
 424  * data structure with the same rmap owner, then the block will be in sublist
 425  * and therefore doesn't need disposal.  If there are multiple rmap records
 426  * with only the same rmap owner but the block is not owned by something with
 427  * the same rmap owner, the block will be freed.
 428  *
 429  * The caller is responsible for locking the AG headers for the entire rebuild
 430  * operation so that nothing else can sneak in and change the AG state while
 431  * we're not looking.  We also assume that the caller already invalidated any
 432  * buffers associated with @bitmap.
 433  */
 434
 435 /*
 436  * Invalidate buffers for per-AG btree blocks we're dumping.  This function
 437  * is not intended for use with file data repairs; we have bunmapi for that.
 438  */
 439 int
 440 xrep_invalidate_blocks(
 441         struct xfs_scrub        *sc,
 442         struct xbitmap          *bitmap)
 443 {
 444         struct xbitmap_range    *bmr;
 445         struct xbitmap_range    *n;
 446         struct xfs_buf          *bp;
 447         xfs_fsblock_t           fsbno;
 448
 449         /*
 450          * For each block in each extent, see if there's an incore buffer for
 451          * exactly that block; if so, invalidate it.  The buffer cache only
 452          * lets us look for one buffer at a time, so we have to look one block
 453          * at a time.  Avoid invalidating AG headers and post-EOFS blocks
 454          * because we never own those; and if we can't TRYLOCK the buffer we
 455          * assume it's owned by someone else.
 456          */
 457         for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
 458                 /* Skip AG headers and post-EOFS blocks */
 459                 if (!xfs_verify_fsbno(sc->mp, fsbno))
 460                         continue;
 461                 bp = xfs_buf_incore(sc->mp->m_ddev_targp,
 462                                 XFS_FSB_TO_DADDR(sc->mp, fsbno),
 463                                 XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
 464                 if (bp) {
 465                         xfs_trans_bjoin(sc->tp, bp);
 466                         xfs_trans_binval(sc->tp, bp);
 467                 }
 468         }
 469
 470         return 0;
 471 }
 472
 473 /* Ensure the freelist is the correct size. */
 474 int
 475 xrep_fix_freelist(
 476         struct xfs_scrub        *sc,
 477         bool                    can_shrink)
 478 {
 479         struct xfs_alloc_arg    args = {0};
 480
 481         args.mp = sc->mp;
 482         args.tp = sc->tp;
 483         args.agno = sc->sa.agno;
 484         args.alignment = 1;
 485         args.pag = sc->sa.pag;
 486
 487         return xfs_alloc_fix_freelist(&args,
 488                         can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
 489 }
 490
 491 /*
 492  * Put a block back on the AGFL.
 493  */
 494 STATIC int
 495 xrep_put_freelist(
 496         struct xfs_scrub        *sc,
 497         xfs_agblock_t           agbno)
 498 {
 499         int                     error;
 500
 501         /* Make sure there's space on the freelist. */
 502         error = xrep_fix_freelist(sc, true);
 503         if (error)
 504                 return error;
 505
 506         /*
 507          * Since we're "freeing" a lost block onto the AGFL, we have to
 508          * create an rmap for the block prior to merging it or else other
 509          * parts will break.
 510          */
 511         error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
 512                         &XFS_RMAP_OINFO_AG);
 513         if (error)
 514                 return error;
 515
 516         /* Put the block on the AGFL. */
 517         error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
 518                         agbno, 0);
 519         if (error)
 520                 return error;
 521         xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
 522                         XFS_EXTENT_BUSY_SKIP_DISCARD);
 523
 524         return 0;
 525 }
 526
 527 /* Dispose of a single block. */
 528 STATIC int
 529 xrep_reap_block(
 530         struct xfs_scrub                *sc,
 531         xfs_fsblock_t                   fsbno,
 532         const struct xfs_owner_info     *oinfo,
 533         enum xfs_ag_resv_type           resv)
 534 {
 535         struct xfs_btree_cur            *cur;
 536         struct xfs_buf                  *agf_bp = NULL;
 537         xfs_agnumber_t                  agno;
 538         xfs_agblock_t                   agbno;
 539         bool                            has_other_rmap;
 540         int                             error;
 541
 542         agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
 543         agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
 544
 545         /*
 546          * If we are repairing per-inode metadata, we need to read in the AGF
 547          * buffer.  Otherwise, we're repairing a per-AG structure, so reuse
 548          * the AGF buffer that the setup functions already grabbed.
 549          */
 550         if (sc->ip) {
 551                 error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
 552                 if (error)
 553                         return error;
 554         } else {
 555                 agf_bp = sc->sa.agf_bp;
 556         }
 557         cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
 558
 559         /* Can we find any other rmappings? */
 560         error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
 561         xfs_btree_del_cursor(cur, error);
 562         if (error)
 563                 goto out_free;
 564
 565         /*
 566          * If there are other rmappings, this block is cross linked and must
 567          * not be freed.  Remove the reverse mapping and move on.  Otherwise,
 568          * we were the only owner of the block, so free the extent, which will
 569          * also remove the rmap.
 570          *
 571          * XXX: XFS doesn't support detecting the case where a single block
 572          * metadata structure is crosslinked with a multi-block structure
 573          * because the buffer cache doesn't detect aliasing problems, so we
 574          * can't fix 100% of crosslinking problems (yet).  The verifiers will
 575          * blow on writeout, the filesystem will shut down, and the admin gets
 576          * to run xfs_repair.
 577          */
 578         if (has_other_rmap)
 579                 error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
 580         else if (resv == XFS_AG_RESV_AGFL)
 581                 error = xrep_put_freelist(sc, agbno);
 582         else
 583                 error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
 584         if (agf_bp != sc->sa.agf_bp)
 585                 xfs_trans_brelse(sc->tp, agf_bp);
 586         if (error)
 587                 return error;
 588
 589         if (sc->ip)
 590                 return xfs_trans_roll_inode(&sc->tp, sc->ip);
 591         return xrep_roll_ag_trans(sc);
 592
 593 out_free:
 594         if (agf_bp != sc->sa.agf_bp)
 595                 xfs_trans_brelse(sc->tp, agf_bp);
 596         return error;
 597 }
 598
 599 /* Dispose of every block of every extent in the bitmap. */
 600 int
 601 xrep_reap_extents(
 602         struct xfs_scrub                *sc,
 603         struct xbitmap                  *bitmap,
 604         const struct xfs_owner_info     *oinfo,
 605         enum xfs_ag_resv_type           type)
 606 {
 607         struct xbitmap_range            *bmr;
 608         struct xbitmap_range            *n;
 609         xfs_fsblock_t                   fsbno;
 610         int                             error = 0;
 611
 612         ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
 613
 614         for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
 615                 ASSERT(sc->ip != NULL ||
 616                        XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno);
 617                 trace_xrep_dispose_btree_extent(sc->mp,
 618                                 XFS_FSB_TO_AGNO(sc->mp, fsbno),
 619                                 XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
 620
 621                 error = xrep_reap_block(sc, fsbno, oinfo, type);
 622                 if (error)
 623                         break;
 624         }
 625
 626         return error;
 627 }
 628
 629 /*
 630  * Finding per-AG Btree Roots for AGF/AGI Reconstruction
 631  *
 632  * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
 633  * the AG headers by using the rmap data to rummage through the AG looking for
 634  * btree roots.  This is not guaranteed to work if the AG is heavily damaged
 635  * or the rmap data are corrupt.
 636  *
 637  * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
 638  * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
 639  * AGI is being rebuilt.  It must maintain these locks until it's safe for
 640  * other threads to change the btrees' shapes.  The caller provides
 641  * information about the btrees to look for by passing in an array of
 642  * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
 643  * The (root, height) fields will be set on return if anything is found.  The
 644  * last element of the array should have a NULL buf_ops to mark the end of the
 645  * array.
 646  *
 647  * For every rmapbt record matching any of the rmap owners in btree_info,
 648  * read each block referenced by the rmap record.  If the block is a btree
 649  * block from this filesystem matching any of the magic numbers and has a
 650  * level higher than what we've already seen, remember the block and the
 651  * height of the tree required to have such a block.  When the call completes,
 652  * we return the highest block we've found for each btree description; those
 653  * should be the roots.
 654  */
 655
 656 struct xrep_findroot {
 657         struct xfs_scrub                *sc;
 658         struct xfs_buf                  *agfl_bp;
 659         struct xfs_agf                  *agf;
 660         struct xrep_find_ag_btree       *btree_info;
 661 };
 662
 663 /* See if our block is in the AGFL. */
 664 STATIC int
 665 xrep_findroot_agfl_walk(
 666         struct xfs_mount        *mp,
 667         xfs_agblock_t           bno,
 668         void                    *priv)
 669 {
 670         xfs_agblock_t           *agbno = priv;
 671
 672         return (*agbno == bno) ? -ECANCELED : 0;
 673 }
 674
 675 /* Does this block match the btree information passed in? */
 676 STATIC int
 677 xrep_findroot_block(
 678         struct xrep_findroot            *ri,
 679         struct xrep_find_ag_btree       *fab,
 680         uint64_t                        owner,
 681         xfs_agblock_t                   agbno,
 682         bool                            *done_with_block)
 683 {
 684         struct xfs_mount                *mp = ri->sc->mp;
 685         struct xfs_buf                  *bp;
 686         struct xfs_btree_block          *btblock;
 687         xfs_daddr_t                     daddr;
 688         int                             block_level;
 689         int                             error = 0;
 690
 691         daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
 692
 693         /*
 694          * Blocks in the AGFL have stale contents that might just happen to
 695          * have a matching magic and uuid.  We don't want to pull these blocks
 696          * in as part of a tree root, so we have to filter out the AGFL stuff
 697          * here.  If the AGFL looks insane we'll just refuse to repair.
 698          */
 699         if (owner == XFS_RMAP_OWN_AG) {
 700                 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
 701                                 xrep_findroot_agfl_walk, &agbno);
 702                 if (error == -ECANCELED)
 703                         return 0;
 704                 if (error)
 705                         return error;
 706         }
 707
 708         /*
 709          * Read the buffer into memory so that we can see if it's a match for
 710          * our btree type.  We have no clue if it is beforehand, and we want to
 711          * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
 712          * will cause needless disk reads in subsequent calls to this function)
 713          * and logging metadata verifier failures.
 714          *
 715          * Therefore, pass in NULL buffer ops.  If the buffer was already in
 716          * memory from some other caller it will already have b_ops assigned.
 717          * If it was in memory from a previous unsuccessful findroot_block
 718          * call, the buffer won't have b_ops but it should be clean and ready
 719          * for us to try to verify if the read call succeeds.  The same applies
 720          * if the buffer wasn't in memory at all.
 721          *
 722          * Note: If we never match a btree type with this buffer, it will be
 723          * left in memory with NULL b_ops.  This shouldn't be a problem unless
 724          * the buffer gets written.
 725          */
 726         error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
 727                         mp->m_bsize, 0, &bp, NULL);
 728         if (error)
 729                 return error;
 730
 731         /* Ensure the block magic matches the btree type we're looking for. */
 732         btblock = XFS_BUF_TO_BLOCK(bp);
 733         ASSERT(fab->buf_ops->magic[1] != 0);
 734         if (btblock->bb_magic != fab->buf_ops->magic[1])
 735                 goto out;
 736
 737         /*
 738          * If the buffer already has ops applied and they're not the ones for
 739          * this btree type, we know this block doesn't match the btree and we
 740          * can bail out.
 741          *
 742          * If the buffer ops match ours, someone else has already validated
 743          * the block for us, so we can move on to checking if this is a root
 744          * block candidate.
 745          *
 746          * If the buffer does not have ops, nobody has successfully validated
 747          * the contents and the buffer cannot be dirty.  If the magic, uuid,
 748          * and structure match this btree type then we'll move on to checking
 749          * if it's a root block candidate.  If there is no match, bail out.
 750          */
 751         if (bp->b_ops) {
 752                 if (bp->b_ops != fab->buf_ops)
 753                         goto out;
 754         } else {
 755                 ASSERT(!xfs_trans_buf_is_dirty(bp));
 756                 if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
 757                                 &mp->m_sb.sb_meta_uuid))
 758                         goto out;
 759                 /*
 760                  * Read verifiers can reference b_ops, so we set the pointer
 761                  * here.  If the verifier fails we'll reset the buffer state
 762                  * to what it was before we touched the buffer.
 763                  */
 764                 bp->b_ops = fab->buf_ops;
 765                 fab->buf_ops->verify_read(bp);
 766                 if (bp->b_error) {
 767                         bp->b_ops = NULL;
 768                         bp->b_error = 0;
 769                         goto out;
 770                 }
 771
 772                 /*
 773                  * Some read verifiers will (re)set b_ops, so we must be
 774                  * careful not to change b_ops after running the verifier.
 775                  */
 776         }
 777
 778         /*
 779          * This block passes the magic/uuid and verifier tests for this btree
 780          * type.  We don't need the caller to try the other tree types.
 781          */
 782         *done_with_block = true;
 783
 784         /*
 785          * Compare this btree block's level to the height of the current
 786          * candidate root block.
 787          *
 788          * If the level matches the root we found previously, throw away both
 789          * blocks because there can't be two candidate roots.
 790          *
 791          * If level is lower in the tree than the root we found previously,
 792          * ignore this block.
 793          */
 794         block_level = xfs_btree_get_level(btblock);
 795         if (block_level + 1 == fab->height) {
 796                 fab->root = NULLAGBLOCK;
 797                 goto out;
 798         } else if (block_level < fab->height) {
 799                 goto out;
 800         }
 801
 802         /*
 803          * This is the highest block in the tree that we've found so far.
 804          * Update the btree height to reflect what we've learned from this
 805          * block.
 806          */
 807         fab->height = block_level + 1;
 808
 809         /*
 810          * If this block doesn't have sibling pointers, then it's the new root
 811          * block candidate.  Otherwise, the root will be found farther up the
 812          * tree.
 813          */
 814         if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
 815             btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
 816                 fab->root = agbno;
 817         else
 818                 fab->root = NULLAGBLOCK;
 819
 820         trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
 821                         be32_to_cpu(btblock->bb_magic), fab->height - 1);
 822 out:
 823         xfs_trans_brelse(ri->sc->tp, bp);
 824         return error;
 825 }
 826
 827 /*
 828  * Do any of the blocks in this rmap record match one of the btrees we're
 829  * looking for?
 830  */
 831 STATIC int
 832 xrep_findroot_rmap(
 833         struct xfs_btree_cur            *cur,
 834         struct xfs_rmap_irec            *rec,
 835         void                            *priv)
 836 {
 837         struct xrep_findroot            *ri = priv;
 838         struct xrep_find_ag_btree       *fab;
 839         xfs_agblock_t                   b;
 840         bool                            done;
 841         int                             error = 0;
 842
 843         /* Ignore anything that isn't AG metadata. */
 844         if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
 845                 return 0;
 846
 847         /* Otherwise scan each block + btree type. */
 848         for (b = 0; b < rec->rm_blockcount; b++) {
 849                 done = false;
 850                 for (fab = ri->btree_info; fab->buf_ops; fab++) {
 851                         if (rec->rm_owner != fab->rmap_owner)
 852                                 continue;
 853                         error = xrep_findroot_block(ri, fab,
 854                                         rec->rm_owner, rec->rm_startblock + b,
 855                                         &done);
 856                         if (error)
 857                                 return error;
 858                         if (done)
 859                                 break;
 860                 }
 861         }
 862
 863         return 0;
 864 }
 865
 866 /* Find the roots of the per-AG btrees described in btree_info. */
 867 int
 868 xrep_find_ag_btree_roots(
 869         struct xfs_scrub                *sc,
 870         struct xfs_buf                  *agf_bp,
 871         struct xrep_find_ag_btree       *btree_info,
 872         struct xfs_buf                  *agfl_bp)
 873 {
 874         struct xfs_mount                *mp = sc->mp;
 875         struct xrep_findroot            ri;
 876         struct xrep_find_ag_btree       *fab;
 877         struct xfs_btree_cur            *cur;
 878         int                             error;
 879
 880         ASSERT(xfs_buf_islocked(agf_bp));
 881         ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
 882
 883         ri.sc = sc;
 884         ri.btree_info = btree_info;
 885         ri.agf = agf_bp->b_addr;
 886         ri.agfl_bp = agfl_bp;
 887         for (fab = btree_info; fab->buf_ops; fab++) {
 888                 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
 889                 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
 890                 fab->root = NULLAGBLOCK;
 891                 fab->height = 0;
 892         }
 893
 894         cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
 895         error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
 896         xfs_btree_del_cursor(cur, error);
 897
 898         return error;
 899 }
 900
 901 /* Force a quotacheck the next time we mount. */
 902 void
 903 xrep_force_quotacheck(
 904         struct xfs_scrub        *sc,
 905         xfs_dqtype_t            type)
 906 {
 907         uint                    flag;
 908
 909         flag = xfs_quota_chkd_flag(type);
 910         if (!(flag & sc->mp->m_qflags))
 911                 return;
 912
 913         sc->mp->m_qflags &= ~flag;
 914         spin_lock(&sc->mp->m_sb_lock);
 915         sc->mp->m_sb.sb_qflags &= ~flag;
 916         spin_unlock(&sc->mp->m_sb_lock);
 917         xfs_log_sb(sc->tp);
 918 }
 919
 920 /*
 921  * Attach dquots to this inode, or schedule quotacheck to fix them.
 922  *
 923  * This function ensures that the appropriate dquots are attached to an inode.
 924  * We cannot allow the dquot code to allocate an on-disk dquot block here
 925  * because we're already in transaction context with the inode locked.  The
 926  * on-disk dquot should already exist anyway.  If the quota code signals
 927  * corruption or missing quota information, schedule quotacheck, which will
 928  * repair corruptions in the quota metadata.
 929  */
 930 int
 931 xrep_ino_dqattach(
 932         struct xfs_scrub        *sc)
 933 {
 934         int                     error;
 935
 936         error = xfs_qm_dqattach_locked(sc->ip, false);
 937         switch (error) {
 938         case -EFSBADCRC:
 939         case -EFSCORRUPTED:
 940         case -ENOENT:
 941                 xfs_err_ratelimited(sc->mp,
 942 "inode %llu repair encountered quota error %d, quotacheck forced.",
 943                                 (unsigned long long)sc->ip->i_ino, error);
 944                 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
 945                         xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
 946                 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
 947                         xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
 948                 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
 949                         xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
 950                 /* fall through */
 951         case -ESRCH:
 952                 error = 0;
 953                 break;
 954         default:
 955                 break;
 956         }
 957
 958         return error;
 959 }