fs/xfs/xfs_iwalk.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2019 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_btree.h"
  15 #include "xfs_ialloc.h"
  16 #include "xfs_ialloc_btree.h"
  17 #include "xfs_iwalk.h"
  18 #include "xfs_error.h"
  19 #include "xfs_trace.h"
  20 #include "xfs_icache.h"
  21 #include "xfs_health.h"
  22 #include "xfs_trans.h"
  23 #include "xfs_pwork.h"
  24 #include "xfs_ag.h"
  25 #include "xfs_bit.h"
  26
  27 /*
  28  * Walking Inodes in the Filesystem
  29  * ================================
  30  *
  31  * This iterator function walks a subset of filesystem inodes in increasing
  32  * order from @startino until there are no more inodes.  For each allocated
  33  * inode it finds, it calls a walk function with the relevant inode number and
  34  * a pointer to caller-provided data.  The walk function can return the usual
  35  * negative error code to stop the iteration; 0 to continue the iteration; or
  36  * -ECANCELED to stop the iteration.  This return value is returned to the
  37  * caller.
  38  *
  39  * Internally, we allow the walk function to do anything, which means that we
  40  * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
  41  * therefore cache the inobt records in kernel memory and only call the walk
  42  * function when our memory buffer is full.  @nr_recs is the number of records
  43  * that we've cached, and @sz_recs is the size of our cache.
  44  *
  45  * It is the responsibility of the walk function to ensure it accesses
  46  * allocated inodes, as the inobt records may be stale by the time they are
  47  * acted upon.
  48  */
  49
  50 struct xfs_iwalk_ag {
  51         /* parallel work control data; will be null if single threaded */
  52         struct xfs_pwork                pwork;
  53
  54         struct xfs_mount                *mp;
  55         struct xfs_trans                *tp;
  56         struct xfs_perag                *pag;
  57
  58         /* Where do we start the traversal? */
  59         xfs_ino_t                       startino;
  60
  61         /* What was the last inode number we saw when iterating the inobt? */
  62         xfs_ino_t                       lastino;
  63
  64         /* Array of inobt records we cache. */
  65         struct xfs_inobt_rec_incore     *recs;
  66
  67         /* Number of entries allocated for the @recs array. */
  68         unsigned int                    sz_recs;
  69
  70         /* Number of entries in the @recs array that are in use. */
  71         unsigned int                    nr_recs;
  72
  73         /* Inode walk function and data pointer. */
  74         xfs_iwalk_fn                    iwalk_fn;
  75         xfs_inobt_walk_fn               inobt_walk_fn;
  76         void                            *data;
  77
  78         /*
  79          * Make it look like the inodes up to startino are free so that
  80          * bulkstat can start its inode iteration at the correct place without
  81          * needing to special case everywhere.
  82          */
  83         unsigned int                    trim_start:1;
  84
  85         /* Skip empty inobt records? */
  86         unsigned int                    skip_empty:1;
  87
  88         /* Drop the (hopefully empty) transaction when calling iwalk_fn. */
  89         unsigned int                    drop_trans:1;
  90 };
  91
  92 /*
  93  * Loop over all clusters in a chunk for a given incore inode allocation btree
  94  * record.  Do a readahead if there are any allocated inodes in that cluster.
  95  */
  96 STATIC void
  97 xfs_iwalk_ichunk_ra(
  98         struct xfs_mount                *mp,
  99         struct xfs_perag                *pag,
 100         struct xfs_inobt_rec_incore     *irec)
 101 {
 102         struct xfs_ino_geometry         *igeo = M_IGEO(mp);
 103         xfs_agblock_t                   agbno;
 104         struct blk_plug                 plug;
 105         int                             i;      /* inode chunk index */
 106
 107         agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
 108
 109         blk_start_plug(&plug);
 110         for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
 111                 xfs_inofree_t   imask;
 112
 113                 imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
 114                 if (imask & ~irec->ir_free) {
 115                         xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
 116                                         igeo->blocks_per_cluster,
 117                                         &xfs_inode_buf_ops);
 118                 }
 119                 agbno += igeo->blocks_per_cluster;
 120         }
 121         blk_finish_plug(&plug);
 122 }
 123
 124 /*
 125  * Set the bits in @irec's free mask that correspond to the inodes before
 126  * @agino so that we skip them.  This is how we restart an inode walk that was
 127  * interrupted in the middle of an inode record.
 128  */
 129 STATIC void
 130 xfs_iwalk_adjust_start(
 131         xfs_agino_t                     agino,  /* starting inode of chunk */
 132         struct xfs_inobt_rec_incore     *irec)  /* btree record */
 133 {
 134         int                             idx;    /* index into inode chunk */
 135
 136         idx = agino - irec->ir_startino;
 137
 138         irec->ir_free |= xfs_inobt_maskn(0, idx);
 139         irec->ir_freecount = hweight64(irec->ir_free);
 140 }
 141
 142 /* Allocate memory for a walk. */
 143 STATIC int
 144 xfs_iwalk_alloc(
 145         struct xfs_iwalk_ag     *iwag)
 146 {
 147         size_t                  size;
 148
 149         ASSERT(iwag->recs == NULL);
 150         iwag->nr_recs = 0;
 151
 152         /* Allocate a prefetch buffer for inobt records. */
 153         size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
 154         iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 155         if (iwag->recs == NULL)
 156                 return -ENOMEM;
 157
 158         return 0;
 159 }
 160
 161 /* Free memory we allocated for a walk. */
 162 STATIC void
 163 xfs_iwalk_free(
 164         struct xfs_iwalk_ag     *iwag)
 165 {
 166         kfree(iwag->recs);
 167         iwag->recs = NULL;
 168 }
 169
 170 /* For each inuse inode in each cached inobt record, call our function. */
 171 STATIC int
 172 xfs_iwalk_ag_recs(
 173         struct xfs_iwalk_ag     *iwag)
 174 {
 175         struct xfs_mount        *mp = iwag->mp;
 176         struct xfs_trans        *tp = iwag->tp;
 177         struct xfs_perag        *pag = iwag->pag;
 178         xfs_ino_t               ino;
 179         unsigned int            i, j;
 180         int                     error;
 181
 182         for (i = 0; i < iwag->nr_recs; i++) {
 183                 struct xfs_inobt_rec_incore     *irec = &iwag->recs[i];
 184
 185                 trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
 186
 187                 if (xfs_pwork_want_abort(&iwag->pwork))
 188                         return 0;
 189
 190                 if (iwag->inobt_walk_fn) {
 191                         error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
 192                                         iwag->data);
 193                         if (error)
 194                                 return error;
 195                 }
 196
 197                 if (!iwag->iwalk_fn)
 198                         continue;
 199
 200                 for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
 201                         if (xfs_pwork_want_abort(&iwag->pwork))
 202                                 return 0;
 203
 204                         /* Skip if this inode is free */
 205                         if (XFS_INOBT_MASK(j) & irec->ir_free)
 206                                 continue;
 207
 208                         /* Otherwise call our function. */
 209                         ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
 210                                                 irec->ir_startino + j);
 211                         error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
 212                         if (error)
 213                                 return error;
 214                 }
 215         }
 216
 217         return 0;
 218 }
 219
 220 /* Delete cursor and let go of AGI. */
 221 static inline void
 222 xfs_iwalk_del_inobt(
 223         struct xfs_trans        *tp,
 224         struct xfs_btree_cur    **curpp,
 225         struct xfs_buf          **agi_bpp,
 226         int                     error)
 227 {
 228         if (*curpp) {
 229                 xfs_btree_del_cursor(*curpp, error);
 230                 *curpp = NULL;
 231         }
 232         if (*agi_bpp) {
 233                 xfs_trans_brelse(tp, *agi_bpp);
 234                 *agi_bpp = NULL;
 235         }
 236 }
 237
 238 /*
 239  * Set ourselves up for walking inobt records starting from a given point in
 240  * the filesystem.
 241  *
 242  * If caller passed in a nonzero start inode number, load the record from the
 243  * inobt and make the record look like all the inodes before agino are free so
 244  * that we skip them, and then move the cursor to the next inobt record.  This
 245  * is how we support starting an iwalk in the middle of an inode chunk.
 246  *
 247  * If the caller passed in a start number of zero, move the cursor to the first
 248  * inobt record.
 249  *
 250  * The caller is responsible for cleaning up the cursor and buffer pointer
 251  * regardless of the error status.
 252  */
 253 STATIC int
 254 xfs_iwalk_ag_start(
 255         struct xfs_iwalk_ag     *iwag,
 256         xfs_agino_t             agino,
 257         struct xfs_btree_cur    **curpp,
 258         struct xfs_buf          **agi_bpp,
 259         int                     *has_more)
 260 {
 261         struct xfs_mount        *mp = iwag->mp;
 262         struct xfs_trans        *tp = iwag->tp;
 263         struct xfs_perag        *pag = iwag->pag;
 264         struct xfs_inobt_rec_incore *irec;
 265         int                     error;
 266
 267         /* Set up a fresh cursor and empty the inobt cache. */
 268         iwag->nr_recs = 0;
 269         error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp);
 270         if (error)
 271                 return error;
 272
 273         /* Starting at the beginning of the AG?  That's easy! */
 274         if (agino == 0)
 275                 return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
 276
 277         /*
 278          * Otherwise, we have to grab the inobt record where we left off, stuff
 279          * the record into our cache, and then see if there are more records.
 280          * We require a lookup cache of at least two elements so that the
 281          * caller doesn't have to deal with tearing down the cursor to walk the
 282          * records.
 283          */
 284         error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
 285         if (error)
 286                 return error;
 287
 288         /*
 289          * If the LE lookup at @agino yields no records, jump ahead to the
 290          * inobt cursor increment to see if there are more records to process.
 291          */
 292         if (!*has_more)
 293                 goto out_advance;
 294
 295         /* Get the record, should always work */
 296         irec = &iwag->recs[iwag->nr_recs];
 297         error = xfs_inobt_get_rec(*curpp, irec, has_more);
 298         if (error)
 299                 return error;
 300         if (XFS_IS_CORRUPT(mp, *has_more != 1))
 301                 return -EFSCORRUPTED;
 302
 303         iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
 304                                 irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
 305
 306         /*
 307          * If the LE lookup yielded an inobt record before the cursor position,
 308          * skip it and see if there's another one after it.
 309          */
 310         if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
 311                 goto out_advance;
 312
 313         /*
 314          * If agino fell in the middle of the inode record, make it look like
 315          * the inodes up to agino are free so that we don't return them again.
 316          */
 317         if (iwag->trim_start)
 318                 xfs_iwalk_adjust_start(agino, irec);
 319
 320         /*
 321          * The prefetch calculation is supposed to give us a large enough inobt
 322          * record cache that grab_ichunk can stage a partial first record and
 323          * the loop body can cache a record without having to check for cache
 324          * space until after it reads an inobt record.
 325          */
 326         iwag->nr_recs++;
 327         ASSERT(iwag->nr_recs < iwag->sz_recs);
 328
 329 out_advance:
 330         return xfs_btree_increment(*curpp, 0, has_more);
 331 }
 332
 333 /*
 334  * The inobt record cache is full, so preserve the inobt cursor state and
 335  * run callbacks on the cached inobt records.  When we're done, restore the
 336  * cursor state to wherever the cursor would have been had the cache not been
 337  * full (and therefore we could've just incremented the cursor) if *@has_more
 338  * is true.  On exit, *@has_more will indicate whether or not the caller should
 339  * try for more inode records.
 340  */
 341 STATIC int
 342 xfs_iwalk_run_callbacks(
 343         struct xfs_iwalk_ag             *iwag,
 344         struct xfs_btree_cur            **curpp,
 345         struct xfs_buf                  **agi_bpp,
 346         int                             *has_more)
 347 {
 348         struct xfs_mount                *mp = iwag->mp;
 349         struct xfs_inobt_rec_incore     *irec;
 350         xfs_agino_t                     next_agino;
 351         int                             error;
 352
 353         next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
 354
 355         ASSERT(iwag->nr_recs > 0);
 356
 357         /* Delete cursor but remember the last record we cached... */
 358         xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0);
 359         irec = &iwag->recs[iwag->nr_recs - 1];
 360         ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
 361
 362         if (iwag->drop_trans) {
 363                 xfs_trans_cancel(iwag->tp);
 364                 iwag->tp = NULL;
 365         }
 366
 367         error = xfs_iwalk_ag_recs(iwag);
 368         if (error)
 369                 return error;
 370
 371         /* ...empty the cache... */
 372         iwag->nr_recs = 0;
 373
 374         if (!has_more)
 375                 return 0;
 376
 377         if (iwag->drop_trans) {
 378                 error = xfs_trans_alloc_empty(mp, &iwag->tp);
 379                 if (error)
 380                         return error;
 381         }
 382
 383         /* ...and recreate the cursor just past where we left off. */
 384         error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp,
 385                         agi_bpp);
 386         if (error)
 387                 return error;
 388
 389         return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
 390 }
 391
 392 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
 393 STATIC int
 394 xfs_iwalk_ag(
 395         struct xfs_iwalk_ag             *iwag)
 396 {
 397         struct xfs_mount                *mp = iwag->mp;
 398         struct xfs_perag                *pag = iwag->pag;
 399         struct xfs_buf                  *agi_bp = NULL;
 400         struct xfs_btree_cur            *cur = NULL;
 401         xfs_agino_t                     agino;
 402         int                             has_more;
 403         int                             error = 0;
 404
 405         /* Set up our cursor at the right place in the inode btree. */
 406         ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
 407         agino = XFS_INO_TO_AGINO(mp, iwag->startino);
 408         error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
 409
 410         while (!error && has_more) {
 411                 struct xfs_inobt_rec_incore     *irec;
 412                 xfs_ino_t                       rec_fsino;
 413
 414                 cond_resched();
 415                 if (xfs_pwork_want_abort(&iwag->pwork))
 416                         goto out;
 417
 418                 /* Fetch the inobt record. */
 419                 irec = &iwag->recs[iwag->nr_recs];
 420                 error = xfs_inobt_get_rec(cur, irec, &has_more);
 421                 if (error || !has_more)
 422                         break;
 423
 424                 /* Make sure that we always move forward. */
 425                 rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
 426                 if (iwag->lastino != NULLFSINO &&
 427                     XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
 428                         error = -EFSCORRUPTED;
 429                         goto out;
 430                 }
 431                 iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
 432
 433                 /* No allocated inodes in this chunk; skip it. */
 434                 if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
 435                         error = xfs_btree_increment(cur, 0, &has_more);
 436                         if (error)
 437                                 break;
 438                         continue;
 439                 }
 440
 441                 /*
 442                  * Start readahead for this inode chunk in anticipation of
 443                  * walking the inodes.
 444                  */
 445                 if (iwag->iwalk_fn)
 446                         xfs_iwalk_ichunk_ra(mp, pag, irec);
 447
 448                 /*
 449                  * If there's space in the buffer for more records, increment
 450                  * the btree cursor and grab more.
 451                  */
 452                 if (++iwag->nr_recs < iwag->sz_recs) {
 453                         error = xfs_btree_increment(cur, 0, &has_more);
 454                         if (error || !has_more)
 455                                 break;
 456                         continue;
 457                 }
 458
 459                 /*
 460                  * Otherwise, we need to save cursor state and run the callback
 461                  * function on the cached records.  The run_callbacks function
 462                  * is supposed to return a cursor pointing to the record where
 463                  * we would be if we had been able to increment like above.
 464                  */
 465                 ASSERT(has_more);
 466                 error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
 467         }
 468
 469         if (iwag->nr_recs == 0 || error)
 470                 goto out;
 471
 472         /* Walk the unprocessed records in the cache. */
 473         error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
 474
 475 out:
 476         xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error);
 477         return error;
 478 }
 479
 480 /*
 481  * We experimentally determined that the reduction in ioctl call overhead
 482  * diminishes when userspace asks for more than 2048 inodes, so we'll cap
 483  * prefetch at this point.
 484  */
 485 #define IWALK_MAX_INODE_PREFETCH        (2048U)
 486
 487 /*
 488  * Given the number of inodes to prefetch, set the number of inobt records that
 489  * we cache in memory, which controls the number of inodes we try to read
 490  * ahead.  Set the maximum if @inodes == 0.
 491  */
 492 static inline unsigned int
 493 xfs_iwalk_prefetch(
 494         unsigned int            inodes)
 495 {
 496         unsigned int            inobt_records;
 497
 498         /*
 499          * If the caller didn't tell us the number of inodes they wanted,
 500          * assume the maximum prefetch possible for best performance.
 501          * Otherwise, cap prefetch at that maximum so that we don't start an
 502          * absurd amount of prefetch.
 503          */
 504         if (inodes == 0)
 505                 inodes = IWALK_MAX_INODE_PREFETCH;
 506         inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
 507
 508         /* Round the inode count up to a full chunk. */
 509         inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
 510
 511         /*
 512          * In order to convert the number of inodes to prefetch into an
 513          * estimate of the number of inobt records to cache, we require a
 514          * conversion factor that reflects our expectations of the average
 515          * loading factor of an inode chunk.  Based on data gathered, most
 516          * (but not all) filesystems manage to keep the inode chunks totally
 517          * full, so we'll underestimate slightly so that our readahead will
 518          * still deliver the performance we want on aging filesystems:
 519          *
 520          * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
 521          *
 522          * The funny math is to avoid integer division.
 523          */
 524         inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
 525
 526         /*
 527          * Allocate enough space to prefetch at least two inobt records so that
 528          * we can cache both the record where the iwalk started and the next
 529          * record.  This simplifies the AG inode walk loop setup code.
 530          */
 531         return max(inobt_records, 2U);
 532 }
 533
 534 /*
 535  * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
 536  * will be called for each allocated inode, being passed the inode's number and
 537  * @data.  @max_prefetch controls how many inobt records' worth of inodes we
 538  * try to readahead.
 539  */
 540 int
 541 xfs_iwalk(
 542         struct xfs_mount        *mp,
 543         struct xfs_trans        *tp,
 544         xfs_ino_t               startino,
 545         unsigned int            flags,
 546         xfs_iwalk_fn            iwalk_fn,
 547         unsigned int            inode_records,
 548         void                    *data)
 549 {
 550         struct xfs_iwalk_ag     iwag = {
 551                 .mp             = mp,
 552                 .tp             = tp,
 553                 .iwalk_fn       = iwalk_fn,
 554                 .data           = data,
 555                 .startino       = startino,
 556                 .sz_recs        = xfs_iwalk_prefetch(inode_records),
 557                 .trim_start     = 1,
 558                 .skip_empty     = 1,
 559                 .pwork          = XFS_PWORK_SINGLE_THREADED,
 560                 .lastino        = NULLFSINO,
 561         };
 562         struct xfs_perag        *pag;
 563         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 564         int                     error;
 565
 566         ASSERT(agno < mp->m_sb.sb_agcount);
 567         ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 568
 569         error = xfs_iwalk_alloc(&iwag);
 570         if (error)
 571                 return error;
 572
 573         for_each_perag_from(mp, agno, pag) {
 574                 iwag.pag = pag;
 575                 error = xfs_iwalk_ag(&iwag);
 576                 if (error)
 577                         break;
 578                 iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 579                 if (flags & XFS_INOBT_WALK_SAME_AG)
 580                         break;
 581                 iwag.pag = NULL;
 582         }
 583
 584         if (iwag.pag)
 585                 xfs_perag_rele(pag);
 586         xfs_iwalk_free(&iwag);
 587         return error;
 588 }
 589
 590 /* Run per-thread iwalk work. */
 591 static int
 592 xfs_iwalk_ag_work(
 593         struct xfs_mount        *mp,
 594         struct xfs_pwork        *pwork)
 595 {
 596         struct xfs_iwalk_ag     *iwag;
 597         int                     error = 0;
 598
 599         iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
 600         if (xfs_pwork_want_abort(pwork))
 601                 goto out;
 602
 603         error = xfs_iwalk_alloc(iwag);
 604         if (error)
 605                 goto out;
 606         /*
 607          * Grab an empty transaction so that we can use its recursive buffer
 608          * locking abilities to detect cycles in the inobt without deadlocking.
 609          */
 610         error = xfs_trans_alloc_empty(mp, &iwag->tp);
 611         if (error)
 612                 goto out;
 613         iwag->drop_trans = 1;
 614
 615         error = xfs_iwalk_ag(iwag);
 616         if (iwag->tp)
 617                 xfs_trans_cancel(iwag->tp);
 618         xfs_iwalk_free(iwag);
 619 out:
 620         xfs_perag_put(iwag->pag);
 621         kfree(iwag);
 622         return error;
 623 }
 624
 625 /*
 626  * Walk all the inodes in the filesystem using multiple threads to process each
 627  * AG.
 628  */
 629 int
 630 xfs_iwalk_threaded(
 631         struct xfs_mount        *mp,
 632         xfs_ino_t               startino,
 633         unsigned int            flags,
 634         xfs_iwalk_fn            iwalk_fn,
 635         unsigned int            inode_records,
 636         bool                    polled,
 637         void                    *data)
 638 {
 639         struct xfs_pwork_ctl    pctl;
 640         struct xfs_perag        *pag;
 641         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 642         int                     error;
 643
 644         ASSERT(agno < mp->m_sb.sb_agcount);
 645         ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 646
 647         error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
 648         if (error)
 649                 return error;
 650
 651         for_each_perag_from(mp, agno, pag) {
 652                 struct xfs_iwalk_ag     *iwag;
 653
 654                 if (xfs_pwork_ctl_want_abort(&pctl))
 655                         break;
 656
 657                 iwag = kzalloc(sizeof(struct xfs_iwalk_ag),
 658                                 GFP_KERNEL | __GFP_NOFAIL);
 659                 iwag->mp = mp;
 660
 661                 /*
 662                  * perag is being handed off to async work, so take a passive
 663                  * reference for the async work to release.
 664                  */
 665                 iwag->pag = xfs_perag_hold(pag);
 666                 iwag->iwalk_fn = iwalk_fn;
 667                 iwag->data = data;
 668                 iwag->startino = startino;
 669                 iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
 670                 iwag->lastino = NULLFSINO;
 671                 xfs_pwork_queue(&pctl, &iwag->pwork);
 672                 startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
 673                 if (flags & XFS_INOBT_WALK_SAME_AG)
 674                         break;
 675         }
 676         if (pag)
 677                 xfs_perag_rele(pag);
 678         if (polled)
 679                 xfs_pwork_poll(&pctl);
 680         return xfs_pwork_destroy(&pctl);
 681 }
 682
 683 /*
 684  * Allow callers to cache up to a page's worth of inobt records.  This reflects
 685  * the existing inumbers prefetching behavior.  Since the inobt walk does not
 686  * itself do anything with the inobt records, we can set a fairly high limit
 687  * here.
 688  */
 689 #define MAX_INOBT_WALK_PREFETCH \
 690         (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
 691
 692 /*
 693  * Given the number of records that the user wanted, set the number of inobt
 694  * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
 695  */
 696 static inline unsigned int
 697 xfs_inobt_walk_prefetch(
 698         unsigned int            inobt_records)
 699 {
 700         /*
 701          * If the caller didn't tell us the number of inobt records they
 702          * wanted, assume the maximum prefetch possible for best performance.
 703          */
 704         if (inobt_records == 0)
 705                 inobt_records = MAX_INOBT_WALK_PREFETCH;
 706
 707         /*
 708          * Allocate enough space to prefetch at least two inobt records so that
 709          * we can cache both the record where the iwalk started and the next
 710          * record.  This simplifies the AG inode walk loop setup code.
 711          */
 712         inobt_records = max(inobt_records, 2U);
 713
 714         /*
 715          * Cap prefetch at that maximum so that we don't use an absurd amount
 716          * of memory.
 717          */
 718         return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
 719 }
 720
 721 /*
 722  * Walk all inode btree records in the filesystem starting from @startino.  The
 723  * @inobt_walk_fn will be called for each btree record, being passed the incore
 724  * record and @data.  @max_prefetch controls how many inobt records we try to
 725  * cache ahead of time.
 726  */
 727 int
 728 xfs_inobt_walk(
 729         struct xfs_mount        *mp,
 730         struct xfs_trans        *tp,
 731         xfs_ino_t               startino,
 732         unsigned int            flags,
 733         xfs_inobt_walk_fn       inobt_walk_fn,
 734         unsigned int            inobt_records,
 735         void                    *data)
 736 {
 737         struct xfs_iwalk_ag     iwag = {
 738                 .mp             = mp,
 739                 .tp             = tp,
 740                 .inobt_walk_fn  = inobt_walk_fn,
 741                 .data           = data,
 742                 .startino       = startino,
 743                 .sz_recs        = xfs_inobt_walk_prefetch(inobt_records),
 744                 .pwork          = XFS_PWORK_SINGLE_THREADED,
 745                 .lastino        = NULLFSINO,
 746         };
 747         struct xfs_perag        *pag;
 748         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 749         int                     error;
 750
 751         ASSERT(agno < mp->m_sb.sb_agcount);
 752         ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
 753
 754         error = xfs_iwalk_alloc(&iwag);
 755         if (error)
 756                 return error;
 757
 758         for_each_perag_from(mp, agno, pag) {
 759                 iwag.pag = pag;
 760                 error = xfs_iwalk_ag(&iwag);
 761                 if (error)
 762                         break;
 763                 iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
 764                 if (flags & XFS_INOBT_WALK_SAME_AG)
 765                         break;
 766                 iwag.pag = NULL;
 767         }
 768
 769         if (iwag.pag)
 770                 xfs_perag_rele(pag);
 771         xfs_iwalk_free(&iwag);
 772         return error;
 773 }