fs/xfs/xfs_log_recover.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_sb.h"
  14 #include "xfs_mount.h"
  15 #include "xfs_defer.h"
  16 #include "xfs_inode.h"
  17 #include "xfs_trans.h"
  18 #include "xfs_log.h"
  19 #include "xfs_log_priv.h"
  20 #include "xfs_log_recover.h"
  21 #include "xfs_trans_priv.h"
  22 #include "xfs_alloc.h"
  23 #include "xfs_ialloc.h"
  24 #include "xfs_trace.h"
  25 #include "xfs_icache.h"
  26 #include "xfs_error.h"
  27 #include "xfs_buf_item.h"
  28
  29 #define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
  30
  31 STATIC int
  32 xlog_find_zeroed(
  33         struct xlog     *,
  34         xfs_daddr_t     *);
  35 STATIC int
  36 xlog_clear_stale_blocks(
  37         struct xlog     *,
  38         xfs_lsn_t);
  39 #if defined(DEBUG)
  40 STATIC void
  41 xlog_recover_check_summary(
  42         struct xlog *);
  43 #else
  44 #define xlog_recover_check_summary(log)
  45 #endif
  46 STATIC int
  47 xlog_do_recovery_pass(
  48         struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
  49
  50 /*
  51  * Sector aligned buffer routines for buffer create/read/write/access
  52  */
  53
  54 /*
  55  * Verify the log-relative block number and length in basic blocks are valid for
  56  * an operation involving the given XFS log buffer. Returns true if the fields
  57  * are valid, false otherwise.
  58  */
  59 static inline bool
  60 xlog_verify_bno(
  61         struct xlog     *log,
  62         xfs_daddr_t     blk_no,
  63         int             bbcount)
  64 {
  65         if (blk_no < 0 || blk_no >= log->l_logBBsize)
  66                 return false;
  67         if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
  68                 return false;
  69         return true;
  70 }
  71
  72 /*
  73  * Allocate a buffer to hold log data.  The buffer needs to be able to map to
  74  * a range of nbblks basic blocks at any valid offset within the log.
  75  */
  76 static char *
  77 xlog_alloc_buffer(
  78         struct xlog     *log,
  79         int             nbblks)
  80 {
  81         int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
  82
  83         /*
  84          * Pass log block 0 since we don't have an addr yet, buffer will be
  85          * verified on read.
  86          */
  87         if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
  88                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
  89                         nbblks);
  90                 return NULL;
  91         }
  92
  93         /*
  94          * We do log I/O in units of log sectors (a power-of-2 multiple of the
  95          * basic block size), so we round up the requested size to accommodate
  96          * the basic blocks required for complete log sectors.
  97          *
  98          * In addition, the buffer may be used for a non-sector-aligned block
  99          * offset, in which case an I/O of the requested size could extend
 100          * beyond the end of the buffer.  If the requested size is only 1 basic
 101          * block it will never straddle a sector boundary, so this won't be an
 102          * issue.  Nor will this be a problem if the log I/O is done in basic
 103          * blocks (sector size 1).  But otherwise we extend the buffer by one
 104          * extra log sector to ensure there's space to accommodate this
 105          * possibility.
 106          */
 107         if (nbblks > 1 && log->l_sectBBsize > 1)
 108                 nbblks += log->l_sectBBsize;
 109         nbblks = round_up(nbblks, log->l_sectBBsize);
 110         return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
 111 }
 112
 113 /*
 114  * Return the address of the start of the given block number's data
 115  * in a log buffer.  The buffer covers a log sector-aligned region.
 116  */
 117 static inline unsigned int
 118 xlog_align(
 119         struct xlog     *log,
 120         xfs_daddr_t     blk_no)
 121 {
 122         return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
 123 }
 124
 125 static int
 126 xlog_do_io(
 127         struct xlog             *log,
 128         xfs_daddr_t             blk_no,
 129         unsigned int            nbblks,
 130         char                    *data,
 131         unsigned int            op)
 132 {
 133         int                     error;
 134
 135         if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
 136                 xfs_warn(log->l_mp,
 137                          "Invalid log block/length (0x%llx, 0x%x) for buffer",
 138                          blk_no, nbblks);
 139                 return -EFSCORRUPTED;
 140         }
 141
 142         blk_no = round_down(blk_no, log->l_sectBBsize);
 143         nbblks = round_up(nbblks, log->l_sectBBsize);
 144         ASSERT(nbblks > 0);
 145
 146         error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
 147                         BBTOB(nbblks), data, op);
 148         if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
 149                 xfs_alert(log->l_mp,
 150                           "log recovery %s I/O error at daddr 0x%llx len %d error %d",
 151                           op == REQ_OP_WRITE ? "write" : "read",
 152                           blk_no, nbblks, error);
 153         }
 154         return error;
 155 }
 156
 157 STATIC int
 158 xlog_bread_noalign(
 159         struct xlog     *log,
 160         xfs_daddr_t     blk_no,
 161         int             nbblks,
 162         char            *data)
 163 {
 164         return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
 165 }
 166
 167 STATIC int
 168 xlog_bread(
 169         struct xlog     *log,
 170         xfs_daddr_t     blk_no,
 171         int             nbblks,
 172         char            *data,
 173         char            **offset)
 174 {
 175         int             error;
 176
 177         error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
 178         if (!error)
 179                 *offset = data + xlog_align(log, blk_no);
 180         return error;
 181 }
 182
 183 STATIC int
 184 xlog_bwrite(
 185         struct xlog     *log,
 186         xfs_daddr_t     blk_no,
 187         int             nbblks,
 188         char            *data)
 189 {
 190         return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
 191 }
 192
 193 #ifdef DEBUG
 194 /*
 195  * dump debug superblock and log record information
 196  */
 197 STATIC void
 198 xlog_header_check_dump(
 199         xfs_mount_t             *mp,
 200         xlog_rec_header_t       *head)
 201 {
 202         xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
 203                 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 204         xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
 205                 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 206 }
 207 #else
 208 #define xlog_header_check_dump(mp, head)
 209 #endif
 210
 211 /*
 212  * check log record header for recovery
 213  */
 214 STATIC int
 215 xlog_header_check_recover(
 216         xfs_mount_t             *mp,
 217         xlog_rec_header_t       *head)
 218 {
 219         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 220
 221         /*
 222          * IRIX doesn't write the h_fmt field and leaves it zeroed
 223          * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 224          * a dirty log created in IRIX.
 225          */
 226         if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 227                 xfs_warn(mp,
 228         "dirty log written in incompatible format - can't recover");
 229                 xlog_header_check_dump(mp, head);
 230                 return -EFSCORRUPTED;
 231         }
 232         if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
 233                                            &head->h_fs_uuid))) {
 234                 xfs_warn(mp,
 235         "dirty log entry has mismatched uuid - can't recover");
 236                 xlog_header_check_dump(mp, head);
 237                 return -EFSCORRUPTED;
 238         }
 239         return 0;
 240 }
 241
 242 /*
 243  * read the head block of the log and check the header
 244  */
 245 STATIC int
 246 xlog_header_check_mount(
 247         xfs_mount_t             *mp,
 248         xlog_rec_header_t       *head)
 249 {
 250         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 251
 252         if (uuid_is_null(&head->h_fs_uuid)) {
 253                 /*
 254                  * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 255                  * h_fs_uuid is null, we assume this log was last mounted
 256                  * by IRIX and continue.
 257                  */
 258                 xfs_warn(mp, "null uuid in log - IRIX style log");
 259         } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
 260                                                   &head->h_fs_uuid))) {
 261                 xfs_warn(mp, "log has mismatched uuid - can't recover");
 262                 xlog_header_check_dump(mp, head);
 263                 return -EFSCORRUPTED;
 264         }
 265         return 0;
 266 }
 267
 268 /*
 269  * This routine finds (to an approximation) the first block in the physical
 270  * log which contains the given cycle.  It uses a binary search algorithm.
 271  * Note that the algorithm can not be perfect because the disk will not
 272  * necessarily be perfect.
 273  */
 274 STATIC int
 275 xlog_find_cycle_start(
 276         struct xlog     *log,
 277         char            *buffer,
 278         xfs_daddr_t     first_blk,
 279         xfs_daddr_t     *last_blk,
 280         uint            cycle)
 281 {
 282         char            *offset;
 283         xfs_daddr_t     mid_blk;
 284         xfs_daddr_t     end_blk;
 285         uint            mid_cycle;
 286         int             error;
 287
 288         end_blk = *last_blk;
 289         mid_blk = BLK_AVG(first_blk, end_blk);
 290         while (mid_blk != first_blk && mid_blk != end_blk) {
 291                 error = xlog_bread(log, mid_blk, 1, buffer, &offset);
 292                 if (error)
 293                         return error;
 294                 mid_cycle = xlog_get_cycle(offset);
 295                 if (mid_cycle == cycle)
 296                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 297                 else
 298                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 299                 mid_blk = BLK_AVG(first_blk, end_blk);
 300         }
 301         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 302                (mid_blk == end_blk && mid_blk-1 == first_blk));
 303
 304         *last_blk = end_blk;
 305
 306         return 0;
 307 }
 308
 309 /*
 310  * Check that a range of blocks does not contain stop_on_cycle_no.
 311  * Fill in *new_blk with the block offset where such a block is
 312  * found, or with -1 (an invalid block number) if there is no such
 313  * block in the range.  The scan needs to occur from front to back
 314  * and the pointer into the region must be updated since a later
 315  * routine will need to perform another test.
 316  */
 317 STATIC int
 318 xlog_find_verify_cycle(
 319         struct xlog     *log,
 320         xfs_daddr_t     start_blk,
 321         int             nbblks,
 322         uint            stop_on_cycle_no,
 323         xfs_daddr_t     *new_blk)
 324 {
 325         xfs_daddr_t     i, j;
 326         uint            cycle;
 327         char            *buffer;
 328         xfs_daddr_t     bufblks;
 329         char            *buf = NULL;
 330         int             error = 0;
 331
 332         /*
 333          * Greedily allocate a buffer big enough to handle the full
 334          * range of basic blocks we'll be examining.  If that fails,
 335          * try a smaller size.  We need to be able to read at least
 336          * a log sector, or we're out of luck.
 337          */
 338         bufblks = 1 << ffs(nbblks);
 339         while (bufblks > log->l_logBBsize)
 340                 bufblks >>= 1;
 341         while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
 342                 bufblks >>= 1;
 343                 if (bufblks < log->l_sectBBsize)
 344                         return -ENOMEM;
 345         }
 346
 347         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 348                 int     bcount;
 349
 350                 bcount = min(bufblks, (start_blk + nbblks - i));
 351
 352                 error = xlog_bread(log, i, bcount, buffer, &buf);
 353                 if (error)
 354                         goto out;
 355
 356                 for (j = 0; j < bcount; j++) {
 357                         cycle = xlog_get_cycle(buf);
 358                         if (cycle == stop_on_cycle_no) {
 359                                 *new_blk = i+j;
 360                                 goto out;
 361                         }
 362
 363                         buf += BBSIZE;
 364                 }
 365         }
 366
 367         *new_blk = -1;
 368
 369 out:
 370         kmem_free(buffer);
 371         return error;
 372 }
 373
 374 static inline int
 375 xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
 376 {
 377         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 378                 int     h_size = be32_to_cpu(rh->h_size);
 379
 380                 if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
 381                     h_size > XLOG_HEADER_CYCLE_SIZE)
 382                         return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
 383         }
 384         return 1;
 385 }
 386
 387 /*
 388  * Potentially backup over partial log record write.
 389  *
 390  * In the typical case, last_blk is the number of the block directly after
 391  * a good log record.  Therefore, we subtract one to get the block number
 392  * of the last block in the given buffer.  extra_bblks contains the number
 393  * of blocks we would have read on a previous read.  This happens when the
 394  * last log record is split over the end of the physical log.
 395  *
 396  * extra_bblks is the number of blocks potentially verified on a previous
 397  * call to this routine.
 398  */
 399 STATIC int
 400 xlog_find_verify_log_record(
 401         struct xlog             *log,
 402         xfs_daddr_t             start_blk,
 403         xfs_daddr_t             *last_blk,
 404         int                     extra_bblks)
 405 {
 406         xfs_daddr_t             i;
 407         char                    *buffer;
 408         char                    *offset = NULL;
 409         xlog_rec_header_t       *head = NULL;
 410         int                     error = 0;
 411         int                     smallmem = 0;
 412         int                     num_blks = *last_blk - start_blk;
 413         int                     xhdrs;
 414
 415         ASSERT(start_blk != 0 || *last_blk != start_blk);
 416
 417         buffer = xlog_alloc_buffer(log, num_blks);
 418         if (!buffer) {
 419                 buffer = xlog_alloc_buffer(log, 1);
 420                 if (!buffer)
 421                         return -ENOMEM;
 422                 smallmem = 1;
 423         } else {
 424                 error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
 425                 if (error)
 426                         goto out;
 427                 offset += ((num_blks - 1) << BBSHIFT);
 428         }
 429
 430         for (i = (*last_blk) - 1; i >= 0; i--) {
 431                 if (i < start_blk) {
 432                         /* valid log record not found */
 433                         xfs_warn(log->l_mp,
 434                 "Log inconsistent (didn't find previous header)");
 435                         ASSERT(0);
 436                         error = -EFSCORRUPTED;
 437                         goto out;
 438                 }
 439
 440                 if (smallmem) {
 441                         error = xlog_bread(log, i, 1, buffer, &offset);
 442                         if (error)
 443                                 goto out;
 444                 }
 445
 446                 head = (xlog_rec_header_t *)offset;
 447
 448                 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 449                         break;
 450
 451                 if (!smallmem)
 452                         offset -= BBSIZE;
 453         }
 454
 455         /*
 456          * We hit the beginning of the physical log & still no header.  Return
 457          * to caller.  If caller can handle a return of -1, then this routine
 458          * will be called again for the end of the physical log.
 459          */
 460         if (i == -1) {
 461                 error = 1;
 462                 goto out;
 463         }
 464
 465         /*
 466          * We have the final block of the good log (the first block
 467          * of the log record _before_ the head. So we check the uuid.
 468          */
 469         if ((error = xlog_header_check_mount(log->l_mp, head)))
 470                 goto out;
 471
 472         /*
 473          * We may have found a log record header before we expected one.
 474          * last_blk will be the 1st block # with a given cycle #.  We may end
 475          * up reading an entire log record.  In this case, we don't want to
 476          * reset last_blk.  Only when last_blk points in the middle of a log
 477          * record do we update last_blk.
 478          */
 479         xhdrs = xlog_logrec_hblks(log, head);
 480
 481         if (*last_blk - i + extra_bblks !=
 482             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 483                 *last_blk = i;
 484
 485 out:
 486         kmem_free(buffer);
 487         return error;
 488 }
 489
 490 /*
 491  * Head is defined to be the point of the log where the next log write
 492  * could go.  This means that incomplete LR writes at the end are
 493  * eliminated when calculating the head.  We aren't guaranteed that previous
 494  * LR have complete transactions.  We only know that a cycle number of
 495  * current cycle number -1 won't be present in the log if we start writing
 496  * from our current block number.
 497  *
 498  * last_blk contains the block number of the first block with a given
 499  * cycle number.
 500  *
 501  * Return: zero if normal, non-zero if error.
 502  */
 503 STATIC int
 504 xlog_find_head(
 505         struct xlog     *log,
 506         xfs_daddr_t     *return_head_blk)
 507 {
 508         char            *buffer;
 509         char            *offset;
 510         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 511         int             num_scan_bblks;
 512         uint            first_half_cycle, last_half_cycle;
 513         uint            stop_on_cycle;
 514         int             error, log_bbnum = log->l_logBBsize;
 515
 516         /* Is the end of the log device zeroed? */
 517         error = xlog_find_zeroed(log, &first_blk);
 518         if (error < 0) {
 519                 xfs_warn(log->l_mp, "empty log check failed");
 520                 return error;
 521         }
 522         if (error == 1) {
 523                 *return_head_blk = first_blk;
 524
 525                 /* Is the whole lot zeroed? */
 526                 if (!first_blk) {
 527                         /* Linux XFS shouldn't generate totally zeroed logs -
 528                          * mkfs etc write a dummy unmount record to a fresh
 529                          * log so we can store the uuid in there
 530                          */
 531                         xfs_warn(log->l_mp, "totally zeroed log");
 532                 }
 533
 534                 return 0;
 535         }
 536
 537         first_blk = 0;                  /* get cycle # of 1st block */
 538         buffer = xlog_alloc_buffer(log, 1);
 539         if (!buffer)
 540                 return -ENOMEM;
 541
 542         error = xlog_bread(log, 0, 1, buffer, &offset);
 543         if (error)
 544                 goto out_free_buffer;
 545
 546         first_half_cycle = xlog_get_cycle(offset);
 547
 548         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 549         error = xlog_bread(log, last_blk, 1, buffer, &offset);
 550         if (error)
 551                 goto out_free_buffer;
 552
 553         last_half_cycle = xlog_get_cycle(offset);
 554         ASSERT(last_half_cycle != 0);
 555
 556         /*
 557          * If the 1st half cycle number is equal to the last half cycle number,
 558          * then the entire log is stamped with the same cycle number.  In this
 559          * case, head_blk can't be set to zero (which makes sense).  The below
 560          * math doesn't work out properly with head_blk equal to zero.  Instead,
 561          * we set it to log_bbnum which is an invalid block number, but this
 562          * value makes the math correct.  If head_blk doesn't changed through
 563          * all the tests below, *head_blk is set to zero at the very end rather
 564          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 565          * in a circular file.
 566          */
 567         if (first_half_cycle == last_half_cycle) {
 568                 /*
 569                  * In this case we believe that the entire log should have
 570                  * cycle number last_half_cycle.  We need to scan backwards
 571                  * from the end verifying that there are no holes still
 572                  * containing last_half_cycle - 1.  If we find such a hole,
 573                  * then the start of that hole will be the new head.  The
 574                  * simple case looks like
 575                  *        x | x ... | x - 1 | x
 576                  * Another case that fits this picture would be
 577                  *        x | x + 1 | x ... | x
 578                  * In this case the head really is somewhere at the end of the
 579                  * log, as one of the latest writes at the beginning was
 580                  * incomplete.
 581                  * One more case is
 582                  *        x | x + 1 | x ... | x - 1 | x
 583                  * This is really the combination of the above two cases, and
 584                  * the head has to end up at the start of the x-1 hole at the
 585                  * end of the log.
 586                  *
 587                  * In the 256k log case, we will read from the beginning to the
 588                  * end of the log and search for cycle numbers equal to x-1.
 589                  * We don't worry about the x+1 blocks that we encounter,
 590                  * because we know that they cannot be the head since the log
 591                  * started with x.
 592                  */
 593                 head_blk = log_bbnum;
 594                 stop_on_cycle = last_half_cycle - 1;
 595         } else {
 596                 /*
 597                  * In this case we want to find the first block with cycle
 598                  * number matching last_half_cycle.  We expect the log to be
 599                  * some variation on
 600                  *        x + 1 ... | x ... | x
 601                  * The first block with cycle number x (last_half_cycle) will
 602                  * be where the new head belongs.  First we do a binary search
 603                  * for the first occurrence of last_half_cycle.  The binary
 604                  * search may not be totally accurate, so then we scan back
 605                  * from there looking for occurrences of last_half_cycle before
 606                  * us.  If that backwards scan wraps around the beginning of
 607                  * the log, then we look for occurrences of last_half_cycle - 1
 608                  * at the end of the log.  The cases we're looking for look
 609                  * like
 610                  *                               v binary search stopped here
 611                  *        x + 1 ... | x | x + 1 | x ... | x
 612                  *                   ^ but we want to locate this spot
 613                  * or
 614                  *        <---------> less than scan distance
 615                  *        x + 1 ... | x ... | x - 1 | x
 616                  *                           ^ we want to locate this spot
 617                  */
 618                 stop_on_cycle = last_half_cycle;
 619                 error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
 620                                 last_half_cycle);
 621                 if (error)
 622                         goto out_free_buffer;
 623         }
 624
 625         /*
 626          * Now validate the answer.  Scan back some number of maximum possible
 627          * blocks and make sure each one has the expected cycle number.  The
 628          * maximum is determined by the total possible amount of buffering
 629          * in the in-core log.  The following number can be made tighter if
 630          * we actually look at the block size of the filesystem.
 631          */
 632         num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
 633         if (head_blk >= num_scan_bblks) {
 634                 /*
 635                  * We are guaranteed that the entire check can be performed
 636                  * in one buffer.
 637                  */
 638                 start_blk = head_blk - num_scan_bblks;
 639                 if ((error = xlog_find_verify_cycle(log,
 640                                                 start_blk, num_scan_bblks,
 641                                                 stop_on_cycle, &new_blk)))
 642                         goto out_free_buffer;
 643                 if (new_blk != -1)
 644                         head_blk = new_blk;
 645         } else {                /* need to read 2 parts of log */
 646                 /*
 647                  * We are going to scan backwards in the log in two parts.
 648                  * First we scan the physical end of the log.  In this part
 649                  * of the log, we are looking for blocks with cycle number
 650                  * last_half_cycle - 1.
 651                  * If we find one, then we know that the log starts there, as
 652                  * we've found a hole that didn't get written in going around
 653                  * the end of the physical log.  The simple case for this is
 654                  *        x + 1 ... | x ... | x - 1 | x
 655                  *        <---------> less than scan distance
 656                  * If all of the blocks at the end of the log have cycle number
 657                  * last_half_cycle, then we check the blocks at the start of
 658                  * the log looking for occurrences of last_half_cycle.  If we
 659                  * find one, then our current estimate for the location of the
 660                  * first occurrence of last_half_cycle is wrong and we move
 661                  * back to the hole we've found.  This case looks like
 662                  *        x + 1 ... | x | x + 1 | x ...
 663                  *                               ^ binary search stopped here
 664                  * Another case we need to handle that only occurs in 256k
 665                  * logs is
 666                  *        x + 1 ... | x ... | x+1 | x ...
 667                  *                   ^ binary search stops here
 668                  * In a 256k log, the scan at the end of the log will see the
 669                  * x + 1 blocks.  We need to skip past those since that is
 670                  * certainly not the head of the log.  By searching for
 671                  * last_half_cycle-1 we accomplish that.
 672                  */
 673                 ASSERT(head_blk <= INT_MAX &&
 674                         (xfs_daddr_t) num_scan_bblks >= head_blk);
 675                 start_blk = log_bbnum - (num_scan_bblks - head_blk);
 676                 if ((error = xlog_find_verify_cycle(log, start_blk,
 677                                         num_scan_bblks - (int)head_blk,
 678                                         (stop_on_cycle - 1), &new_blk)))
 679                         goto out_free_buffer;
 680                 if (new_blk != -1) {
 681                         head_blk = new_blk;
 682                         goto validate_head;
 683                 }
 684
 685                 /*
 686                  * Scan beginning of log now.  The last part of the physical
 687                  * log is good.  This scan needs to verify that it doesn't find
 688                  * the last_half_cycle.
 689                  */
 690                 start_blk = 0;
 691                 ASSERT(head_blk <= INT_MAX);
 692                 if ((error = xlog_find_verify_cycle(log,
 693                                         start_blk, (int)head_blk,
 694                                         stop_on_cycle, &new_blk)))
 695                         goto out_free_buffer;
 696                 if (new_blk != -1)
 697                         head_blk = new_blk;
 698         }
 699
 700 validate_head:
 701         /*
 702          * Now we need to make sure head_blk is not pointing to a block in
 703          * the middle of a log record.
 704          */
 705         num_scan_bblks = XLOG_REC_SHIFT(log);
 706         if (head_blk >= num_scan_bblks) {
 707                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 708
 709                 /* start ptr at last block ptr before head_blk */
 710                 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 711                 if (error == 1)
 712                         error = -EIO;
 713                 if (error)
 714                         goto out_free_buffer;
 715         } else {
 716                 start_blk = 0;
 717                 ASSERT(head_blk <= INT_MAX);
 718                 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 719                 if (error < 0)
 720                         goto out_free_buffer;
 721                 if (error == 1) {
 722                         /* We hit the beginning of the log during our search */
 723                         start_blk = log_bbnum - (num_scan_bblks - head_blk);
 724                         new_blk = log_bbnum;
 725                         ASSERT(start_blk <= INT_MAX &&
 726                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
 727                         ASSERT(head_blk <= INT_MAX);
 728                         error = xlog_find_verify_log_record(log, start_blk,
 729                                                         &new_blk, (int)head_blk);
 730                         if (error == 1)
 731                                 error = -EIO;
 732                         if (error)
 733                                 goto out_free_buffer;
 734                         if (new_blk != log_bbnum)
 735                                 head_blk = new_blk;
 736                 } else if (error)
 737                         goto out_free_buffer;
 738         }
 739
 740         kmem_free(buffer);
 741         if (head_blk == log_bbnum)
 742                 *return_head_blk = 0;
 743         else
 744                 *return_head_blk = head_blk;
 745         /*
 746          * When returning here, we have a good block number.  Bad block
 747          * means that during a previous crash, we didn't have a clean break
 748          * from cycle number N to cycle number N-1.  In this case, we need
 749          * to find the first block with cycle number N-1.
 750          */
 751         return 0;
 752
 753 out_free_buffer:
 754         kmem_free(buffer);
 755         if (error)
 756                 xfs_warn(log->l_mp, "failed to find log head");
 757         return error;
 758 }
 759
 760 /*
 761  * Seek backwards in the log for log record headers.
 762  *
 763  * Given a starting log block, walk backwards until we find the provided number
 764  * of records or hit the provided tail block. The return value is the number of
 765  * records encountered or a negative error code. The log block and buffer
 766  * pointer of the last record seen are returned in rblk and rhead respectively.
 767  */
 768 STATIC int
 769 xlog_rseek_logrec_hdr(
 770         struct xlog             *log,
 771         xfs_daddr_t             head_blk,
 772         xfs_daddr_t             tail_blk,
 773         int                     count,
 774         char                    *buffer,
 775         xfs_daddr_t             *rblk,
 776         struct xlog_rec_header  **rhead,
 777         bool                    *wrapped)
 778 {
 779         int                     i;
 780         int                     error;
 781         int                     found = 0;
 782         char                    *offset = NULL;
 783         xfs_daddr_t             end_blk;
 784
 785         *wrapped = false;
 786
 787         /*
 788          * Walk backwards from the head block until we hit the tail or the first
 789          * block in the log.
 790          */
 791         end_blk = head_blk > tail_blk ? tail_blk : 0;
 792         for (i = (int) head_blk - 1; i >= end_blk; i--) {
 793                 error = xlog_bread(log, i, 1, buffer, &offset);
 794                 if (error)
 795                         goto out_error;
 796
 797                 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 798                         *rblk = i;
 799                         *rhead = (struct xlog_rec_header *) offset;
 800                         if (++found == count)
 801                                 break;
 802                 }
 803         }
 804
 805         /*
 806          * If we haven't hit the tail block or the log record header count,
 807          * start looking again from the end of the physical log. Note that
 808          * callers can pass head == tail if the tail is not yet known.
 809          */
 810         if (tail_blk >= head_blk && found != count) {
 811                 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
 812                         error = xlog_bread(log, i, 1, buffer, &offset);
 813                         if (error)
 814                                 goto out_error;
 815
 816                         if (*(__be32 *)offset ==
 817                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 818                                 *wrapped = true;
 819                                 *rblk = i;
 820                                 *rhead = (struct xlog_rec_header *) offset;
 821                                 if (++found == count)
 822                                         break;
 823                         }
 824                 }
 825         }
 826
 827         return found;
 828
 829 out_error:
 830         return error;
 831 }
 832
 833 /*
 834  * Seek forward in the log for log record headers.
 835  *
 836  * Given head and tail blocks, walk forward from the tail block until we find
 837  * the provided number of records or hit the head block. The return value is the
 838  * number of records encountered or a negative error code. The log block and
 839  * buffer pointer of the last record seen are returned in rblk and rhead
 840  * respectively.
 841  */
 842 STATIC int
 843 xlog_seek_logrec_hdr(
 844         struct xlog             *log,
 845         xfs_daddr_t             head_blk,
 846         xfs_daddr_t             tail_blk,
 847         int                     count,
 848         char                    *buffer,
 849         xfs_daddr_t             *rblk,
 850         struct xlog_rec_header  **rhead,
 851         bool                    *wrapped)
 852 {
 853         int                     i;
 854         int                     error;
 855         int                     found = 0;
 856         char                    *offset = NULL;
 857         xfs_daddr_t             end_blk;
 858
 859         *wrapped = false;
 860
 861         /*
 862          * Walk forward from the tail block until we hit the head or the last
 863          * block in the log.
 864          */
 865         end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
 866         for (i = (int) tail_blk; i <= end_blk; i++) {
 867                 error = xlog_bread(log, i, 1, buffer, &offset);
 868                 if (error)
 869                         goto out_error;
 870
 871                 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 872                         *rblk = i;
 873                         *rhead = (struct xlog_rec_header *) offset;
 874                         if (++found == count)
 875                                 break;
 876                 }
 877         }
 878
 879         /*
 880          * If we haven't hit the head block or the log record header count,
 881          * start looking again from the start of the physical log.
 882          */
 883         if (tail_blk > head_blk && found != count) {
 884                 for (i = 0; i < (int) head_blk; i++) {
 885                         error = xlog_bread(log, i, 1, buffer, &offset);
 886                         if (error)
 887                                 goto out_error;
 888
 889                         if (*(__be32 *)offset ==
 890                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 891                                 *wrapped = true;
 892                                 *rblk = i;
 893                                 *rhead = (struct xlog_rec_header *) offset;
 894                                 if (++found == count)
 895                                         break;
 896                         }
 897                 }
 898         }
 899
 900         return found;
 901
 902 out_error:
 903         return error;
 904 }
 905
 906 /*
 907  * Calculate distance from head to tail (i.e., unused space in the log).
 908  */
 909 static inline int
 910 xlog_tail_distance(
 911         struct xlog     *log,
 912         xfs_daddr_t     head_blk,
 913         xfs_daddr_t     tail_blk)
 914 {
 915         if (head_blk < tail_blk)
 916                 return tail_blk - head_blk;
 917
 918         return tail_blk + (log->l_logBBsize - head_blk);
 919 }
 920
 921 /*
 922  * Verify the log tail. This is particularly important when torn or incomplete
 923  * writes have been detected near the front of the log and the head has been
 924  * walked back accordingly.
 925  *
 926  * We also have to handle the case where the tail was pinned and the head
 927  * blocked behind the tail right before a crash. If the tail had been pushed
 928  * immediately prior to the crash and the subsequent checkpoint was only
 929  * partially written, it's possible it overwrote the last referenced tail in the
 930  * log with garbage. This is not a coherency problem because the tail must have
 931  * been pushed before it can be overwritten, but appears as log corruption to
 932  * recovery because we have no way to know the tail was updated if the
 933  * subsequent checkpoint didn't write successfully.
 934  *
 935  * Therefore, CRC check the log from tail to head. If a failure occurs and the
 936  * offending record is within max iclog bufs from the head, walk the tail
 937  * forward and retry until a valid tail is found or corruption is detected out
 938  * of the range of a possible overwrite.
 939  */
 940 STATIC int
 941 xlog_verify_tail(
 942         struct xlog             *log,
 943         xfs_daddr_t             head_blk,
 944         xfs_daddr_t             *tail_blk,
 945         int                     hsize)
 946 {
 947         struct xlog_rec_header  *thead;
 948         char                    *buffer;
 949         xfs_daddr_t             first_bad;
 950         int                     error = 0;
 951         bool                    wrapped;
 952         xfs_daddr_t             tmp_tail;
 953         xfs_daddr_t             orig_tail = *tail_blk;
 954
 955         buffer = xlog_alloc_buffer(log, 1);
 956         if (!buffer)
 957                 return -ENOMEM;
 958
 959         /*
 960          * Make sure the tail points to a record (returns positive count on
 961          * success).
 962          */
 963         error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
 964                         &tmp_tail, &thead, &wrapped);
 965         if (error < 0)
 966                 goto out;
 967         if (*tail_blk != tmp_tail)
 968                 *tail_blk = tmp_tail;
 969
 970         /*
 971          * Run a CRC check from the tail to the head. We can't just check
 972          * MAX_ICLOGS records past the tail because the tail may point to stale
 973          * blocks cleared during the search for the head/tail. These blocks are
 974          * overwritten with zero-length records and thus record count is not a
 975          * reliable indicator of the iclog state before a crash.
 976          */
 977         first_bad = 0;
 978         error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
 979                                       XLOG_RECOVER_CRCPASS, &first_bad);
 980         while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
 981                 int     tail_distance;
 982
 983                 /*
 984                  * Is corruption within range of the head? If so, retry from
 985                  * the next record. Otherwise return an error.
 986                  */
 987                 tail_distance = xlog_tail_distance(log, head_blk, first_bad);
 988                 if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
 989                         break;
 990
 991                 /* skip to the next record; returns positive count on success */
 992                 error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
 993                                 buffer, &tmp_tail, &thead, &wrapped);
 994                 if (error < 0)
 995                         goto out;
 996
 997                 *tail_blk = tmp_tail;
 998                 first_bad = 0;
 999                 error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1000                                               XLOG_RECOVER_CRCPASS, &first_bad);
1001         }
1002
1003         if (!error && *tail_blk != orig_tail)
1004                 xfs_warn(log->l_mp,
1005                 "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1006                          orig_tail, *tail_blk);
1007 out:
1008         kmem_free(buffer);
1009         return error;
1010 }
1011
1012 /*
1013  * Detect and trim torn writes from the head of the log.
1014  *
1015  * Storage without sector atomicity guarantees can result in torn writes in the
1016  * log in the event of a crash. Our only means to detect this scenario is via
1017  * CRC verification. While we can't always be certain that CRC verification
1018  * failure is due to a torn write vs. an unrelated corruption, we do know that
1019  * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1020  * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1021  * the log and treat failures in this range as torn writes as a matter of
1022  * policy. In the event of CRC failure, the head is walked back to the last good
1023  * record in the log and the tail is updated from that record and verified.
1024  */
1025 STATIC int
1026 xlog_verify_head(
1027         struct xlog             *log,
1028         xfs_daddr_t             *head_blk,      /* in/out: unverified head */
1029         xfs_daddr_t             *tail_blk,      /* out: tail block */
1030         char                    *buffer,
1031         xfs_daddr_t             *rhead_blk,     /* start blk of last record */
1032         struct xlog_rec_header  **rhead,        /* ptr to last record */
1033         bool                    *wrapped)       /* last rec. wraps phys. log */
1034 {
1035         struct xlog_rec_header  *tmp_rhead;
1036         char                    *tmp_buffer;
1037         xfs_daddr_t             first_bad;
1038         xfs_daddr_t             tmp_rhead_blk;
1039         int                     found;
1040         int                     error;
1041         bool                    tmp_wrapped;
1042
1043         /*
1044          * Check the head of the log for torn writes. Search backwards from the
1045          * head until we hit the tail or the maximum number of log record I/Os
1046          * that could have been in flight at one time. Use a temporary buffer so
1047          * we don't trash the rhead/buffer pointers from the caller.
1048          */
1049         tmp_buffer = xlog_alloc_buffer(log, 1);
1050         if (!tmp_buffer)
1051                 return -ENOMEM;
1052         error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1053                                       XLOG_MAX_ICLOGS, tmp_buffer,
1054                                       &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1055         kmem_free(tmp_buffer);
1056         if (error < 0)
1057                 return error;
1058
1059         /*
1060          * Now run a CRC verification pass over the records starting at the
1061          * block found above to the current head. If a CRC failure occurs, the
1062          * log block of the first bad record is saved in first_bad.
1063          */
1064         error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1065                                       XLOG_RECOVER_CRCPASS, &first_bad);
1066         if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1067                 /*
1068                  * We've hit a potential torn write. Reset the error and warn
1069                  * about it.
1070                  */
1071                 error = 0;
1072                 xfs_warn(log->l_mp,
1073 "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1074                          first_bad, *head_blk);
1075
1076                 /*
1077                  * Get the header block and buffer pointer for the last good
1078                  * record before the bad record.
1079                  *
1080                  * Note that xlog_find_tail() clears the blocks at the new head
1081                  * (i.e., the records with invalid CRC) if the cycle number
1082                  * matches the current cycle.
1083                  */
1084                 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1085                                 buffer, rhead_blk, rhead, wrapped);
1086                 if (found < 0)
1087                         return found;
1088                 if (found == 0)         /* XXX: right thing to do here? */
1089                         return -EIO;
1090
1091                 /*
1092                  * Reset the head block to the starting block of the first bad
1093                  * log record and set the tail block based on the last good
1094                  * record.
1095                  *
1096                  * Bail out if the updated head/tail match as this indicates
1097                  * possible corruption outside of the acceptable
1098                  * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1099                  */
1100                 *head_blk = first_bad;
1101                 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1102                 if (*head_blk == *tail_blk) {
1103                         ASSERT(0);
1104                         return 0;
1105                 }
1106         }
1107         if (error)
1108                 return error;
1109
1110         return xlog_verify_tail(log, *head_blk, tail_blk,
1111                                 be32_to_cpu((*rhead)->h_size));
1112 }
1113
1114 /*
1115  * We need to make sure we handle log wrapping properly, so we can't use the
1116  * calculated logbno directly. Make sure it wraps to the correct bno inside the
1117  * log.
1118  *
1119  * The log is limited to 32 bit sizes, so we use the appropriate modulus
1120  * operation here and cast it back to a 64 bit daddr on return.
1121  */
1122 static inline xfs_daddr_t
1123 xlog_wrap_logbno(
1124         struct xlog             *log,
1125         xfs_daddr_t             bno)
1126 {
1127         int                     mod;
1128
1129         div_s64_rem(bno, log->l_logBBsize, &mod);
1130         return mod;
1131 }
1132
1133 /*
1134  * Check whether the head of the log points to an unmount record. In other
1135  * words, determine whether the log is clean. If so, update the in-core state
1136  * appropriately.
1137  */
1138 static int
1139 xlog_check_unmount_rec(
1140         struct xlog             *log,
1141         xfs_daddr_t             *head_blk,
1142         xfs_daddr_t             *tail_blk,
1143         struct xlog_rec_header  *rhead,
1144         xfs_daddr_t             rhead_blk,
1145         char                    *buffer,
1146         bool                    *clean)
1147 {
1148         struct xlog_op_header   *op_head;
1149         xfs_daddr_t             umount_data_blk;
1150         xfs_daddr_t             after_umount_blk;
1151         int                     hblks;
1152         int                     error;
1153         char                    *offset;
1154
1155         *clean = false;
1156
1157         /*
1158          * Look for unmount record. If we find it, then we know there was a
1159          * clean unmount. Since 'i' could be the last block in the physical
1160          * log, we convert to a log block before comparing to the head_blk.
1161          *
1162          * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1163          * below. We won't want to clear the unmount record if there is one, so
1164          * we pass the lsn of the unmount record rather than the block after it.
1165          */
1166         hblks = xlog_logrec_hblks(log, rhead);
1167         after_umount_blk = xlog_wrap_logbno(log,
1168                         rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1169
1170         if (*head_blk == after_umount_blk &&
1171             be32_to_cpu(rhead->h_num_logops) == 1) {
1172                 umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1173                 error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1174                 if (error)
1175                         return error;
1176
1177                 op_head = (struct xlog_op_header *)offset;
1178                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1179                         /*
1180                          * Set tail and last sync so that newly written log
1181                          * records will point recovery to after the current
1182                          * unmount record.
1183                          */
1184                         xlog_assign_atomic_lsn(&log->l_tail_lsn,
1185                                         log->l_curr_cycle, after_umount_blk);
1186                         xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1187                                         log->l_curr_cycle, after_umount_blk);
1188                         *tail_blk = after_umount_blk;
1189
1190                         *clean = true;
1191                 }
1192         }
1193
1194         return 0;
1195 }
1196
1197 static void
1198 xlog_set_state(
1199         struct xlog             *log,
1200         xfs_daddr_t             head_blk,
1201         struct xlog_rec_header  *rhead,
1202         xfs_daddr_t             rhead_blk,
1203         bool                    bump_cycle)
1204 {
1205         /*
1206          * Reset log values according to the state of the log when we
1207          * crashed.  In the case where head_blk == 0, we bump curr_cycle
1208          * one because the next write starts a new cycle rather than
1209          * continuing the cycle of the last good log record.  At this
1210          * point we have guaranteed that all partial log records have been
1211          * accounted for.  Therefore, we know that the last good log record
1212          * written was complete and ended exactly on the end boundary
1213          * of the physical log.
1214          */
1215         log->l_prev_block = rhead_blk;
1216         log->l_curr_block = (int)head_blk;
1217         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1218         if (bump_cycle)
1219                 log->l_curr_cycle++;
1220         atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1221         atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1222         xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1223                                         BBTOB(log->l_curr_block));
1224         xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1225                                         BBTOB(log->l_curr_block));
1226 }
1227
1228 /*
1229  * Find the sync block number or the tail of the log.
1230  *
1231  * This will be the block number of the last record to have its
1232  * associated buffers synced to disk.  Every log record header has
1233  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
1234  * to get a sync block number.  The only concern is to figure out which
1235  * log record header to believe.
1236  *
1237  * The following algorithm uses the log record header with the largest
1238  * lsn.  The entire log record does not need to be valid.  We only care
1239  * that the header is valid.
1240  *
1241  * We could speed up search by using current head_blk buffer, but it is not
1242  * available.
1243  */
1244 STATIC int
1245 xlog_find_tail(
1246         struct xlog             *log,
1247         xfs_daddr_t             *head_blk,
1248         xfs_daddr_t             *tail_blk)
1249 {
1250         xlog_rec_header_t       *rhead;
1251         char                    *offset = NULL;
1252         char                    *buffer;
1253         int                     error;
1254         xfs_daddr_t             rhead_blk;
1255         xfs_lsn_t               tail_lsn;
1256         bool                    wrapped = false;
1257         bool                    clean = false;
1258
1259         /*
1260          * Find previous log record
1261          */
1262         if ((error = xlog_find_head(log, head_blk)))
1263                 return error;
1264         ASSERT(*head_blk < INT_MAX);
1265
1266         buffer = xlog_alloc_buffer(log, 1);
1267         if (!buffer)
1268                 return -ENOMEM;
1269         if (*head_blk == 0) {                           /* special case */
1270                 error = xlog_bread(log, 0, 1, buffer, &offset);
1271                 if (error)
1272                         goto done;
1273
1274                 if (xlog_get_cycle(offset) == 0) {
1275                         *tail_blk = 0;
1276                         /* leave all other log inited values alone */
1277                         goto done;
1278                 }
1279         }
1280
1281         /*
1282          * Search backwards through the log looking for the log record header
1283          * block. This wraps all the way back around to the head so something is
1284          * seriously wrong if we can't find it.
1285          */
1286         error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
1287                                       &rhead_blk, &rhead, &wrapped);
1288         if (error < 0)
1289                 goto done;
1290         if (!error) {
1291                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1292                 error = -EFSCORRUPTED;
1293                 goto done;
1294         }
1295         *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1296
1297         /*
1298          * Set the log state based on the current head record.
1299          */
1300         xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1301         tail_lsn = atomic64_read(&log->l_tail_lsn);
1302
1303         /*
1304          * Look for an unmount record at the head of the log. This sets the log
1305          * state to determine whether recovery is necessary.
1306          */
1307         error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1308                                        rhead_blk, buffer, &clean);
1309         if (error)
1310                 goto done;
1311
1312         /*
1313          * Verify the log head if the log is not clean (e.g., we have anything
1314          * but an unmount record at the head). This uses CRC verification to
1315          * detect and trim torn writes. If discovered, CRC failures are
1316          * considered torn writes and the log head is trimmed accordingly.
1317          *
1318          * Note that we can only run CRC verification when the log is dirty
1319          * because there's no guarantee that the log data behind an unmount
1320          * record is compatible with the current architecture.
1321          */
1322         if (!clean) {
1323                 xfs_daddr_t     orig_head = *head_blk;
1324
1325                 error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1326                                          &rhead_blk, &rhead, &wrapped);
1327                 if (error)
1328                         goto done;
1329
1330                 /* update in-core state again if the head changed */
1331                 if (*head_blk != orig_head) {
1332                         xlog_set_state(log, *head_blk, rhead, rhead_blk,
1333                                        wrapped);
1334                         tail_lsn = atomic64_read(&log->l_tail_lsn);
1335                         error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1336                                                        rhead, rhead_blk, buffer,
1337                                                        &clean);
1338                         if (error)
1339                                 goto done;
1340                 }
1341         }
1342
1343         /*
1344          * Note that the unmount was clean. If the unmount was not clean, we
1345          * need to know this to rebuild the superblock counters from the perag
1346          * headers if we have a filesystem using non-persistent counters.
1347          */
1348         if (clean)
1349                 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1350
1351         /*
1352          * Make sure that there are no blocks in front of the head
1353          * with the same cycle number as the head.  This can happen
1354          * because we allow multiple outstanding log writes concurrently,
1355          * and the later writes might make it out before earlier ones.
1356          *
1357          * We use the lsn from before modifying it so that we'll never
1358          * overwrite the unmount record after a clean unmount.
1359          *
1360          * Do this only if we are going to recover the filesystem
1361          *
1362          * NOTE: This used to say "if (!readonly)"
1363          * However on Linux, we can & do recover a read-only filesystem.
1364          * We only skip recovery if NORECOVERY is specified on mount,
1365          * in which case we would not be here.
1366          *
1367          * But... if the -device- itself is readonly, just skip this.
1368          * We can't recover this device anyway, so it won't matter.
1369          */
1370         if (!xfs_readonly_buftarg(log->l_targ))
1371                 error = xlog_clear_stale_blocks(log, tail_lsn);
1372
1373 done:
1374         kmem_free(buffer);
1375
1376         if (error)
1377                 xfs_warn(log->l_mp, "failed to locate log tail");
1378         return error;
1379 }
1380
1381 /*
1382  * Is the log zeroed at all?
1383  *
1384  * The last binary search should be changed to perform an X block read
1385  * once X becomes small enough.  You can then search linearly through
1386  * the X blocks.  This will cut down on the number of reads we need to do.
1387  *
1388  * If the log is partially zeroed, this routine will pass back the blkno
1389  * of the first block with cycle number 0.  It won't have a complete LR
1390  * preceding it.
1391  *
1392  * Return:
1393  *      0  => the log is completely written to
1394  *      1 => use *blk_no as the first block of the log
1395  *      <0 => error has occurred
1396  */
1397 STATIC int
1398 xlog_find_zeroed(
1399         struct xlog     *log,
1400         xfs_daddr_t     *blk_no)
1401 {
1402         char            *buffer;
1403         char            *offset;
1404         uint            first_cycle, last_cycle;
1405         xfs_daddr_t     new_blk, last_blk, start_blk;
1406         xfs_daddr_t     num_scan_bblks;
1407         int             error, log_bbnum = log->l_logBBsize;
1408
1409         *blk_no = 0;
1410
1411         /* check totally zeroed log */
1412         buffer = xlog_alloc_buffer(log, 1);
1413         if (!buffer)
1414                 return -ENOMEM;
1415         error = xlog_bread(log, 0, 1, buffer, &offset);
1416         if (error)
1417                 goto out_free_buffer;
1418
1419         first_cycle = xlog_get_cycle(offset);
1420         if (first_cycle == 0) {         /* completely zeroed log */
1421                 *blk_no = 0;
1422                 kmem_free(buffer);
1423                 return 1;
1424         }
1425
1426         /* check partially zeroed log */
1427         error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1428         if (error)
1429                 goto out_free_buffer;
1430
1431         last_cycle = xlog_get_cycle(offset);
1432         if (last_cycle != 0) {          /* log completely written to */
1433                 kmem_free(buffer);
1434                 return 0;
1435         }
1436
1437         /* we have a partially zeroed log */
1438         last_blk = log_bbnum-1;
1439         error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1440         if (error)
1441                 goto out_free_buffer;
1442
1443         /*
1444          * Validate the answer.  Because there is no way to guarantee that
1445          * the entire log is made up of log records which are the same size,
1446          * we scan over the defined maximum blocks.  At this point, the maximum
1447          * is not chosen to mean anything special.   XXXmiken
1448          */
1449         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1450         ASSERT(num_scan_bblks <= INT_MAX);
1451
1452         if (last_blk < num_scan_bblks)
1453                 num_scan_bblks = last_blk;
1454         start_blk = last_blk - num_scan_bblks;
1455
1456         /*
1457          * We search for any instances of cycle number 0 that occur before
1458          * our current estimate of the head.  What we're trying to detect is
1459          *        1 ... | 0 | 1 | 0...
1460          *                       ^ binary search ends here
1461          */
1462         if ((error = xlog_find_verify_cycle(log, start_blk,
1463                                          (int)num_scan_bblks, 0, &new_blk)))
1464                 goto out_free_buffer;
1465         if (new_blk != -1)
1466                 last_blk = new_blk;
1467
1468         /*
1469          * Potentially backup over partial log record write.  We don't need
1470          * to search the end of the log because we know it is zero.
1471          */
1472         error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1473         if (error == 1)
1474                 error = -EIO;
1475         if (error)
1476                 goto out_free_buffer;
1477
1478         *blk_no = last_blk;
1479 out_free_buffer:
1480         kmem_free(buffer);
1481         if (error)
1482                 return error;
1483         return 1;
1484 }
1485
1486 /*
1487  * These are simple subroutines used by xlog_clear_stale_blocks() below
1488  * to initialize a buffer full of empty log record headers and write
1489  * them into the log.
1490  */
1491 STATIC void
1492 xlog_add_record(
1493         struct xlog             *log,
1494         char                    *buf,
1495         int                     cycle,
1496         int                     block,
1497         int                     tail_cycle,
1498         int                     tail_block)
1499 {
1500         xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1501
1502         memset(buf, 0, BBSIZE);
1503         recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1504         recp->h_cycle = cpu_to_be32(cycle);
1505         recp->h_version = cpu_to_be32(
1506                         xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1507         recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1508         recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1509         recp->h_fmt = cpu_to_be32(XLOG_FMT);
1510         memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1511 }
1512
1513 STATIC int
1514 xlog_write_log_records(
1515         struct xlog     *log,
1516         int             cycle,
1517         int             start_block,
1518         int             blocks,
1519         int             tail_cycle,
1520         int             tail_block)
1521 {
1522         char            *offset;
1523         char            *buffer;
1524         int             balign, ealign;
1525         int             sectbb = log->l_sectBBsize;
1526         int             end_block = start_block + blocks;
1527         int             bufblks;
1528         int             error = 0;
1529         int             i, j = 0;
1530
1531         /*
1532          * Greedily allocate a buffer big enough to handle the full
1533          * range of basic blocks to be written.  If that fails, try
1534          * a smaller size.  We need to be able to write at least a
1535          * log sector, or we're out of luck.
1536          */
1537         bufblks = 1 << ffs(blocks);
1538         while (bufblks > log->l_logBBsize)
1539                 bufblks >>= 1;
1540         while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1541                 bufblks >>= 1;
1542                 if (bufblks < sectbb)
1543                         return -ENOMEM;
1544         }
1545
1546         /* We may need to do a read at the start to fill in part of
1547          * the buffer in the starting sector not covered by the first
1548          * write below.
1549          */
1550         balign = round_down(start_block, sectbb);
1551         if (balign != start_block) {
1552                 error = xlog_bread_noalign(log, start_block, 1, buffer);
1553                 if (error)
1554                         goto out_free_buffer;
1555
1556                 j = start_block - balign;
1557         }
1558
1559         for (i = start_block; i < end_block; i += bufblks) {
1560                 int             bcount, endcount;
1561
1562                 bcount = min(bufblks, end_block - start_block);
1563                 endcount = bcount - j;
1564
1565                 /* We may need to do a read at the end to fill in part of
1566                  * the buffer in the final sector not covered by the write.
1567                  * If this is the same sector as the above read, skip it.
1568                  */
1569                 ealign = round_down(end_block, sectbb);
1570                 if (j == 0 && (start_block + endcount > ealign)) {
1571                         error = xlog_bread_noalign(log, ealign, sectbb,
1572                                         buffer + BBTOB(ealign - start_block));
1573                         if (error)
1574                                 break;
1575
1576                 }
1577
1578                 offset = buffer + xlog_align(log, start_block);
1579                 for (; j < endcount; j++) {
1580                         xlog_add_record(log, offset, cycle, i+j,
1581                                         tail_cycle, tail_block);
1582                         offset += BBSIZE;
1583                 }
1584                 error = xlog_bwrite(log, start_block, endcount, buffer);
1585                 if (error)
1586                         break;
1587                 start_block += endcount;
1588                 j = 0;
1589         }
1590
1591 out_free_buffer:
1592         kmem_free(buffer);
1593         return error;
1594 }
1595
1596 /*
1597  * This routine is called to blow away any incomplete log writes out
1598  * in front of the log head.  We do this so that we won't become confused
1599  * if we come up, write only a little bit more, and then crash again.
1600  * If we leave the partial log records out there, this situation could
1601  * cause us to think those partial writes are valid blocks since they
1602  * have the current cycle number.  We get rid of them by overwriting them
1603  * with empty log records with the old cycle number rather than the
1604  * current one.
1605  *
1606  * The tail lsn is passed in rather than taken from
1607  * the log so that we will not write over the unmount record after a
1608  * clean unmount in a 512 block log.  Doing so would leave the log without
1609  * any valid log records in it until a new one was written.  If we crashed
1610  * during that time we would not be able to recover.
1611  */
1612 STATIC int
1613 xlog_clear_stale_blocks(
1614         struct xlog     *log,
1615         xfs_lsn_t       tail_lsn)
1616 {
1617         int             tail_cycle, head_cycle;
1618         int             tail_block, head_block;
1619         int             tail_distance, max_distance;
1620         int             distance;
1621         int             error;
1622
1623         tail_cycle = CYCLE_LSN(tail_lsn);
1624         tail_block = BLOCK_LSN(tail_lsn);
1625         head_cycle = log->l_curr_cycle;
1626         head_block = log->l_curr_block;
1627
1628         /*
1629          * Figure out the distance between the new head of the log
1630          * and the tail.  We want to write over any blocks beyond the
1631          * head that we may have written just before the crash, but
1632          * we don't want to overwrite the tail of the log.
1633          */
1634         if (head_cycle == tail_cycle) {
1635                 /*
1636                  * The tail is behind the head in the physical log,
1637                  * so the distance from the head to the tail is the
1638                  * distance from the head to the end of the log plus
1639                  * the distance from the beginning of the log to the
1640                  * tail.
1641                  */
1642                 if (XFS_IS_CORRUPT(log->l_mp,
1643                                    head_block < tail_block ||
1644                                    head_block >= log->l_logBBsize))
1645                         return -EFSCORRUPTED;
1646                 tail_distance = tail_block + (log->l_logBBsize - head_block);
1647         } else {
1648                 /*
1649                  * The head is behind the tail in the physical log,
1650                  * so the distance from the head to the tail is just
1651                  * the tail block minus the head block.
1652                  */
1653                 if (XFS_IS_CORRUPT(log->l_mp,
1654                                    head_block >= tail_block ||
1655                                    head_cycle != tail_cycle + 1))
1656                         return -EFSCORRUPTED;
1657                 tail_distance = tail_block - head_block;
1658         }
1659
1660         /*
1661          * If the head is right up against the tail, we can't clear
1662          * anything.
1663          */
1664         if (tail_distance <= 0) {
1665                 ASSERT(tail_distance == 0);
1666                 return 0;
1667         }
1668
1669         max_distance = XLOG_TOTAL_REC_SHIFT(log);
1670         /*
1671          * Take the smaller of the maximum amount of outstanding I/O
1672          * we could have and the distance to the tail to clear out.
1673          * We take the smaller so that we don't overwrite the tail and
1674          * we don't waste all day writing from the head to the tail
1675          * for no reason.
1676          */
1677         max_distance = min(max_distance, tail_distance);
1678
1679         if ((head_block + max_distance) <= log->l_logBBsize) {
1680                 /*
1681                  * We can stomp all the blocks we need to without
1682                  * wrapping around the end of the log.  Just do it
1683                  * in a single write.  Use the cycle number of the
1684                  * current cycle minus one so that the log will look like:
1685                  *     n ... | n - 1 ...
1686                  */
1687                 error = xlog_write_log_records(log, (head_cycle - 1),
1688                                 head_block, max_distance, tail_cycle,
1689                                 tail_block);
1690                 if (error)
1691                         return error;
1692         } else {
1693                 /*
1694                  * We need to wrap around the end of the physical log in
1695                  * order to clear all the blocks.  Do it in two separate
1696                  * I/Os.  The first write should be from the head to the
1697                  * end of the physical log, and it should use the current
1698                  * cycle number minus one just like above.
1699                  */
1700                 distance = log->l_logBBsize - head_block;
1701                 error = xlog_write_log_records(log, (head_cycle - 1),
1702                                 head_block, distance, tail_cycle,
1703                                 tail_block);
1704
1705                 if (error)
1706                         return error;
1707
1708                 /*
1709                  * Now write the blocks at the start of the physical log.
1710                  * This writes the remainder of the blocks we want to clear.
1711                  * It uses the current cycle number since we're now on the
1712                  * same cycle as the head so that we get:
1713                  *    n ... n ... | n - 1 ...
1714                  *    ^^^^^ blocks we're writing
1715                  */
1716                 distance = max_distance - (log->l_logBBsize - head_block);
1717                 error = xlog_write_log_records(log, head_cycle, 0, distance,
1718                                 tail_cycle, tail_block);
1719                 if (error)
1720                         return error;
1721         }
1722
1723         return 0;
1724 }
1725
1726 /*
1727  * Release the recovered intent item in the AIL that matches the given intent
1728  * type and intent id.
1729  */
1730 void
1731 xlog_recover_release_intent(
1732         struct xlog             *log,
1733         unsigned short          intent_type,
1734         uint64_t                intent_id)
1735 {
1736         struct xfs_ail_cursor   cur;
1737         struct xfs_log_item     *lip;
1738         struct xfs_ail          *ailp = log->l_ailp;
1739
1740         spin_lock(&ailp->ail_lock);
1741         for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
1742              lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
1743                 if (lip->li_type != intent_type)
1744                         continue;
1745                 if (!lip->li_ops->iop_match(lip, intent_id))
1746                         continue;
1747
1748                 spin_unlock(&ailp->ail_lock);
1749                 lip->li_ops->iop_release(lip);
1750                 spin_lock(&ailp->ail_lock);
1751                 break;
1752         }
1753
1754         xfs_trans_ail_cursor_done(&cur);
1755         spin_unlock(&ailp->ail_lock);
1756 }
1757
1758 /******************************************************************************
1759  *
1760  *              Log recover routines
1761  *
1762  ******************************************************************************
1763  */
1764 static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1765         &xlog_buf_item_ops,
1766         &xlog_inode_item_ops,
1767         &xlog_dquot_item_ops,
1768         &xlog_quotaoff_item_ops,
1769         &xlog_icreate_item_ops,
1770         &xlog_efi_item_ops,
1771         &xlog_efd_item_ops,
1772         &xlog_rui_item_ops,
1773         &xlog_rud_item_ops,
1774         &xlog_cui_item_ops,
1775         &xlog_cud_item_ops,
1776         &xlog_bui_item_ops,
1777         &xlog_bud_item_ops,
1778 };
1779
1780 static const struct xlog_recover_item_ops *
1781 xlog_find_item_ops(
1782         struct xlog_recover_item                *item)
1783 {
1784         unsigned int                            i;
1785
1786         for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1787                 if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1788                         return xlog_recover_item_ops[i];
1789
1790         return NULL;
1791 }
1792
1793 /*
1794  * Sort the log items in the transaction.
1795  *
1796  * The ordering constraints are defined by the inode allocation and unlink
1797  * behaviour. The rules are:
1798  *
1799  *      1. Every item is only logged once in a given transaction. Hence it
1800  *         represents the last logged state of the item. Hence ordering is
1801  *         dependent on the order in which operations need to be performed so
1802  *         required initial conditions are always met.
1803  *
1804  *      2. Cancelled buffers are recorded in pass 1 in a separate table and
1805  *         there's nothing to replay from them so we can simply cull them
1806  *         from the transaction. However, we can't do that until after we've
1807  *         replayed all the other items because they may be dependent on the
1808  *         cancelled buffer and replaying the cancelled buffer can remove it
1809  *         form the cancelled buffer table. Hence they have tobe done last.
1810  *
1811  *      3. Inode allocation buffers must be replayed before inode items that
1812  *         read the buffer and replay changes into it. For filesystems using the
1813  *         ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1814  *         treated the same as inode allocation buffers as they create and
1815  *         initialise the buffers directly.
1816  *
1817  *      4. Inode unlink buffers must be replayed after inode items are replayed.
1818  *         This ensures that inodes are completely flushed to the inode buffer
1819  *         in a "free" state before we remove the unlinked inode list pointer.
1820  *
1821  * Hence the ordering needs to be inode allocation buffers first, inode items
1822  * second, inode unlink buffers third and cancelled buffers last.
1823  *
1824  * But there's a problem with that - we can't tell an inode allocation buffer
1825  * apart from a regular buffer, so we can't separate them. We can, however,
1826  * tell an inode unlink buffer from the others, and so we can separate them out
1827  * from all the other buffers and move them to last.
1828  *
1829  * Hence, 4 lists, in order from head to tail:
1830  *      - buffer_list for all buffers except cancelled/inode unlink buffers
1831  *      - item_list for all non-buffer items
1832  *      - inode_buffer_list for inode unlink buffers
1833  *      - cancel_list for the cancelled buffers
1834  *
1835  * Note that we add objects to the tail of the lists so that first-to-last
1836  * ordering is preserved within the lists. Adding objects to the head of the
1837  * list means when we traverse from the head we walk them in last-to-first
1838  * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1839  * but for all other items there may be specific ordering that we need to
1840  * preserve.
1841  */
1842 STATIC int
1843 xlog_recover_reorder_trans(
1844         struct xlog             *log,
1845         struct xlog_recover     *trans,
1846         int                     pass)
1847 {
1848         struct xlog_recover_item *item, *n;
1849         int                     error = 0;
1850         LIST_HEAD(sort_list);
1851         LIST_HEAD(cancel_list);
1852         LIST_HEAD(buffer_list);
1853         LIST_HEAD(inode_buffer_list);
1854         LIST_HEAD(item_list);
1855
1856         list_splice_init(&trans->r_itemq, &sort_list);
1857         list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1858                 enum xlog_recover_reorder       fate = XLOG_REORDER_ITEM_LIST;
1859
1860                 item->ri_ops = xlog_find_item_ops(item);
1861                 if (!item->ri_ops) {
1862                         xfs_warn(log->l_mp,
1863                                 "%s: unrecognized type of log operation (%d)",
1864                                 __func__, ITEM_TYPE(item));
1865                         ASSERT(0);
1866                         /*
1867                          * return the remaining items back to the transaction
1868                          * item list so they can be freed in caller.
1869                          */
1870                         if (!list_empty(&sort_list))
1871                                 list_splice_init(&sort_list, &trans->r_itemq);
1872                         error = -EFSCORRUPTED;
1873                         break;
1874                 }
1875
1876                 if (item->ri_ops->reorder)
1877                         fate = item->ri_ops->reorder(item);
1878
1879                 switch (fate) {
1880                 case XLOG_REORDER_BUFFER_LIST:
1881                         list_move_tail(&item->ri_list, &buffer_list);
1882                         break;
1883                 case XLOG_REORDER_CANCEL_LIST:
1884                         trace_xfs_log_recover_item_reorder_head(log,
1885                                         trans, item, pass);
1886                         list_move(&item->ri_list, &cancel_list);
1887                         break;
1888                 case XLOG_REORDER_INODE_BUFFER_LIST:
1889                         list_move(&item->ri_list, &inode_buffer_list);
1890                         break;
1891                 case XLOG_REORDER_ITEM_LIST:
1892                         trace_xfs_log_recover_item_reorder_tail(log,
1893                                                         trans, item, pass);
1894                         list_move_tail(&item->ri_list, &item_list);
1895                         break;
1896                 }
1897         }
1898
1899         ASSERT(list_empty(&sort_list));
1900         if (!list_empty(&buffer_list))
1901                 list_splice(&buffer_list, &trans->r_itemq);
1902         if (!list_empty(&item_list))
1903                 list_splice_tail(&item_list, &trans->r_itemq);
1904         if (!list_empty(&inode_buffer_list))
1905                 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1906         if (!list_empty(&cancel_list))
1907                 list_splice_tail(&cancel_list, &trans->r_itemq);
1908         return error;
1909 }
1910
1911 void
1912 xlog_buf_readahead(
1913         struct xlog             *log,
1914         xfs_daddr_t             blkno,
1915         uint                    len,
1916         const struct xfs_buf_ops *ops)
1917 {
1918         if (!xlog_is_buffer_cancelled(log, blkno, len))
1919                 xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
1920 }
1921
1922 STATIC int
1923 xlog_recover_items_pass2(
1924         struct xlog                     *log,
1925         struct xlog_recover             *trans,
1926         struct list_head                *buffer_list,
1927         struct list_head                *item_list)
1928 {
1929         struct xlog_recover_item        *item;
1930         int                             error = 0;
1931
1932         list_for_each_entry(item, item_list, ri_list) {
1933                 trace_xfs_log_recover_item_recover(log, trans, item,
1934                                 XLOG_RECOVER_PASS2);
1935
1936                 if (item->ri_ops->commit_pass2)
1937                         error = item->ri_ops->commit_pass2(log, buffer_list,
1938                                         item, trans->r_lsn);
1939                 if (error)
1940                         return error;
1941         }
1942
1943         return error;
1944 }
1945
1946 /*
1947  * Perform the transaction.
1948  *
1949  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
1950  * EFIs and EFDs get queued up by adding entries into the AIL for them.
1951  */
1952 STATIC int
1953 xlog_recover_commit_trans(
1954         struct xlog             *log,
1955         struct xlog_recover     *trans,
1956         int                     pass,
1957         struct list_head        *buffer_list)
1958 {
1959         int                             error = 0;
1960         int                             items_queued = 0;
1961         struct xlog_recover_item        *item;
1962         struct xlog_recover_item        *next;
1963         LIST_HEAD                       (ra_list);
1964         LIST_HEAD                       (done_list);
1965
1966         #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
1967
1968         hlist_del_init(&trans->r_list);
1969
1970         error = xlog_recover_reorder_trans(log, trans, pass);
1971         if (error)
1972                 return error;
1973
1974         list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
1975                 trace_xfs_log_recover_item_recover(log, trans, item, pass);
1976
1977                 switch (pass) {
1978                 case XLOG_RECOVER_PASS1:
1979                         if (item->ri_ops->commit_pass1)
1980                                 error = item->ri_ops->commit_pass1(log, item);
1981                         break;
1982                 case XLOG_RECOVER_PASS2:
1983                         if (item->ri_ops->ra_pass2)
1984                                 item->ri_ops->ra_pass2(log, item);
1985                         list_move_tail(&item->ri_list, &ra_list);
1986                         items_queued++;
1987                         if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
1988                                 error = xlog_recover_items_pass2(log, trans,
1989                                                 buffer_list, &ra_list);
1990                                 list_splice_tail_init(&ra_list, &done_list);
1991                                 items_queued = 0;
1992                         }
1993
1994                         break;
1995                 default:
1996                         ASSERT(0);
1997                 }
1998
1999                 if (error)
2000                         goto out;
2001         }
2002
2003 out:
2004         if (!list_empty(&ra_list)) {
2005                 if (!error)
2006                         error = xlog_recover_items_pass2(log, trans,
2007                                         buffer_list, &ra_list);
2008                 list_splice_tail_init(&ra_list, &done_list);
2009         }
2010
2011         if (!list_empty(&done_list))
2012                 list_splice_init(&done_list, &trans->r_itemq);
2013
2014         return error;
2015 }
2016
2017 STATIC void
2018 xlog_recover_add_item(
2019         struct list_head        *head)
2020 {
2021         struct xlog_recover_item *item;
2022
2023         item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
2024         INIT_LIST_HEAD(&item->ri_list);
2025         list_add_tail(&item->ri_list, head);
2026 }
2027
2028 STATIC int
2029 xlog_recover_add_to_cont_trans(
2030         struct xlog             *log,
2031         struct xlog_recover     *trans,
2032         char                    *dp,
2033         int                     len)
2034 {
2035         struct xlog_recover_item *item;
2036         char                    *ptr, *old_ptr;
2037         int                     old_len;
2038
2039         /*
2040          * If the transaction is empty, the header was split across this and the
2041          * previous record. Copy the rest of the header.
2042          */
2043         if (list_empty(&trans->r_itemq)) {
2044                 ASSERT(len <= sizeof(struct xfs_trans_header));
2045                 if (len > sizeof(struct xfs_trans_header)) {
2046                         xfs_warn(log->l_mp, "%s: bad header length", __func__);
2047                         return -EFSCORRUPTED;
2048                 }
2049
2050                 xlog_recover_add_item(&trans->r_itemq);
2051                 ptr = (char *)&trans->r_theader +
2052                                 sizeof(struct xfs_trans_header) - len;
2053                 memcpy(ptr, dp, len);
2054                 return 0;
2055         }
2056
2057         /* take the tail entry */
2058         item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2059                           ri_list);
2060
2061         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
2062         old_len = item->ri_buf[item->ri_cnt-1].i_len;
2063
2064         ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
2065         memcpy(&ptr[old_len], dp, len);
2066         item->ri_buf[item->ri_cnt-1].i_len += len;
2067         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
2068         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
2069         return 0;
2070 }
2071
2072 /*
2073  * The next region to add is the start of a new region.  It could be
2074  * a whole region or it could be the first part of a new region.  Because
2075  * of this, the assumption here is that the type and size fields of all
2076  * format structures fit into the first 32 bits of the structure.
2077  *
2078  * This works because all regions must be 32 bit aligned.  Therefore, we
2079  * either have both fields or we have neither field.  In the case we have
2080  * neither field, the data part of the region is zero length.  We only have
2081  * a log_op_header and can throw away the header since a new one will appear
2082  * later.  If we have at least 4 bytes, then we can determine how many regions
2083  * will appear in the current log item.
2084  */
2085 STATIC int
2086 xlog_recover_add_to_trans(
2087         struct xlog             *log,
2088         struct xlog_recover     *trans,
2089         char                    *dp,
2090         int                     len)
2091 {
2092         struct xfs_inode_log_format     *in_f;                  /* any will do */
2093         struct xlog_recover_item *item;
2094         char                    *ptr;
2095
2096         if (!len)
2097                 return 0;
2098         if (list_empty(&trans->r_itemq)) {
2099                 /* we need to catch log corruptions here */
2100                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
2101                         xfs_warn(log->l_mp, "%s: bad header magic number",
2102                                 __func__);
2103                         ASSERT(0);
2104                         return -EFSCORRUPTED;
2105                 }
2106
2107                 if (len > sizeof(struct xfs_trans_header)) {
2108                         xfs_warn(log->l_mp, "%s: bad header length", __func__);
2109                         ASSERT(0);
2110                         return -EFSCORRUPTED;
2111                 }
2112
2113                 /*
2114                  * The transaction header can be arbitrarily split across op
2115                  * records. If we don't have the whole thing here, copy what we
2116                  * do have and handle the rest in the next record.
2117                  */
2118                 if (len == sizeof(struct xfs_trans_header))
2119                         xlog_recover_add_item(&trans->r_itemq);
2120                 memcpy(&trans->r_theader, dp, len);
2121                 return 0;
2122         }
2123
2124         ptr = kmem_alloc(len, 0);
2125         memcpy(ptr, dp, len);
2126         in_f = (struct xfs_inode_log_format *)ptr;
2127
2128         /* take the tail entry */
2129         item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2130                           ri_list);
2131         if (item->ri_total != 0 &&
2132              item->ri_total == item->ri_cnt) {
2133                 /* tail item is in use, get a new one */
2134                 xlog_recover_add_item(&trans->r_itemq);
2135                 item = list_entry(trans->r_itemq.prev,
2136                                         struct xlog_recover_item, ri_list);
2137         }
2138
2139         if (item->ri_total == 0) {              /* first region to be added */
2140                 if (in_f->ilf_size == 0 ||
2141                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
2142                         xfs_warn(log->l_mp,
2143                 "bad number of regions (%d) in inode log format",
2144                                   in_f->ilf_size);
2145                         ASSERT(0);
2146                         kmem_free(ptr);
2147                         return -EFSCORRUPTED;
2148                 }
2149
2150                 item->ri_total = in_f->ilf_size;
2151                 item->ri_buf =
2152                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2153                                     0);
2154         }
2155
2156         if (item->ri_total <= item->ri_cnt) {
2157                 xfs_warn(log->l_mp,
2158         "log item region count (%d) overflowed size (%d)",
2159                                 item->ri_cnt, item->ri_total);
2160                 ASSERT(0);
2161                 kmem_free(ptr);
2162                 return -EFSCORRUPTED;
2163         }
2164
2165         /* Description region is ri_buf[0] */
2166         item->ri_buf[item->ri_cnt].i_addr = ptr;
2167         item->ri_buf[item->ri_cnt].i_len  = len;
2168         item->ri_cnt++;
2169         trace_xfs_log_recover_item_add(log, trans, item, 0);
2170         return 0;
2171 }
2172
2173 /*
2174  * Free up any resources allocated by the transaction
2175  *
2176  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2177  */
2178 STATIC void
2179 xlog_recover_free_trans(
2180         struct xlog_recover     *trans)
2181 {
2182         struct xlog_recover_item *item, *n;
2183         int                     i;
2184
2185         hlist_del_init(&trans->r_list);
2186
2187         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2188                 /* Free the regions in the item. */
2189                 list_del(&item->ri_list);
2190                 for (i = 0; i < item->ri_cnt; i++)
2191                         kmem_free(item->ri_buf[i].i_addr);
2192                 /* Free the item itself */
2193                 kmem_free(item->ri_buf);
2194                 kmem_free(item);
2195         }
2196         /* Free the transaction recover structure */
2197         kmem_free(trans);
2198 }
2199
2200 /*
2201  * On error or completion, trans is freed.
2202  */
2203 STATIC int
2204 xlog_recovery_process_trans(
2205         struct xlog             *log,
2206         struct xlog_recover     *trans,
2207         char                    *dp,
2208         unsigned int            len,
2209         unsigned int            flags,
2210         int                     pass,
2211         struct list_head        *buffer_list)
2212 {
2213         int                     error = 0;
2214         bool                    freeit = false;
2215
2216         /* mask off ophdr transaction container flags */
2217         flags &= ~XLOG_END_TRANS;
2218         if (flags & XLOG_WAS_CONT_TRANS)
2219                 flags &= ~XLOG_CONTINUE_TRANS;
2220
2221         /*
2222          * Callees must not free the trans structure. We'll decide if we need to
2223          * free it or not based on the operation being done and it's result.
2224          */
2225         switch (flags) {
2226         /* expected flag values */
2227         case 0:
2228         case XLOG_CONTINUE_TRANS:
2229                 error = xlog_recover_add_to_trans(log, trans, dp, len);
2230                 break;
2231         case XLOG_WAS_CONT_TRANS:
2232                 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2233                 break;
2234         case XLOG_COMMIT_TRANS:
2235                 error = xlog_recover_commit_trans(log, trans, pass,
2236                                                   buffer_list);
2237                 /* success or fail, we are now done with this transaction. */
2238                 freeit = true;
2239                 break;
2240
2241         /* unexpected flag values */
2242         case XLOG_UNMOUNT_TRANS:
2243                 /* just skip trans */
2244                 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2245                 freeit = true;
2246                 break;
2247         case XLOG_START_TRANS:
2248         default:
2249                 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2250                 ASSERT(0);
2251                 error = -EFSCORRUPTED;
2252                 break;
2253         }
2254         if (error || freeit)
2255                 xlog_recover_free_trans(trans);
2256         return error;
2257 }
2258
2259 /*
2260  * Lookup the transaction recovery structure associated with the ID in the
2261  * current ophdr. If the transaction doesn't exist and the start flag is set in
2262  * the ophdr, then allocate a new transaction for future ID matches to find.
2263  * Either way, return what we found during the lookup - an existing transaction
2264  * or nothing.
2265  */
2266 STATIC struct xlog_recover *
2267 xlog_recover_ophdr_to_trans(
2268         struct hlist_head       rhash[],
2269         struct xlog_rec_header  *rhead,
2270         struct xlog_op_header   *ohead)
2271 {
2272         struct xlog_recover     *trans;
2273         xlog_tid_t              tid;
2274         struct hlist_head       *rhp;
2275
2276         tid = be32_to_cpu(ohead->oh_tid);
2277         rhp = &rhash[XLOG_RHASH(tid)];
2278         hlist_for_each_entry(trans, rhp, r_list) {
2279                 if (trans->r_log_tid == tid)
2280                         return trans;
2281         }
2282
2283         /*
2284          * skip over non-start transaction headers - we could be
2285          * processing slack space before the next transaction starts
2286          */
2287         if (!(ohead->oh_flags & XLOG_START_TRANS))
2288                 return NULL;
2289
2290         ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2291
2292         /*
2293          * This is a new transaction so allocate a new recovery container to
2294          * hold the recovery ops that will follow.
2295          */
2296         trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
2297         trans->r_log_tid = tid;
2298         trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2299         INIT_LIST_HEAD(&trans->r_itemq);
2300         INIT_HLIST_NODE(&trans->r_list);
2301         hlist_add_head(&trans->r_list, rhp);
2302
2303         /*
2304          * Nothing more to do for this ophdr. Items to be added to this new
2305          * transaction will be in subsequent ophdr containers.
2306          */
2307         return NULL;
2308 }
2309
2310 STATIC int
2311 xlog_recover_process_ophdr(
2312         struct xlog             *log,
2313         struct hlist_head       rhash[],
2314         struct xlog_rec_header  *rhead,
2315         struct xlog_op_header   *ohead,
2316         char                    *dp,
2317         char                    *end,
2318         int                     pass,
2319         struct list_head        *buffer_list)
2320 {
2321         struct xlog_recover     *trans;
2322         unsigned int            len;
2323         int                     error;
2324
2325         /* Do we understand who wrote this op? */
2326         if (ohead->oh_clientid != XFS_TRANSACTION &&
2327             ohead->oh_clientid != XFS_LOG) {
2328                 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2329                         __func__, ohead->oh_clientid);
2330                 ASSERT(0);
2331                 return -EFSCORRUPTED;
2332         }
2333
2334         /*
2335          * Check the ophdr contains all the data it is supposed to contain.
2336          */
2337         len = be32_to_cpu(ohead->oh_len);
2338         if (dp + len > end) {
2339                 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2340                 WARN_ON(1);
2341                 return -EFSCORRUPTED;
2342         }
2343
2344         trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2345         if (!trans) {
2346                 /* nothing to do, so skip over this ophdr */
2347                 return 0;
2348         }
2349
2350         /*
2351          * The recovered buffer queue is drained only once we know that all
2352          * recovery items for the current LSN have been processed. This is
2353          * required because:
2354          *
2355          * - Buffer write submission updates the metadata LSN of the buffer.
2356          * - Log recovery skips items with a metadata LSN >= the current LSN of
2357          *   the recovery item.
2358          * - Separate recovery items against the same metadata buffer can share
2359          *   a current LSN. I.e., consider that the LSN of a recovery item is
2360          *   defined as the starting LSN of the first record in which its
2361          *   transaction appears, that a record can hold multiple transactions,
2362          *   and/or that a transaction can span multiple records.
2363          *
2364          * In other words, we are allowed to submit a buffer from log recovery
2365          * once per current LSN. Otherwise, we may incorrectly skip recovery
2366          * items and cause corruption.
2367          *
2368          * We don't know up front whether buffers are updated multiple times per
2369          * LSN. Therefore, track the current LSN of each commit log record as it
2370          * is processed and drain the queue when it changes. Use commit records
2371          * because they are ordered correctly by the logging code.
2372          */
2373         if (log->l_recovery_lsn != trans->r_lsn &&
2374             ohead->oh_flags & XLOG_COMMIT_TRANS) {
2375                 error = xfs_buf_delwri_submit(buffer_list);
2376                 if (error)
2377                         return error;
2378                 log->l_recovery_lsn = trans->r_lsn;
2379         }
2380
2381         return xlog_recovery_process_trans(log, trans, dp, len,
2382                                            ohead->oh_flags, pass, buffer_list);
2383 }
2384
2385 /*
2386  * There are two valid states of the r_state field.  0 indicates that the
2387  * transaction structure is in a normal state.  We have either seen the
2388  * start of the transaction or the last operation we added was not a partial
2389  * operation.  If the last operation we added to the transaction was a
2390  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2391  *
2392  * NOTE: skip LRs with 0 data length.
2393  */
2394 STATIC int
2395 xlog_recover_process_data(
2396         struct xlog             *log,
2397         struct hlist_head       rhash[],
2398         struct xlog_rec_header  *rhead,
2399         char                    *dp,
2400         int                     pass,
2401         struct list_head        *buffer_list)
2402 {
2403         struct xlog_op_header   *ohead;
2404         char                    *end;
2405         int                     num_logops;
2406         int                     error;
2407
2408         end = dp + be32_to_cpu(rhead->h_len);
2409         num_logops = be32_to_cpu(rhead->h_num_logops);
2410
2411         /* check the log format matches our own - else we can't recover */
2412         if (xlog_header_check_recover(log->l_mp, rhead))
2413                 return -EIO;
2414
2415         trace_xfs_log_recover_record(log, rhead, pass);
2416         while ((dp < end) && num_logops) {
2417
2418                 ohead = (struct xlog_op_header *)dp;
2419                 dp += sizeof(*ohead);
2420                 ASSERT(dp <= end);
2421
2422                 /* errors will abort recovery */
2423                 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
2424                                                    dp, end, pass, buffer_list);
2425                 if (error)
2426                         return error;
2427
2428                 dp += be32_to_cpu(ohead->oh_len);
2429                 num_logops--;
2430         }
2431         return 0;
2432 }
2433
2434 /* Take all the collected deferred ops and finish them in order. */
2435 static int
2436 xlog_finish_defer_ops(
2437         struct xfs_mount        *mp,
2438         struct list_head        *capture_list)
2439 {
2440         struct xfs_defer_capture *dfc, *next;
2441         struct xfs_trans        *tp;
2442         struct xfs_inode        *ip;
2443         int                     error = 0;
2444
2445         list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2446                 struct xfs_trans_res    resv;
2447
2448                 /*
2449                  * Create a new transaction reservation from the captured
2450                  * information.  Set logcount to 1 to force the new transaction
2451                  * to regrant every roll so that we can make forward progress
2452                  * in recovery no matter how full the log might be.
2453                  */
2454                 resv.tr_logres = dfc->dfc_logres;
2455                 resv.tr_logcount = 1;
2456                 resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2457
2458                 error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2459                                 dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
2460                 if (error)
2461                         return error;
2462
2463                 /*
2464                  * Transfer to this new transaction all the dfops we captured
2465                  * from recovering a single intent item.
2466                  */
2467                 list_del_init(&dfc->dfc_list);
2468                 xfs_defer_ops_continue(dfc, tp, &ip);
2469
2470                 error = xfs_trans_commit(tp);
2471                 if (ip) {
2472                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
2473                         xfs_irele(ip);
2474                 }
2475                 if (error)
2476                         return error;
2477         }
2478
2479         ASSERT(list_empty(capture_list));
2480         return 0;
2481 }
2482
2483 /* Release all the captured defer ops and capture structures in this list. */
2484 static void
2485 xlog_abort_defer_ops(
2486         struct xfs_mount                *mp,
2487         struct list_head                *capture_list)
2488 {
2489         struct xfs_defer_capture        *dfc;
2490         struct xfs_defer_capture        *next;
2491
2492         list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2493                 list_del_init(&dfc->dfc_list);
2494                 xfs_defer_ops_release(mp, dfc);
2495         }
2496 }
2497 /*
2498  * When this is called, all of the log intent items which did not have
2499  * corresponding log done items should be in the AIL.  What we do now
2500  * is update the data structures associated with each one.
2501  *
2502  * Since we process the log intent items in normal transactions, they
2503  * will be removed at some point after the commit.  This prevents us
2504  * from just walking down the list processing each one.  We'll use a
2505  * flag in the intent item to skip those that we've already processed
2506  * and use the AIL iteration mechanism's generation count to try to
2507  * speed this up at least a bit.
2508  *
2509  * When we start, we know that the intents are the only things in the
2510  * AIL.  As we process them, however, other items are added to the
2511  * AIL.
2512  */
2513 STATIC int
2514 xlog_recover_process_intents(
2515         struct xlog             *log)
2516 {
2517         LIST_HEAD(capture_list);
2518         struct xfs_ail_cursor   cur;
2519         struct xfs_log_item     *lip;
2520         struct xfs_ail          *ailp;
2521         int                     error = 0;
2522 #if defined(DEBUG) || defined(XFS_WARN)
2523         xfs_lsn_t               last_lsn;
2524 #endif
2525
2526         ailp = log->l_ailp;
2527         spin_lock(&ailp->ail_lock);
2528 #if defined(DEBUG) || defined(XFS_WARN)
2529         last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2530 #endif
2531         for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2532              lip != NULL;
2533              lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
2534                 /*
2535                  * We're done when we see something other than an intent.
2536                  * There should be no intents left in the AIL now.
2537                  */
2538                 if (!xlog_item_is_intent(lip)) {
2539 #ifdef DEBUG
2540                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
2541                                 ASSERT(!xlog_item_is_intent(lip));
2542 #endif
2543                         break;
2544                 }
2545
2546                 /*
2547                  * We should never see a redo item with a LSN higher than
2548                  * the last transaction we found in the log at the start
2549                  * of recovery.
2550                  */
2551                 ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
2552
2553                 /*
2554                  * NOTE: If your intent processing routine can create more
2555                  * deferred ops, you /must/ attach them to the capture list in
2556                  * the recover routine or else those subsequent intents will be
2557                  * replayed in the wrong order!
2558                  */
2559                 spin_unlock(&ailp->ail_lock);
2560                 error = lip->li_ops->iop_recover(lip, &capture_list);
2561                 spin_lock(&ailp->ail_lock);
2562                 if (error) {
2563                         trace_xlog_intent_recovery_failed(log->l_mp, error,
2564                                         lip->li_ops->iop_recover);
2565                         break;
2566                 }
2567         }
2568
2569         xfs_trans_ail_cursor_done(&cur);
2570         spin_unlock(&ailp->ail_lock);
2571         if (error)
2572                 goto err;
2573
2574         error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2575         if (error)
2576                 goto err;
2577
2578         return 0;
2579 err:
2580         xlog_abort_defer_ops(log->l_mp, &capture_list);
2581         return error;
2582 }
2583
2584 /*
2585  * A cancel occurs when the mount has failed and we're bailing out.
2586  * Release all pending log intent items so they don't pin the AIL.
2587  */
2588 STATIC void
2589 xlog_recover_cancel_intents(
2590         struct xlog             *log)
2591 {
2592         struct xfs_log_item     *lip;
2593         struct xfs_ail_cursor   cur;
2594         struct xfs_ail          *ailp;
2595
2596         ailp = log->l_ailp;
2597         spin_lock(&ailp->ail_lock);
2598         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2599         while (lip != NULL) {
2600                 /*
2601                  * We're done when we see something other than an intent.
2602                  * There should be no intents left in the AIL now.
2603                  */
2604                 if (!xlog_item_is_intent(lip)) {
2605 #ifdef DEBUG
2606                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
2607                                 ASSERT(!xlog_item_is_intent(lip));
2608 #endif
2609                         break;
2610                 }
2611
2612                 spin_unlock(&ailp->ail_lock);
2613                 lip->li_ops->iop_release(lip);
2614                 spin_lock(&ailp->ail_lock);
2615                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
2616         }
2617
2618         xfs_trans_ail_cursor_done(&cur);
2619         spin_unlock(&ailp->ail_lock);
2620 }
2621
2622 /*
2623  * This routine performs a transaction to null out a bad inode pointer
2624  * in an agi unlinked inode hash bucket.
2625  */
2626 STATIC void
2627 xlog_recover_clear_agi_bucket(
2628         xfs_mount_t     *mp,
2629         xfs_agnumber_t  agno,
2630         int             bucket)
2631 {
2632         xfs_trans_t     *tp;
2633         xfs_agi_t       *agi;
2634         struct xfs_buf  *agibp;
2635         int             offset;
2636         int             error;
2637
2638         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
2639         if (error)
2640                 goto out_error;
2641
2642         error = xfs_read_agi(mp, tp, agno, &agibp);
2643         if (error)
2644                 goto out_abort;
2645
2646         agi = agibp->b_addr;
2647         agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
2648         offset = offsetof(xfs_agi_t, agi_unlinked) +
2649                  (sizeof(xfs_agino_t) * bucket);
2650         xfs_trans_log_buf(tp, agibp, offset,
2651                           (offset + sizeof(xfs_agino_t) - 1));
2652
2653         error = xfs_trans_commit(tp);
2654         if (error)
2655                 goto out_error;
2656         return;
2657
2658 out_abort:
2659         xfs_trans_cancel(tp);
2660 out_error:
2661         xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
2662         return;
2663 }
2664
2665 STATIC xfs_agino_t
2666 xlog_recover_process_one_iunlink(
2667         struct xfs_mount                *mp,
2668         xfs_agnumber_t                  agno,
2669         xfs_agino_t                     agino,
2670         int                             bucket)
2671 {
2672         struct xfs_buf                  *ibp;
2673         struct xfs_dinode               *dip;
2674         struct xfs_inode                *ip;
2675         xfs_ino_t                       ino;
2676         int                             error;
2677
2678         ino = XFS_AGINO_TO_INO(mp, agno, agino);
2679         error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
2680         if (error)
2681                 goto fail;
2682
2683         /*
2684          * Get the on disk inode to find the next inode in the bucket.
2685          */
2686         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
2687         if (error)
2688                 goto fail_iput;
2689
2690         xfs_iflags_clear(ip, XFS_IRECOVERY);
2691         ASSERT(VFS_I(ip)->i_nlink == 0);
2692         ASSERT(VFS_I(ip)->i_mode != 0);
2693
2694         /* setup for the next pass */
2695         agino = be32_to_cpu(dip->di_next_unlinked);
2696         xfs_buf_relse(ibp);
2697
2698         /*
2699          * Prevent any DMAPI event from being sent when the reference on
2700          * the inode is dropped.
2701          */
2702         ip->i_d.di_dmevmask = 0;
2703
2704         xfs_irele(ip);
2705         return agino;
2706
2707  fail_iput:
2708         xfs_irele(ip);
2709  fail:
2710         /*
2711          * We can't read in the inode this bucket points to, or this inode
2712          * is messed up.  Just ditch this bucket of inodes.  We will lose
2713          * some inodes and space, but at least we won't hang.
2714          *
2715          * Call xlog_recover_clear_agi_bucket() to perform a transaction to
2716          * clear the inode pointer in the bucket.
2717          */
2718         xlog_recover_clear_agi_bucket(mp, agno, bucket);
2719         return NULLAGINO;
2720 }
2721
2722 /*
2723  * Recover AGI unlinked lists
2724  *
2725  * This is called during recovery to process any inodes which we unlinked but
2726  * not freed when the system crashed.  These inodes will be on the lists in the
2727  * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2728  * any inodes found on the lists. Each inode is removed from the lists when it
2729  * has been fully truncated and is freed. The freeing of the inode and its
2730  * removal from the list must be atomic.
2731  *
2732  * If everything we touch in the agi processing loop is already in memory, this
2733  * loop can hold the cpu for a long time. It runs without lock contention,
2734  * memory allocation contention, the need wait for IO, etc, and so will run
2735  * until we either run out of inodes to process, run low on memory or we run out
2736  * of log space.
2737  *
2738  * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2739  * and can prevent other filesytem work (such as CIL pushes) from running. This
2740  * can lead to deadlocks if the recovery process runs out of log reservation
2741  * space. Hence we need to yield the CPU when there is other kernel work
2742  * scheduled on this CPU to ensure other scheduled work can run without undue
2743  * latency.
2744  */
2745 STATIC void
2746 xlog_recover_process_iunlinks(
2747         struct xlog     *log)
2748 {
2749         xfs_mount_t     *mp;
2750         xfs_agnumber_t  agno;
2751         xfs_agi_t       *agi;
2752         struct xfs_buf  *agibp;
2753         xfs_agino_t     agino;
2754         int             bucket;
2755         int             error;
2756
2757         mp = log->l_mp;
2758
2759         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
2760                 /*
2761                  * Find the agi for this ag.
2762                  */
2763                 error = xfs_read_agi(mp, NULL, agno, &agibp);
2764                 if (error) {
2765                         /*
2766                          * AGI is b0rked. Don't process it.
2767                          *
2768                          * We should probably mark the filesystem as corrupt
2769                          * after we've recovered all the ag's we can....
2770                          */
2771                         continue;
2772                 }
2773                 /*
2774                  * Unlock the buffer so that it can be acquired in the normal
2775                  * course of the transaction to truncate and free each inode.
2776                  * Because we are not racing with anyone else here for the AGI
2777                  * buffer, we don't even need to hold it locked to read the
2778                  * initial unlinked bucket entries out of the buffer. We keep
2779                  * buffer reference though, so that it stays pinned in memory
2780                  * while we need the buffer.
2781                  */
2782                 agi = agibp->b_addr;
2783                 xfs_buf_unlock(agibp);
2784
2785                 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
2786                         agino = be32_to_cpu(agi->agi_unlinked[bucket]);
2787                         while (agino != NULLAGINO) {
2788                                 agino = xlog_recover_process_one_iunlink(mp,
2789                                                         agno, agino, bucket);
2790                                 cond_resched();
2791                         }
2792                 }
2793                 xfs_buf_rele(agibp);
2794         }
2795 }
2796
2797 STATIC void
2798 xlog_unpack_data(
2799         struct xlog_rec_header  *rhead,
2800         char                    *dp,
2801         struct xlog             *log)
2802 {
2803         int                     i, j, k;
2804
2805         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
2806                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2807                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
2808                 dp += BBSIZE;
2809         }
2810
2811         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
2812                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
2813                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
2814                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2815                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2816                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
2817                         dp += BBSIZE;
2818                 }
2819         }
2820 }
2821
2822 /*
2823  * CRC check, unpack and process a log record.
2824  */
2825 STATIC int
2826 xlog_recover_process(
2827         struct xlog             *log,
2828         struct hlist_head       rhash[],
2829         struct xlog_rec_header  *rhead,
2830         char                    *dp,
2831         int                     pass,
2832         struct list_head        *buffer_list)
2833 {
2834         __le32                  old_crc = rhead->h_crc;
2835         __le32                  crc;
2836
2837         crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
2838
2839         /*
2840          * Nothing else to do if this is a CRC verification pass. Just return
2841          * if this a record with a non-zero crc. Unfortunately, mkfs always
2842          * sets old_crc to 0 so we must consider this valid even on v5 supers.
2843          * Otherwise, return EFSBADCRC on failure so the callers up the stack
2844          * know precisely what failed.
2845          */
2846         if (pass == XLOG_RECOVER_CRCPASS) {
2847                 if (old_crc && crc != old_crc)
2848                         return -EFSBADCRC;
2849                 return 0;
2850         }
2851
2852         /*
2853          * We're in the normal recovery path. Issue a warning if and only if the
2854          * CRC in the header is non-zero. This is an advisory warning and the
2855          * zero CRC check prevents warnings from being emitted when upgrading
2856          * the kernel from one that does not add CRCs by default.
2857          */
2858         if (crc != old_crc) {
2859                 if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
2860                         xfs_alert(log->l_mp,
2861                 "log record CRC mismatch: found 0x%x, expected 0x%x.",
2862                                         le32_to_cpu(old_crc),
2863                                         le32_to_cpu(crc));
2864                         xfs_hex_dump(dp, 32);
2865                 }
2866
2867                 /*
2868                  * If the filesystem is CRC enabled, this mismatch becomes a
2869                  * fatal log corruption failure.
2870                  */
2871                 if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
2872                         XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2873                         return -EFSCORRUPTED;
2874                 }
2875         }
2876
2877         xlog_unpack_data(rhead, dp, log);
2878
2879         return xlog_recover_process_data(log, rhash, rhead, dp, pass,
2880                                          buffer_list);
2881 }
2882
2883 STATIC int
2884 xlog_valid_rec_header(
2885         struct xlog             *log,
2886         struct xlog_rec_header  *rhead,
2887         xfs_daddr_t             blkno,
2888         int                     bufsize)
2889 {
2890         int                     hlen;
2891
2892         if (XFS_IS_CORRUPT(log->l_mp,
2893                            rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
2894                 return -EFSCORRUPTED;
2895         if (XFS_IS_CORRUPT(log->l_mp,
2896                            (!rhead->h_version ||
2897                            (be32_to_cpu(rhead->h_version) &
2898                             (~XLOG_VERSION_OKBITS))))) {
2899                 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
2900                         __func__, be32_to_cpu(rhead->h_version));
2901                 return -EFSCORRUPTED;
2902         }
2903
2904         /*
2905          * LR body must have data (or it wouldn't have been written)
2906          * and h_len must not be greater than LR buffer size.
2907          */
2908         hlen = be32_to_cpu(rhead->h_len);
2909         if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
2910                 return -EFSCORRUPTED;
2911
2912         if (XFS_IS_CORRUPT(log->l_mp,
2913                            blkno > log->l_logBBsize || blkno > INT_MAX))
2914                 return -EFSCORRUPTED;
2915         return 0;
2916 }
2917
2918 /*
2919  * Read the log from tail to head and process the log records found.
2920  * Handle the two cases where the tail and head are in the same cycle
2921  * and where the active portion of the log wraps around the end of
2922  * the physical log separately.  The pass parameter is passed through
2923  * to the routines called to process the data and is not looked at
2924  * here.
2925  */
2926 STATIC int
2927 xlog_do_recovery_pass(
2928         struct xlog             *log,
2929         xfs_daddr_t             head_blk,
2930         xfs_daddr_t             tail_blk,
2931         int                     pass,
2932         xfs_daddr_t             *first_bad)     /* out: first bad log rec */
2933 {
2934         xlog_rec_header_t       *rhead;
2935         xfs_daddr_t             blk_no, rblk_no;
2936         xfs_daddr_t             rhead_blk;
2937         char                    *offset;
2938         char                    *hbp, *dbp;
2939         int                     error = 0, h_size, h_len;
2940         int                     error2 = 0;
2941         int                     bblks, split_bblks;
2942         int                     hblks, split_hblks, wrapped_hblks;
2943         int                     i;
2944         struct hlist_head       rhash[XLOG_RHASH_SIZE];
2945         LIST_HEAD               (buffer_list);
2946
2947         ASSERT(head_blk != tail_blk);
2948         blk_no = rhead_blk = tail_blk;
2949
2950         for (i = 0; i < XLOG_RHASH_SIZE; i++)
2951                 INIT_HLIST_HEAD(&rhash[i]);
2952
2953         /*
2954          * Read the header of the tail block and get the iclog buffer size from
2955          * h_size.  Use this to tell how many sectors make up the log header.
2956          */
2957         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
2958                 /*
2959                  * When using variable length iclogs, read first sector of
2960                  * iclog header and extract the header size from it.  Get a
2961                  * new hbp that is the correct size.
2962                  */
2963                 hbp = xlog_alloc_buffer(log, 1);
2964                 if (!hbp)
2965                         return -ENOMEM;
2966
2967                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
2968                 if (error)
2969                         goto bread_err1;
2970
2971                 rhead = (xlog_rec_header_t *)offset;
2972
2973                 /*
2974                  * xfsprogs has a bug where record length is based on lsunit but
2975                  * h_size (iclog size) is hardcoded to 32k. Now that we
2976                  * unconditionally CRC verify the unmount record, this means the
2977                  * log buffer can be too small for the record and cause an
2978                  * overrun.
2979                  *
2980                  * Detect this condition here. Use lsunit for the buffer size as
2981                  * long as this looks like the mkfs case. Otherwise, return an
2982                  * error to avoid a buffer overrun.
2983                  */
2984                 h_size = be32_to_cpu(rhead->h_size);
2985                 h_len = be32_to_cpu(rhead->h_len);
2986                 if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
2987                     rhead->h_num_logops == cpu_to_be32(1)) {
2988                         xfs_warn(log->l_mp,
2989                 "invalid iclog size (%d bytes), using lsunit (%d bytes)",
2990                                  h_size, log->l_mp->m_logbsize);
2991                         h_size = log->l_mp->m_logbsize;
2992                 }
2993
2994                 error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
2995                 if (error)
2996                         goto bread_err1;
2997
2998                 hblks = xlog_logrec_hblks(log, rhead);
2999                 if (hblks != 1) {
3000                         kmem_free(hbp);
3001                         hbp = xlog_alloc_buffer(log, hblks);
3002                 }
3003         } else {
3004                 ASSERT(log->l_sectBBsize == 1);
3005                 hblks = 1;
3006                 hbp = xlog_alloc_buffer(log, 1);
3007                 h_size = XLOG_BIG_RECORD_BSIZE;
3008         }
3009
3010         if (!hbp)
3011                 return -ENOMEM;
3012         dbp = xlog_alloc_buffer(log, BTOBB(h_size));
3013         if (!dbp) {
3014                 kmem_free(hbp);
3015                 return -ENOMEM;
3016         }
3017
3018         memset(rhash, 0, sizeof(rhash));
3019         if (tail_blk > head_blk) {
3020                 /*
3021                  * Perform recovery around the end of the physical log.
3022                  * When the head is not on the same cycle number as the tail,
3023                  * we can't do a sequential recovery.
3024                  */
3025                 while (blk_no < log->l_logBBsize) {
3026                         /*
3027                          * Check for header wrapping around physical end-of-log
3028                          */
3029                         offset = hbp;
3030                         split_hblks = 0;
3031                         wrapped_hblks = 0;
3032                         if (blk_no + hblks <= log->l_logBBsize) {
3033                                 /* Read header in one read */
3034                                 error = xlog_bread(log, blk_no, hblks, hbp,
3035                                                    &offset);
3036                                 if (error)
3037                                         goto bread_err2;
3038                         } else {
3039                                 /* This LR is split across physical log end */
3040                                 if (blk_no != log->l_logBBsize) {
3041                                         /* some data before physical log end */
3042                                         ASSERT(blk_no <= INT_MAX);
3043                                         split_hblks = log->l_logBBsize - (int)blk_no;
3044                                         ASSERT(split_hblks > 0);
3045                                         error = xlog_bread(log, blk_no,
3046                                                            split_hblks, hbp,
3047                                                            &offset);
3048                                         if (error)
3049                                                 goto bread_err2;
3050                                 }
3051
3052                                 /*
3053                                  * Note: this black magic still works with
3054                                  * large sector sizes (non-512) only because:
3055                                  * - we increased the buffer size originally
3056                                  *   by 1 sector giving us enough extra space
3057                                  *   for the second read;
3058                                  * - the log start is guaranteed to be sector
3059                                  *   aligned;
3060                                  * - we read the log end (LR header start)
3061                                  *   _first_, then the log start (LR header end)
3062                                  *   - order is important.
3063                                  */
3064                                 wrapped_hblks = hblks - split_hblks;
3065                                 error = xlog_bread_noalign(log, 0,
3066                                                 wrapped_hblks,
3067                                                 offset + BBTOB(split_hblks));
3068                                 if (error)
3069                                         goto bread_err2;
3070                         }
3071                         rhead = (xlog_rec_header_t *)offset;
3072                         error = xlog_valid_rec_header(log, rhead,
3073                                         split_hblks ? blk_no : 0, h_size);
3074                         if (error)
3075                                 goto bread_err2;
3076
3077                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3078                         blk_no += hblks;
3079
3080                         /*
3081                          * Read the log record data in multiple reads if it
3082                          * wraps around the end of the log. Note that if the
3083                          * header already wrapped, blk_no could point past the
3084                          * end of the log. The record data is contiguous in
3085                          * that case.
3086                          */
3087                         if (blk_no + bblks <= log->l_logBBsize ||
3088                             blk_no >= log->l_logBBsize) {
3089                                 rblk_no = xlog_wrap_logbno(log, blk_no);
3090                                 error = xlog_bread(log, rblk_no, bblks, dbp,
3091                                                    &offset);
3092                                 if (error)
3093                                         goto bread_err2;
3094                         } else {
3095                                 /* This log record is split across the
3096                                  * physical end of log */
3097                                 offset = dbp;
3098                                 split_bblks = 0;
3099                                 if (blk_no != log->l_logBBsize) {
3100                                         /* some data is before the physical
3101                                          * end of log */
3102                                         ASSERT(!wrapped_hblks);
3103                                         ASSERT(blk_no <= INT_MAX);
3104                                         split_bblks =
3105                                                 log->l_logBBsize - (int)blk_no;
3106                                         ASSERT(split_bblks > 0);
3107                                         error = xlog_bread(log, blk_no,
3108                                                         split_bblks, dbp,
3109                                                         &offset);
3110                                         if (error)
3111                                                 goto bread_err2;
3112                                 }
3113
3114                                 /*
3115                                  * Note: this black magic still works with
3116                                  * large sector sizes (non-512) only because:
3117                                  * - we increased the buffer size originally
3118                                  *   by 1 sector giving us enough extra space
3119                                  *   for the second read;
3120                                  * - the log start is guaranteed to be sector
3121                                  *   aligned;
3122                                  * - we read the log end (LR header start)
3123                                  *   _first_, then the log start (LR header end)
3124                                  *   - order is important.
3125                                  */
3126                                 error = xlog_bread_noalign(log, 0,
3127                                                 bblks - split_bblks,
3128                                                 offset + BBTOB(split_bblks));
3129                                 if (error)
3130                                         goto bread_err2;
3131                         }
3132
3133                         error = xlog_recover_process(log, rhash, rhead, offset,
3134                                                      pass, &buffer_list);
3135                         if (error)
3136                                 goto bread_err2;
3137
3138                         blk_no += bblks;
3139                         rhead_blk = blk_no;
3140                 }
3141
3142                 ASSERT(blk_no >= log->l_logBBsize);
3143                 blk_no -= log->l_logBBsize;
3144                 rhead_blk = blk_no;
3145         }
3146
3147         /* read first part of physical log */
3148         while (blk_no < head_blk) {
3149                 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3150                 if (error)
3151                         goto bread_err2;
3152
3153                 rhead = (xlog_rec_header_t *)offset;
3154                 error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
3155                 if (error)
3156                         goto bread_err2;
3157
3158                 /* blocks in data section */
3159                 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3160                 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3161                                    &offset);
3162                 if (error)
3163                         goto bread_err2;
3164
3165                 error = xlog_recover_process(log, rhash, rhead, offset, pass,
3166                                              &buffer_list);
3167                 if (error)
3168                         goto bread_err2;
3169
3170                 blk_no += bblks + hblks;
3171                 rhead_blk = blk_no;
3172         }
3173
3174  bread_err2:
3175         kmem_free(dbp);
3176  bread_err1:
3177         kmem_free(hbp);
3178
3179         /*
3180          * Submit buffers that have been added from the last record processed,
3181          * regardless of error status.
3182          */
3183         if (!list_empty(&buffer_list))
3184                 error2 = xfs_buf_delwri_submit(&buffer_list);
3185
3186         if (error && first_bad)
3187                 *first_bad = rhead_blk;
3188
3189         /*
3190          * Transactions are freed at commit time but transactions without commit
3191          * records on disk are never committed. Free any that may be left in the
3192          * hash table.
3193          */
3194         for (i = 0; i < XLOG_RHASH_SIZE; i++) {
3195                 struct hlist_node       *tmp;
3196                 struct xlog_recover     *trans;
3197
3198                 hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
3199                         xlog_recover_free_trans(trans);
3200         }
3201
3202         return error ? error : error2;
3203 }
3204
3205 /*
3206  * Do the recovery of the log.  We actually do this in two phases.
3207  * The two passes are necessary in order to implement the function
3208  * of cancelling a record written into the log.  The first pass
3209  * determines those things which have been cancelled, and the
3210  * second pass replays log items normally except for those which
3211  * have been cancelled.  The handling of the replay and cancellations
3212  * takes place in the log item type specific routines.
3213  *
3214  * The table of items which have cancel records in the log is allocated
3215  * and freed at this level, since only here do we know when all of
3216  * the log recovery has been completed.
3217  */
3218 STATIC int
3219 xlog_do_log_recovery(
3220         struct xlog     *log,
3221         xfs_daddr_t     head_blk,
3222         xfs_daddr_t     tail_blk)
3223 {
3224         int             error, i;
3225
3226         ASSERT(head_blk != tail_blk);
3227
3228         /*
3229          * First do a pass to find all of the cancelled buf log items.
3230          * Store them in the buf_cancel_table for use in the second pass.
3231          */
3232         log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3233                                                  sizeof(struct list_head),
3234                                                  0);
3235         for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3236                 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3237
3238         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3239                                       XLOG_RECOVER_PASS1, NULL);
3240         if (error != 0) {
3241                 kmem_free(log->l_buf_cancel_table);
3242                 log->l_buf_cancel_table = NULL;
3243                 return error;
3244         }
3245         /*
3246          * Then do a second pass to actually recover the items in the log.
3247          * When it is complete free the table of buf cancel items.
3248          */
3249         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3250                                       XLOG_RECOVER_PASS2, NULL);
3251 #ifdef DEBUG
3252         if (!error) {
3253                 int     i;
3254
3255                 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3256                         ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3257         }
3258 #endif  /* DEBUG */
3259
3260         kmem_free(log->l_buf_cancel_table);
3261         log->l_buf_cancel_table = NULL;
3262
3263         return error;
3264 }
3265
3266 /*
3267  * Do the actual recovery
3268  */
3269 STATIC int
3270 xlog_do_recover(
3271         struct xlog             *log,
3272         xfs_daddr_t             head_blk,
3273         xfs_daddr_t             tail_blk)
3274 {
3275         struct xfs_mount        *mp = log->l_mp;
3276         struct xfs_buf          *bp = mp->m_sb_bp;
3277         struct xfs_sb           *sbp = &mp->m_sb;
3278         int                     error;
3279
3280         trace_xfs_log_recover(log, head_blk, tail_blk);
3281
3282         /*
3283          * First replay the images in the log.
3284          */
3285         error = xlog_do_log_recovery(log, head_blk, tail_blk);
3286         if (error)
3287                 return error;
3288
3289         /*
3290          * If IO errors happened during recovery, bail out.
3291          */
3292         if (XFS_FORCED_SHUTDOWN(mp))
3293                 return -EIO;
3294
3295         /*
3296          * We now update the tail_lsn since much of the recovery has completed
3297          * and there may be space available to use.  If there were no extent
3298          * or iunlinks, we can free up the entire log and set the tail_lsn to
3299          * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3300          * lsn of the last known good LR on disk.  If there are extent frees
3301          * or iunlinks they will have some entries in the AIL; so we look at
3302          * the AIL to determine how to set the tail_lsn.
3303          */
3304         xlog_assign_tail_lsn(mp);
3305
3306         /*
3307          * Now that we've finished replaying all buffer and inode updates,
3308          * re-read the superblock and reverify it.
3309          */
3310         xfs_buf_lock(bp);
3311         xfs_buf_hold(bp);
3312         error = _xfs_buf_read(bp, XBF_READ);
3313         if (error) {
3314                 if (!XFS_FORCED_SHUTDOWN(mp)) {
3315                         xfs_buf_ioerror_alert(bp, __this_address);
3316                         ASSERT(0);
3317                 }
3318                 xfs_buf_relse(bp);
3319                 return error;
3320         }
3321
3322         /* Convert superblock from on-disk format */
3323         xfs_sb_from_disk(sbp, bp->b_addr);
3324         xfs_buf_relse(bp);
3325
3326         /* re-initialise in-core superblock and geometry structures */
3327         xfs_reinit_percpu_counters(mp);
3328         error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
3329         if (error) {
3330                 xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
3331                 return error;
3332         }
3333         mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
3334
3335         xlog_recover_check_summary(log);
3336
3337         /* Normal transactions can now occur */
3338         log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3339         return 0;
3340 }
3341
3342 /*
3343  * Perform recovery and re-initialize some log variables in xlog_find_tail.
3344  *
3345  * Return error or zero.
3346  */
3347 int
3348 xlog_recover(
3349         struct xlog     *log)
3350 {
3351         xfs_daddr_t     head_blk, tail_blk;
3352         int             error;
3353
3354         /* find the tail of the log */
3355         error = xlog_find_tail(log, &head_blk, &tail_blk);
3356         if (error)
3357                 return error;
3358
3359         /*
3360          * The superblock was read before the log was available and thus the LSN
3361          * could not be verified. Check the superblock LSN against the current
3362          * LSN now that it's known.
3363          */
3364         if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
3365             !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3366                 return -EINVAL;
3367
3368         if (tail_blk != head_blk) {
3369                 /* There used to be a comment here:
3370                  *
3371                  * disallow recovery on read-only mounts.  note -- mount
3372                  * checks for ENOSPC and turns it into an intelligent
3373                  * error message.
3374                  * ...but this is no longer true.  Now, unless you specify
3375                  * NORECOVERY (in which case this function would never be
3376                  * called), we just go ahead and recover.  We do this all
3377                  * under the vfs layer, so we can get away with it unless
3378                  * the device itself is read-only, in which case we fail.
3379                  */
3380                 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3381                         return error;
3382                 }
3383
3384                 /*
3385                  * Version 5 superblock log feature mask validation. We know the
3386                  * log is dirty so check if there are any unknown log features
3387                  * in what we need to recover. If there are unknown features
3388                  * (e.g. unsupported transactions, then simply reject the
3389                  * attempt at recovery before touching anything.
3390                  */
3391                 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
3392                     xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3393                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3394                         xfs_warn(log->l_mp,
3395 "Superblock has unknown incompatible log features (0x%x) enabled.",
3396                                 (log->l_mp->m_sb.sb_features_log_incompat &
3397                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3398                         xfs_warn(log->l_mp,
3399 "The log can not be fully and/or safely recovered by this kernel.");
3400                         xfs_warn(log->l_mp,
3401 "Please recover the log on a kernel that supports the unknown features.");
3402                         return -EINVAL;
3403                 }
3404
3405                 /*
3406                  * Delay log recovery if the debug hook is set. This is debug
3407                  * instrumention to coordinate simulation of I/O failures with
3408                  * log recovery.
3409                  */
3410                 if (xfs_globals.log_recovery_delay) {
3411                         xfs_notice(log->l_mp,
3412                                 "Delaying log recovery for %d seconds.",
3413                                 xfs_globals.log_recovery_delay);
3414                         msleep(xfs_globals.log_recovery_delay * 1000);
3415                 }
3416
3417                 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3418                                 log->l_mp->m_logname ? log->l_mp->m_logname
3419                                                      : "internal");
3420
3421                 error = xlog_do_recover(log, head_blk, tail_blk);
3422                 log->l_flags |= XLOG_RECOVERY_NEEDED;
3423         }
3424         return error;
3425 }
3426
3427 /*
3428  * In the first part of recovery we replay inodes and buffers and build
3429  * up the list of extent free items which need to be processed.  Here
3430  * we process the extent free items and clean up the on disk unlinked
3431  * inode lists.  This is separated from the first part of recovery so
3432  * that the root and real-time bitmap inodes can be read in from disk in
3433  * between the two stages.  This is necessary so that we can free space
3434  * in the real-time portion of the file system.
3435  */
3436 int
3437 xlog_recover_finish(
3438         struct xlog     *log)
3439 {
3440         /*
3441          * Now we're ready to do the transactions needed for the
3442          * rest of recovery.  Start with completing all the extent
3443          * free intent records and then process the unlinked inode
3444          * lists.  At this point, we essentially run in normal mode
3445          * except that we're still performing recovery actions
3446          * rather than accepting new requests.
3447          */
3448         if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3449                 int     error;
3450                 error = xlog_recover_process_intents(log);
3451                 if (error) {
3452                         /*
3453                          * Cancel all the unprocessed intent items now so that
3454                          * we don't leave them pinned in the AIL.  This can
3455                          * cause the AIL to livelock on the pinned item if
3456                          * anyone tries to push the AIL (inode reclaim does
3457                          * this) before we get around to xfs_log_mount_cancel.
3458                          */
3459                         xlog_recover_cancel_intents(log);
3460                         xfs_alert(log->l_mp, "Failed to recover intents");
3461                         return error;
3462                 }
3463
3464                 /*
3465                  * Sync the log to get all the intents out of the AIL.
3466                  * This isn't absolutely necessary, but it helps in
3467                  * case the unlink transactions would have problems
3468                  * pushing the intents out of the way.
3469                  */
3470                 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3471
3472                 xlog_recover_process_iunlinks(log);
3473
3474                 xlog_recover_check_summary(log);
3475
3476                 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3477                                 log->l_mp->m_logname ? log->l_mp->m_logname
3478                                                      : "internal");
3479                 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3480         } else {
3481                 xfs_info(log->l_mp, "Ending clean mount");
3482         }
3483         return 0;
3484 }
3485
3486 void
3487 xlog_recover_cancel(
3488         struct xlog     *log)
3489 {
3490         if (log->l_flags & XLOG_RECOVERY_NEEDED)
3491                 xlog_recover_cancel_intents(log);
3492 }
3493
3494 #if defined(DEBUG)
3495 /*
3496  * Read all of the agf and agi counters and check that they
3497  * are consistent with the superblock counters.
3498  */
3499 STATIC void
3500 xlog_recover_check_summary(
3501         struct xlog     *log)
3502 {
3503         xfs_mount_t     *mp;
3504         struct xfs_buf  *agfbp;
3505         struct xfs_buf  *agibp;
3506         xfs_agnumber_t  agno;
3507         uint64_t        freeblks;
3508         uint64_t        itotal;
3509         uint64_t        ifree;
3510         int             error;
3511
3512         mp = log->l_mp;
3513
3514         freeblks = 0LL;
3515         itotal = 0LL;
3516         ifree = 0LL;
3517         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3518                 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3519                 if (error) {
3520                         xfs_alert(mp, "%s agf read failed agno %d error %d",
3521                                                 __func__, agno, error);
3522                 } else {
3523                         struct xfs_agf  *agfp = agfbp->b_addr;
3524
3525                         freeblks += be32_to_cpu(agfp->agf_freeblks) +
3526                                     be32_to_cpu(agfp->agf_flcount);
3527                         xfs_buf_relse(agfbp);
3528                 }
3529
3530                 error = xfs_read_agi(mp, NULL, agno, &agibp);
3531                 if (error) {
3532                         xfs_alert(mp, "%s agi read failed agno %d error %d",
3533                                                 __func__, agno, error);
3534                 } else {
3535                         struct xfs_agi  *agi = agibp->b_addr;
3536
3537                         itotal += be32_to_cpu(agi->agi_count);
3538                         ifree += be32_to_cpu(agi->agi_freecount);
3539                         xfs_buf_relse(agibp);
3540                 }
3541         }
3542 }
3543 #endif /* DEBUG */