fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  * Not all operations are supported by fast commits today (e.g extended
  69  * attributes). Fast commit ineligibility is marked by calling one of the
  70  * two following functions:
  71  *
  72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73  *   back to full commit. This is useful in case of transient errors.
  74  *
  75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76  *   the fast commits happening between ext4_fc_start_ineligible() and
  77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79  *   make one more fast commit to fall back to full commit after stop call so
  80  *   that it guaranteed that the fast commit ineligible operation contained
  81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82  *   followed by at least 1 full commit.
  83  *
  84  * Atomicity of commits
  85  * --------------------
  86  * In order to guarantee atomicity during the commit operation, fast commit
  87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88  * tag contains CRC of the contents and TID of the transaction after which
  89  * this fast commit should be applied. Recovery code replays fast commit
  90  * logs only if there's at least 1 valid tail present. For every fast commit
  91  * operation, there is 1 tail. This means, we may end up with multiple tails
  92  * in the fast commit space. Here's an example:
  93  *
  94  * - Create a new file A and remove existing file B
  95  * - fsync()
  96  * - Append contents to file A
  97  * - Truncate file A
  98  * - fsync()
  99  *
 100  * The fast commit space at the end of above operations would look like this:
 101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103  *
 104  * Replay code should thus check for all the valid tails in the FC area.
 105  *
 106  * Fast Commit Replay Idempotence
 107  * ------------------------------
 108  *
 109  * Fast commits tags are idempotent in nature provided the recovery code follows
 110  * certain rules. The guiding principle that the commit path follows while
 111  * committing is that it stores the result of a particular operation instead of
 112  * storing the procedure.
 113  *
 114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115  * was associated with inode 10. During fast commit, instead of storing this
 116  * operation as a procedure "rename a to b", we store the resulting file system
 117  * state as a "series" of outcomes:
 118  *
 119  * - Link dirent b to inode 10
 120  * - Unlink dirent a
 121  * - Inode <10> with valid refcount
 122  *
 123  * Now when recovery code runs, it needs "enforce" this state on the file
 124  * system. This is what guarantees idempotence of fast commit replay.
 125  *
 126  * Let's take an example of a procedure that is not idempotent and see how fast
 127  * commits make it idempotent. Consider following sequence of operations:
 128  *
 129  *     rm A;    mv B A;    read A
 130  *  (x)     (y)        (z)
 131  *
 132  * (x), (y) and (z) are the points at which we can crash. If we store this
 133  * sequence of operations as is then the replay is not idempotent. Let's say
 134  * while in replay, we crash at (z). During the second replay, file A (which was
 135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136  * file named A would be absent when we try to read A. So, this sequence of
 137  * operations is not idempotent. However, as mentioned above, instead of storing
 138  * the procedure fast commits store the outcome of each procedure. Thus the fast
 139  * commit log for above procedure would be as follows:
 140  *
 141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142  * inode 11 before the replay)
 143  *
 144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145  * (w)          (x)                    (y)          (z)
 146  *
 147  * If we crash at (z), we will have file A linked to inode 11. During the second
 148  * replay, we will remove file A (inode 11). But we will create it back and make
 149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152  * similarly. Thus, by converting a non-idempotent procedure into a series of
 153  * idempotent outcomes, fast commits ensured idempotence during the replay.
 154  *
 155  * TODOs
 156  * -----
 157  *
 158  * 0) Fast commit replay path hardening: Fast commit replay code should use
 159  *    journal handles to make sure all the updates it does during the replay
 160  *    path are atomic. With that if we crash during fast commit replay, after
 161  *    trying to do recovery again, we will find a file system where fast commit
 162  *    area is invalid (because new full commit would be found). In order to deal
 163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164  *    superblock state is persisted before starting the replay, so that after
 165  *    the crash, fast commit recovery code can look at that flag and perform
 166  *    fast commit recovery even if that area is invalidated by later full
 167  *    commits.
 168  *
 169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170  *    eligible update must be protected within ext4_fc_start_update() and
 171  *    ext4_fc_stop_update(). These routines are called at much higher
 172  *    routines. This can be made more fine grained by combining with
 173  *    ext4_journal_start().
 174  *
 175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176  *
 177  * 3) Handle more ineligible cases.
 178  */
 179
 180 #include <trace/events/ext4.h>
 181 static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184 {
 185         BUFFER_TRACE(bh, "");
 186         if (uptodate) {
 187                 ext4_debug("%s: Block %lld up-to-date",
 188                            __func__, bh->b_blocknr);
 189                 set_buffer_uptodate(bh);
 190         } else {
 191                 ext4_debug("%s: Block %lld not up-to-date",
 192                            __func__, bh->b_blocknr);
 193                 clear_buffer_uptodate(bh);
 194         }
 195
 196         unlock_buffer(bh);
 197 }
 198
 199 static inline void ext4_fc_reset_inode(struct inode *inode)
 200 {
 201         struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203         ei->i_fc_lblk_start = 0;
 204         ei->i_fc_lblk_len = 0;
 205 }
 206
 207 void ext4_fc_init_inode(struct inode *inode)
 208 {
 209         struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211         ext4_fc_reset_inode(inode);
 212         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213         INIT_LIST_HEAD(&ei->i_fc_list);
 214         init_waitqueue_head(&ei->i_fc_wait);
 215         atomic_set(&ei->i_fc_updates, 0);
 216 }
 217
 218 /* This function must be called with sbi->s_fc_lock held. */
 219 static void ext4_fc_wait_committing_inode(struct inode *inode)
 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221 {
 222         wait_queue_head_t *wq;
 223         struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225 #if (BITS_PER_LONG < 64)
 226         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                         EXT4_STATE_FC_COMMITTING);
 228         wq = bit_waitqueue(&ei->i_state_flags,
 229                                 EXT4_STATE_FC_COMMITTING);
 230 #else
 231         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                         EXT4_STATE_FC_COMMITTING);
 233         wq = bit_waitqueue(&ei->i_flags,
 234                                 EXT4_STATE_FC_COMMITTING);
 235 #endif
 236         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239         schedule();
 240         finish_wait(wq, &wait.wq_entry);
 241 }
 242
 243 /*
 244  * Inform Ext4's fast about start of an inode update
 245  *
 246  * This function is called by the high level call VFS callbacks before
 247  * performing any inode update. This function blocks if there's an ongoing
 248  * fast commit on the inode in question.
 249  */
 250 void ext4_fc_start_update(struct inode *inode)
 251 {
 252         struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                 return;
 257
 258 restart:
 259         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260         if (list_empty(&ei->i_fc_list))
 261                 goto out;
 262
 263         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                 ext4_fc_wait_committing_inode(inode);
 265                 goto restart;
 266         }
 267 out:
 268         atomic_inc(&ei->i_fc_updates);
 269         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270 }
 271
 272 /*
 273  * Stop inode update and wake up waiting fast commits if any.
 274  */
 275 void ext4_fc_stop_update(struct inode *inode)
 276 {
 277         struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                 return;
 282
 283         if (atomic_dec_and_test(&ei->i_fc_updates))
 284                 wake_up_all(&ei->i_fc_wait);
 285 }
 286
 287 /*
 288  * Remove inode from fast commit list. If the inode is being committed
 289  * we wait until inode commit is done.
 290  */
 291 void ext4_fc_del(struct inode *inode)
 292 {
 293         struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                 return;
 298
 299 restart:
 300         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301         if (list_empty(&ei->i_fc_list)) {
 302                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                 return;
 304         }
 305
 306         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                 ext4_fc_wait_committing_inode(inode);
 308                 goto restart;
 309         }
 310         list_del_init(&ei->i_fc_list);
 311         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312 }
 313
 314 /*
 315  * Mark file system as fast commit ineligible. This means that next commit
 316  * operation would result in a full jbd2 commit.
 317  */
 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319 {
 320         struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                 return;
 325
 326         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329 }
 330
 331 /*
 332  * Start a fast commit ineligible update. Any commits that happen while
 333  * such an operation is in progress fall back to full commits.
 334  */
 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336 {
 337         struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                 return;
 342
 343         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345         atomic_inc(&sbi->s_fc_ineligible_updates);
 346 }
 347
 348 /*
 349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350  * to ensure that after stopping the ineligible update, at least one full
 351  * commit takes place.
 352  */
 353 void ext4_fc_stop_ineligible(struct super_block *sb)
 354 {
 355         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                 return;
 358
 359         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361 }
 362
 363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364 {
 365         return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367 }
 368
 369 /*
 370  * Generic fast commit tracking function. If this is the first time this we are
 371  * called after a full commit, we initialize fast commit fields and then call
 372  * __fc_track_fn() with update = 0. If we have already been called after a full
 373  * commit, we pass update = 1. Based on that, the track function can determine
 374  * if it needs to track a field for the first time or if it needs to just
 375  * update the previously tracked value.
 376  *
 377  * If enqueue is set, this function enqueues the inode in fast commit list.
 378  */
 379 static int ext4_fc_track_template(
 380         handle_t *handle, struct inode *inode,
 381         int (*__fc_track_fn)(struct inode *, void *, bool),
 382         void *args, int enqueue)
 383 {
 384         bool update = false;
 385         struct ext4_inode_info *ei = EXT4_I(inode);
 386         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387         tid_t tid = 0;
 388         int ret;
 389
 390         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391             (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                 return -EOPNOTSUPP;
 393
 394         if (ext4_fc_is_ineligible(inode->i_sb))
 395                 return -EINVAL;
 396
 397         tid = handle->h_transaction->t_tid;
 398         mutex_lock(&ei->i_fc_lock);
 399         if (tid == ei->i_sync_tid) {
 400                 update = true;
 401         } else {
 402                 ext4_fc_reset_inode(inode);
 403                 ei->i_sync_tid = tid;
 404         }
 405         ret = __fc_track_fn(inode, args, update);
 406         mutex_unlock(&ei->i_fc_lock);
 407
 408         if (!enqueue)
 409                 return ret;
 410
 411         spin_lock(&sbi->s_fc_lock);
 412         if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                 &sbi->s_fc_q[FC_Q_STAGING] :
 416                                 &sbi->s_fc_q[FC_Q_MAIN]);
 417         spin_unlock(&sbi->s_fc_lock);
 418
 419         return ret;
 420 }
 421
 422 struct __track_dentry_update_args {
 423         struct dentry *dentry;
 424         int op;
 425 };
 426
 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429 {
 430         struct ext4_fc_dentry_update *node;
 431         struct ext4_inode_info *ei = EXT4_I(inode);
 432         struct __track_dentry_update_args *dentry_update =
 433                 (struct __track_dentry_update_args *)arg;
 434         struct dentry *dentry = dentry_update->dentry;
 435         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437         mutex_unlock(&ei->i_fc_lock);
 438         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439         if (!node) {
 440                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                 mutex_lock(&ei->i_fc_lock);
 442                 return -ENOMEM;
 443         }
 444
 445         node->fcd_op = dentry_update->op;
 446         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447         node->fcd_ino = inode->i_ino;
 448         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                 if (!node->fcd_name.name) {
 451                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                         ext4_fc_mark_ineligible(inode->i_sb,
 453                                 EXT4_FC_REASON_NOMEM);
 454                         mutex_lock(&ei->i_fc_lock);
 455                         return -ENOMEM;
 456                 }
 457                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                         dentry->d_name.len);
 459         } else {
 460                 memcpy(node->fcd_iname, dentry->d_name.name,
 461                         dentry->d_name.len);
 462                 node->fcd_name.name = node->fcd_iname;
 463         }
 464         node->fcd_name.len = dentry->d_name.len;
 465
 466         spin_lock(&sbi->s_fc_lock);
 467         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                 list_add_tail(&node->fcd_list,
 469                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470         else
 471                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472         spin_unlock(&sbi->s_fc_lock);
 473         mutex_lock(&ei->i_fc_lock);
 474
 475         return 0;
 476 }
 477
 478 void __ext4_fc_track_unlink(handle_t *handle,
 479                 struct inode *inode, struct dentry *dentry)
 480 {
 481         struct __track_dentry_update_args args;
 482         int ret;
 483
 484         args.dentry = dentry;
 485         args.op = EXT4_FC_TAG_UNLINK;
 486
 487         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                         (void *)&args, 0);
 489         trace_ext4_fc_track_unlink(inode, dentry, ret);
 490 }
 491
 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493 {
 494         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495 }
 496
 497 void __ext4_fc_track_link(handle_t *handle,
 498         struct inode *inode, struct dentry *dentry)
 499 {
 500         struct __track_dentry_update_args args;
 501         int ret;
 502
 503         args.dentry = dentry;
 504         args.op = EXT4_FC_TAG_LINK;
 505
 506         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                         (void *)&args, 0);
 508         trace_ext4_fc_track_link(inode, dentry, ret);
 509 }
 510
 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512 {
 513         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514 }
 515
 516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517                           struct dentry *dentry)
 518 {
 519         struct __track_dentry_update_args args;
 520         int ret;
 521
 522         args.dentry = dentry;
 523         args.op = EXT4_FC_TAG_CREAT;
 524
 525         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                         (void *)&args, 0);
 527         trace_ext4_fc_track_create(inode, dentry, ret);
 528 }
 529
 530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531 {
 532         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533 }
 534
 535 /* __track_fn for inode tracking */
 536 static int __track_inode(struct inode *inode, void *arg, bool update)
 537 {
 538         if (update)
 539                 return -EEXIST;
 540
 541         EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543         return 0;
 544 }
 545
 546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547 {
 548         int ret;
 549
 550         if (S_ISDIR(inode->i_mode))
 551                 return;
 552
 553         if (ext4_should_journal_data(inode)) {
 554                 ext4_fc_mark_ineligible(inode->i_sb,
 555                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556                 return;
 557         }
 558
 559         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560         trace_ext4_fc_track_inode(inode, ret);
 561 }
 562
 563 struct __track_range_args {
 564         ext4_lblk_t start, end;
 565 };
 566
 567 /* __track_fn for tracking data updates */
 568 static int __track_range(struct inode *inode, void *arg, bool update)
 569 {
 570         struct ext4_inode_info *ei = EXT4_I(inode);
 571         ext4_lblk_t oldstart;
 572         struct __track_range_args *__arg =
 573                 (struct __track_range_args *)arg;
 574
 575         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577                 return -ECANCELED;
 578         }
 579
 580         oldstart = ei->i_fc_lblk_start;
 581
 582         if (update && ei->i_fc_lblk_len > 0) {
 583                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584                 ei->i_fc_lblk_len =
 585                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586                                 ei->i_fc_lblk_start + 1;
 587         } else {
 588                 ei->i_fc_lblk_start = __arg->start;
 589                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590         }
 591
 592         return 0;
 593 }
 594
 595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596                          ext4_lblk_t end)
 597 {
 598         struct __track_range_args args;
 599         int ret;
 600
 601         if (S_ISDIR(inode->i_mode))
 602                 return;
 603
 604         args.start = start;
 605         args.end = end;
 606
 607         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609         trace_ext4_fc_track_range(inode, start, end, ret);
 610 }
 611
 612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613 {
 614         int write_flags = REQ_SYNC;
 615         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618         if (test_opt(sb, BARRIER) && is_tail)
 619                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 620         lock_buffer(bh);
 621         set_buffer_dirty(bh);
 622         set_buffer_uptodate(bh);
 623         bh->b_end_io = ext4_end_buffer_io_sync;
 624         submit_bh(REQ_OP_WRITE, write_flags, bh);
 625         EXT4_SB(sb)->s_fc_bh = NULL;
 626 }
 627
 628 /* Ext4 commit path routines */
 629
 630 /* memzero and update CRC */
 631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632                                 u32 *crc)
 633 {
 634         void *ret;
 635
 636         ret = memset(dst, 0, len);
 637         if (crc)
 638                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639         return ret;
 640 }
 641
 642 /*
 643  * Allocate len bytes on a fast commit buffer.
 644  *
 645  * During the commit time this function is used to manage fast commit
 646  * block space. We don't split a fast commit log onto different
 647  * blocks. So this function makes sure that if there's not enough space
 648  * on the current block, the remaining space in the current block is
 649  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650  * new block is from jbd2 and CRC is updated to reflect the padding
 651  * we added.
 652  */
 653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654 {
 655         struct ext4_fc_tl *tl;
 656         struct ext4_sb_info *sbi = EXT4_SB(sb);
 657         struct buffer_head *bh;
 658         int bsize = sbi->s_journal->j_blocksize;
 659         int ret, off = sbi->s_fc_bytes % bsize;
 660         int pad_len;
 661
 662         /*
 663          * After allocating len, we should have space at least for a 0 byte
 664          * padding.
 665          */
 666         if (len + sizeof(struct ext4_fc_tl) > bsize)
 667                 return NULL;
 668
 669         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670                 /*
 671                  * Only allocate from current buffer if we have enough space for
 672                  * this request AND we have space to add a zero byte padding.
 673                  */
 674                 if (!sbi->s_fc_bh) {
 675                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676                         if (ret)
 677                                 return NULL;
 678                         sbi->s_fc_bh = bh;
 679                 }
 680                 sbi->s_fc_bytes += len;
 681                 return sbi->s_fc_bh->b_data + off;
 682         }
 683         /* Need to add PAD tag */
 684         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687         tl->fc_len = cpu_to_le16(pad_len);
 688         if (crc)
 689                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690         if (pad_len > 0)
 691                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692         ext4_fc_submit_bh(sb, false);
 693
 694         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695         if (ret)
 696                 return NULL;
 697         sbi->s_fc_bh = bh;
 698         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699         return sbi->s_fc_bh->b_data;
 700 }
 701
 702 /* memcpy to fc reserved space and update CRC */
 703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704                                 int len, u32 *crc)
 705 {
 706         if (crc)
 707                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708         return memcpy(dst, src, len);
 709 }
 710
 711 /*
 712  * Complete a fast commit by writing tail tag.
 713  *
 714  * Writing tail tag marks the end of a fast commit. In order to guarantee
 715  * atomicity, after writing tail tag, even if there's space remaining
 716  * in the block, next commit shouldn't use it. That's why tail tag
 717  * has the length as that of the remaining space on the block.
 718  */
 719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720 {
 721         struct ext4_sb_info *sbi = EXT4_SB(sb);
 722         struct ext4_fc_tl tl;
 723         struct ext4_fc_tail tail;
 724         int off, bsize = sbi->s_journal->j_blocksize;
 725         u8 *dst;
 726
 727         /*
 728          * ext4_fc_reserve_space takes care of allocating an extra block if
 729          * there's no enough space on this block for accommodating this tail.
 730          */
 731         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732         if (!dst)
 733                 return -ENOSPC;
 734
 735         off = sbi->s_fc_bytes % bsize;
 736
 737         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742         dst += sizeof(tl);
 743         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745         dst += sizeof(tail.fc_tid);
 746         tail.fc_crc = cpu_to_le32(crc);
 747         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749         ext4_fc_submit_bh(sb, true);
 750
 751         return 0;
 752 }
 753
 754 /*
 755  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756  * Returns false if there's not enough space.
 757  */
 758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759                            u32 *crc)
 760 {
 761         struct ext4_fc_tl tl;
 762         u8 *dst;
 763
 764         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765         if (!dst)
 766                 return false;
 767
 768         tl.fc_tag = cpu_to_le16(tag);
 769         tl.fc_len = cpu_to_le16(len);
 770
 771         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774         return true;
 775 }
 776
 777 /* Same as above, but adds dentry tlv. */
 778 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779                                         int parent_ino, int ino, int dlen,
 780                                         const unsigned char *dname,
 781                                         u32 *crc)
 782 {
 783         struct ext4_fc_dentry_info fcd;
 784         struct ext4_fc_tl tl;
 785         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786                                         crc);
 787
 788         if (!dst)
 789                 return false;
 790
 791         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792         fcd.fc_ino = cpu_to_le32(ino);
 793         tl.fc_tag = cpu_to_le16(tag);
 794         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796         dst += sizeof(tl);
 797         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798         dst += sizeof(fcd);
 799         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800         dst += dlen;
 801
 802         return true;
 803 }
 804
 805 /*
 806  * Writes inode in the fast commit space under TLV with tag @tag.
 807  * Returns 0 on success, error on failure.
 808  */
 809 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810 {
 811         struct ext4_inode_info *ei = EXT4_I(inode);
 812         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813         int ret;
 814         struct ext4_iloc iloc;
 815         struct ext4_fc_inode fc_inode;
 816         struct ext4_fc_tl tl;
 817         u8 *dst;
 818
 819         ret = ext4_get_inode_loc(inode, &iloc);
 820         if (ret)
 821                 return ret;
 822
 823         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 824                 inode_len += ei->i_extra_isize;
 825
 826         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 830         dst = ext4_fc_reserve_space(inode->i_sb,
 831                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832         if (!dst)
 833                 return -ECANCELED;
 834
 835         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836                 return -ECANCELED;
 837         dst += sizeof(tl);
 838         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839                 return -ECANCELED;
 840         dst += sizeof(fc_inode);
 841         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842                                         inode_len, crc))
 843                 return -ECANCELED;
 844
 845         return 0;
 846 }
 847
 848 /*
 849  * Writes updated data ranges for the inode in question. Updates CRC.
 850  * Returns 0 on success, error otherwise.
 851  */
 852 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853 {
 854         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855         struct ext4_inode_info *ei = EXT4_I(inode);
 856         struct ext4_map_blocks map;
 857         struct ext4_fc_add_range fc_ext;
 858         struct ext4_fc_del_range lrange;
 859         struct ext4_extent *ex;
 860         int ret;
 861
 862         mutex_lock(&ei->i_fc_lock);
 863         if (ei->i_fc_lblk_len == 0) {
 864                 mutex_unlock(&ei->i_fc_lock);
 865                 return 0;
 866         }
 867         old_blk_size = ei->i_fc_lblk_start;
 868         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869         ei->i_fc_lblk_len = 0;
 870         mutex_unlock(&ei->i_fc_lock);
 871
 872         cur_lblk_off = old_blk_size;
 873         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876         while (cur_lblk_off <= new_blk_size) {
 877                 map.m_lblk = cur_lblk_off;
 878                 map.m_len = new_blk_size - cur_lblk_off + 1;
 879                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 880                 if (ret < 0)
 881                         return -ECANCELED;
 882
 883                 if (map.m_len == 0) {
 884                         cur_lblk_off++;
 885                         continue;
 886                 }
 887
 888                 if (ret == 0) {
 889                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891                         lrange.fc_len = cpu_to_le32(map.m_len);
 892                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893                                             sizeof(lrange), (u8 *)&lrange, crc))
 894                                 return -ENOSPC;
 895                 } else {
 896                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 897                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 898                         ex->ee_block = cpu_to_le32(map.m_lblk);
 899                         ex->ee_len = cpu_to_le16(map.m_len);
 900                         ext4_ext_store_pblock(ex, map.m_pblk);
 901                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 902                                 ext4_ext_mark_unwritten(ex);
 903                         else
 904                                 ext4_ext_mark_initialized(ex);
 905                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 906                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 907                                 return -ENOSPC;
 908                 }
 909
 910                 cur_lblk_off += map.m_len;
 911         }
 912
 913         return 0;
 914 }
 915
 916
 917 /* Submit data for all the fast commit inodes */
 918 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 919 {
 920         struct super_block *sb = (struct super_block *)(journal->j_private);
 921         struct ext4_sb_info *sbi = EXT4_SB(sb);
 922         struct ext4_inode_info *ei;
 923         int ret = 0;
 924
 925         spin_lock(&sbi->s_fc_lock);
 926         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 927         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 928                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 929                 while (atomic_read(&ei->i_fc_updates)) {
 930                         DEFINE_WAIT(wait);
 931
 932                         prepare_to_wait(&ei->i_fc_wait, &wait,
 933                                                 TASK_UNINTERRUPTIBLE);
 934                         if (atomic_read(&ei->i_fc_updates)) {
 935                                 spin_unlock(&sbi->s_fc_lock);
 936                                 schedule();
 937                                 spin_lock(&sbi->s_fc_lock);
 938                         }
 939                         finish_wait(&ei->i_fc_wait, &wait);
 940                 }
 941                 spin_unlock(&sbi->s_fc_lock);
 942                 ret = jbd2_submit_inode_data(ei->jinode);
 943                 if (ret)
 944                         return ret;
 945                 spin_lock(&sbi->s_fc_lock);
 946         }
 947         spin_unlock(&sbi->s_fc_lock);
 948
 949         return ret;
 950 }
 951
 952 /* Wait for completion of data for all the fast commit inodes */
 953 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 954 {
 955         struct super_block *sb = (struct super_block *)(journal->j_private);
 956         struct ext4_sb_info *sbi = EXT4_SB(sb);
 957         struct ext4_inode_info *pos, *n;
 958         int ret = 0;
 959
 960         spin_lock(&sbi->s_fc_lock);
 961         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 962                 if (!ext4_test_inode_state(&pos->vfs_inode,
 963                                            EXT4_STATE_FC_COMMITTING))
 964                         continue;
 965                 spin_unlock(&sbi->s_fc_lock);
 966
 967                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 968                 if (ret)
 969                         return ret;
 970                 spin_lock(&sbi->s_fc_lock);
 971         }
 972         spin_unlock(&sbi->s_fc_lock);
 973
 974         return 0;
 975 }
 976
 977 /* Commit all the directory entry updates */
 978 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 979 __acquires(&sbi->s_fc_lock)
 980 __releases(&sbi->s_fc_lock)
 981 {
 982         struct super_block *sb = (struct super_block *)(journal->j_private);
 983         struct ext4_sb_info *sbi = EXT4_SB(sb);
 984         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 985         struct inode *inode;
 986         struct ext4_inode_info *ei, *ei_n;
 987         int ret;
 988
 989         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 990                 return 0;
 991         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 992                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 993                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 994                         spin_unlock(&sbi->s_fc_lock);
 995                         if (!ext4_fc_add_dentry_tlv(
 996                                 sb, fc_dentry->fcd_op,
 997                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 998                                 fc_dentry->fcd_name.len,
 999                                 fc_dentry->fcd_name.name, crc)) {
1000                                 ret = -ENOSPC;
1001                                 goto lock_and_exit;
1002                         }
1003                         spin_lock(&sbi->s_fc_lock);
1004                         continue;
1005                 }
1006
1007                 inode = NULL;
1008                 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1009                                          i_fc_list) {
1010                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1011                                 inode = &ei->vfs_inode;
1012                                 break;
1013                         }
1014                 }
1015                 /*
1016                  * If we don't find inode in our list, then it was deleted,
1017                  * in which case, we don't need to record it's create tag.
1018                  */
1019                 if (!inode)
1020                         continue;
1021                 spin_unlock(&sbi->s_fc_lock);
1022
1023                 /*
1024                  * We first write the inode and then the create dirent. This
1025                  * allows the recovery code to create an unnamed inode first
1026                  * and then link it to a directory entry. This allows us
1027                  * to use namei.c routines almost as is and simplifies
1028                  * the recovery code.
1029                  */
1030                 ret = ext4_fc_write_inode(inode, crc);
1031                 if (ret)
1032                         goto lock_and_exit;
1033
1034                 ret = ext4_fc_write_inode_data(inode, crc);
1035                 if (ret)
1036                         goto lock_and_exit;
1037
1038                 if (!ext4_fc_add_dentry_tlv(
1039                         sb, fc_dentry->fcd_op,
1040                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1041                         fc_dentry->fcd_name.len,
1042                         fc_dentry->fcd_name.name, crc)) {
1043                         ret = -ENOSPC;
1044                         goto lock_and_exit;
1045                 }
1046
1047                 spin_lock(&sbi->s_fc_lock);
1048         }
1049         return 0;
1050 lock_and_exit:
1051         spin_lock(&sbi->s_fc_lock);
1052         return ret;
1053 }
1054
1055 static int ext4_fc_perform_commit(journal_t *journal)
1056 {
1057         struct super_block *sb = (struct super_block *)(journal->j_private);
1058         struct ext4_sb_info *sbi = EXT4_SB(sb);
1059         struct ext4_inode_info *iter;
1060         struct ext4_fc_head head;
1061         struct inode *inode;
1062         struct blk_plug plug;
1063         int ret = 0;
1064         u32 crc = 0;
1065
1066         ret = ext4_fc_submit_inode_data_all(journal);
1067         if (ret)
1068                 return ret;
1069
1070         ret = ext4_fc_wait_inode_data_all(journal);
1071         if (ret)
1072                 return ret;
1073
1074         /*
1075          * If file system device is different from journal device, issue a cache
1076          * flush before we start writing fast commit blocks.
1077          */
1078         if (journal->j_fs_dev != journal->j_dev)
1079                 blkdev_issue_flush(journal->j_fs_dev);
1080
1081         blk_start_plug(&plug);
1082         if (sbi->s_fc_bytes == 0) {
1083                 /*
1084                  * Add a head tag only if this is the first fast commit
1085                  * in this TID.
1086                  */
1087                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088                 head.fc_tid = cpu_to_le32(
1089                         sbi->s_journal->j_running_transaction->t_tid);
1090                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091                         (u8 *)&head, &crc)) {
1092                         ret = -ENOSPC;
1093                         goto out;
1094                 }
1095         }
1096
1097         spin_lock(&sbi->s_fc_lock);
1098         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1099         if (ret) {
1100                 spin_unlock(&sbi->s_fc_lock);
1101                 goto out;
1102         }
1103
1104         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1105                 inode = &iter->vfs_inode;
1106                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1107                         continue;
1108
1109                 spin_unlock(&sbi->s_fc_lock);
1110                 ret = ext4_fc_write_inode_data(inode, &crc);
1111                 if (ret)
1112                         goto out;
1113                 ret = ext4_fc_write_inode(inode, &crc);
1114                 if (ret)
1115                         goto out;
1116                 spin_lock(&sbi->s_fc_lock);
1117         }
1118         spin_unlock(&sbi->s_fc_lock);
1119
1120         ret = ext4_fc_write_tail(sb, crc);
1121
1122 out:
1123         blk_finish_plug(&plug);
1124         return ret;
1125 }
1126
1127 /*
1128  * The main commit entry point. Performs a fast commit for transaction
1129  * commit_tid if needed. If it's not possible to perform a fast commit
1130  * due to various reasons, we fall back to full commit. Returns 0
1131  * on success, error otherwise.
1132  */
1133 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1134 {
1135         struct super_block *sb = (struct super_block *)(journal->j_private);
1136         struct ext4_sb_info *sbi = EXT4_SB(sb);
1137         int nblks = 0, ret, bsize = journal->j_blocksize;
1138         int subtid = atomic_read(&sbi->s_fc_subtid);
1139         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1140         ktime_t start_time, commit_time;
1141
1142         trace_ext4_fc_commit_start(sb);
1143
1144         start_time = ktime_get();
1145
1146         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1147                 (ext4_fc_is_ineligible(sb))) {
1148                 reason = EXT4_FC_REASON_INELIGIBLE;
1149                 goto out;
1150         }
1151
1152 restart_fc:
1153         ret = jbd2_fc_begin_commit(journal, commit_tid);
1154         if (ret == -EALREADY) {
1155                 /* There was an ongoing commit, check if we need to restart */
1156                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1157                         commit_tid > journal->j_commit_sequence)
1158                         goto restart_fc;
1159                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1160                 goto out;
1161         } else if (ret) {
1162                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1163                 reason = EXT4_FC_REASON_FC_START_FAILED;
1164                 goto out;
1165         }
1166
1167         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1168         ret = ext4_fc_perform_commit(journal);
1169         if (ret < 0) {
1170                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1171                 reason = EXT4_FC_REASON_FC_FAILED;
1172                 goto out;
1173         }
1174         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1175         ret = jbd2_fc_wait_bufs(journal, nblks);
1176         if (ret < 0) {
1177                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1178                 reason = EXT4_FC_REASON_FC_FAILED;
1179                 goto out;
1180         }
1181         atomic_inc(&sbi->s_fc_subtid);
1182         jbd2_fc_end_commit(journal);
1183 out:
1184         /* Has any ineligible update happened since we started? */
1185         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1186                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1187                 reason = EXT4_FC_REASON_INELIGIBLE;
1188         }
1189
1190         spin_lock(&sbi->s_fc_lock);
1191         if (reason != EXT4_FC_REASON_OK &&
1192                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1193                 sbi->s_fc_stats.fc_ineligible_commits++;
1194         } else {
1195                 sbi->s_fc_stats.fc_num_commits++;
1196                 sbi->s_fc_stats.fc_numblks += nblks;
1197         }
1198         spin_unlock(&sbi->s_fc_lock);
1199         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1200         trace_ext4_fc_commit_stop(sb, nblks, reason);
1201         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1202         /*
1203          * weight the commit time higher than the average time so we don't
1204          * react too strongly to vast changes in the commit time
1205          */
1206         if (likely(sbi->s_fc_avg_commit_time))
1207                 sbi->s_fc_avg_commit_time = (commit_time +
1208                                 sbi->s_fc_avg_commit_time * 3) / 4;
1209         else
1210                 sbi->s_fc_avg_commit_time = commit_time;
1211         jbd_debug(1,
1212                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1213                 nblks, reason, subtid);
1214         if (reason == EXT4_FC_REASON_FC_FAILED)
1215                 return jbd2_fc_end_commit_fallback(journal);
1216         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1217                 reason == EXT4_FC_REASON_INELIGIBLE)
1218                 return jbd2_complete_transaction(journal, commit_tid);
1219         return 0;
1220 }
1221
1222 /*
1223  * Fast commit cleanup routine. This is called after every fast commit and
1224  * full commit. full is true if we are called after a full commit.
1225  */
1226 static void ext4_fc_cleanup(journal_t *journal, int full)
1227 {
1228         struct super_block *sb = journal->j_private;
1229         struct ext4_sb_info *sbi = EXT4_SB(sb);
1230         struct ext4_inode_info *iter, *iter_n;
1231         struct ext4_fc_dentry_update *fc_dentry;
1232
1233         if (full && sbi->s_fc_bh)
1234                 sbi->s_fc_bh = NULL;
1235
1236         jbd2_fc_release_bufs(journal);
1237
1238         spin_lock(&sbi->s_fc_lock);
1239         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1240                                  i_fc_list) {
1241                 list_del_init(&iter->i_fc_list);
1242                 ext4_clear_inode_state(&iter->vfs_inode,
1243                                        EXT4_STATE_FC_COMMITTING);
1244                 ext4_fc_reset_inode(&iter->vfs_inode);
1245                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246                 smp_mb();
1247 #if (BITS_PER_LONG < 64)
1248                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249 #else
1250                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251 #endif
1252         }
1253
1254         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256                                              struct ext4_fc_dentry_update,
1257                                              fcd_list);
1258                 list_del_init(&fc_dentry->fcd_list);
1259                 spin_unlock(&sbi->s_fc_lock);
1260
1261                 if (fc_dentry->fcd_name.name &&
1262                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263                         kfree(fc_dentry->fcd_name.name);
1264                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265                 spin_lock(&sbi->s_fc_lock);
1266         }
1267
1268         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271                                 &sbi->s_fc_q[FC_Q_MAIN]);
1272
1273         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275
1276         if (full)
1277                 sbi->s_fc_bytes = 0;
1278         spin_unlock(&sbi->s_fc_lock);
1279         trace_ext4_fc_stats(sb);
1280 }
1281
1282 /* Ext4 Replay Path Routines */
1283
1284 /* Helper struct for dentry replay routines */
1285 struct dentry_info_args {
1286         int parent_ino, dname_len, ino, inode_len;
1287         char *dname;
1288 };
1289
1290 static inline void tl_to_darg(struct dentry_info_args *darg,
1291                               struct  ext4_fc_tl *tl, u8 *val)
1292 {
1293         struct ext4_fc_dentry_info fcd;
1294
1295         memcpy(&fcd, val, sizeof(fcd));
1296
1297         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1298         darg->ino = le32_to_cpu(fcd.fc_ino);
1299         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1300         darg->dname_len = le16_to_cpu(tl->fc_len) -
1301                 sizeof(struct ext4_fc_dentry_info);
1302 }
1303
1304 /* Unlink replay function */
1305 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1306                                  u8 *val)
1307 {
1308         struct inode *inode, *old_parent;
1309         struct qstr entry;
1310         struct dentry_info_args darg;
1311         int ret = 0;
1312
1313         tl_to_darg(&darg, tl, val);
1314
1315         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1316                         darg.parent_ino, darg.dname_len);
1317
1318         entry.name = darg.dname;
1319         entry.len = darg.dname_len;
1320         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1321
1322         if (IS_ERR(inode)) {
1323                 jbd_debug(1, "Inode %d not found", darg.ino);
1324                 return 0;
1325         }
1326
1327         old_parent = ext4_iget(sb, darg.parent_ino,
1328                                 EXT4_IGET_NORMAL);
1329         if (IS_ERR(old_parent)) {
1330                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1331                 iput(inode);
1332                 return 0;
1333         }
1334
1335         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1336         /* -ENOENT ok coz it might not exist anymore. */
1337         if (ret == -ENOENT)
1338                 ret = 0;
1339         iput(old_parent);
1340         iput(inode);
1341         return ret;
1342 }
1343
1344 static int ext4_fc_replay_link_internal(struct super_block *sb,
1345                                 struct dentry_info_args *darg,
1346                                 struct inode *inode)
1347 {
1348         struct inode *dir = NULL;
1349         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1350         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1351         int ret = 0;
1352
1353         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1354         if (IS_ERR(dir)) {
1355                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1356                 dir = NULL;
1357                 goto out;
1358         }
1359
1360         dentry_dir = d_obtain_alias(dir);
1361         if (IS_ERR(dentry_dir)) {
1362                 jbd_debug(1, "Failed to obtain dentry");
1363                 dentry_dir = NULL;
1364                 goto out;
1365         }
1366
1367         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1368         if (!dentry_inode) {
1369                 jbd_debug(1, "Inode dentry not created.");
1370                 ret = -ENOMEM;
1371                 goto out;
1372         }
1373
1374         ret = __ext4_link(dir, inode, dentry_inode);
1375         /*
1376          * It's possible that link already existed since data blocks
1377          * for the dir in question got persisted before we crashed OR
1378          * we replayed this tag and crashed before the entire replay
1379          * could complete.
1380          */
1381         if (ret && ret != -EEXIST) {
1382                 jbd_debug(1, "Failed to link\n");
1383                 goto out;
1384         }
1385
1386         ret = 0;
1387 out:
1388         if (dentry_dir) {
1389                 d_drop(dentry_dir);
1390                 dput(dentry_dir);
1391         } else if (dir) {
1392                 iput(dir);
1393         }
1394         if (dentry_inode) {
1395                 d_drop(dentry_inode);
1396                 dput(dentry_inode);
1397         }
1398
1399         return ret;
1400 }
1401
1402 /* Link replay function */
1403 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1404                                u8 *val)
1405 {
1406         struct inode *inode;
1407         struct dentry_info_args darg;
1408         int ret = 0;
1409
1410         tl_to_darg(&darg, tl, val);
1411         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1412                         darg.parent_ino, darg.dname_len);
1413
1414         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1415         if (IS_ERR(inode)) {
1416                 jbd_debug(1, "Inode not found.");
1417                 return 0;
1418         }
1419
1420         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1421         iput(inode);
1422         return ret;
1423 }
1424
1425 /*
1426  * Record all the modified inodes during replay. We use this later to setup
1427  * block bitmaps correctly.
1428  */
1429 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1430 {
1431         struct ext4_fc_replay_state *state;
1432         int i;
1433
1434         state = &EXT4_SB(sb)->s_fc_replay_state;
1435         for (i = 0; i < state->fc_modified_inodes_used; i++)
1436                 if (state->fc_modified_inodes[i] == ino)
1437                         return 0;
1438         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1439                 state->fc_modified_inodes_size +=
1440                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1441                 state->fc_modified_inodes = krealloc(
1442                                         state->fc_modified_inodes, sizeof(int) *
1443                                         state->fc_modified_inodes_size,
1444                                         GFP_KERNEL);
1445                 if (!state->fc_modified_inodes)
1446                         return -ENOMEM;
1447         }
1448         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1449         return 0;
1450 }
1451
1452 /*
1453  * Inode replay function
1454  */
1455 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1456                                 u8 *val)
1457 {
1458         struct ext4_fc_inode fc_inode;
1459         struct ext4_inode *raw_inode;
1460         struct ext4_inode *raw_fc_inode;
1461         struct inode *inode = NULL;
1462         struct ext4_iloc iloc;
1463         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1464         struct ext4_extent_header *eh;
1465
1466         memcpy(&fc_inode, val, sizeof(fc_inode));
1467
1468         ino = le32_to_cpu(fc_inode.fc_ino);
1469         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1470
1471         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1472         if (!IS_ERR(inode)) {
1473                 ext4_ext_clear_bb(inode);
1474                 iput(inode);
1475         }
1476         inode = NULL;
1477
1478         ext4_fc_record_modified_inode(sb, ino);
1479
1480         raw_fc_inode = (struct ext4_inode *)
1481                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1482         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1483         if (ret)
1484                 goto out;
1485
1486         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1487         raw_inode = ext4_raw_inode(&iloc);
1488
1489         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1490         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1491                 inode_len - offsetof(struct ext4_inode, i_generation));
1492         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1493                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1494                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1495                         memset(eh, 0, sizeof(*eh));
1496                         eh->eh_magic = EXT4_EXT_MAGIC;
1497                         eh->eh_max = cpu_to_le16(
1498                                 (sizeof(raw_inode->i_block) -
1499                                  sizeof(struct ext4_extent_header))
1500                                  / sizeof(struct ext4_extent));
1501                 }
1502         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1503                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1504                         sizeof(raw_inode->i_block));
1505         }
1506
1507         /* Immediately update the inode on disk. */
1508         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1509         if (ret)
1510                 goto out;
1511         ret = sync_dirty_buffer(iloc.bh);
1512         if (ret)
1513                 goto out;
1514         ret = ext4_mark_inode_used(sb, ino);
1515         if (ret)
1516                 goto out;
1517
1518         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1519         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1520         if (IS_ERR(inode)) {
1521                 jbd_debug(1, "Inode not found.");
1522                 return -EFSCORRUPTED;
1523         }
1524
1525         /*
1526          * Our allocator could have made different decisions than before
1527          * crashing. This should be fixed but until then, we calculate
1528          * the number of blocks the inode.
1529          */
1530         ext4_ext_replay_set_iblocks(inode);
1531
1532         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1533         ext4_reset_inode_seed(inode);
1534
1535         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1536         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1537         sync_dirty_buffer(iloc.bh);
1538         brelse(iloc.bh);
1539 out:
1540         iput(inode);
1541         if (!ret)
1542                 blkdev_issue_flush(sb->s_bdev);
1543
1544         return 0;
1545 }
1546
1547 /*
1548  * Dentry create replay function.
1549  *
1550  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1551  * inode for which we are trying to create a dentry here, should already have
1552  * been replayed before we start here.
1553  */
1554 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1555                                  u8 *val)
1556 {
1557         int ret = 0;
1558         struct inode *inode = NULL;
1559         struct inode *dir = NULL;
1560         struct dentry_info_args darg;
1561
1562         tl_to_darg(&darg, tl, val);
1563
1564         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1565                         darg.parent_ino, darg.dname_len);
1566
1567         /* This takes care of update group descriptor and other metadata */
1568         ret = ext4_mark_inode_used(sb, darg.ino);
1569         if (ret)
1570                 goto out;
1571
1572         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1573         if (IS_ERR(inode)) {
1574                 jbd_debug(1, "inode %d not found.", darg.ino);
1575                 inode = NULL;
1576                 ret = -EINVAL;
1577                 goto out;
1578         }
1579
1580         if (S_ISDIR(inode->i_mode)) {
1581                 /*
1582                  * If we are creating a directory, we need to make sure that the
1583                  * dot and dot dot dirents are setup properly.
1584                  */
1585                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1586                 if (IS_ERR(dir)) {
1587                         jbd_debug(1, "Dir %d not found.", darg.ino);
1588                         goto out;
1589                 }
1590                 ret = ext4_init_new_dir(NULL, dir, inode);
1591                 iput(dir);
1592                 if (ret) {
1593                         ret = 0;
1594                         goto out;
1595                 }
1596         }
1597         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1598         if (ret)
1599                 goto out;
1600         set_nlink(inode, 1);
1601         ext4_mark_inode_dirty(NULL, inode);
1602 out:
1603         if (inode)
1604                 iput(inode);
1605         return ret;
1606 }
1607
1608 /*
1609  * Record physical disk regions which are in use as per fast commit area. Our
1610  * simple replay phase allocator excludes these regions from allocation.
1611  */
1612 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1613                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1614 {
1615         struct ext4_fc_replay_state *state;
1616         struct ext4_fc_alloc_region *region;
1617
1618         state = &EXT4_SB(sb)->s_fc_replay_state;
1619         if (state->fc_regions_used == state->fc_regions_size) {
1620                 state->fc_regions_size +=
1621                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1622                 state->fc_regions = krealloc(
1623                                         state->fc_regions,
1624                                         state->fc_regions_size *
1625                                         sizeof(struct ext4_fc_alloc_region),
1626                                         GFP_KERNEL);
1627                 if (!state->fc_regions)
1628                         return -ENOMEM;
1629         }
1630         region = &state->fc_regions[state->fc_regions_used++];
1631         region->ino = ino;
1632         region->lblk = lblk;
1633         region->pblk = pblk;
1634         region->len = len;
1635
1636         return 0;
1637 }
1638
1639 /* Replay add range tag */
1640 static int ext4_fc_replay_add_range(struct super_block *sb,
1641                                     struct ext4_fc_tl *tl, u8 *val)
1642 {
1643         struct ext4_fc_add_range fc_add_ex;
1644         struct ext4_extent newex, *ex;
1645         struct inode *inode;
1646         ext4_lblk_t start, cur;
1647         int remaining, len;
1648         ext4_fsblk_t start_pblk;
1649         struct ext4_map_blocks map;
1650         struct ext4_ext_path *path = NULL;
1651         int ret;
1652
1653         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1654         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1655
1656         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1657                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1658                 ext4_ext_get_actual_len(ex));
1659
1660         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1661         if (IS_ERR(inode)) {
1662                 jbd_debug(1, "Inode not found.");
1663                 return 0;
1664         }
1665
1666         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1667
1668         start = le32_to_cpu(ex->ee_block);
1669         start_pblk = ext4_ext_pblock(ex);
1670         len = ext4_ext_get_actual_len(ex);
1671
1672         cur = start;
1673         remaining = len;
1674         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1675                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1676                   inode->i_ino);
1677
1678         while (remaining > 0) {
1679                 map.m_lblk = cur;
1680                 map.m_len = remaining;
1681                 map.m_pblk = 0;
1682                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1683
1684                 if (ret < 0) {
1685                         iput(inode);
1686                         return 0;
1687                 }
1688
1689                 if (ret == 0) {
1690                         /* Range is not mapped */
1691                         path = ext4_find_extent(inode, cur, NULL, 0);
1692                         if (IS_ERR(path)) {
1693                                 iput(inode);
1694                                 return 0;
1695                         }
1696                         memset(&newex, 0, sizeof(newex));
1697                         newex.ee_block = cpu_to_le32(cur);
1698                         ext4_ext_store_pblock(
1699                                 &newex, start_pblk + cur - start);
1700                         newex.ee_len = cpu_to_le16(map.m_len);
1701                         if (ext4_ext_is_unwritten(ex))
1702                                 ext4_ext_mark_unwritten(&newex);
1703                         down_write(&EXT4_I(inode)->i_data_sem);
1704                         ret = ext4_ext_insert_extent(
1705                                 NULL, inode, &path, &newex, 0);
1706                         up_write((&EXT4_I(inode)->i_data_sem));
1707                         ext4_ext_drop_refs(path);
1708                         kfree(path);
1709                         if (ret) {
1710                                 iput(inode);
1711                                 return 0;
1712                         }
1713                         goto next;
1714                 }
1715
1716                 if (start_pblk + cur - start != map.m_pblk) {
1717                         /*
1718                          * Logical to physical mapping changed. This can happen
1719                          * if this range was removed and then reallocated to
1720                          * map to new physical blocks during a fast commit.
1721                          */
1722                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1723                                         ext4_ext_is_unwritten(ex),
1724                                         start_pblk + cur - start);
1725                         if (ret) {
1726                                 iput(inode);
1727                                 return 0;
1728                         }
1729                         /*
1730                          * Mark the old blocks as free since they aren't used
1731                          * anymore. We maintain an array of all the modified
1732                          * inodes. In case these blocks are still used at either
1733                          * a different logical range in the same inode or in
1734                          * some different inode, we will mark them as allocated
1735                          * at the end of the FC replay using our array of
1736                          * modified inodes.
1737                          */
1738                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1739                         goto next;
1740                 }
1741
1742                 /* Range is mapped and needs a state change */
1743                 jbd_debug(1, "Converting from %ld to %d %lld",
1744                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1745                         ext4_ext_is_unwritten(ex), map.m_pblk);
1746                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1747                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1748                 if (ret) {
1749                         iput(inode);
1750                         return 0;
1751                 }
1752                 /*
1753                  * We may have split the extent tree while toggling the state.
1754                  * Try to shrink the extent tree now.
1755                  */
1756                 ext4_ext_replay_shrink_inode(inode, start + len);
1757 next:
1758                 cur += map.m_len;
1759                 remaining -= map.m_len;
1760         }
1761         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1762                                         sb->s_blocksize_bits);
1763         iput(inode);
1764         return 0;
1765 }
1766
1767 /* Replay DEL_RANGE tag */
1768 static int
1769 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1770                          u8 *val)
1771 {
1772         struct inode *inode;
1773         struct ext4_fc_del_range lrange;
1774         struct ext4_map_blocks map;
1775         ext4_lblk_t cur, remaining;
1776         int ret;
1777
1778         memcpy(&lrange, val, sizeof(lrange));
1779         cur = le32_to_cpu(lrange.fc_lblk);
1780         remaining = le32_to_cpu(lrange.fc_len);
1781
1782         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1783                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1784
1785         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1786         if (IS_ERR(inode)) {
1787                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1788                 return 0;
1789         }
1790
1791         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1792
1793         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1794                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1795                         le32_to_cpu(lrange.fc_len));
1796         while (remaining > 0) {
1797                 map.m_lblk = cur;
1798                 map.m_len = remaining;
1799
1800                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1801                 if (ret < 0) {
1802                         iput(inode);
1803                         return 0;
1804                 }
1805                 if (ret > 0) {
1806                         remaining -= ret;
1807                         cur += ret;
1808                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1809                 } else {
1810                         remaining -= map.m_len;
1811                         cur += map.m_len;
1812                 }
1813         }
1814
1815         ret = ext4_punch_hole(inode,
1816                 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1817                 le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1818         if (ret)
1819                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1820         ext4_ext_replay_shrink_inode(inode,
1821                 i_size_read(inode) >> sb->s_blocksize_bits);
1822         ext4_mark_inode_dirty(NULL, inode);
1823         iput(inode);
1824
1825         return 0;
1826 }
1827
1828 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1829 {
1830         struct ext4_fc_replay_state *state;
1831         struct inode *inode;
1832         struct ext4_ext_path *path = NULL;
1833         struct ext4_map_blocks map;
1834         int i, ret, j;
1835         ext4_lblk_t cur, end;
1836
1837         state = &EXT4_SB(sb)->s_fc_replay_state;
1838         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1839                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1840                         EXT4_IGET_NORMAL);
1841                 if (IS_ERR(inode)) {
1842                         jbd_debug(1, "Inode %d not found.",
1843                                 state->fc_modified_inodes[i]);
1844                         continue;
1845                 }
1846                 cur = 0;
1847                 end = EXT_MAX_BLOCKS;
1848                 while (cur < end) {
1849                         map.m_lblk = cur;
1850                         map.m_len = end - cur;
1851
1852                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1853                         if (ret < 0)
1854                                 break;
1855
1856                         if (ret > 0) {
1857                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1858                                 if (!IS_ERR(path)) {
1859                                         for (j = 0; j < path->p_depth; j++)
1860                                                 ext4_mb_mark_bb(inode->i_sb,
1861                                                         path[j].p_block, 1, 1);
1862                                         ext4_ext_drop_refs(path);
1863                                         kfree(path);
1864                                 }
1865                                 cur += ret;
1866                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1867                                                         map.m_len, 1);
1868                         } else {
1869                                 cur = cur + (map.m_len ? map.m_len : 1);
1870                         }
1871                 }
1872                 iput(inode);
1873         }
1874 }
1875
1876 /*
1877  * Check if block is in excluded regions for block allocation. The simple
1878  * allocator that runs during replay phase is calls this function to see
1879  * if it is okay to use a block.
1880  */
1881 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1882 {
1883         int i;
1884         struct ext4_fc_replay_state *state;
1885
1886         state = &EXT4_SB(sb)->s_fc_replay_state;
1887         for (i = 0; i < state->fc_regions_valid; i++) {
1888                 if (state->fc_regions[i].ino == 0 ||
1889                         state->fc_regions[i].len == 0)
1890                         continue;
1891                 if (blk >= state->fc_regions[i].pblk &&
1892                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1893                         return true;
1894         }
1895         return false;
1896 }
1897
1898 /* Cleanup function called after replay */
1899 void ext4_fc_replay_cleanup(struct super_block *sb)
1900 {
1901         struct ext4_sb_info *sbi = EXT4_SB(sb);
1902
1903         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1904         kfree(sbi->s_fc_replay_state.fc_regions);
1905         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1906 }
1907
1908 /*
1909  * Recovery Scan phase handler
1910  *
1911  * This function is called during the scan phase and is responsible
1912  * for doing following things:
1913  * - Make sure the fast commit area has valid tags for replay
1914  * - Count number of tags that need to be replayed by the replay handler
1915  * - Verify CRC
1916  * - Create a list of excluded blocks for allocation during replay phase
1917  *
1918  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1919  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1920  * to indicate that scan has finished and JBD2 can now start replay phase.
1921  * It returns a negative error to indicate that there was an error. At the end
1922  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1923  * to indicate the number of tags that need to replayed during the replay phase.
1924  */
1925 static int ext4_fc_replay_scan(journal_t *journal,
1926                                 struct buffer_head *bh, int off,
1927                                 tid_t expected_tid)
1928 {
1929         struct super_block *sb = journal->j_private;
1930         struct ext4_sb_info *sbi = EXT4_SB(sb);
1931         struct ext4_fc_replay_state *state;
1932         int ret = JBD2_FC_REPLAY_CONTINUE;
1933         struct ext4_fc_add_range ext;
1934         struct ext4_fc_tl tl;
1935         struct ext4_fc_tail tail;
1936         __u8 *start, *end, *cur, *val;
1937         struct ext4_fc_head head;
1938         struct ext4_extent *ex;
1939
1940         state = &sbi->s_fc_replay_state;
1941
1942         start = (u8 *)bh->b_data;
1943         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1944
1945         if (state->fc_replay_expected_off == 0) {
1946                 state->fc_cur_tag = 0;
1947                 state->fc_replay_num_tags = 0;
1948                 state->fc_crc = 0;
1949                 state->fc_regions = NULL;
1950                 state->fc_regions_valid = state->fc_regions_used =
1951                         state->fc_regions_size = 0;
1952                 /* Check if we can stop early */
1953                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1954                         != EXT4_FC_TAG_HEAD)
1955                         return 0;
1956         }
1957
1958         if (off != state->fc_replay_expected_off) {
1959                 ret = -EFSCORRUPTED;
1960                 goto out_err;
1961         }
1962
1963         state->fc_replay_expected_off++;
1964         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1965                 memcpy(&tl, cur, sizeof(tl));
1966                 val = cur + sizeof(tl);
1967                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1968                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1969                 switch (le16_to_cpu(tl.fc_tag)) {
1970                 case EXT4_FC_TAG_ADD_RANGE:
1971                         memcpy(&ext, val, sizeof(ext));
1972                         ex = (struct ext4_extent *)&ext.fc_ex;
1973                         ret = ext4_fc_record_regions(sb,
1974                                 le32_to_cpu(ext.fc_ino),
1975                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1976                                 ext4_ext_get_actual_len(ex));
1977                         if (ret < 0)
1978                                 break;
1979                         ret = JBD2_FC_REPLAY_CONTINUE;
1980                         fallthrough;
1981                 case EXT4_FC_TAG_DEL_RANGE:
1982                 case EXT4_FC_TAG_LINK:
1983                 case EXT4_FC_TAG_UNLINK:
1984                 case EXT4_FC_TAG_CREAT:
1985                 case EXT4_FC_TAG_INODE:
1986                 case EXT4_FC_TAG_PAD:
1987                         state->fc_cur_tag++;
1988                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1989                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1990                         break;
1991                 case EXT4_FC_TAG_TAIL:
1992                         state->fc_cur_tag++;
1993                         memcpy(&tail, val, sizeof(tail));
1994                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995                                                 sizeof(tl) +
1996                                                 offsetof(struct ext4_fc_tail,
1997                                                 fc_crc));
1998                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1999                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2000                                 state->fc_replay_num_tags = state->fc_cur_tag;
2001                                 state->fc_regions_valid =
2002                                         state->fc_regions_used;
2003                         } else {
2004                                 ret = state->fc_replay_num_tags ?
2005                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2006                         }
2007                         state->fc_crc = 0;
2008                         break;
2009                 case EXT4_FC_TAG_HEAD:
2010                         memcpy(&head, val, sizeof(head));
2011                         if (le32_to_cpu(head.fc_features) &
2012                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2013                                 ret = -EOPNOTSUPP;
2014                                 break;
2015                         }
2016                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2017                                 ret = JBD2_FC_REPLAY_STOP;
2018                                 break;
2019                         }
2020                         state->fc_cur_tag++;
2021                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2022                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2023                         break;
2024                 default:
2025                         ret = state->fc_replay_num_tags ?
2026                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2027                 }
2028                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2029                         break;
2030         }
2031
2032 out_err:
2033         trace_ext4_fc_replay_scan(sb, ret, off);
2034         return ret;
2035 }
2036
2037 /*
2038  * Main recovery path entry point.
2039  * The meaning of return codes is similar as above.
2040  */
2041 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2042                                 enum passtype pass, int off, tid_t expected_tid)
2043 {
2044         struct super_block *sb = journal->j_private;
2045         struct ext4_sb_info *sbi = EXT4_SB(sb);
2046         struct ext4_fc_tl tl;
2047         __u8 *start, *end, *cur, *val;
2048         int ret = JBD2_FC_REPLAY_CONTINUE;
2049         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2050         struct ext4_fc_tail tail;
2051
2052         if (pass == PASS_SCAN) {
2053                 state->fc_current_pass = PASS_SCAN;
2054                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2055         }
2056
2057         if (state->fc_current_pass != pass) {
2058                 state->fc_current_pass = pass;
2059                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2060         }
2061         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2062                 jbd_debug(1, "Replay stops\n");
2063                 ext4_fc_set_bitmaps_and_counters(sb);
2064                 return 0;
2065         }
2066
2067 #ifdef CONFIG_EXT4_DEBUG
2068         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2069                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2070                 return JBD2_FC_REPLAY_STOP;
2071         }
2072 #endif
2073
2074         start = (u8 *)bh->b_data;
2075         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2076
2077         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2078                 memcpy(&tl, cur, sizeof(tl));
2079                 val = cur + sizeof(tl);
2080
2081                 if (state->fc_replay_num_tags == 0) {
2082                         ret = JBD2_FC_REPLAY_STOP;
2083                         ext4_fc_set_bitmaps_and_counters(sb);
2084                         break;
2085                 }
2086                 jbd_debug(3, "Replay phase, tag:%s\n",
2087                                 tag2str(le16_to_cpu(tl.fc_tag)));
2088                 state->fc_replay_num_tags--;
2089                 switch (le16_to_cpu(tl.fc_tag)) {
2090                 case EXT4_FC_TAG_LINK:
2091                         ret = ext4_fc_replay_link(sb, &tl, val);
2092                         break;
2093                 case EXT4_FC_TAG_UNLINK:
2094                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2095                         break;
2096                 case EXT4_FC_TAG_ADD_RANGE:
2097                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2098                         break;
2099                 case EXT4_FC_TAG_CREAT:
2100                         ret = ext4_fc_replay_create(sb, &tl, val);
2101                         break;
2102                 case EXT4_FC_TAG_DEL_RANGE:
2103                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2104                         break;
2105                 case EXT4_FC_TAG_INODE:
2106                         ret = ext4_fc_replay_inode(sb, &tl, val);
2107                         break;
2108                 case EXT4_FC_TAG_PAD:
2109                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2110                                              le16_to_cpu(tl.fc_len), 0);
2111                         break;
2112                 case EXT4_FC_TAG_TAIL:
2113                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2114                                              le16_to_cpu(tl.fc_len), 0);
2115                         memcpy(&tail, val, sizeof(tail));
2116                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2117                         break;
2118                 case EXT4_FC_TAG_HEAD:
2119                         break;
2120                 default:
2121                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2122                                              le16_to_cpu(tl.fc_len), 0);
2123                         ret = -ECANCELED;
2124                         break;
2125                 }
2126                 if (ret < 0)
2127                         break;
2128                 ret = JBD2_FC_REPLAY_CONTINUE;
2129         }
2130         return ret;
2131 }
2132
2133 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2134 {
2135         /*
2136          * We set replay callback even if fast commit disabled because we may
2137          * could still have fast commit blocks that need to be replayed even if
2138          * fast commit has now been turned off.
2139          */
2140         journal->j_fc_replay_callback = ext4_fc_replay;
2141         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2142                 return;
2143         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2144 }
2145
2146 static const char *fc_ineligible_reasons[] = {
2147         "Extended attributes changed",
2148         "Cross rename",
2149         "Journal flag changed",
2150         "Insufficient memory",
2151         "Swap boot",
2152         "Resize",
2153         "Dir renamed",
2154         "Falloc range op",
2155         "Data journalling",
2156         "FC Commit Failed"
2157 };
2158
2159 int ext4_fc_info_show(struct seq_file *seq, void *v)
2160 {
2161         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2162         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2163         int i;
2164
2165         if (v != SEQ_START_TOKEN)
2166                 return 0;
2167
2168         seq_printf(seq,
2169                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2170                    stats->fc_num_commits, stats->fc_ineligible_commits,
2171                    stats->fc_numblks,
2172                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2173         seq_puts(seq, "Ineligible reasons:\n");
2174         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2175                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2176                         stats->fc_ineligible_reason_count[i]);
2177
2178         return 0;
2179 }
2180
2181 int __init ext4_fc_init_dentry_cache(void)
2182 {
2183         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2184                                            SLAB_RECLAIM_ACCOUNT);
2185
2186         if (ext4_fc_dentry_cachep == NULL)
2187                 return -ENOMEM;
2188
2189         return 0;
2190 }