fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  * Not all operations are supported by fast commits today (e.g extended
  69  * attributes). Fast commit ineligiblity is marked by calling one of the
  70  * two following functions:
  71  *
  72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73  *   back to full commit. This is useful in case of transient errors.
  74  *
  75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76  *   the fast commits happening between ext4_fc_start_ineligible() and
  77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79  *   make one more fast commit to fall back to full commit after stop call so
  80  *   that it guaranteed that the fast commit ineligible operation contained
  81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82  *   followed by at least 1 full commit.
  83  *
  84  * Atomicity of commits
  85  * --------------------
  86  * In order to guarantee atomicity during the commit operation, fast commit
  87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88  * tag contains CRC of the contents and TID of the transaction after which
  89  * this fast commit should be applied. Recovery code replays fast commit
  90  * logs only if there's at least 1 valid tail present. For every fast commit
  91  * operation, there is 1 tail. This means, we may end up with multiple tails
  92  * in the fast commit space. Here's an example:
  93  *
  94  * - Create a new file A and remove existing file B
  95  * - fsync()
  96  * - Append contents to file A
  97  * - Truncate file A
  98  * - fsync()
  99  *
 100  * The fast commit space at the end of above operations would look like this:
 101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103  *
 104  * Replay code should thus check for all the valid tails in the FC area.
 105  *
 106  * TODOs
 107  * -----
 108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 109  *    eligible update must be protected within ext4_fc_start_update() and
 110  *    ext4_fc_stop_update(). These routines are called at much higher
 111  *    routines. This can be made more fine grained by combining with
 112  *    ext4_journal_start().
 113  *
 114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 115  *
 116  * 3) Handle more ineligible cases.
 117  */
 118
 119 #include <trace/events/ext4.h>
 120 static struct kmem_cache *ext4_fc_dentry_cachep;
 121
 122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 123 {
 124         BUFFER_TRACE(bh, "");
 125         if (uptodate) {
 126                 ext4_debug("%s: Block %lld up-to-date",
 127                            __func__, bh->b_blocknr);
 128                 set_buffer_uptodate(bh);
 129         } else {
 130                 ext4_debug("%s: Block %lld not up-to-date",
 131                            __func__, bh->b_blocknr);
 132                 clear_buffer_uptodate(bh);
 133         }
 134
 135         unlock_buffer(bh);
 136 }
 137
 138 static inline void ext4_fc_reset_inode(struct inode *inode)
 139 {
 140         struct ext4_inode_info *ei = EXT4_I(inode);
 141
 142         ei->i_fc_lblk_start = 0;
 143         ei->i_fc_lblk_len = 0;
 144 }
 145
 146 void ext4_fc_init_inode(struct inode *inode)
 147 {
 148         struct ext4_inode_info *ei = EXT4_I(inode);
 149
 150         ext4_fc_reset_inode(inode);
 151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 152         INIT_LIST_HEAD(&ei->i_fc_list);
 153         init_waitqueue_head(&ei->i_fc_wait);
 154         atomic_set(&ei->i_fc_updates, 0);
 155 }
 156
 157 /* This function must be called with sbi->s_fc_lock held. */
 158 static void ext4_fc_wait_committing_inode(struct inode *inode)
 159 {
 160         wait_queue_head_t *wq;
 161         struct ext4_inode_info *ei = EXT4_I(inode);
 162
 163 #if (BITS_PER_LONG < 64)
 164         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 165                         EXT4_STATE_FC_COMMITTING);
 166         wq = bit_waitqueue(&ei->i_state_flags,
 167                                 EXT4_STATE_FC_COMMITTING);
 168 #else
 169         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 170                         EXT4_STATE_FC_COMMITTING);
 171         wq = bit_waitqueue(&ei->i_flags,
 172                                 EXT4_STATE_FC_COMMITTING);
 173 #endif
 174         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 175         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 176         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 177         schedule();
 178         finish_wait(wq, &wait.wq_entry);
 179 }
 180
 181 /*
 182  * Inform Ext4's fast about start of an inode update
 183  *
 184  * This function is called by the high level call VFS callbacks before
 185  * performing any inode update. This function blocks if there's an ongoing
 186  * fast commit on the inode in question.
 187  */
 188 void ext4_fc_start_update(struct inode *inode)
 189 {
 190         struct ext4_inode_info *ei = EXT4_I(inode);
 191
 192         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 193             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 194                 return;
 195
 196 restart:
 197         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 198         if (list_empty(&ei->i_fc_list))
 199                 goto out;
 200
 201         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 202                 ext4_fc_wait_committing_inode(inode);
 203                 goto restart;
 204         }
 205 out:
 206         atomic_inc(&ei->i_fc_updates);
 207         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 208 }
 209
 210 /*
 211  * Stop inode update and wake up waiting fast commits if any.
 212  */
 213 void ext4_fc_stop_update(struct inode *inode)
 214 {
 215         struct ext4_inode_info *ei = EXT4_I(inode);
 216
 217         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 218             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 219                 return;
 220
 221         if (atomic_dec_and_test(&ei->i_fc_updates))
 222                 wake_up_all(&ei->i_fc_wait);
 223 }
 224
 225 /*
 226  * Remove inode from fast commit list. If the inode is being committed
 227  * we wait until inode commit is done.
 228  */
 229 void ext4_fc_del(struct inode *inode)
 230 {
 231         struct ext4_inode_info *ei = EXT4_I(inode);
 232
 233         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 234             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 235                 return;
 236
 237 restart:
 238         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239         if (list_empty(&ei->i_fc_list)) {
 240                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 241                 return;
 242         }
 243
 244         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 245                 ext4_fc_wait_committing_inode(inode);
 246                 goto restart;
 247         }
 248         list_del_init(&ei->i_fc_list);
 249         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 250 }
 251
 252 /*
 253  * Mark file system as fast commit ineligible. This means that next commit
 254  * operation would result in a full jbd2 commit.
 255  */
 256 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 257 {
 258         struct ext4_sb_info *sbi = EXT4_SB(sb);
 259
 260         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 261             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 262                 return;
 263
 264         sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
 265         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 266         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 267 }
 268
 269 /*
 270  * Start a fast commit ineligible update. Any commits that happen while
 271  * such an operation is in progress fall back to full commits.
 272  */
 273 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 274 {
 275         struct ext4_sb_info *sbi = EXT4_SB(sb);
 276
 277         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 278             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 279                 return;
 280
 281         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 282         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 283         atomic_inc(&sbi->s_fc_ineligible_updates);
 284 }
 285
 286 /*
 287  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 288  * to ensure that after stopping the ineligible update, at least one full
 289  * commit takes place.
 290  */
 291 void ext4_fc_stop_ineligible(struct super_block *sb)
 292 {
 293         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 294             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 295                 return;
 296
 297         EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
 298         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 299 }
 300
 301 static inline int ext4_fc_is_ineligible(struct super_block *sb)
 302 {
 303         return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
 304                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
 305 }
 306
 307 /*
 308  * Generic fast commit tracking function. If this is the first time this we are
 309  * called after a full commit, we initialize fast commit fields and then call
 310  * __fc_track_fn() with update = 0. If we have already been called after a full
 311  * commit, we pass update = 1. Based on that, the track function can determine
 312  * if it needs to track a field for the first time or if it needs to just
 313  * update the previously tracked value.
 314  *
 315  * If enqueue is set, this function enqueues the inode in fast commit list.
 316  */
 317 static int ext4_fc_track_template(
 318         handle_t *handle, struct inode *inode,
 319         int (*__fc_track_fn)(struct inode *, void *, bool),
 320         void *args, int enqueue)
 321 {
 322         bool update = false;
 323         struct ext4_inode_info *ei = EXT4_I(inode);
 324         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 325         tid_t tid = 0;
 326         int ret;
 327
 328         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 329             (sbi->s_mount_state & EXT4_FC_REPLAY))
 330                 return -EOPNOTSUPP;
 331
 332         if (ext4_fc_is_ineligible(inode->i_sb))
 333                 return -EINVAL;
 334
 335         tid = handle->h_transaction->t_tid;
 336         mutex_lock(&ei->i_fc_lock);
 337         if (tid == ei->i_sync_tid) {
 338                 update = true;
 339         } else {
 340                 ext4_fc_reset_inode(inode);
 341                 ei->i_sync_tid = tid;
 342         }
 343         ret = __fc_track_fn(inode, args, update);
 344         mutex_unlock(&ei->i_fc_lock);
 345
 346         if (!enqueue)
 347                 return ret;
 348
 349         spin_lock(&sbi->s_fc_lock);
 350         if (list_empty(&EXT4_I(inode)->i_fc_list))
 351                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 352                                 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
 353                                 &sbi->s_fc_q[FC_Q_STAGING] :
 354                                 &sbi->s_fc_q[FC_Q_MAIN]);
 355         spin_unlock(&sbi->s_fc_lock);
 356
 357         return ret;
 358 }
 359
 360 struct __track_dentry_update_args {
 361         struct dentry *dentry;
 362         int op;
 363 };
 364
 365 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 366 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 367 {
 368         struct ext4_fc_dentry_update *node;
 369         struct ext4_inode_info *ei = EXT4_I(inode);
 370         struct __track_dentry_update_args *dentry_update =
 371                 (struct __track_dentry_update_args *)arg;
 372         struct dentry *dentry = dentry_update->dentry;
 373         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 374
 375         mutex_unlock(&ei->i_fc_lock);
 376         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 377         if (!node) {
 378                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 379                 mutex_lock(&ei->i_fc_lock);
 380                 return -ENOMEM;
 381         }
 382
 383         node->fcd_op = dentry_update->op;
 384         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 385         node->fcd_ino = inode->i_ino;
 386         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 387                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 388                 if (!node->fcd_name.name) {
 389                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 390                         ext4_fc_mark_ineligible(inode->i_sb,
 391                                 EXT4_FC_REASON_NOMEM);
 392                         mutex_lock(&ei->i_fc_lock);
 393                         return -ENOMEM;
 394                 }
 395                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 396                         dentry->d_name.len);
 397         } else {
 398                 memcpy(node->fcd_iname, dentry->d_name.name,
 399                         dentry->d_name.len);
 400                 node->fcd_name.name = node->fcd_iname;
 401         }
 402         node->fcd_name.len = dentry->d_name.len;
 403
 404         spin_lock(&sbi->s_fc_lock);
 405         if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
 406                 list_add_tail(&node->fcd_list,
 407                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 408         else
 409                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 410         spin_unlock(&sbi->s_fc_lock);
 411         mutex_lock(&ei->i_fc_lock);
 412
 413         return 0;
 414 }
 415
 416 void __ext4_fc_track_unlink(handle_t *handle,
 417                 struct inode *inode, struct dentry *dentry)
 418 {
 419         struct __track_dentry_update_args args;
 420         int ret;
 421
 422         args.dentry = dentry;
 423         args.op = EXT4_FC_TAG_UNLINK;
 424
 425         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 426                                         (void *)&args, 0);
 427         trace_ext4_fc_track_unlink(inode, dentry, ret);
 428 }
 429
 430 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 431 {
 432         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 433 }
 434
 435 void __ext4_fc_track_link(handle_t *handle,
 436         struct inode *inode, struct dentry *dentry)
 437 {
 438         struct __track_dentry_update_args args;
 439         int ret;
 440
 441         args.dentry = dentry;
 442         args.op = EXT4_FC_TAG_LINK;
 443
 444         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 445                                         (void *)&args, 0);
 446         trace_ext4_fc_track_link(inode, dentry, ret);
 447 }
 448
 449 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 450 {
 451         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 452 }
 453
 454 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 455 {
 456         struct __track_dentry_update_args args;
 457         struct inode *inode = d_inode(dentry);
 458         int ret;
 459
 460         args.dentry = dentry;
 461         args.op = EXT4_FC_TAG_CREAT;
 462
 463         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 464                                         (void *)&args, 0);
 465         trace_ext4_fc_track_create(inode, dentry, ret);
 466 }
 467
 468 /* __track_fn for inode tracking */
 469 static int __track_inode(struct inode *inode, void *arg, bool update)
 470 {
 471         if (update)
 472                 return -EEXIST;
 473
 474         EXT4_I(inode)->i_fc_lblk_len = 0;
 475
 476         return 0;
 477 }
 478
 479 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 480 {
 481         int ret;
 482
 483         if (S_ISDIR(inode->i_mode))
 484                 return;
 485
 486         if (ext4_should_journal_data(inode)) {
 487                 ext4_fc_mark_ineligible(inode->i_sb,
 488                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
 489                 return;
 490         }
 491
 492         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 493         trace_ext4_fc_track_inode(inode, ret);
 494 }
 495
 496 struct __track_range_args {
 497         ext4_lblk_t start, end;
 498 };
 499
 500 /* __track_fn for tracking data updates */
 501 static int __track_range(struct inode *inode, void *arg, bool update)
 502 {
 503         struct ext4_inode_info *ei = EXT4_I(inode);
 504         ext4_lblk_t oldstart;
 505         struct __track_range_args *__arg =
 506                 (struct __track_range_args *)arg;
 507
 508         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 509                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 510                 return -ECANCELED;
 511         }
 512
 513         oldstart = ei->i_fc_lblk_start;
 514
 515         if (update && ei->i_fc_lblk_len > 0) {
 516                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 517                 ei->i_fc_lblk_len =
 518                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 519                                 ei->i_fc_lblk_start + 1;
 520         } else {
 521                 ei->i_fc_lblk_start = __arg->start;
 522                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 523         }
 524
 525         return 0;
 526 }
 527
 528 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 529                          ext4_lblk_t end)
 530 {
 531         struct __track_range_args args;
 532         int ret;
 533
 534         if (S_ISDIR(inode->i_mode))
 535                 return;
 536
 537         args.start = start;
 538         args.end = end;
 539
 540         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 541
 542         trace_ext4_fc_track_range(inode, start, end, ret);
 543 }
 544
 545 static void ext4_fc_submit_bh(struct super_block *sb)
 546 {
 547         int write_flags = REQ_SYNC;
 548         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 549
 550         /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
 551         if (test_opt(sb, BARRIER))
 552                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 553         lock_buffer(bh);
 554         set_buffer_dirty(bh);
 555         set_buffer_uptodate(bh);
 556         bh->b_end_io = ext4_end_buffer_io_sync;
 557         submit_bh(REQ_OP_WRITE, write_flags, bh);
 558         EXT4_SB(sb)->s_fc_bh = NULL;
 559 }
 560
 561 /* Ext4 commit path routines */
 562
 563 /* memzero and update CRC */
 564 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 565                                 u32 *crc)
 566 {
 567         void *ret;
 568
 569         ret = memset(dst, 0, len);
 570         if (crc)
 571                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 572         return ret;
 573 }
 574
 575 /*
 576  * Allocate len bytes on a fast commit buffer.
 577  *
 578  * During the commit time this function is used to manage fast commit
 579  * block space. We don't split a fast commit log onto different
 580  * blocks. So this function makes sure that if there's not enough space
 581  * on the current block, the remaining space in the current block is
 582  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 583  * new block is from jbd2 and CRC is updated to reflect the padding
 584  * we added.
 585  */
 586 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 587 {
 588         struct ext4_fc_tl *tl;
 589         struct ext4_sb_info *sbi = EXT4_SB(sb);
 590         struct buffer_head *bh;
 591         int bsize = sbi->s_journal->j_blocksize;
 592         int ret, off = sbi->s_fc_bytes % bsize;
 593         int pad_len;
 594
 595         /*
 596          * After allocating len, we should have space at least for a 0 byte
 597          * padding.
 598          */
 599         if (len + sizeof(struct ext4_fc_tl) > bsize)
 600                 return NULL;
 601
 602         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 603                 /*
 604                  * Only allocate from current buffer if we have enough space for
 605                  * this request AND we have space to add a zero byte padding.
 606                  */
 607                 if (!sbi->s_fc_bh) {
 608                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 609                         if (ret)
 610                                 return NULL;
 611                         sbi->s_fc_bh = bh;
 612                 }
 613                 sbi->s_fc_bytes += len;
 614                 return sbi->s_fc_bh->b_data + off;
 615         }
 616         /* Need to add PAD tag */
 617         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 618         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 619         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 620         tl->fc_len = cpu_to_le16(pad_len);
 621         if (crc)
 622                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 623         if (pad_len > 0)
 624                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 625         ext4_fc_submit_bh(sb);
 626
 627         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 628         if (ret)
 629                 return NULL;
 630         sbi->s_fc_bh = bh;
 631         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 632         return sbi->s_fc_bh->b_data;
 633 }
 634
 635 /* memcpy to fc reserved space and update CRC */
 636 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 637                                 int len, u32 *crc)
 638 {
 639         if (crc)
 640                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 641         return memcpy(dst, src, len);
 642 }
 643
 644 /*
 645  * Complete a fast commit by writing tail tag.
 646  *
 647  * Writing tail tag marks the end of a fast commit. In order to guarantee
 648  * atomicity, after writing tail tag, even if there's space remaining
 649  * in the block, next commit shouldn't use it. That's why tail tag
 650  * has the length as that of the remaining space on the block.
 651  */
 652 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 653 {
 654         struct ext4_sb_info *sbi = EXT4_SB(sb);
 655         struct ext4_fc_tl tl;
 656         struct ext4_fc_tail tail;
 657         int off, bsize = sbi->s_journal->j_blocksize;
 658         u8 *dst;
 659
 660         /*
 661          * ext4_fc_reserve_space takes care of allocating an extra block if
 662          * there's no enough space on this block for accommodating this tail.
 663          */
 664         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 665         if (!dst)
 666                 return -ENOSPC;
 667
 668         off = sbi->s_fc_bytes % bsize;
 669
 670         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 671         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 672         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 673
 674         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 675         dst += sizeof(tl);
 676         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 677         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 678         dst += sizeof(tail.fc_tid);
 679         tail.fc_crc = cpu_to_le32(crc);
 680         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 681
 682         ext4_fc_submit_bh(sb);
 683
 684         return 0;
 685 }
 686
 687 /*
 688  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 689  * Returns false if there's not enough space.
 690  */
 691 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 692                            u32 *crc)
 693 {
 694         struct ext4_fc_tl tl;
 695         u8 *dst;
 696
 697         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 698         if (!dst)
 699                 return false;
 700
 701         tl.fc_tag = cpu_to_le16(tag);
 702         tl.fc_len = cpu_to_le16(len);
 703
 704         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 705         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 706
 707         return true;
 708 }
 709
 710 /* Same as above, but adds dentry tlv. */
 711 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 712                                         int parent_ino, int ino, int dlen,
 713                                         const unsigned char *dname,
 714                                         u32 *crc)
 715 {
 716         struct ext4_fc_dentry_info fcd;
 717         struct ext4_fc_tl tl;
 718         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 719                                         crc);
 720
 721         if (!dst)
 722                 return false;
 723
 724         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 725         fcd.fc_ino = cpu_to_le32(ino);
 726         tl.fc_tag = cpu_to_le16(tag);
 727         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 728         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 729         dst += sizeof(tl);
 730         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 731         dst += sizeof(fcd);
 732         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 733         dst += dlen;
 734
 735         return true;
 736 }
 737
 738 /*
 739  * Writes inode in the fast commit space under TLV with tag @tag.
 740  * Returns 0 on success, error on failure.
 741  */
 742 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 743 {
 744         struct ext4_inode_info *ei = EXT4_I(inode);
 745         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 746         int ret;
 747         struct ext4_iloc iloc;
 748         struct ext4_fc_inode fc_inode;
 749         struct ext4_fc_tl tl;
 750         u8 *dst;
 751
 752         ret = ext4_get_inode_loc(inode, &iloc);
 753         if (ret)
 754                 return ret;
 755
 756         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 757                 inode_len += ei->i_extra_isize;
 758
 759         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 760         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 761         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 762
 763         dst = ext4_fc_reserve_space(inode->i_sb,
 764                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 765         if (!dst)
 766                 return -ECANCELED;
 767
 768         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 769                 return -ECANCELED;
 770         dst += sizeof(tl);
 771         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 772                 return -ECANCELED;
 773         dst += sizeof(fc_inode);
 774         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 775                                         inode_len, crc))
 776                 return -ECANCELED;
 777
 778         return 0;
 779 }
 780
 781 /*
 782  * Writes updated data ranges for the inode in question. Updates CRC.
 783  * Returns 0 on success, error otherwise.
 784  */
 785 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 786 {
 787         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 788         struct ext4_inode_info *ei = EXT4_I(inode);
 789         struct ext4_map_blocks map;
 790         struct ext4_fc_add_range fc_ext;
 791         struct ext4_fc_del_range lrange;
 792         struct ext4_extent *ex;
 793         int ret;
 794
 795         mutex_lock(&ei->i_fc_lock);
 796         if (ei->i_fc_lblk_len == 0) {
 797                 mutex_unlock(&ei->i_fc_lock);
 798                 return 0;
 799         }
 800         old_blk_size = ei->i_fc_lblk_start;
 801         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 802         ei->i_fc_lblk_len = 0;
 803         mutex_unlock(&ei->i_fc_lock);
 804
 805         cur_lblk_off = old_blk_size;
 806         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 807                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 808
 809         while (cur_lblk_off <= new_blk_size) {
 810                 map.m_lblk = cur_lblk_off;
 811                 map.m_len = new_blk_size - cur_lblk_off + 1;
 812                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 813                 if (ret < 0)
 814                         return -ECANCELED;
 815
 816                 if (map.m_len == 0) {
 817                         cur_lblk_off++;
 818                         continue;
 819                 }
 820
 821                 if (ret == 0) {
 822                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 823                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 824                         lrange.fc_len = cpu_to_le32(map.m_len);
 825                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 826                                             sizeof(lrange), (u8 *)&lrange, crc))
 827                                 return -ENOSPC;
 828                 } else {
 829                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 830                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 831                         ex->ee_block = cpu_to_le32(map.m_lblk);
 832                         ex->ee_len = cpu_to_le16(map.m_len);
 833                         ext4_ext_store_pblock(ex, map.m_pblk);
 834                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 835                                 ext4_ext_mark_unwritten(ex);
 836                         else
 837                                 ext4_ext_mark_initialized(ex);
 838                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 839                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 840                                 return -ENOSPC;
 841                 }
 842
 843                 cur_lblk_off += map.m_len;
 844         }
 845
 846         return 0;
 847 }
 848
 849
 850 /* Submit data for all the fast commit inodes */
 851 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 852 {
 853         struct super_block *sb = (struct super_block *)(journal->j_private);
 854         struct ext4_sb_info *sbi = EXT4_SB(sb);
 855         struct ext4_inode_info *ei;
 856         struct list_head *pos;
 857         int ret = 0;
 858
 859         spin_lock(&sbi->s_fc_lock);
 860         sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
 861         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
 862                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 863                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 864                 while (atomic_read(&ei->i_fc_updates)) {
 865                         DEFINE_WAIT(wait);
 866
 867                         prepare_to_wait(&ei->i_fc_wait, &wait,
 868                                                 TASK_UNINTERRUPTIBLE);
 869                         if (atomic_read(&ei->i_fc_updates)) {
 870                                 spin_unlock(&sbi->s_fc_lock);
 871                                 schedule();
 872                                 spin_lock(&sbi->s_fc_lock);
 873                         }
 874                         finish_wait(&ei->i_fc_wait, &wait);
 875                 }
 876                 spin_unlock(&sbi->s_fc_lock);
 877                 ret = jbd2_submit_inode_data(ei->jinode);
 878                 if (ret)
 879                         return ret;
 880                 spin_lock(&sbi->s_fc_lock);
 881         }
 882         spin_unlock(&sbi->s_fc_lock);
 883
 884         return ret;
 885 }
 886
 887 /* Wait for completion of data for all the fast commit inodes */
 888 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 889 {
 890         struct super_block *sb = (struct super_block *)(journal->j_private);
 891         struct ext4_sb_info *sbi = EXT4_SB(sb);
 892         struct ext4_inode_info *pos, *n;
 893         int ret = 0;
 894
 895         spin_lock(&sbi->s_fc_lock);
 896         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 897                 if (!ext4_test_inode_state(&pos->vfs_inode,
 898                                            EXT4_STATE_FC_COMMITTING))
 899                         continue;
 900                 spin_unlock(&sbi->s_fc_lock);
 901
 902                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 903                 if (ret)
 904                         return ret;
 905                 spin_lock(&sbi->s_fc_lock);
 906         }
 907         spin_unlock(&sbi->s_fc_lock);
 908
 909         return 0;
 910 }
 911
 912 /* Commit all the directory entry updates */
 913 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 914 {
 915         struct super_block *sb = (struct super_block *)(journal->j_private);
 916         struct ext4_sb_info *sbi = EXT4_SB(sb);
 917         struct ext4_fc_dentry_update *fc_dentry;
 918         struct inode *inode;
 919         struct list_head *pos, *n, *fcd_pos, *fcd_n;
 920         struct ext4_inode_info *ei;
 921         int ret;
 922
 923         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 924                 return 0;
 925         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
 926                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
 927                                         fcd_list);
 928                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 929                         spin_unlock(&sbi->s_fc_lock);
 930                         if (!ext4_fc_add_dentry_tlv(
 931                                 sb, fc_dentry->fcd_op,
 932                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 933                                 fc_dentry->fcd_name.len,
 934                                 fc_dentry->fcd_name.name, crc)) {
 935                                 ret = -ENOSPC;
 936                                 goto lock_and_exit;
 937                         }
 938                         spin_lock(&sbi->s_fc_lock);
 939                         continue;
 940                 }
 941
 942                 inode = NULL;
 943                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
 944                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 945                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
 946                                 inode = &ei->vfs_inode;
 947                                 break;
 948                         }
 949                 }
 950                 /*
 951                  * If we don't find inode in our list, then it was deleted,
 952                  * in which case, we don't need to record it's create tag.
 953                  */
 954                 if (!inode)
 955                         continue;
 956                 spin_unlock(&sbi->s_fc_lock);
 957
 958                 /*
 959                  * We first write the inode and then the create dirent. This
 960                  * allows the recovery code to create an unnamed inode first
 961                  * and then link it to a directory entry. This allows us
 962                  * to use namei.c routines almost as is and simplifies
 963                  * the recovery code.
 964                  */
 965                 ret = ext4_fc_write_inode(inode, crc);
 966                 if (ret)
 967                         goto lock_and_exit;
 968
 969                 ret = ext4_fc_write_inode_data(inode, crc);
 970                 if (ret)
 971                         goto lock_and_exit;
 972
 973                 if (!ext4_fc_add_dentry_tlv(
 974                         sb, fc_dentry->fcd_op,
 975                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 976                         fc_dentry->fcd_name.len,
 977                         fc_dentry->fcd_name.name, crc)) {
 978                         ret = -ENOSPC;
 979                         goto lock_and_exit;
 980                 }
 981
 982                 spin_lock(&sbi->s_fc_lock);
 983         }
 984         return 0;
 985 lock_and_exit:
 986         spin_lock(&sbi->s_fc_lock);
 987         return ret;
 988 }
 989
 990 static int ext4_fc_perform_commit(journal_t *journal)
 991 {
 992         struct super_block *sb = (struct super_block *)(journal->j_private);
 993         struct ext4_sb_info *sbi = EXT4_SB(sb);
 994         struct ext4_inode_info *iter;
 995         struct ext4_fc_head head;
 996         struct list_head *pos;
 997         struct inode *inode;
 998         struct blk_plug plug;
 999         int ret = 0;
1000         u32 crc = 0;
1001
1002         ret = ext4_fc_submit_inode_data_all(journal);
1003         if (ret)
1004                 return ret;
1005
1006         ret = ext4_fc_wait_inode_data_all(journal);
1007         if (ret)
1008                 return ret;
1009
1010         blk_start_plug(&plug);
1011         if (sbi->s_fc_bytes == 0) {
1012                 /*
1013                  * Add a head tag only if this is the first fast commit
1014                  * in this TID.
1015                  */
1016                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1017                 head.fc_tid = cpu_to_le32(
1018                         sbi->s_journal->j_running_transaction->t_tid);
1019                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1020                         (u8 *)&head, &crc))
1021                         goto out;
1022         }
1023
1024         spin_lock(&sbi->s_fc_lock);
1025         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1026         if (ret) {
1027                 spin_unlock(&sbi->s_fc_lock);
1028                 goto out;
1029         }
1030
1031         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1032                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1033                 inode = &iter->vfs_inode;
1034                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1035                         continue;
1036
1037                 spin_unlock(&sbi->s_fc_lock);
1038                 ret = ext4_fc_write_inode_data(inode, &crc);
1039                 if (ret)
1040                         goto out;
1041                 ret = ext4_fc_write_inode(inode, &crc);
1042                 if (ret)
1043                         goto out;
1044                 spin_lock(&sbi->s_fc_lock);
1045         }
1046         spin_unlock(&sbi->s_fc_lock);
1047
1048         ret = ext4_fc_write_tail(sb, crc);
1049
1050 out:
1051         blk_finish_plug(&plug);
1052         return ret;
1053 }
1054
1055 /*
1056  * The main commit entry point. Performs a fast commit for transaction
1057  * commit_tid if needed. If it's not possible to perform a fast commit
1058  * due to various reasons, we fall back to full commit. Returns 0
1059  * on success, error otherwise.
1060  */
1061 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1062 {
1063         struct super_block *sb = (struct super_block *)(journal->j_private);
1064         struct ext4_sb_info *sbi = EXT4_SB(sb);
1065         int nblks = 0, ret, bsize = journal->j_blocksize;
1066         int subtid = atomic_read(&sbi->s_fc_subtid);
1067         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1068         ktime_t start_time, commit_time;
1069
1070         trace_ext4_fc_commit_start(sb);
1071
1072         start_time = ktime_get();
1073
1074         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1075                 (ext4_fc_is_ineligible(sb))) {
1076                 reason = EXT4_FC_REASON_INELIGIBLE;
1077                 goto out;
1078         }
1079
1080 restart_fc:
1081         ret = jbd2_fc_begin_commit(journal, commit_tid);
1082         if (ret == -EALREADY) {
1083                 /* There was an ongoing commit, check if we need to restart */
1084                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1085                         commit_tid > journal->j_commit_sequence)
1086                         goto restart_fc;
1087                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1088                 goto out;
1089         } else if (ret) {
1090                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1091                 reason = EXT4_FC_REASON_FC_START_FAILED;
1092                 goto out;
1093         }
1094
1095         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1096         ret = ext4_fc_perform_commit(journal);
1097         if (ret < 0) {
1098                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1099                 reason = EXT4_FC_REASON_FC_FAILED;
1100                 goto out;
1101         }
1102         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1103         ret = jbd2_fc_wait_bufs(journal, nblks);
1104         if (ret < 0) {
1105                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1106                 reason = EXT4_FC_REASON_FC_FAILED;
1107                 goto out;
1108         }
1109         atomic_inc(&sbi->s_fc_subtid);
1110         jbd2_fc_end_commit(journal);
1111 out:
1112         /* Has any ineligible update happened since we started? */
1113         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1114                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1115                 reason = EXT4_FC_REASON_INELIGIBLE;
1116         }
1117
1118         spin_lock(&sbi->s_fc_lock);
1119         if (reason != EXT4_FC_REASON_OK &&
1120                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1121                 sbi->s_fc_stats.fc_ineligible_commits++;
1122         } else {
1123                 sbi->s_fc_stats.fc_num_commits++;
1124                 sbi->s_fc_stats.fc_numblks += nblks;
1125         }
1126         spin_unlock(&sbi->s_fc_lock);
1127         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1128         trace_ext4_fc_commit_stop(sb, nblks, reason);
1129         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1130         /*
1131          * weight the commit time higher than the average time so we don't
1132          * react too strongly to vast changes in the commit time
1133          */
1134         if (likely(sbi->s_fc_avg_commit_time))
1135                 sbi->s_fc_avg_commit_time = (commit_time +
1136                                 sbi->s_fc_avg_commit_time * 3) / 4;
1137         else
1138                 sbi->s_fc_avg_commit_time = commit_time;
1139         jbd_debug(1,
1140                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1141                 nblks, reason, subtid);
1142         if (reason == EXT4_FC_REASON_FC_FAILED)
1143                 return jbd2_fc_end_commit_fallback(journal);
1144         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1145                 reason == EXT4_FC_REASON_INELIGIBLE)
1146                 return jbd2_complete_transaction(journal, commit_tid);
1147         return 0;
1148 }
1149
1150 /*
1151  * Fast commit cleanup routine. This is called after every fast commit and
1152  * full commit. full is true if we are called after a full commit.
1153  */
1154 static void ext4_fc_cleanup(journal_t *journal, int full)
1155 {
1156         struct super_block *sb = journal->j_private;
1157         struct ext4_sb_info *sbi = EXT4_SB(sb);
1158         struct ext4_inode_info *iter;
1159         struct ext4_fc_dentry_update *fc_dentry;
1160         struct list_head *pos, *n;
1161
1162         if (full && sbi->s_fc_bh)
1163                 sbi->s_fc_bh = NULL;
1164
1165         jbd2_fc_release_bufs(journal);
1166
1167         spin_lock(&sbi->s_fc_lock);
1168         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1169                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1170                 list_del_init(&iter->i_fc_list);
1171                 ext4_clear_inode_state(&iter->vfs_inode,
1172                                        EXT4_STATE_FC_COMMITTING);
1173                 ext4_fc_reset_inode(&iter->vfs_inode);
1174                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1175                 smp_mb();
1176 #if (BITS_PER_LONG < 64)
1177                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1178 #else
1179                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1180 #endif
1181         }
1182
1183         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1184                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1185                                              struct ext4_fc_dentry_update,
1186                                              fcd_list);
1187                 list_del_init(&fc_dentry->fcd_list);
1188                 spin_unlock(&sbi->s_fc_lock);
1189
1190                 if (fc_dentry->fcd_name.name &&
1191                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1192                         kfree(fc_dentry->fcd_name.name);
1193                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1194                 spin_lock(&sbi->s_fc_lock);
1195         }
1196
1197         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1198                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1199         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1200                                 &sbi->s_fc_q[FC_Q_STAGING]);
1201
1202         sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1203         sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1204
1205         if (full)
1206                 sbi->s_fc_bytes = 0;
1207         spin_unlock(&sbi->s_fc_lock);
1208         trace_ext4_fc_stats(sb);
1209 }
1210
1211 /* Ext4 Replay Path Routines */
1212
1213 /* Get length of a particular tlv */
1214 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1215 {
1216         return le16_to_cpu(tl->fc_len);
1217 }
1218
1219 /* Get a pointer to "value" of a tlv */
1220 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1221 {
1222         return (u8 *)tl + sizeof(*tl);
1223 }
1224
1225 /* Helper struct for dentry replay routines */
1226 struct dentry_info_args {
1227         int parent_ino, dname_len, ino, inode_len;
1228         char *dname;
1229 };
1230
1231 static inline void tl_to_darg(struct dentry_info_args *darg,
1232                                 struct  ext4_fc_tl *tl)
1233 {
1234         struct ext4_fc_dentry_info *fcd;
1235
1236         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1237
1238         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1239         darg->ino = le32_to_cpu(fcd->fc_ino);
1240         darg->dname = fcd->fc_dname;
1241         darg->dname_len = ext4_fc_tag_len(tl) -
1242                         sizeof(struct ext4_fc_dentry_info);
1243 }
1244
1245 /* Unlink replay function */
1246 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1247 {
1248         struct inode *inode, *old_parent;
1249         struct qstr entry;
1250         struct dentry_info_args darg;
1251         int ret = 0;
1252
1253         tl_to_darg(&darg, tl);
1254
1255         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1256                         darg.parent_ino, darg.dname_len);
1257
1258         entry.name = darg.dname;
1259         entry.len = darg.dname_len;
1260         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1261
1262         if (IS_ERR_OR_NULL(inode)) {
1263                 jbd_debug(1, "Inode %d not found", darg.ino);
1264                 return 0;
1265         }
1266
1267         old_parent = ext4_iget(sb, darg.parent_ino,
1268                                 EXT4_IGET_NORMAL);
1269         if (IS_ERR_OR_NULL(old_parent)) {
1270                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1271                 iput(inode);
1272                 return 0;
1273         }
1274
1275         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1276         /* -ENOENT ok coz it might not exist anymore. */
1277         if (ret == -ENOENT)
1278                 ret = 0;
1279         iput(old_parent);
1280         iput(inode);
1281         return ret;
1282 }
1283
1284 static int ext4_fc_replay_link_internal(struct super_block *sb,
1285                                 struct dentry_info_args *darg,
1286                                 struct inode *inode)
1287 {
1288         struct inode *dir = NULL;
1289         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1290         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1291         int ret = 0;
1292
1293         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1294         if (IS_ERR(dir)) {
1295                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1296                 dir = NULL;
1297                 goto out;
1298         }
1299
1300         dentry_dir = d_obtain_alias(dir);
1301         if (IS_ERR(dentry_dir)) {
1302                 jbd_debug(1, "Failed to obtain dentry");
1303                 dentry_dir = NULL;
1304                 goto out;
1305         }
1306
1307         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1308         if (!dentry_inode) {
1309                 jbd_debug(1, "Inode dentry not created.");
1310                 ret = -ENOMEM;
1311                 goto out;
1312         }
1313
1314         ret = __ext4_link(dir, inode, dentry_inode);
1315         /*
1316          * It's possible that link already existed since data blocks
1317          * for the dir in question got persisted before we crashed OR
1318          * we replayed this tag and crashed before the entire replay
1319          * could complete.
1320          */
1321         if (ret && ret != -EEXIST) {
1322                 jbd_debug(1, "Failed to link\n");
1323                 goto out;
1324         }
1325
1326         ret = 0;
1327 out:
1328         if (dentry_dir) {
1329                 d_drop(dentry_dir);
1330                 dput(dentry_dir);
1331         } else if (dir) {
1332                 iput(dir);
1333         }
1334         if (dentry_inode) {
1335                 d_drop(dentry_inode);
1336                 dput(dentry_inode);
1337         }
1338
1339         return ret;
1340 }
1341
1342 /* Link replay function */
1343 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1344 {
1345         struct inode *inode;
1346         struct dentry_info_args darg;
1347         int ret = 0;
1348
1349         tl_to_darg(&darg, tl);
1350         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1351                         darg.parent_ino, darg.dname_len);
1352
1353         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1354         if (IS_ERR_OR_NULL(inode)) {
1355                 jbd_debug(1, "Inode not found.");
1356                 return 0;
1357         }
1358
1359         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1360         iput(inode);
1361         return ret;
1362 }
1363
1364 /*
1365  * Record all the modified inodes during replay. We use this later to setup
1366  * block bitmaps correctly.
1367  */
1368 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1369 {
1370         struct ext4_fc_replay_state *state;
1371         int i;
1372
1373         state = &EXT4_SB(sb)->s_fc_replay_state;
1374         for (i = 0; i < state->fc_modified_inodes_used; i++)
1375                 if (state->fc_modified_inodes[i] == ino)
1376                         return 0;
1377         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1378                 state->fc_modified_inodes_size +=
1379                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1380                 state->fc_modified_inodes = krealloc(
1381                                         state->fc_modified_inodes, sizeof(int) *
1382                                         state->fc_modified_inodes_size,
1383                                         GFP_KERNEL);
1384                 if (!state->fc_modified_inodes)
1385                         return -ENOMEM;
1386         }
1387         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1388         return 0;
1389 }
1390
1391 /*
1392  * Inode replay function
1393  */
1394 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1395 {
1396         struct ext4_fc_inode *fc_inode;
1397         struct ext4_inode *raw_inode;
1398         struct ext4_inode *raw_fc_inode;
1399         struct inode *inode = NULL;
1400         struct ext4_iloc iloc;
1401         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1402         struct ext4_extent_header *eh;
1403
1404         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1405
1406         ino = le32_to_cpu(fc_inode->fc_ino);
1407         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1408
1409         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1410         if (!IS_ERR_OR_NULL(inode)) {
1411                 ext4_ext_clear_bb(inode);
1412                 iput(inode);
1413         }
1414
1415         ext4_fc_record_modified_inode(sb, ino);
1416
1417         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1418         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1419         if (ret)
1420                 goto out;
1421
1422         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1423         raw_inode = ext4_raw_inode(&iloc);
1424
1425         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1426         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1427                 inode_len - offsetof(struct ext4_inode, i_generation));
1428         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1429                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1430                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1431                         memset(eh, 0, sizeof(*eh));
1432                         eh->eh_magic = EXT4_EXT_MAGIC;
1433                         eh->eh_max = cpu_to_le16(
1434                                 (sizeof(raw_inode->i_block) -
1435                                  sizeof(struct ext4_extent_header))
1436                                  / sizeof(struct ext4_extent));
1437                 }
1438         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1439                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1440                         sizeof(raw_inode->i_block));
1441         }
1442
1443         /* Immediately update the inode on disk. */
1444         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1445         if (ret)
1446                 goto out;
1447         ret = sync_dirty_buffer(iloc.bh);
1448         if (ret)
1449                 goto out;
1450         ret = ext4_mark_inode_used(sb, ino);
1451         if (ret)
1452                 goto out;
1453
1454         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1455         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1456         if (IS_ERR_OR_NULL(inode)) {
1457                 jbd_debug(1, "Inode not found.");
1458                 return -EFSCORRUPTED;
1459         }
1460
1461         /*
1462          * Our allocator could have made different decisions than before
1463          * crashing. This should be fixed but until then, we calculate
1464          * the number of blocks the inode.
1465          */
1466         ext4_ext_replay_set_iblocks(inode);
1467
1468         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1469         ext4_reset_inode_seed(inode);
1470
1471         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1472         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1473         sync_dirty_buffer(iloc.bh);
1474         brelse(iloc.bh);
1475 out:
1476         iput(inode);
1477         if (!ret)
1478                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1479
1480         return 0;
1481 }
1482
1483 /*
1484  * Dentry create replay function.
1485  *
1486  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1487  * inode for which we are trying to create a dentry here, should already have
1488  * been replayed before we start here.
1489  */
1490 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1491 {
1492         int ret = 0;
1493         struct inode *inode = NULL;
1494         struct inode *dir = NULL;
1495         struct dentry_info_args darg;
1496
1497         tl_to_darg(&darg, tl);
1498
1499         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1500                         darg.parent_ino, darg.dname_len);
1501
1502         /* This takes care of update group descriptor and other metadata */
1503         ret = ext4_mark_inode_used(sb, darg.ino);
1504         if (ret)
1505                 goto out;
1506
1507         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1508         if (IS_ERR_OR_NULL(inode)) {
1509                 jbd_debug(1, "inode %d not found.", darg.ino);
1510                 inode = NULL;
1511                 ret = -EINVAL;
1512                 goto out;
1513         }
1514
1515         if (S_ISDIR(inode->i_mode)) {
1516                 /*
1517                  * If we are creating a directory, we need to make sure that the
1518                  * dot and dot dot dirents are setup properly.
1519                  */
1520                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1521                 if (IS_ERR_OR_NULL(dir)) {
1522                         jbd_debug(1, "Dir %d not found.", darg.ino);
1523                         goto out;
1524                 }
1525                 ret = ext4_init_new_dir(NULL, dir, inode);
1526                 iput(dir);
1527                 if (ret) {
1528                         ret = 0;
1529                         goto out;
1530                 }
1531         }
1532         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1533         if (ret)
1534                 goto out;
1535         set_nlink(inode, 1);
1536         ext4_mark_inode_dirty(NULL, inode);
1537 out:
1538         if (inode)
1539                 iput(inode);
1540         return ret;
1541 }
1542
1543 /*
1544  * Record physical disk regions which are in use as per fast commit area. Our
1545  * simple replay phase allocator excludes these regions from allocation.
1546  */
1547 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1548                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1549 {
1550         struct ext4_fc_replay_state *state;
1551         struct ext4_fc_alloc_region *region;
1552
1553         state = &EXT4_SB(sb)->s_fc_replay_state;
1554         if (state->fc_regions_used == state->fc_regions_size) {
1555                 state->fc_regions_size +=
1556                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1557                 state->fc_regions = krealloc(
1558                                         state->fc_regions,
1559                                         state->fc_regions_size *
1560                                         sizeof(struct ext4_fc_alloc_region),
1561                                         GFP_KERNEL);
1562                 if (!state->fc_regions)
1563                         return -ENOMEM;
1564         }
1565         region = &state->fc_regions[state->fc_regions_used++];
1566         region->ino = ino;
1567         region->lblk = lblk;
1568         region->pblk = pblk;
1569         region->len = len;
1570
1571         return 0;
1572 }
1573
1574 /* Replay add range tag */
1575 static int ext4_fc_replay_add_range(struct super_block *sb,
1576                                 struct ext4_fc_tl *tl)
1577 {
1578         struct ext4_fc_add_range *fc_add_ex;
1579         struct ext4_extent newex, *ex;
1580         struct inode *inode;
1581         ext4_lblk_t start, cur;
1582         int remaining, len;
1583         ext4_fsblk_t start_pblk;
1584         struct ext4_map_blocks map;
1585         struct ext4_ext_path *path = NULL;
1586         int ret;
1587
1588         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1589         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1590
1591         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1592                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1593                 ext4_ext_get_actual_len(ex));
1594
1595         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1596                                 EXT4_IGET_NORMAL);
1597         if (IS_ERR_OR_NULL(inode)) {
1598                 jbd_debug(1, "Inode not found.");
1599                 return 0;
1600         }
1601
1602         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1603
1604         start = le32_to_cpu(ex->ee_block);
1605         start_pblk = ext4_ext_pblock(ex);
1606         len = ext4_ext_get_actual_len(ex);
1607
1608         cur = start;
1609         remaining = len;
1610         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1611                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1612                   inode->i_ino);
1613
1614         while (remaining > 0) {
1615                 map.m_lblk = cur;
1616                 map.m_len = remaining;
1617                 map.m_pblk = 0;
1618                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1619
1620                 if (ret < 0) {
1621                         iput(inode);
1622                         return 0;
1623                 }
1624
1625                 if (ret == 0) {
1626                         /* Range is not mapped */
1627                         path = ext4_find_extent(inode, cur, NULL, 0);
1628                         if (IS_ERR(path)) {
1629                                 iput(inode);
1630                                 return 0;
1631                         }
1632                         memset(&newex, 0, sizeof(newex));
1633                         newex.ee_block = cpu_to_le32(cur);
1634                         ext4_ext_store_pblock(
1635                                 &newex, start_pblk + cur - start);
1636                         newex.ee_len = cpu_to_le16(map.m_len);
1637                         if (ext4_ext_is_unwritten(ex))
1638                                 ext4_ext_mark_unwritten(&newex);
1639                         down_write(&EXT4_I(inode)->i_data_sem);
1640                         ret = ext4_ext_insert_extent(
1641                                 NULL, inode, &path, &newex, 0);
1642                         up_write((&EXT4_I(inode)->i_data_sem));
1643                         ext4_ext_drop_refs(path);
1644                         kfree(path);
1645                         if (ret) {
1646                                 iput(inode);
1647                                 return 0;
1648                         }
1649                         goto next;
1650                 }
1651
1652                 if (start_pblk + cur - start != map.m_pblk) {
1653                         /*
1654                          * Logical to physical mapping changed. This can happen
1655                          * if this range was removed and then reallocated to
1656                          * map to new physical blocks during a fast commit.
1657                          */
1658                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1659                                         ext4_ext_is_unwritten(ex),
1660                                         start_pblk + cur - start);
1661                         if (ret) {
1662                                 iput(inode);
1663                                 return 0;
1664                         }
1665                         /*
1666                          * Mark the old blocks as free since they aren't used
1667                          * anymore. We maintain an array of all the modified
1668                          * inodes. In case these blocks are still used at either
1669                          * a different logical range in the same inode or in
1670                          * some different inode, we will mark them as allocated
1671                          * at the end of the FC replay using our array of
1672                          * modified inodes.
1673                          */
1674                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1675                         goto next;
1676                 }
1677
1678                 /* Range is mapped and needs a state change */
1679                 jbd_debug(1, "Converting from %d to %d %lld",
1680                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1681                         ext4_ext_is_unwritten(ex), map.m_pblk);
1682                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1683                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1684                 if (ret) {
1685                         iput(inode);
1686                         return 0;
1687                 }
1688                 /*
1689                  * We may have split the extent tree while toggling the state.
1690                  * Try to shrink the extent tree now.
1691                  */
1692                 ext4_ext_replay_shrink_inode(inode, start + len);
1693 next:
1694                 cur += map.m_len;
1695                 remaining -= map.m_len;
1696         }
1697         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1698                                         sb->s_blocksize_bits);
1699         iput(inode);
1700         return 0;
1701 }
1702
1703 /* Replay DEL_RANGE tag */
1704 static int
1705 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1706 {
1707         struct inode *inode;
1708         struct ext4_fc_del_range *lrange;
1709         struct ext4_map_blocks map;
1710         ext4_lblk_t cur, remaining;
1711         int ret;
1712
1713         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1714         cur = le32_to_cpu(lrange->fc_lblk);
1715         remaining = le32_to_cpu(lrange->fc_len);
1716
1717         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1718                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1719
1720         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1721         if (IS_ERR_OR_NULL(inode)) {
1722                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1723                 return 0;
1724         }
1725
1726         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1727
1728         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1729                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1730                         le32_to_cpu(lrange->fc_len));
1731         while (remaining > 0) {
1732                 map.m_lblk = cur;
1733                 map.m_len = remaining;
1734
1735                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1736                 if (ret < 0) {
1737                         iput(inode);
1738                         return 0;
1739                 }
1740                 if (ret > 0) {
1741                         remaining -= ret;
1742                         cur += ret;
1743                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1744                 } else {
1745                         remaining -= map.m_len;
1746                         cur += map.m_len;
1747                 }
1748         }
1749
1750         ret = ext4_punch_hole(inode,
1751                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1752                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1753         if (ret)
1754                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1755         ext4_ext_replay_shrink_inode(inode,
1756                 i_size_read(inode) >> sb->s_blocksize_bits);
1757         ext4_mark_inode_dirty(NULL, inode);
1758         iput(inode);
1759
1760         return 0;
1761 }
1762
1763 static inline const char *tag2str(u16 tag)
1764 {
1765         switch (tag) {
1766         case EXT4_FC_TAG_LINK:
1767                 return "TAG_ADD_ENTRY";
1768         case EXT4_FC_TAG_UNLINK:
1769                 return "TAG_DEL_ENTRY";
1770         case EXT4_FC_TAG_ADD_RANGE:
1771                 return "TAG_ADD_RANGE";
1772         case EXT4_FC_TAG_CREAT:
1773                 return "TAG_CREAT_DENTRY";
1774         case EXT4_FC_TAG_DEL_RANGE:
1775                 return "TAG_DEL_RANGE";
1776         case EXT4_FC_TAG_INODE:
1777                 return "TAG_INODE";
1778         case EXT4_FC_TAG_PAD:
1779                 return "TAG_PAD";
1780         case EXT4_FC_TAG_TAIL:
1781                 return "TAG_TAIL";
1782         case EXT4_FC_TAG_HEAD:
1783                 return "TAG_HEAD";
1784         default:
1785                 return "TAG_ERROR";
1786         }
1787 }
1788
1789 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1790 {
1791         struct ext4_fc_replay_state *state;
1792         struct inode *inode;
1793         struct ext4_ext_path *path = NULL;
1794         struct ext4_map_blocks map;
1795         int i, ret, j;
1796         ext4_lblk_t cur, end;
1797
1798         state = &EXT4_SB(sb)->s_fc_replay_state;
1799         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1800                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1801                         EXT4_IGET_NORMAL);
1802                 if (IS_ERR_OR_NULL(inode)) {
1803                         jbd_debug(1, "Inode %d not found.",
1804                                 state->fc_modified_inodes[i]);
1805                         continue;
1806                 }
1807                 cur = 0;
1808                 end = EXT_MAX_BLOCKS;
1809                 while (cur < end) {
1810                         map.m_lblk = cur;
1811                         map.m_len = end - cur;
1812
1813                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1814                         if (ret < 0)
1815                                 break;
1816
1817                         if (ret > 0) {
1818                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1819                                 if (!IS_ERR_OR_NULL(path)) {
1820                                         for (j = 0; j < path->p_depth; j++)
1821                                                 ext4_mb_mark_bb(inode->i_sb,
1822                                                         path[j].p_block, 1, 1);
1823                                         ext4_ext_drop_refs(path);
1824                                         kfree(path);
1825                                 }
1826                                 cur += ret;
1827                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1828                                                         map.m_len, 1);
1829                         } else {
1830                                 cur = cur + (map.m_len ? map.m_len : 1);
1831                         }
1832                 }
1833                 iput(inode);
1834         }
1835 }
1836
1837 /*
1838  * Check if block is in excluded regions for block allocation. The simple
1839  * allocator that runs during replay phase is calls this function to see
1840  * if it is okay to use a block.
1841  */
1842 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1843 {
1844         int i;
1845         struct ext4_fc_replay_state *state;
1846
1847         state = &EXT4_SB(sb)->s_fc_replay_state;
1848         for (i = 0; i < state->fc_regions_valid; i++) {
1849                 if (state->fc_regions[i].ino == 0 ||
1850                         state->fc_regions[i].len == 0)
1851                         continue;
1852                 if (blk >= state->fc_regions[i].pblk &&
1853                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1854                         return true;
1855         }
1856         return false;
1857 }
1858
1859 /* Cleanup function called after replay */
1860 void ext4_fc_replay_cleanup(struct super_block *sb)
1861 {
1862         struct ext4_sb_info *sbi = EXT4_SB(sb);
1863
1864         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1865         kfree(sbi->s_fc_replay_state.fc_regions);
1866         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1867 }
1868
1869 /*
1870  * Recovery Scan phase handler
1871  *
1872  * This function is called during the scan phase and is responsible
1873  * for doing following things:
1874  * - Make sure the fast commit area has valid tags for replay
1875  * - Count number of tags that need to be replayed by the replay handler
1876  * - Verify CRC
1877  * - Create a list of excluded blocks for allocation during replay phase
1878  *
1879  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1880  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1881  * to indicate that scan has finished and JBD2 can now start replay phase.
1882  * It returns a negative error to indicate that there was an error. At the end
1883  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1884  * to indicate the number of tags that need to replayed during the replay phase.
1885  */
1886 static int ext4_fc_replay_scan(journal_t *journal,
1887                                 struct buffer_head *bh, int off,
1888                                 tid_t expected_tid)
1889 {
1890         struct super_block *sb = journal->j_private;
1891         struct ext4_sb_info *sbi = EXT4_SB(sb);
1892         struct ext4_fc_replay_state *state;
1893         int ret = JBD2_FC_REPLAY_CONTINUE;
1894         struct ext4_fc_add_range *ext;
1895         struct ext4_fc_tl *tl;
1896         struct ext4_fc_tail *tail;
1897         __u8 *start, *end;
1898         struct ext4_fc_head *head;
1899         struct ext4_extent *ex;
1900
1901         state = &sbi->s_fc_replay_state;
1902
1903         start = (u8 *)bh->b_data;
1904         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1905
1906         if (state->fc_replay_expected_off == 0) {
1907                 state->fc_cur_tag = 0;
1908                 state->fc_replay_num_tags = 0;
1909                 state->fc_crc = 0;
1910                 state->fc_regions = NULL;
1911                 state->fc_regions_valid = state->fc_regions_used =
1912                         state->fc_regions_size = 0;
1913                 /* Check if we can stop early */
1914                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1915                         != EXT4_FC_TAG_HEAD)
1916                         return 0;
1917         }
1918
1919         if (off != state->fc_replay_expected_off) {
1920                 ret = -EFSCORRUPTED;
1921                 goto out_err;
1922         }
1923
1924         state->fc_replay_expected_off++;
1925         fc_for_each_tl(start, end, tl) {
1926                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1927                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1928                 switch (le16_to_cpu(tl->fc_tag)) {
1929                 case EXT4_FC_TAG_ADD_RANGE:
1930                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1931                         ex = (struct ext4_extent *)&ext->fc_ex;
1932                         ret = ext4_fc_record_regions(sb,
1933                                 le32_to_cpu(ext->fc_ino),
1934                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1935                                 ext4_ext_get_actual_len(ex));
1936                         if (ret < 0)
1937                                 break;
1938                         ret = JBD2_FC_REPLAY_CONTINUE;
1939                         fallthrough;
1940                 case EXT4_FC_TAG_DEL_RANGE:
1941                 case EXT4_FC_TAG_LINK:
1942                 case EXT4_FC_TAG_UNLINK:
1943                 case EXT4_FC_TAG_CREAT:
1944                 case EXT4_FC_TAG_INODE:
1945                 case EXT4_FC_TAG_PAD:
1946                         state->fc_cur_tag++;
1947                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1948                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1949                         break;
1950                 case EXT4_FC_TAG_TAIL:
1951                         state->fc_cur_tag++;
1952                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1953                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1954                                                 sizeof(*tl) +
1955                                                 offsetof(struct ext4_fc_tail,
1956                                                 fc_crc));
1957                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1958                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1959                                 state->fc_replay_num_tags = state->fc_cur_tag;
1960                                 state->fc_regions_valid =
1961                                         state->fc_regions_used;
1962                         } else {
1963                                 ret = state->fc_replay_num_tags ?
1964                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1965                         }
1966                         state->fc_crc = 0;
1967                         break;
1968                 case EXT4_FC_TAG_HEAD:
1969                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1970                         if (le32_to_cpu(head->fc_features) &
1971                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1972                                 ret = -EOPNOTSUPP;
1973                                 break;
1974                         }
1975                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
1976                                 ret = JBD2_FC_REPLAY_STOP;
1977                                 break;
1978                         }
1979                         state->fc_cur_tag++;
1980                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1981                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1982                         break;
1983                 default:
1984                         ret = state->fc_replay_num_tags ?
1985                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
1986                 }
1987                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1988                         break;
1989         }
1990
1991 out_err:
1992         trace_ext4_fc_replay_scan(sb, ret, off);
1993         return ret;
1994 }
1995
1996 /*
1997  * Main recovery path entry point.
1998  * The meaning of return codes is similar as above.
1999  */
2000 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2001                                 enum passtype pass, int off, tid_t expected_tid)
2002 {
2003         struct super_block *sb = journal->j_private;
2004         struct ext4_sb_info *sbi = EXT4_SB(sb);
2005         struct ext4_fc_tl *tl;
2006         __u8 *start, *end;
2007         int ret = JBD2_FC_REPLAY_CONTINUE;
2008         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2009         struct ext4_fc_tail *tail;
2010
2011         if (pass == PASS_SCAN) {
2012                 state->fc_current_pass = PASS_SCAN;
2013                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2014         }
2015
2016         if (state->fc_current_pass != pass) {
2017                 state->fc_current_pass = pass;
2018                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2019         }
2020         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2021                 jbd_debug(1, "Replay stops\n");
2022                 ext4_fc_set_bitmaps_and_counters(sb);
2023                 return 0;
2024         }
2025
2026 #ifdef CONFIG_EXT4_DEBUG
2027         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2028                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2029                 return JBD2_FC_REPLAY_STOP;
2030         }
2031 #endif
2032
2033         start = (u8 *)bh->b_data;
2034         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2035
2036         fc_for_each_tl(start, end, tl) {
2037                 if (state->fc_replay_num_tags == 0) {
2038                         ret = JBD2_FC_REPLAY_STOP;
2039                         ext4_fc_set_bitmaps_and_counters(sb);
2040                         break;
2041                 }
2042                 jbd_debug(3, "Replay phase, tag:%s\n",
2043                                 tag2str(le16_to_cpu(tl->fc_tag)));
2044                 state->fc_replay_num_tags--;
2045                 switch (le16_to_cpu(tl->fc_tag)) {
2046                 case EXT4_FC_TAG_LINK:
2047                         ret = ext4_fc_replay_link(sb, tl);
2048                         break;
2049                 case EXT4_FC_TAG_UNLINK:
2050                         ret = ext4_fc_replay_unlink(sb, tl);
2051                         break;
2052                 case EXT4_FC_TAG_ADD_RANGE:
2053                         ret = ext4_fc_replay_add_range(sb, tl);
2054                         break;
2055                 case EXT4_FC_TAG_CREAT:
2056                         ret = ext4_fc_replay_create(sb, tl);
2057                         break;
2058                 case EXT4_FC_TAG_DEL_RANGE:
2059                         ret = ext4_fc_replay_del_range(sb, tl);
2060                         break;
2061                 case EXT4_FC_TAG_INODE:
2062                         ret = ext4_fc_replay_inode(sb, tl);
2063                         break;
2064                 case EXT4_FC_TAG_PAD:
2065                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2066                                 ext4_fc_tag_len(tl), 0);
2067                         break;
2068                 case EXT4_FC_TAG_TAIL:
2069                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2070                                 ext4_fc_tag_len(tl), 0);
2071                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2072                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2073                         break;
2074                 case EXT4_FC_TAG_HEAD:
2075                         break;
2076                 default:
2077                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2078                                 ext4_fc_tag_len(tl), 0);
2079                         ret = -ECANCELED;
2080                         break;
2081                 }
2082                 if (ret < 0)
2083                         break;
2084                 ret = JBD2_FC_REPLAY_CONTINUE;
2085         }
2086         return ret;
2087 }
2088
2089 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2090 {
2091         /*
2092          * We set replay callback even if fast commit disabled because we may
2093          * could still have fast commit blocks that need to be replayed even if
2094          * fast commit has now been turned off.
2095          */
2096         journal->j_fc_replay_callback = ext4_fc_replay;
2097         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2098                 return;
2099         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2100 }
2101
2102 const char *fc_ineligible_reasons[] = {
2103         "Extended attributes changed",
2104         "Cross rename",
2105         "Journal flag changed",
2106         "Insufficient memory",
2107         "Swap boot",
2108         "Resize",
2109         "Dir renamed",
2110         "Falloc range op",
2111         "Data journalling",
2112         "FC Commit Failed"
2113 };
2114
2115 int ext4_fc_info_show(struct seq_file *seq, void *v)
2116 {
2117         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2118         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2119         int i;
2120
2121         if (v != SEQ_START_TOKEN)
2122                 return 0;
2123
2124         seq_printf(seq,
2125                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2126                    stats->fc_num_commits, stats->fc_ineligible_commits,
2127                    stats->fc_numblks,
2128                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2129         seq_puts(seq, "Ineligible reasons:\n");
2130         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2131                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2132                         stats->fc_ineligible_reason_count[i]);
2133
2134         return 0;
2135 }
2136
2137 int __init ext4_fc_init_dentry_cache(void)
2138 {
2139         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2140                                            SLAB_RECLAIM_ACCOUNT);
2141
2142         if (ext4_fc_dentry_cachep == NULL)
2143                 return -ENOMEM;
2144
2145         return 0;
2146 }