fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  * Not all operations are supported by fast commits today (e.g extended
  69  * attributes). Fast commit ineligiblity is marked by calling one of the
  70  * two following functions:
  71  *
  72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73  *   back to full commit. This is useful in case of transient errors.
  74  *
  75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76  *   the fast commits happening between ext4_fc_start_ineligible() and
  77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79  *   make one more fast commit to fall back to full commit after stop call so
  80  *   that it guaranteed that the fast commit ineligible operation contained
  81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82  *   followed by at least 1 full commit.
  83  *
  84  * Atomicity of commits
  85  * --------------------
  86  * In order to guarantee atomicity during the commit operation, fast commit
  87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88  * tag contains CRC of the contents and TID of the transaction after which
  89  * this fast commit should be applied. Recovery code replays fast commit
  90  * logs only if there's at least 1 valid tail present. For every fast commit
  91  * operation, there is 1 tail. This means, we may end up with multiple tails
  92  * in the fast commit space. Here's an example:
  93  *
  94  * - Create a new file A and remove existing file B
  95  * - fsync()
  96  * - Append contents to file A
  97  * - Truncate file A
  98  * - fsync()
  99  *
 100  * The fast commit space at the end of above operations would look like this:
 101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103  *
 104  * Replay code should thus check for all the valid tails in the FC area.
 105  *
 106  * TODOs
 107  * -----
 108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 109  *    eligible update must be protected within ext4_fc_start_update() and
 110  *    ext4_fc_stop_update(). These routines are called at much higher
 111  *    routines. This can be made more fine grained by combining with
 112  *    ext4_journal_start().
 113  *
 114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 115  *
 116  * 3) Handle more ineligible cases.
 117  */
 118
 119 #include <trace/events/ext4.h>
 120 static struct kmem_cache *ext4_fc_dentry_cachep;
 121
 122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 123 {
 124         BUFFER_TRACE(bh, "");
 125         if (uptodate) {
 126                 ext4_debug("%s: Block %lld up-to-date",
 127                            __func__, bh->b_blocknr);
 128                 set_buffer_uptodate(bh);
 129         } else {
 130                 ext4_debug("%s: Block %lld not up-to-date",
 131                            __func__, bh->b_blocknr);
 132                 clear_buffer_uptodate(bh);
 133         }
 134
 135         unlock_buffer(bh);
 136 }
 137
 138 static inline void ext4_fc_reset_inode(struct inode *inode)
 139 {
 140         struct ext4_inode_info *ei = EXT4_I(inode);
 141
 142         ei->i_fc_lblk_start = 0;
 143         ei->i_fc_lblk_len = 0;
 144 }
 145
 146 void ext4_fc_init_inode(struct inode *inode)
 147 {
 148         struct ext4_inode_info *ei = EXT4_I(inode);
 149
 150         ext4_fc_reset_inode(inode);
 151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 152         INIT_LIST_HEAD(&ei->i_fc_list);
 153         init_waitqueue_head(&ei->i_fc_wait);
 154         atomic_set(&ei->i_fc_updates, 0);
 155 }
 156
 157 /* This function must be called with sbi->s_fc_lock held. */
 158 static void ext4_fc_wait_committing_inode(struct inode *inode)
 159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 160 {
 161         wait_queue_head_t *wq;
 162         struct ext4_inode_info *ei = EXT4_I(inode);
 163
 164 #if (BITS_PER_LONG < 64)
 165         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 166                         EXT4_STATE_FC_COMMITTING);
 167         wq = bit_waitqueue(&ei->i_state_flags,
 168                                 EXT4_STATE_FC_COMMITTING);
 169 #else
 170         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 171                         EXT4_STATE_FC_COMMITTING);
 172         wq = bit_waitqueue(&ei->i_flags,
 173                                 EXT4_STATE_FC_COMMITTING);
 174 #endif
 175         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 176         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 177         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 178         schedule();
 179         finish_wait(wq, &wait.wq_entry);
 180 }
 181
 182 /*
 183  * Inform Ext4's fast about start of an inode update
 184  *
 185  * This function is called by the high level call VFS callbacks before
 186  * performing any inode update. This function blocks if there's an ongoing
 187  * fast commit on the inode in question.
 188  */
 189 void ext4_fc_start_update(struct inode *inode)
 190 {
 191         struct ext4_inode_info *ei = EXT4_I(inode);
 192
 193         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 194             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 195                 return;
 196
 197 restart:
 198         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 199         if (list_empty(&ei->i_fc_list))
 200                 goto out;
 201
 202         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 203                 ext4_fc_wait_committing_inode(inode);
 204                 goto restart;
 205         }
 206 out:
 207         atomic_inc(&ei->i_fc_updates);
 208         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 209 }
 210
 211 /*
 212  * Stop inode update and wake up waiting fast commits if any.
 213  */
 214 void ext4_fc_stop_update(struct inode *inode)
 215 {
 216         struct ext4_inode_info *ei = EXT4_I(inode);
 217
 218         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 219             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 220                 return;
 221
 222         if (atomic_dec_and_test(&ei->i_fc_updates))
 223                 wake_up_all(&ei->i_fc_wait);
 224 }
 225
 226 /*
 227  * Remove inode from fast commit list. If the inode is being committed
 228  * we wait until inode commit is done.
 229  */
 230 void ext4_fc_del(struct inode *inode)
 231 {
 232         struct ext4_inode_info *ei = EXT4_I(inode);
 233
 234         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 235             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 236                 return;
 237
 238 restart:
 239         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 240         if (list_empty(&ei->i_fc_list)) {
 241                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 242                 return;
 243         }
 244
 245         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 246                 ext4_fc_wait_committing_inode(inode);
 247                 goto restart;
 248         }
 249         list_del_init(&ei->i_fc_list);
 250         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 251 }
 252
 253 /*
 254  * Mark file system as fast commit ineligible. This means that next commit
 255  * operation would result in a full jbd2 commit.
 256  */
 257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 258 {
 259         struct ext4_sb_info *sbi = EXT4_SB(sb);
 260
 261         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 262             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 263                 return;
 264
 265         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 266         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 267         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 268 }
 269
 270 /*
 271  * Start a fast commit ineligible update. Any commits that happen while
 272  * such an operation is in progress fall back to full commits.
 273  */
 274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 275 {
 276         struct ext4_sb_info *sbi = EXT4_SB(sb);
 277
 278         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 279             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 280                 return;
 281
 282         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 283         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 284         atomic_inc(&sbi->s_fc_ineligible_updates);
 285 }
 286
 287 /*
 288  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 289  * to ensure that after stopping the ineligible update, at least one full
 290  * commit takes place.
 291  */
 292 void ext4_fc_stop_ineligible(struct super_block *sb)
 293 {
 294         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 295             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 296                 return;
 297
 298         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 299         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 300 }
 301
 302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
 303 {
 304         return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 305                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 306 }
 307
 308 /*
 309  * Generic fast commit tracking function. If this is the first time this we are
 310  * called after a full commit, we initialize fast commit fields and then call
 311  * __fc_track_fn() with update = 0. If we have already been called after a full
 312  * commit, we pass update = 1. Based on that, the track function can determine
 313  * if it needs to track a field for the first time or if it needs to just
 314  * update the previously tracked value.
 315  *
 316  * If enqueue is set, this function enqueues the inode in fast commit list.
 317  */
 318 static int ext4_fc_track_template(
 319         handle_t *handle, struct inode *inode,
 320         int (*__fc_track_fn)(struct inode *, void *, bool),
 321         void *args, int enqueue)
 322 {
 323         bool update = false;
 324         struct ext4_inode_info *ei = EXT4_I(inode);
 325         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 326         tid_t tid = 0;
 327         int ret;
 328
 329         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 330             (sbi->s_mount_state & EXT4_FC_REPLAY))
 331                 return -EOPNOTSUPP;
 332
 333         if (ext4_fc_is_ineligible(inode->i_sb))
 334                 return -EINVAL;
 335
 336         tid = handle->h_transaction->t_tid;
 337         mutex_lock(&ei->i_fc_lock);
 338         if (tid == ei->i_sync_tid) {
 339                 update = true;
 340         } else {
 341                 ext4_fc_reset_inode(inode);
 342                 ei->i_sync_tid = tid;
 343         }
 344         ret = __fc_track_fn(inode, args, update);
 345         mutex_unlock(&ei->i_fc_lock);
 346
 347         if (!enqueue)
 348                 return ret;
 349
 350         spin_lock(&sbi->s_fc_lock);
 351         if (list_empty(&EXT4_I(inode)->i_fc_list))
 352                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 353                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 354                                 &sbi->s_fc_q[FC_Q_STAGING] :
 355                                 &sbi->s_fc_q[FC_Q_MAIN]);
 356         spin_unlock(&sbi->s_fc_lock);
 357
 358         return ret;
 359 }
 360
 361 struct __track_dentry_update_args {
 362         struct dentry *dentry;
 363         int op;
 364 };
 365
 366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 368 {
 369         struct ext4_fc_dentry_update *node;
 370         struct ext4_inode_info *ei = EXT4_I(inode);
 371         struct __track_dentry_update_args *dentry_update =
 372                 (struct __track_dentry_update_args *)arg;
 373         struct dentry *dentry = dentry_update->dentry;
 374         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 375
 376         mutex_unlock(&ei->i_fc_lock);
 377         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 378         if (!node) {
 379                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 380                 mutex_lock(&ei->i_fc_lock);
 381                 return -ENOMEM;
 382         }
 383
 384         node->fcd_op = dentry_update->op;
 385         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 386         node->fcd_ino = inode->i_ino;
 387         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 388                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 389                 if (!node->fcd_name.name) {
 390                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 391                         ext4_fc_mark_ineligible(inode->i_sb,
 392                                 EXT4_FC_REASON_NOMEM);
 393                         mutex_lock(&ei->i_fc_lock);
 394                         return -ENOMEM;
 395                 }
 396                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 397                         dentry->d_name.len);
 398         } else {
 399                 memcpy(node->fcd_iname, dentry->d_name.name,
 400                         dentry->d_name.len);
 401                 node->fcd_name.name = node->fcd_iname;
 402         }
 403         node->fcd_name.len = dentry->d_name.len;
 404
 405         spin_lock(&sbi->s_fc_lock);
 406         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 407                 list_add_tail(&node->fcd_list,
 408                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 409         else
 410                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 411         spin_unlock(&sbi->s_fc_lock);
 412         mutex_lock(&ei->i_fc_lock);
 413
 414         return 0;
 415 }
 416
 417 void __ext4_fc_track_unlink(handle_t *handle,
 418                 struct inode *inode, struct dentry *dentry)
 419 {
 420         struct __track_dentry_update_args args;
 421         int ret;
 422
 423         args.dentry = dentry;
 424         args.op = EXT4_FC_TAG_UNLINK;
 425
 426         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 427                                         (void *)&args, 0);
 428         trace_ext4_fc_track_unlink(inode, dentry, ret);
 429 }
 430
 431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 432 {
 433         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 434 }
 435
 436 void __ext4_fc_track_link(handle_t *handle,
 437         struct inode *inode, struct dentry *dentry)
 438 {
 439         struct __track_dentry_update_args args;
 440         int ret;
 441
 442         args.dentry = dentry;
 443         args.op = EXT4_FC_TAG_LINK;
 444
 445         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 446                                         (void *)&args, 0);
 447         trace_ext4_fc_track_link(inode, dentry, ret);
 448 }
 449
 450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 451 {
 452         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 453 }
 454
 455 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 456 {
 457         struct __track_dentry_update_args args;
 458         struct inode *inode = d_inode(dentry);
 459         int ret;
 460
 461         args.dentry = dentry;
 462         args.op = EXT4_FC_TAG_CREAT;
 463
 464         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 465                                         (void *)&args, 0);
 466         trace_ext4_fc_track_create(inode, dentry, ret);
 467 }
 468
 469 /* __track_fn for inode tracking */
 470 static int __track_inode(struct inode *inode, void *arg, bool update)
 471 {
 472         if (update)
 473                 return -EEXIST;
 474
 475         EXT4_I(inode)->i_fc_lblk_len = 0;
 476
 477         return 0;
 478 }
 479
 480 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 481 {
 482         int ret;
 483
 484         if (S_ISDIR(inode->i_mode))
 485                 return;
 486
 487         if (ext4_should_journal_data(inode)) {
 488                 ext4_fc_mark_ineligible(inode->i_sb,
 489                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
 490                 return;
 491         }
 492
 493         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 494         trace_ext4_fc_track_inode(inode, ret);
 495 }
 496
 497 struct __track_range_args {
 498         ext4_lblk_t start, end;
 499 };
 500
 501 /* __track_fn for tracking data updates */
 502 static int __track_range(struct inode *inode, void *arg, bool update)
 503 {
 504         struct ext4_inode_info *ei = EXT4_I(inode);
 505         ext4_lblk_t oldstart;
 506         struct __track_range_args *__arg =
 507                 (struct __track_range_args *)arg;
 508
 509         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 510                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 511                 return -ECANCELED;
 512         }
 513
 514         oldstart = ei->i_fc_lblk_start;
 515
 516         if (update && ei->i_fc_lblk_len > 0) {
 517                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 518                 ei->i_fc_lblk_len =
 519                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 520                                 ei->i_fc_lblk_start + 1;
 521         } else {
 522                 ei->i_fc_lblk_start = __arg->start;
 523                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 524         }
 525
 526         return 0;
 527 }
 528
 529 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 530                          ext4_lblk_t end)
 531 {
 532         struct __track_range_args args;
 533         int ret;
 534
 535         if (S_ISDIR(inode->i_mode))
 536                 return;
 537
 538         args.start = start;
 539         args.end = end;
 540
 541         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 542
 543         trace_ext4_fc_track_range(inode, start, end, ret);
 544 }
 545
 546 static void ext4_fc_submit_bh(struct super_block *sb)
 547 {
 548         int write_flags = REQ_SYNC;
 549         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 550
 551         /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
 552         if (test_opt(sb, BARRIER))
 553                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 554         lock_buffer(bh);
 555         set_buffer_dirty(bh);
 556         set_buffer_uptodate(bh);
 557         bh->b_end_io = ext4_end_buffer_io_sync;
 558         submit_bh(REQ_OP_WRITE, write_flags, bh);
 559         EXT4_SB(sb)->s_fc_bh = NULL;
 560 }
 561
 562 /* Ext4 commit path routines */
 563
 564 /* memzero and update CRC */
 565 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 566                                 u32 *crc)
 567 {
 568         void *ret;
 569
 570         ret = memset(dst, 0, len);
 571         if (crc)
 572                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 573         return ret;
 574 }
 575
 576 /*
 577  * Allocate len bytes on a fast commit buffer.
 578  *
 579  * During the commit time this function is used to manage fast commit
 580  * block space. We don't split a fast commit log onto different
 581  * blocks. So this function makes sure that if there's not enough space
 582  * on the current block, the remaining space in the current block is
 583  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 584  * new block is from jbd2 and CRC is updated to reflect the padding
 585  * we added.
 586  */
 587 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 588 {
 589         struct ext4_fc_tl *tl;
 590         struct ext4_sb_info *sbi = EXT4_SB(sb);
 591         struct buffer_head *bh;
 592         int bsize = sbi->s_journal->j_blocksize;
 593         int ret, off = sbi->s_fc_bytes % bsize;
 594         int pad_len;
 595
 596         /*
 597          * After allocating len, we should have space at least for a 0 byte
 598          * padding.
 599          */
 600         if (len + sizeof(struct ext4_fc_tl) > bsize)
 601                 return NULL;
 602
 603         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 604                 /*
 605                  * Only allocate from current buffer if we have enough space for
 606                  * this request AND we have space to add a zero byte padding.
 607                  */
 608                 if (!sbi->s_fc_bh) {
 609                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 610                         if (ret)
 611                                 return NULL;
 612                         sbi->s_fc_bh = bh;
 613                 }
 614                 sbi->s_fc_bytes += len;
 615                 return sbi->s_fc_bh->b_data + off;
 616         }
 617         /* Need to add PAD tag */
 618         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 619         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 620         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 621         tl->fc_len = cpu_to_le16(pad_len);
 622         if (crc)
 623                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 624         if (pad_len > 0)
 625                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 626         ext4_fc_submit_bh(sb);
 627
 628         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 629         if (ret)
 630                 return NULL;
 631         sbi->s_fc_bh = bh;
 632         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 633         return sbi->s_fc_bh->b_data;
 634 }
 635
 636 /* memcpy to fc reserved space and update CRC */
 637 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 638                                 int len, u32 *crc)
 639 {
 640         if (crc)
 641                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 642         return memcpy(dst, src, len);
 643 }
 644
 645 /*
 646  * Complete a fast commit by writing tail tag.
 647  *
 648  * Writing tail tag marks the end of a fast commit. In order to guarantee
 649  * atomicity, after writing tail tag, even if there's space remaining
 650  * in the block, next commit shouldn't use it. That's why tail tag
 651  * has the length as that of the remaining space on the block.
 652  */
 653 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 654 {
 655         struct ext4_sb_info *sbi = EXT4_SB(sb);
 656         struct ext4_fc_tl tl;
 657         struct ext4_fc_tail tail;
 658         int off, bsize = sbi->s_journal->j_blocksize;
 659         u8 *dst;
 660
 661         /*
 662          * ext4_fc_reserve_space takes care of allocating an extra block if
 663          * there's no enough space on this block for accommodating this tail.
 664          */
 665         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 666         if (!dst)
 667                 return -ENOSPC;
 668
 669         off = sbi->s_fc_bytes % bsize;
 670
 671         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 672         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 673         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 674
 675         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 676         dst += sizeof(tl);
 677         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 678         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 679         dst += sizeof(tail.fc_tid);
 680         tail.fc_crc = cpu_to_le32(crc);
 681         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 682
 683         ext4_fc_submit_bh(sb);
 684
 685         return 0;
 686 }
 687
 688 /*
 689  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 690  * Returns false if there's not enough space.
 691  */
 692 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 693                            u32 *crc)
 694 {
 695         struct ext4_fc_tl tl;
 696         u8 *dst;
 697
 698         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 699         if (!dst)
 700                 return false;
 701
 702         tl.fc_tag = cpu_to_le16(tag);
 703         tl.fc_len = cpu_to_le16(len);
 704
 705         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 706         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 707
 708         return true;
 709 }
 710
 711 /* Same as above, but adds dentry tlv. */
 712 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 713                                         int parent_ino, int ino, int dlen,
 714                                         const unsigned char *dname,
 715                                         u32 *crc)
 716 {
 717         struct ext4_fc_dentry_info fcd;
 718         struct ext4_fc_tl tl;
 719         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 720                                         crc);
 721
 722         if (!dst)
 723                 return false;
 724
 725         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 726         fcd.fc_ino = cpu_to_le32(ino);
 727         tl.fc_tag = cpu_to_le16(tag);
 728         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 729         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 730         dst += sizeof(tl);
 731         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 732         dst += sizeof(fcd);
 733         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 734         dst += dlen;
 735
 736         return true;
 737 }
 738
 739 /*
 740  * Writes inode in the fast commit space under TLV with tag @tag.
 741  * Returns 0 on success, error on failure.
 742  */
 743 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 744 {
 745         struct ext4_inode_info *ei = EXT4_I(inode);
 746         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 747         int ret;
 748         struct ext4_iloc iloc;
 749         struct ext4_fc_inode fc_inode;
 750         struct ext4_fc_tl tl;
 751         u8 *dst;
 752
 753         ret = ext4_get_inode_loc(inode, &iloc);
 754         if (ret)
 755                 return ret;
 756
 757         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 758                 inode_len += ei->i_extra_isize;
 759
 760         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 761         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 762         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 763
 764         dst = ext4_fc_reserve_space(inode->i_sb,
 765                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 766         if (!dst)
 767                 return -ECANCELED;
 768
 769         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 770                 return -ECANCELED;
 771         dst += sizeof(tl);
 772         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 773                 return -ECANCELED;
 774         dst += sizeof(fc_inode);
 775         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 776                                         inode_len, crc))
 777                 return -ECANCELED;
 778
 779         return 0;
 780 }
 781
 782 /*
 783  * Writes updated data ranges for the inode in question. Updates CRC.
 784  * Returns 0 on success, error otherwise.
 785  */
 786 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 787 {
 788         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 789         struct ext4_inode_info *ei = EXT4_I(inode);
 790         struct ext4_map_blocks map;
 791         struct ext4_fc_add_range fc_ext;
 792         struct ext4_fc_del_range lrange;
 793         struct ext4_extent *ex;
 794         int ret;
 795
 796         mutex_lock(&ei->i_fc_lock);
 797         if (ei->i_fc_lblk_len == 0) {
 798                 mutex_unlock(&ei->i_fc_lock);
 799                 return 0;
 800         }
 801         old_blk_size = ei->i_fc_lblk_start;
 802         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 803         ei->i_fc_lblk_len = 0;
 804         mutex_unlock(&ei->i_fc_lock);
 805
 806         cur_lblk_off = old_blk_size;
 807         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 808                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 809
 810         while (cur_lblk_off <= new_blk_size) {
 811                 map.m_lblk = cur_lblk_off;
 812                 map.m_len = new_blk_size - cur_lblk_off + 1;
 813                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 814                 if (ret < 0)
 815                         return -ECANCELED;
 816
 817                 if (map.m_len == 0) {
 818                         cur_lblk_off++;
 819                         continue;
 820                 }
 821
 822                 if (ret == 0) {
 823                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 824                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 825                         lrange.fc_len = cpu_to_le32(map.m_len);
 826                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 827                                             sizeof(lrange), (u8 *)&lrange, crc))
 828                                 return -ENOSPC;
 829                 } else {
 830                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 831                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 832                         ex->ee_block = cpu_to_le32(map.m_lblk);
 833                         ex->ee_len = cpu_to_le16(map.m_len);
 834                         ext4_ext_store_pblock(ex, map.m_pblk);
 835                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 836                                 ext4_ext_mark_unwritten(ex);
 837                         else
 838                                 ext4_ext_mark_initialized(ex);
 839                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 840                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 841                                 return -ENOSPC;
 842                 }
 843
 844                 cur_lblk_off += map.m_len;
 845         }
 846
 847         return 0;
 848 }
 849
 850
 851 /* Submit data for all the fast commit inodes */
 852 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 853 {
 854         struct super_block *sb = (struct super_block *)(journal->j_private);
 855         struct ext4_sb_info *sbi = EXT4_SB(sb);
 856         struct ext4_inode_info *ei;
 857         struct list_head *pos;
 858         int ret = 0;
 859
 860         spin_lock(&sbi->s_fc_lock);
 861         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 862         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
 863                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 864                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 865                 while (atomic_read(&ei->i_fc_updates)) {
 866                         DEFINE_WAIT(wait);
 867
 868                         prepare_to_wait(&ei->i_fc_wait, &wait,
 869                                                 TASK_UNINTERRUPTIBLE);
 870                         if (atomic_read(&ei->i_fc_updates)) {
 871                                 spin_unlock(&sbi->s_fc_lock);
 872                                 schedule();
 873                                 spin_lock(&sbi->s_fc_lock);
 874                         }
 875                         finish_wait(&ei->i_fc_wait, &wait);
 876                 }
 877                 spin_unlock(&sbi->s_fc_lock);
 878                 ret = jbd2_submit_inode_data(ei->jinode);
 879                 if (ret)
 880                         return ret;
 881                 spin_lock(&sbi->s_fc_lock);
 882         }
 883         spin_unlock(&sbi->s_fc_lock);
 884
 885         return ret;
 886 }
 887
 888 /* Wait for completion of data for all the fast commit inodes */
 889 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 890 {
 891         struct super_block *sb = (struct super_block *)(journal->j_private);
 892         struct ext4_sb_info *sbi = EXT4_SB(sb);
 893         struct ext4_inode_info *pos, *n;
 894         int ret = 0;
 895
 896         spin_lock(&sbi->s_fc_lock);
 897         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 898                 if (!ext4_test_inode_state(&pos->vfs_inode,
 899                                            EXT4_STATE_FC_COMMITTING))
 900                         continue;
 901                 spin_unlock(&sbi->s_fc_lock);
 902
 903                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 904                 if (ret)
 905                         return ret;
 906                 spin_lock(&sbi->s_fc_lock);
 907         }
 908         spin_unlock(&sbi->s_fc_lock);
 909
 910         return 0;
 911 }
 912
 913 /* Commit all the directory entry updates */
 914 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 915 __acquires(&sbi->s_fc_lock)
 916 __releases(&sbi->s_fc_lock)
 917 {
 918         struct super_block *sb = (struct super_block *)(journal->j_private);
 919         struct ext4_sb_info *sbi = EXT4_SB(sb);
 920         struct ext4_fc_dentry_update *fc_dentry;
 921         struct inode *inode;
 922         struct list_head *pos, *n, *fcd_pos, *fcd_n;
 923         struct ext4_inode_info *ei;
 924         int ret;
 925
 926         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 927                 return 0;
 928         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
 929                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
 930                                         fcd_list);
 931                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 932                         spin_unlock(&sbi->s_fc_lock);
 933                         if (!ext4_fc_add_dentry_tlv(
 934                                 sb, fc_dentry->fcd_op,
 935                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 936                                 fc_dentry->fcd_name.len,
 937                                 fc_dentry->fcd_name.name, crc)) {
 938                                 ret = -ENOSPC;
 939                                 goto lock_and_exit;
 940                         }
 941                         spin_lock(&sbi->s_fc_lock);
 942                         continue;
 943                 }
 944
 945                 inode = NULL;
 946                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
 947                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 948                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
 949                                 inode = &ei->vfs_inode;
 950                                 break;
 951                         }
 952                 }
 953                 /*
 954                  * If we don't find inode in our list, then it was deleted,
 955                  * in which case, we don't need to record it's create tag.
 956                  */
 957                 if (!inode)
 958                         continue;
 959                 spin_unlock(&sbi->s_fc_lock);
 960
 961                 /*
 962                  * We first write the inode and then the create dirent. This
 963                  * allows the recovery code to create an unnamed inode first
 964                  * and then link it to a directory entry. This allows us
 965                  * to use namei.c routines almost as is and simplifies
 966                  * the recovery code.
 967                  */
 968                 ret = ext4_fc_write_inode(inode, crc);
 969                 if (ret)
 970                         goto lock_and_exit;
 971
 972                 ret = ext4_fc_write_inode_data(inode, crc);
 973                 if (ret)
 974                         goto lock_and_exit;
 975
 976                 if (!ext4_fc_add_dentry_tlv(
 977                         sb, fc_dentry->fcd_op,
 978                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 979                         fc_dentry->fcd_name.len,
 980                         fc_dentry->fcd_name.name, crc)) {
 981                         ret = -ENOSPC;
 982                         goto lock_and_exit;
 983                 }
 984
 985                 spin_lock(&sbi->s_fc_lock);
 986         }
 987         return 0;
 988 lock_and_exit:
 989         spin_lock(&sbi->s_fc_lock);
 990         return ret;
 991 }
 992
 993 static int ext4_fc_perform_commit(journal_t *journal)
 994 {
 995         struct super_block *sb = (struct super_block *)(journal->j_private);
 996         struct ext4_sb_info *sbi = EXT4_SB(sb);
 997         struct ext4_inode_info *iter;
 998         struct ext4_fc_head head;
 999         struct list_head *pos;
1000         struct inode *inode;
1001         struct blk_plug plug;
1002         int ret = 0;
1003         u32 crc = 0;
1004
1005         ret = ext4_fc_submit_inode_data_all(journal);
1006         if (ret)
1007                 return ret;
1008
1009         ret = ext4_fc_wait_inode_data_all(journal);
1010         if (ret)
1011                 return ret;
1012
1013         /*
1014          * If file system device is different from journal device, issue a cache
1015          * flush before we start writing fast commit blocks.
1016          */
1017         if (journal->j_fs_dev != journal->j_dev)
1018                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1019
1020         blk_start_plug(&plug);
1021         if (sbi->s_fc_bytes == 0) {
1022                 /*
1023                  * Add a head tag only if this is the first fast commit
1024                  * in this TID.
1025                  */
1026                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1027                 head.fc_tid = cpu_to_le32(
1028                         sbi->s_journal->j_running_transaction->t_tid);
1029                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1030                         (u8 *)&head, &crc))
1031                         goto out;
1032         }
1033
1034         spin_lock(&sbi->s_fc_lock);
1035         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1036         if (ret) {
1037                 spin_unlock(&sbi->s_fc_lock);
1038                 goto out;
1039         }
1040
1041         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1042                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1043                 inode = &iter->vfs_inode;
1044                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1045                         continue;
1046
1047                 spin_unlock(&sbi->s_fc_lock);
1048                 ret = ext4_fc_write_inode_data(inode, &crc);
1049                 if (ret)
1050                         goto out;
1051                 ret = ext4_fc_write_inode(inode, &crc);
1052                 if (ret)
1053                         goto out;
1054                 spin_lock(&sbi->s_fc_lock);
1055         }
1056         spin_unlock(&sbi->s_fc_lock);
1057
1058         ret = ext4_fc_write_tail(sb, crc);
1059
1060 out:
1061         blk_finish_plug(&plug);
1062         return ret;
1063 }
1064
1065 /*
1066  * The main commit entry point. Performs a fast commit for transaction
1067  * commit_tid if needed. If it's not possible to perform a fast commit
1068  * due to various reasons, we fall back to full commit. Returns 0
1069  * on success, error otherwise.
1070  */
1071 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1072 {
1073         struct super_block *sb = (struct super_block *)(journal->j_private);
1074         struct ext4_sb_info *sbi = EXT4_SB(sb);
1075         int nblks = 0, ret, bsize = journal->j_blocksize;
1076         int subtid = atomic_read(&sbi->s_fc_subtid);
1077         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1078         ktime_t start_time, commit_time;
1079
1080         trace_ext4_fc_commit_start(sb);
1081
1082         start_time = ktime_get();
1083
1084         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1085                 (ext4_fc_is_ineligible(sb))) {
1086                 reason = EXT4_FC_REASON_INELIGIBLE;
1087                 goto out;
1088         }
1089
1090 restart_fc:
1091         ret = jbd2_fc_begin_commit(journal, commit_tid);
1092         if (ret == -EALREADY) {
1093                 /* There was an ongoing commit, check if we need to restart */
1094                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1095                         commit_tid > journal->j_commit_sequence)
1096                         goto restart_fc;
1097                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1098                 goto out;
1099         } else if (ret) {
1100                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1101                 reason = EXT4_FC_REASON_FC_START_FAILED;
1102                 goto out;
1103         }
1104
1105         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1106         ret = ext4_fc_perform_commit(journal);
1107         if (ret < 0) {
1108                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109                 reason = EXT4_FC_REASON_FC_FAILED;
1110                 goto out;
1111         }
1112         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1113         ret = jbd2_fc_wait_bufs(journal, nblks);
1114         if (ret < 0) {
1115                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1116                 reason = EXT4_FC_REASON_FC_FAILED;
1117                 goto out;
1118         }
1119         atomic_inc(&sbi->s_fc_subtid);
1120         jbd2_fc_end_commit(journal);
1121 out:
1122         /* Has any ineligible update happened since we started? */
1123         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1124                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1125                 reason = EXT4_FC_REASON_INELIGIBLE;
1126         }
1127
1128         spin_lock(&sbi->s_fc_lock);
1129         if (reason != EXT4_FC_REASON_OK &&
1130                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1131                 sbi->s_fc_stats.fc_ineligible_commits++;
1132         } else {
1133                 sbi->s_fc_stats.fc_num_commits++;
1134                 sbi->s_fc_stats.fc_numblks += nblks;
1135         }
1136         spin_unlock(&sbi->s_fc_lock);
1137         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1138         trace_ext4_fc_commit_stop(sb, nblks, reason);
1139         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1140         /*
1141          * weight the commit time higher than the average time so we don't
1142          * react too strongly to vast changes in the commit time
1143          */
1144         if (likely(sbi->s_fc_avg_commit_time))
1145                 sbi->s_fc_avg_commit_time = (commit_time +
1146                                 sbi->s_fc_avg_commit_time * 3) / 4;
1147         else
1148                 sbi->s_fc_avg_commit_time = commit_time;
1149         jbd_debug(1,
1150                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1151                 nblks, reason, subtid);
1152         if (reason == EXT4_FC_REASON_FC_FAILED)
1153                 return jbd2_fc_end_commit_fallback(journal);
1154         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1155                 reason == EXT4_FC_REASON_INELIGIBLE)
1156                 return jbd2_complete_transaction(journal, commit_tid);
1157         return 0;
1158 }
1159
1160 /*
1161  * Fast commit cleanup routine. This is called after every fast commit and
1162  * full commit. full is true if we are called after a full commit.
1163  */
1164 static void ext4_fc_cleanup(journal_t *journal, int full)
1165 {
1166         struct super_block *sb = journal->j_private;
1167         struct ext4_sb_info *sbi = EXT4_SB(sb);
1168         struct ext4_inode_info *iter;
1169         struct ext4_fc_dentry_update *fc_dentry;
1170         struct list_head *pos, *n;
1171
1172         if (full && sbi->s_fc_bh)
1173                 sbi->s_fc_bh = NULL;
1174
1175         jbd2_fc_release_bufs(journal);
1176
1177         spin_lock(&sbi->s_fc_lock);
1178         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1179                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1180                 list_del_init(&iter->i_fc_list);
1181                 ext4_clear_inode_state(&iter->vfs_inode,
1182                                        EXT4_STATE_FC_COMMITTING);
1183                 ext4_fc_reset_inode(&iter->vfs_inode);
1184                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1185                 smp_mb();
1186 #if (BITS_PER_LONG < 64)
1187                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1188 #else
1189                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1190 #endif
1191         }
1192
1193         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1194                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1195                                              struct ext4_fc_dentry_update,
1196                                              fcd_list);
1197                 list_del_init(&fc_dentry->fcd_list);
1198                 spin_unlock(&sbi->s_fc_lock);
1199
1200                 if (fc_dentry->fcd_name.name &&
1201                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1202                         kfree(fc_dentry->fcd_name.name);
1203                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1204                 spin_lock(&sbi->s_fc_lock);
1205         }
1206
1207         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1208                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1209         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1210                                 &sbi->s_fc_q[FC_Q_STAGING]);
1211
1212         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1213         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1214
1215         if (full)
1216                 sbi->s_fc_bytes = 0;
1217         spin_unlock(&sbi->s_fc_lock);
1218         trace_ext4_fc_stats(sb);
1219 }
1220
1221 /* Ext4 Replay Path Routines */
1222
1223 /* Get length of a particular tlv */
1224 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1225 {
1226         return le16_to_cpu(tl->fc_len);
1227 }
1228
1229 /* Get a pointer to "value" of a tlv */
1230 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1231 {
1232         return (u8 *)tl + sizeof(*tl);
1233 }
1234
1235 /* Helper struct for dentry replay routines */
1236 struct dentry_info_args {
1237         int parent_ino, dname_len, ino, inode_len;
1238         char *dname;
1239 };
1240
1241 static inline void tl_to_darg(struct dentry_info_args *darg,
1242                                 struct  ext4_fc_tl *tl)
1243 {
1244         struct ext4_fc_dentry_info *fcd;
1245
1246         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1247
1248         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1249         darg->ino = le32_to_cpu(fcd->fc_ino);
1250         darg->dname = fcd->fc_dname;
1251         darg->dname_len = ext4_fc_tag_len(tl) -
1252                         sizeof(struct ext4_fc_dentry_info);
1253 }
1254
1255 /* Unlink replay function */
1256 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1257 {
1258         struct inode *inode, *old_parent;
1259         struct qstr entry;
1260         struct dentry_info_args darg;
1261         int ret = 0;
1262
1263         tl_to_darg(&darg, tl);
1264
1265         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1266                         darg.parent_ino, darg.dname_len);
1267
1268         entry.name = darg.dname;
1269         entry.len = darg.dname_len;
1270         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1271
1272         if (IS_ERR_OR_NULL(inode)) {
1273                 jbd_debug(1, "Inode %d not found", darg.ino);
1274                 return 0;
1275         }
1276
1277         old_parent = ext4_iget(sb, darg.parent_ino,
1278                                 EXT4_IGET_NORMAL);
1279         if (IS_ERR_OR_NULL(old_parent)) {
1280                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1281                 iput(inode);
1282                 return 0;
1283         }
1284
1285         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1286         /* -ENOENT ok coz it might not exist anymore. */
1287         if (ret == -ENOENT)
1288                 ret = 0;
1289         iput(old_parent);
1290         iput(inode);
1291         return ret;
1292 }
1293
1294 static int ext4_fc_replay_link_internal(struct super_block *sb,
1295                                 struct dentry_info_args *darg,
1296                                 struct inode *inode)
1297 {
1298         struct inode *dir = NULL;
1299         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1300         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1301         int ret = 0;
1302
1303         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1304         if (IS_ERR(dir)) {
1305                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1306                 dir = NULL;
1307                 goto out;
1308         }
1309
1310         dentry_dir = d_obtain_alias(dir);
1311         if (IS_ERR(dentry_dir)) {
1312                 jbd_debug(1, "Failed to obtain dentry");
1313                 dentry_dir = NULL;
1314                 goto out;
1315         }
1316
1317         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1318         if (!dentry_inode) {
1319                 jbd_debug(1, "Inode dentry not created.");
1320                 ret = -ENOMEM;
1321                 goto out;
1322         }
1323
1324         ret = __ext4_link(dir, inode, dentry_inode);
1325         /*
1326          * It's possible that link already existed since data blocks
1327          * for the dir in question got persisted before we crashed OR
1328          * we replayed this tag and crashed before the entire replay
1329          * could complete.
1330          */
1331         if (ret && ret != -EEXIST) {
1332                 jbd_debug(1, "Failed to link\n");
1333                 goto out;
1334         }
1335
1336         ret = 0;
1337 out:
1338         if (dentry_dir) {
1339                 d_drop(dentry_dir);
1340                 dput(dentry_dir);
1341         } else if (dir) {
1342                 iput(dir);
1343         }
1344         if (dentry_inode) {
1345                 d_drop(dentry_inode);
1346                 dput(dentry_inode);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /* Link replay function */
1353 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1354 {
1355         struct inode *inode;
1356         struct dentry_info_args darg;
1357         int ret = 0;
1358
1359         tl_to_darg(&darg, tl);
1360         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1361                         darg.parent_ino, darg.dname_len);
1362
1363         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1364         if (IS_ERR_OR_NULL(inode)) {
1365                 jbd_debug(1, "Inode not found.");
1366                 return 0;
1367         }
1368
1369         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1370         iput(inode);
1371         return ret;
1372 }
1373
1374 /*
1375  * Record all the modified inodes during replay. We use this later to setup
1376  * block bitmaps correctly.
1377  */
1378 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1379 {
1380         struct ext4_fc_replay_state *state;
1381         int i;
1382
1383         state = &EXT4_SB(sb)->s_fc_replay_state;
1384         for (i = 0; i < state->fc_modified_inodes_used; i++)
1385                 if (state->fc_modified_inodes[i] == ino)
1386                         return 0;
1387         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1388                 state->fc_modified_inodes_size +=
1389                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1390                 state->fc_modified_inodes = krealloc(
1391                                         state->fc_modified_inodes, sizeof(int) *
1392                                         state->fc_modified_inodes_size,
1393                                         GFP_KERNEL);
1394                 if (!state->fc_modified_inodes)
1395                         return -ENOMEM;
1396         }
1397         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1398         return 0;
1399 }
1400
1401 /*
1402  * Inode replay function
1403  */
1404 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1405 {
1406         struct ext4_fc_inode *fc_inode;
1407         struct ext4_inode *raw_inode;
1408         struct ext4_inode *raw_fc_inode;
1409         struct inode *inode = NULL;
1410         struct ext4_iloc iloc;
1411         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1412         struct ext4_extent_header *eh;
1413
1414         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1415
1416         ino = le32_to_cpu(fc_inode->fc_ino);
1417         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1418
1419         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1420         if (!IS_ERR_OR_NULL(inode)) {
1421                 ext4_ext_clear_bb(inode);
1422                 iput(inode);
1423         }
1424
1425         ext4_fc_record_modified_inode(sb, ino);
1426
1427         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1428         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1429         if (ret)
1430                 goto out;
1431
1432         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1433         raw_inode = ext4_raw_inode(&iloc);
1434
1435         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1436         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1437                 inode_len - offsetof(struct ext4_inode, i_generation));
1438         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1439                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1440                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1441                         memset(eh, 0, sizeof(*eh));
1442                         eh->eh_magic = EXT4_EXT_MAGIC;
1443                         eh->eh_max = cpu_to_le16(
1444                                 (sizeof(raw_inode->i_block) -
1445                                  sizeof(struct ext4_extent_header))
1446                                  / sizeof(struct ext4_extent));
1447                 }
1448         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1449                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1450                         sizeof(raw_inode->i_block));
1451         }
1452
1453         /* Immediately update the inode on disk. */
1454         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1455         if (ret)
1456                 goto out;
1457         ret = sync_dirty_buffer(iloc.bh);
1458         if (ret)
1459                 goto out;
1460         ret = ext4_mark_inode_used(sb, ino);
1461         if (ret)
1462                 goto out;
1463
1464         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1465         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1466         if (IS_ERR_OR_NULL(inode)) {
1467                 jbd_debug(1, "Inode not found.");
1468                 return -EFSCORRUPTED;
1469         }
1470
1471         /*
1472          * Our allocator could have made different decisions than before
1473          * crashing. This should be fixed but until then, we calculate
1474          * the number of blocks the inode.
1475          */
1476         ext4_ext_replay_set_iblocks(inode);
1477
1478         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1479         ext4_reset_inode_seed(inode);
1480
1481         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1482         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1483         sync_dirty_buffer(iloc.bh);
1484         brelse(iloc.bh);
1485 out:
1486         iput(inode);
1487         if (!ret)
1488                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1489
1490         return 0;
1491 }
1492
1493 /*
1494  * Dentry create replay function.
1495  *
1496  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1497  * inode for which we are trying to create a dentry here, should already have
1498  * been replayed before we start here.
1499  */
1500 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1501 {
1502         int ret = 0;
1503         struct inode *inode = NULL;
1504         struct inode *dir = NULL;
1505         struct dentry_info_args darg;
1506
1507         tl_to_darg(&darg, tl);
1508
1509         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1510                         darg.parent_ino, darg.dname_len);
1511
1512         /* This takes care of update group descriptor and other metadata */
1513         ret = ext4_mark_inode_used(sb, darg.ino);
1514         if (ret)
1515                 goto out;
1516
1517         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1518         if (IS_ERR_OR_NULL(inode)) {
1519                 jbd_debug(1, "inode %d not found.", darg.ino);
1520                 inode = NULL;
1521                 ret = -EINVAL;
1522                 goto out;
1523         }
1524
1525         if (S_ISDIR(inode->i_mode)) {
1526                 /*
1527                  * If we are creating a directory, we need to make sure that the
1528                  * dot and dot dot dirents are setup properly.
1529                  */
1530                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1531                 if (IS_ERR_OR_NULL(dir)) {
1532                         jbd_debug(1, "Dir %d not found.", darg.ino);
1533                         goto out;
1534                 }
1535                 ret = ext4_init_new_dir(NULL, dir, inode);
1536                 iput(dir);
1537                 if (ret) {
1538                         ret = 0;
1539                         goto out;
1540                 }
1541         }
1542         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1543         if (ret)
1544                 goto out;
1545         set_nlink(inode, 1);
1546         ext4_mark_inode_dirty(NULL, inode);
1547 out:
1548         if (inode)
1549                 iput(inode);
1550         return ret;
1551 }
1552
1553 /*
1554  * Record physical disk regions which are in use as per fast commit area. Our
1555  * simple replay phase allocator excludes these regions from allocation.
1556  */
1557 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1558                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1559 {
1560         struct ext4_fc_replay_state *state;
1561         struct ext4_fc_alloc_region *region;
1562
1563         state = &EXT4_SB(sb)->s_fc_replay_state;
1564         if (state->fc_regions_used == state->fc_regions_size) {
1565                 state->fc_regions_size +=
1566                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1567                 state->fc_regions = krealloc(
1568                                         state->fc_regions,
1569                                         state->fc_regions_size *
1570                                         sizeof(struct ext4_fc_alloc_region),
1571                                         GFP_KERNEL);
1572                 if (!state->fc_regions)
1573                         return -ENOMEM;
1574         }
1575         region = &state->fc_regions[state->fc_regions_used++];
1576         region->ino = ino;
1577         region->lblk = lblk;
1578         region->pblk = pblk;
1579         region->len = len;
1580
1581         return 0;
1582 }
1583
1584 /* Replay add range tag */
1585 static int ext4_fc_replay_add_range(struct super_block *sb,
1586                                 struct ext4_fc_tl *tl)
1587 {
1588         struct ext4_fc_add_range *fc_add_ex;
1589         struct ext4_extent newex, *ex;
1590         struct inode *inode;
1591         ext4_lblk_t start, cur;
1592         int remaining, len;
1593         ext4_fsblk_t start_pblk;
1594         struct ext4_map_blocks map;
1595         struct ext4_ext_path *path = NULL;
1596         int ret;
1597
1598         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1599         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1600
1601         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1602                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1603                 ext4_ext_get_actual_len(ex));
1604
1605         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1606                                 EXT4_IGET_NORMAL);
1607         if (IS_ERR_OR_NULL(inode)) {
1608                 jbd_debug(1, "Inode not found.");
1609                 return 0;
1610         }
1611
1612         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1613
1614         start = le32_to_cpu(ex->ee_block);
1615         start_pblk = ext4_ext_pblock(ex);
1616         len = ext4_ext_get_actual_len(ex);
1617
1618         cur = start;
1619         remaining = len;
1620         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1621                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1622                   inode->i_ino);
1623
1624         while (remaining > 0) {
1625                 map.m_lblk = cur;
1626                 map.m_len = remaining;
1627                 map.m_pblk = 0;
1628                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1629
1630                 if (ret < 0) {
1631                         iput(inode);
1632                         return 0;
1633                 }
1634
1635                 if (ret == 0) {
1636                         /* Range is not mapped */
1637                         path = ext4_find_extent(inode, cur, NULL, 0);
1638                         if (IS_ERR(path)) {
1639                                 iput(inode);
1640                                 return 0;
1641                         }
1642                         memset(&newex, 0, sizeof(newex));
1643                         newex.ee_block = cpu_to_le32(cur);
1644                         ext4_ext_store_pblock(
1645                                 &newex, start_pblk + cur - start);
1646                         newex.ee_len = cpu_to_le16(map.m_len);
1647                         if (ext4_ext_is_unwritten(ex))
1648                                 ext4_ext_mark_unwritten(&newex);
1649                         down_write(&EXT4_I(inode)->i_data_sem);
1650                         ret = ext4_ext_insert_extent(
1651                                 NULL, inode, &path, &newex, 0);
1652                         up_write((&EXT4_I(inode)->i_data_sem));
1653                         ext4_ext_drop_refs(path);
1654                         kfree(path);
1655                         if (ret) {
1656                                 iput(inode);
1657                                 return 0;
1658                         }
1659                         goto next;
1660                 }
1661
1662                 if (start_pblk + cur - start != map.m_pblk) {
1663                         /*
1664                          * Logical to physical mapping changed. This can happen
1665                          * if this range was removed and then reallocated to
1666                          * map to new physical blocks during a fast commit.
1667                          */
1668                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1669                                         ext4_ext_is_unwritten(ex),
1670                                         start_pblk + cur - start);
1671                         if (ret) {
1672                                 iput(inode);
1673                                 return 0;
1674                         }
1675                         /*
1676                          * Mark the old blocks as free since they aren't used
1677                          * anymore. We maintain an array of all the modified
1678                          * inodes. In case these blocks are still used at either
1679                          * a different logical range in the same inode or in
1680                          * some different inode, we will mark them as allocated
1681                          * at the end of the FC replay using our array of
1682                          * modified inodes.
1683                          */
1684                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1685                         goto next;
1686                 }
1687
1688                 /* Range is mapped and needs a state change */
1689                 jbd_debug(1, "Converting from %d to %d %lld",
1690                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1691                         ext4_ext_is_unwritten(ex), map.m_pblk);
1692                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1693                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1694                 if (ret) {
1695                         iput(inode);
1696                         return 0;
1697                 }
1698                 /*
1699                  * We may have split the extent tree while toggling the state.
1700                  * Try to shrink the extent tree now.
1701                  */
1702                 ext4_ext_replay_shrink_inode(inode, start + len);
1703 next:
1704                 cur += map.m_len;
1705                 remaining -= map.m_len;
1706         }
1707         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1708                                         sb->s_blocksize_bits);
1709         iput(inode);
1710         return 0;
1711 }
1712
1713 /* Replay DEL_RANGE tag */
1714 static int
1715 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1716 {
1717         struct inode *inode;
1718         struct ext4_fc_del_range *lrange;
1719         struct ext4_map_blocks map;
1720         ext4_lblk_t cur, remaining;
1721         int ret;
1722
1723         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1724         cur = le32_to_cpu(lrange->fc_lblk);
1725         remaining = le32_to_cpu(lrange->fc_len);
1726
1727         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1728                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1729
1730         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1731         if (IS_ERR_OR_NULL(inode)) {
1732                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1733                 return 0;
1734         }
1735
1736         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1737
1738         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1739                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1740                         le32_to_cpu(lrange->fc_len));
1741         while (remaining > 0) {
1742                 map.m_lblk = cur;
1743                 map.m_len = remaining;
1744
1745                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1746                 if (ret < 0) {
1747                         iput(inode);
1748                         return 0;
1749                 }
1750                 if (ret > 0) {
1751                         remaining -= ret;
1752                         cur += ret;
1753                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1754                 } else {
1755                         remaining -= map.m_len;
1756                         cur += map.m_len;
1757                 }
1758         }
1759
1760         ret = ext4_punch_hole(inode,
1761                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1762                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1763         if (ret)
1764                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1765         ext4_ext_replay_shrink_inode(inode,
1766                 i_size_read(inode) >> sb->s_blocksize_bits);
1767         ext4_mark_inode_dirty(NULL, inode);
1768         iput(inode);
1769
1770         return 0;
1771 }
1772
1773 static inline const char *tag2str(u16 tag)
1774 {
1775         switch (tag) {
1776         case EXT4_FC_TAG_LINK:
1777                 return "TAG_ADD_ENTRY";
1778         case EXT4_FC_TAG_UNLINK:
1779                 return "TAG_DEL_ENTRY";
1780         case EXT4_FC_TAG_ADD_RANGE:
1781                 return "TAG_ADD_RANGE";
1782         case EXT4_FC_TAG_CREAT:
1783                 return "TAG_CREAT_DENTRY";
1784         case EXT4_FC_TAG_DEL_RANGE:
1785                 return "TAG_DEL_RANGE";
1786         case EXT4_FC_TAG_INODE:
1787                 return "TAG_INODE";
1788         case EXT4_FC_TAG_PAD:
1789                 return "TAG_PAD";
1790         case EXT4_FC_TAG_TAIL:
1791                 return "TAG_TAIL";
1792         case EXT4_FC_TAG_HEAD:
1793                 return "TAG_HEAD";
1794         default:
1795                 return "TAG_ERROR";
1796         }
1797 }
1798
1799 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1800 {
1801         struct ext4_fc_replay_state *state;
1802         struct inode *inode;
1803         struct ext4_ext_path *path = NULL;
1804         struct ext4_map_blocks map;
1805         int i, ret, j;
1806         ext4_lblk_t cur, end;
1807
1808         state = &EXT4_SB(sb)->s_fc_replay_state;
1809         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1810                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1811                         EXT4_IGET_NORMAL);
1812                 if (IS_ERR_OR_NULL(inode)) {
1813                         jbd_debug(1, "Inode %d not found.",
1814                                 state->fc_modified_inodes[i]);
1815                         continue;
1816                 }
1817                 cur = 0;
1818                 end = EXT_MAX_BLOCKS;
1819                 while (cur < end) {
1820                         map.m_lblk = cur;
1821                         map.m_len = end - cur;
1822
1823                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1824                         if (ret < 0)
1825                                 break;
1826
1827                         if (ret > 0) {
1828                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1829                                 if (!IS_ERR_OR_NULL(path)) {
1830                                         for (j = 0; j < path->p_depth; j++)
1831                                                 ext4_mb_mark_bb(inode->i_sb,
1832                                                         path[j].p_block, 1, 1);
1833                                         ext4_ext_drop_refs(path);
1834                                         kfree(path);
1835                                 }
1836                                 cur += ret;
1837                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1838                                                         map.m_len, 1);
1839                         } else {
1840                                 cur = cur + (map.m_len ? map.m_len : 1);
1841                         }
1842                 }
1843                 iput(inode);
1844         }
1845 }
1846
1847 /*
1848  * Check if block is in excluded regions for block allocation. The simple
1849  * allocator that runs during replay phase is calls this function to see
1850  * if it is okay to use a block.
1851  */
1852 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1853 {
1854         int i;
1855         struct ext4_fc_replay_state *state;
1856
1857         state = &EXT4_SB(sb)->s_fc_replay_state;
1858         for (i = 0; i < state->fc_regions_valid; i++) {
1859                 if (state->fc_regions[i].ino == 0 ||
1860                         state->fc_regions[i].len == 0)
1861                         continue;
1862                 if (blk >= state->fc_regions[i].pblk &&
1863                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1864                         return true;
1865         }
1866         return false;
1867 }
1868
1869 /* Cleanup function called after replay */
1870 void ext4_fc_replay_cleanup(struct super_block *sb)
1871 {
1872         struct ext4_sb_info *sbi = EXT4_SB(sb);
1873
1874         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1875         kfree(sbi->s_fc_replay_state.fc_regions);
1876         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1877 }
1878
1879 /*
1880  * Recovery Scan phase handler
1881  *
1882  * This function is called during the scan phase and is responsible
1883  * for doing following things:
1884  * - Make sure the fast commit area has valid tags for replay
1885  * - Count number of tags that need to be replayed by the replay handler
1886  * - Verify CRC
1887  * - Create a list of excluded blocks for allocation during replay phase
1888  *
1889  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1890  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1891  * to indicate that scan has finished and JBD2 can now start replay phase.
1892  * It returns a negative error to indicate that there was an error. At the end
1893  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1894  * to indicate the number of tags that need to replayed during the replay phase.
1895  */
1896 static int ext4_fc_replay_scan(journal_t *journal,
1897                                 struct buffer_head *bh, int off,
1898                                 tid_t expected_tid)
1899 {
1900         struct super_block *sb = journal->j_private;
1901         struct ext4_sb_info *sbi = EXT4_SB(sb);
1902         struct ext4_fc_replay_state *state;
1903         int ret = JBD2_FC_REPLAY_CONTINUE;
1904         struct ext4_fc_add_range *ext;
1905         struct ext4_fc_tl *tl;
1906         struct ext4_fc_tail *tail;
1907         __u8 *start, *end;
1908         struct ext4_fc_head *head;
1909         struct ext4_extent *ex;
1910
1911         state = &sbi->s_fc_replay_state;
1912
1913         start = (u8 *)bh->b_data;
1914         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1915
1916         if (state->fc_replay_expected_off == 0) {
1917                 state->fc_cur_tag = 0;
1918                 state->fc_replay_num_tags = 0;
1919                 state->fc_crc = 0;
1920                 state->fc_regions = NULL;
1921                 state->fc_regions_valid = state->fc_regions_used =
1922                         state->fc_regions_size = 0;
1923                 /* Check if we can stop early */
1924                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1925                         != EXT4_FC_TAG_HEAD)
1926                         return 0;
1927         }
1928
1929         if (off != state->fc_replay_expected_off) {
1930                 ret = -EFSCORRUPTED;
1931                 goto out_err;
1932         }
1933
1934         state->fc_replay_expected_off++;
1935         fc_for_each_tl(start, end, tl) {
1936                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1937                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1938                 switch (le16_to_cpu(tl->fc_tag)) {
1939                 case EXT4_FC_TAG_ADD_RANGE:
1940                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1941                         ex = (struct ext4_extent *)&ext->fc_ex;
1942                         ret = ext4_fc_record_regions(sb,
1943                                 le32_to_cpu(ext->fc_ino),
1944                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1945                                 ext4_ext_get_actual_len(ex));
1946                         if (ret < 0)
1947                                 break;
1948                         ret = JBD2_FC_REPLAY_CONTINUE;
1949                         fallthrough;
1950                 case EXT4_FC_TAG_DEL_RANGE:
1951                 case EXT4_FC_TAG_LINK:
1952                 case EXT4_FC_TAG_UNLINK:
1953                 case EXT4_FC_TAG_CREAT:
1954                 case EXT4_FC_TAG_INODE:
1955                 case EXT4_FC_TAG_PAD:
1956                         state->fc_cur_tag++;
1957                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1958                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1959                         break;
1960                 case EXT4_FC_TAG_TAIL:
1961                         state->fc_cur_tag++;
1962                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1963                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1964                                                 sizeof(*tl) +
1965                                                 offsetof(struct ext4_fc_tail,
1966                                                 fc_crc));
1967                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1968                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1969                                 state->fc_replay_num_tags = state->fc_cur_tag;
1970                                 state->fc_regions_valid =
1971                                         state->fc_regions_used;
1972                         } else {
1973                                 ret = state->fc_replay_num_tags ?
1974                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1975                         }
1976                         state->fc_crc = 0;
1977                         break;
1978                 case EXT4_FC_TAG_HEAD:
1979                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1980                         if (le32_to_cpu(head->fc_features) &
1981                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1982                                 ret = -EOPNOTSUPP;
1983                                 break;
1984                         }
1985                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
1986                                 ret = JBD2_FC_REPLAY_STOP;
1987                                 break;
1988                         }
1989                         state->fc_cur_tag++;
1990                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1991                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1992                         break;
1993                 default:
1994                         ret = state->fc_replay_num_tags ?
1995                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
1996                 }
1997                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1998                         break;
1999         }
2000
2001 out_err:
2002         trace_ext4_fc_replay_scan(sb, ret, off);
2003         return ret;
2004 }
2005
2006 /*
2007  * Main recovery path entry point.
2008  * The meaning of return codes is similar as above.
2009  */
2010 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2011                                 enum passtype pass, int off, tid_t expected_tid)
2012 {
2013         struct super_block *sb = journal->j_private;
2014         struct ext4_sb_info *sbi = EXT4_SB(sb);
2015         struct ext4_fc_tl *tl;
2016         __u8 *start, *end;
2017         int ret = JBD2_FC_REPLAY_CONTINUE;
2018         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2019         struct ext4_fc_tail *tail;
2020
2021         if (pass == PASS_SCAN) {
2022                 state->fc_current_pass = PASS_SCAN;
2023                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2024         }
2025
2026         if (state->fc_current_pass != pass) {
2027                 state->fc_current_pass = pass;
2028                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2029         }
2030         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2031                 jbd_debug(1, "Replay stops\n");
2032                 ext4_fc_set_bitmaps_and_counters(sb);
2033                 return 0;
2034         }
2035
2036 #ifdef CONFIG_EXT4_DEBUG
2037         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2038                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2039                 return JBD2_FC_REPLAY_STOP;
2040         }
2041 #endif
2042
2043         start = (u8 *)bh->b_data;
2044         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2045
2046         fc_for_each_tl(start, end, tl) {
2047                 if (state->fc_replay_num_tags == 0) {
2048                         ret = JBD2_FC_REPLAY_STOP;
2049                         ext4_fc_set_bitmaps_and_counters(sb);
2050                         break;
2051                 }
2052                 jbd_debug(3, "Replay phase, tag:%s\n",
2053                                 tag2str(le16_to_cpu(tl->fc_tag)));
2054                 state->fc_replay_num_tags--;
2055                 switch (le16_to_cpu(tl->fc_tag)) {
2056                 case EXT4_FC_TAG_LINK:
2057                         ret = ext4_fc_replay_link(sb, tl);
2058                         break;
2059                 case EXT4_FC_TAG_UNLINK:
2060                         ret = ext4_fc_replay_unlink(sb, tl);
2061                         break;
2062                 case EXT4_FC_TAG_ADD_RANGE:
2063                         ret = ext4_fc_replay_add_range(sb, tl);
2064                         break;
2065                 case EXT4_FC_TAG_CREAT:
2066                         ret = ext4_fc_replay_create(sb, tl);
2067                         break;
2068                 case EXT4_FC_TAG_DEL_RANGE:
2069                         ret = ext4_fc_replay_del_range(sb, tl);
2070                         break;
2071                 case EXT4_FC_TAG_INODE:
2072                         ret = ext4_fc_replay_inode(sb, tl);
2073                         break;
2074                 case EXT4_FC_TAG_PAD:
2075                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2076                                 ext4_fc_tag_len(tl), 0);
2077                         break;
2078                 case EXT4_FC_TAG_TAIL:
2079                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2080                                 ext4_fc_tag_len(tl), 0);
2081                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2082                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2083                         break;
2084                 case EXT4_FC_TAG_HEAD:
2085                         break;
2086                 default:
2087                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2088                                 ext4_fc_tag_len(tl), 0);
2089                         ret = -ECANCELED;
2090                         break;
2091                 }
2092                 if (ret < 0)
2093                         break;
2094                 ret = JBD2_FC_REPLAY_CONTINUE;
2095         }
2096         return ret;
2097 }
2098
2099 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2100 {
2101         /*
2102          * We set replay callback even if fast commit disabled because we may
2103          * could still have fast commit blocks that need to be replayed even if
2104          * fast commit has now been turned off.
2105          */
2106         journal->j_fc_replay_callback = ext4_fc_replay;
2107         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2108                 return;
2109         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2110 }
2111
2112 static const char *fc_ineligible_reasons[] = {
2113         "Extended attributes changed",
2114         "Cross rename",
2115         "Journal flag changed",
2116         "Insufficient memory",
2117         "Swap boot",
2118         "Resize",
2119         "Dir renamed",
2120         "Falloc range op",
2121         "Data journalling",
2122         "FC Commit Failed"
2123 };
2124
2125 int ext4_fc_info_show(struct seq_file *seq, void *v)
2126 {
2127         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2128         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2129         int i;
2130
2131         if (v != SEQ_START_TOKEN)
2132                 return 0;
2133
2134         seq_printf(seq,
2135                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2136                    stats->fc_num_commits, stats->fc_ineligible_commits,
2137                    stats->fc_numblks,
2138                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2139         seq_puts(seq, "Ineligible reasons:\n");
2140         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2141                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2142                         stats->fc_ineligible_reason_count[i]);
2143
2144         return 0;
2145 }
2146
2147 int __init ext4_fc_init_dentry_cache(void)
2148 {
2149         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2150                                            SLAB_RECLAIM_ACCOUNT);
2151
2152         if (ext4_fc_dentry_cachep == NULL)
2153                 return -ENOMEM;
2154
2155         return 0;
2156 }