639b2a308c7ba67914592efae7e6f1b8a5b2103f
[linux-2.6-microblaze.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124         BUFFER_TRACE(bh, "");
125         if (uptodate) {
126                 ext4_debug("%s: Block %lld up-to-date",
127                            __func__, bh->b_blocknr);
128                 set_buffer_uptodate(bh);
129         } else {
130                 ext4_debug("%s: Block %lld not up-to-date",
131                            __func__, bh->b_blocknr);
132                 clear_buffer_uptodate(bh);
133         }
134
135         unlock_buffer(bh);
136 }
137
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140         struct ext4_inode_info *ei = EXT4_I(inode);
141
142         ei->i_fc_lblk_start = 0;
143         ei->i_fc_lblk_len = 0;
144 }
145
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148         struct ext4_inode_info *ei = EXT4_I(inode);
149
150         ext4_fc_reset_inode(inode);
151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152         INIT_LIST_HEAD(&ei->i_fc_list);
153         init_waitqueue_head(&ei->i_fc_wait);
154         atomic_set(&ei->i_fc_updates, 0);
155         ei->i_fc_committed_subtid = 0;
156 }
157
158 /* This function must be called with sbi->s_fc_lock held. */
159 static void ext4_fc_wait_committing_inode(struct inode *inode)
160 {
161         wait_queue_head_t *wq;
162         struct ext4_inode_info *ei = EXT4_I(inode);
163
164 #if (BITS_PER_LONG < 64)
165         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166                         EXT4_STATE_FC_COMMITTING);
167         wq = bit_waitqueue(&ei->i_state_flags,
168                                 EXT4_STATE_FC_COMMITTING);
169 #else
170         DEFINE_WAIT_BIT(wait, &ei->i_flags,
171                         EXT4_STATE_FC_COMMITTING);
172         wq = bit_waitqueue(&ei->i_flags,
173                                 EXT4_STATE_FC_COMMITTING);
174 #endif
175         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178         schedule();
179         finish_wait(wq, &wait.wq_entry);
180 }
181
182 /*
183  * Inform Ext4's fast about start of an inode update
184  *
185  * This function is called by the high level call VFS callbacks before
186  * performing any inode update. This function blocks if there's an ongoing
187  * fast commit on the inode in question.
188  */
189 void ext4_fc_start_update(struct inode *inode)
190 {
191         struct ext4_inode_info *ei = EXT4_I(inode);
192
193         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195                 return;
196
197 restart:
198         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199         if (list_empty(&ei->i_fc_list))
200                 goto out;
201
202         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203                 ext4_fc_wait_committing_inode(inode);
204                 goto restart;
205         }
206 out:
207         atomic_inc(&ei->i_fc_updates);
208         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209 }
210
211 /*
212  * Stop inode update and wake up waiting fast commits if any.
213  */
214 void ext4_fc_stop_update(struct inode *inode)
215 {
216         struct ext4_inode_info *ei = EXT4_I(inode);
217
218         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220                 return;
221
222         if (atomic_dec_and_test(&ei->i_fc_updates))
223                 wake_up_all(&ei->i_fc_wait);
224 }
225
226 /*
227  * Remove inode from fast commit list. If the inode is being committed
228  * we wait until inode commit is done.
229  */
230 void ext4_fc_del(struct inode *inode)
231 {
232         struct ext4_inode_info *ei = EXT4_I(inode);
233
234         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236                 return;
237
238 restart:
239         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240         if (list_empty(&ei->i_fc_list)) {
241                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242                 return;
243         }
244
245         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246                 ext4_fc_wait_committing_inode(inode);
247                 goto restart;
248         }
249         list_del_init(&ei->i_fc_list);
250         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 }
252
253 /*
254  * Mark file system as fast commit ineligible. This means that next commit
255  * operation would result in a full jbd2 commit.
256  */
257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258 {
259         struct ext4_sb_info *sbi = EXT4_SB(sb);
260
261         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263                 return;
264
265         sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
266         WARN_ON(reason >= EXT4_FC_REASON_MAX);
267         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268 }
269
270 /*
271  * Start a fast commit ineligible update. Any commits that happen while
272  * such an operation is in progress fall back to full commits.
273  */
274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275 {
276         struct ext4_sb_info *sbi = EXT4_SB(sb);
277
278         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280                 return;
281
282         WARN_ON(reason >= EXT4_FC_REASON_MAX);
283         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284         atomic_inc(&sbi->s_fc_ineligible_updates);
285 }
286
287 /*
288  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289  * to ensure that after stopping the ineligible update, at least one full
290  * commit takes place.
291  */
292 void ext4_fc_stop_ineligible(struct super_block *sb)
293 {
294         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296                 return;
297
298         EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
299         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300 }
301
302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
303 {
304         return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
305                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
306 }
307
308 /*
309  * Generic fast commit tracking function. If this is the first time this we are
310  * called after a full commit, we initialize fast commit fields and then call
311  * __fc_track_fn() with update = 0. If we have already been called after a full
312  * commit, we pass update = 1. Based on that, the track function can determine
313  * if it needs to track a field for the first time or if it needs to just
314  * update the previously tracked value.
315  *
316  * If enqueue is set, this function enqueues the inode in fast commit list.
317  */
318 static int ext4_fc_track_template(
319         handle_t *handle, struct inode *inode,
320         int (*__fc_track_fn)(struct inode *, void *, bool),
321         void *args, int enqueue)
322 {
323         bool update = false;
324         struct ext4_inode_info *ei = EXT4_I(inode);
325         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326         tid_t tid = 0;
327         int ret;
328
329         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330             (sbi->s_mount_state & EXT4_FC_REPLAY))
331                 return -EOPNOTSUPP;
332
333         if (ext4_fc_is_ineligible(inode->i_sb))
334                 return -EINVAL;
335
336         tid = handle->h_transaction->t_tid;
337         mutex_lock(&ei->i_fc_lock);
338         if (tid == ei->i_sync_tid) {
339                 update = true;
340         } else {
341                 ext4_fc_reset_inode(inode);
342                 ei->i_sync_tid = tid;
343         }
344         ret = __fc_track_fn(inode, args, update);
345         mutex_unlock(&ei->i_fc_lock);
346
347         if (!enqueue)
348                 return ret;
349
350         spin_lock(&sbi->s_fc_lock);
351         if (list_empty(&EXT4_I(inode)->i_fc_list))
352                 list_add_tail(&EXT4_I(inode)->i_fc_list,
353                                 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
354                                 &sbi->s_fc_q[FC_Q_STAGING] :
355                                 &sbi->s_fc_q[FC_Q_MAIN]);
356         spin_unlock(&sbi->s_fc_lock);
357
358         return ret;
359 }
360
361 struct __track_dentry_update_args {
362         struct dentry *dentry;
363         int op;
364 };
365
366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368 {
369         struct ext4_fc_dentry_update *node;
370         struct ext4_inode_info *ei = EXT4_I(inode);
371         struct __track_dentry_update_args *dentry_update =
372                 (struct __track_dentry_update_args *)arg;
373         struct dentry *dentry = dentry_update->dentry;
374         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376         mutex_unlock(&ei->i_fc_lock);
377         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378         if (!node) {
379                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380                 mutex_lock(&ei->i_fc_lock);
381                 return -ENOMEM;
382         }
383
384         node->fcd_op = dentry_update->op;
385         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386         node->fcd_ino = inode->i_ino;
387         if (dentry->d_name.len > DNAME_INLINE_LEN) {
388                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389                 if (!node->fcd_name.name) {
390                         kmem_cache_free(ext4_fc_dentry_cachep, node);
391                         ext4_fc_mark_ineligible(inode->i_sb,
392                                 EXT4_FC_REASON_NOMEM);
393                         mutex_lock(&ei->i_fc_lock);
394                         return -ENOMEM;
395                 }
396                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397                         dentry->d_name.len);
398         } else {
399                 memcpy(node->fcd_iname, dentry->d_name.name,
400                         dentry->d_name.len);
401                 node->fcd_name.name = node->fcd_iname;
402         }
403         node->fcd_name.len = dentry->d_name.len;
404
405         spin_lock(&sbi->s_fc_lock);
406         if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
407                 list_add_tail(&node->fcd_list,
408                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409         else
410                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411         spin_unlock(&sbi->s_fc_lock);
412         mutex_lock(&ei->i_fc_lock);
413
414         return 0;
415 }
416
417 void __ext4_fc_track_unlink(handle_t *handle,
418                 struct inode *inode, struct dentry *dentry)
419 {
420         struct __track_dentry_update_args args;
421         int ret;
422
423         args.dentry = dentry;
424         args.op = EXT4_FC_TAG_UNLINK;
425
426         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427                                         (void *)&args, 0);
428         trace_ext4_fc_track_unlink(inode, dentry, ret);
429 }
430
431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432 {
433         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434 }
435
436 void __ext4_fc_track_link(handle_t *handle,
437         struct inode *inode, struct dentry *dentry)
438 {
439         struct __track_dentry_update_args args;
440         int ret;
441
442         args.dentry = dentry;
443         args.op = EXT4_FC_TAG_LINK;
444
445         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446                                         (void *)&args, 0);
447         trace_ext4_fc_track_link(inode, dentry, ret);
448 }
449
450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451 {
452         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453 }
454
455 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
456 {
457         struct __track_dentry_update_args args;
458         struct inode *inode = d_inode(dentry);
459         int ret;
460
461         args.dentry = dentry;
462         args.op = EXT4_FC_TAG_CREAT;
463
464         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465                                         (void *)&args, 0);
466         trace_ext4_fc_track_create(inode, dentry, ret);
467 }
468
469 /* __track_fn for inode tracking */
470 static int __track_inode(struct inode *inode, void *arg, bool update)
471 {
472         if (update)
473                 return -EEXIST;
474
475         EXT4_I(inode)->i_fc_lblk_len = 0;
476
477         return 0;
478 }
479
480 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
481 {
482         int ret;
483
484         if (S_ISDIR(inode->i_mode))
485                 return;
486
487         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
488         trace_ext4_fc_track_inode(inode, ret);
489 }
490
491 struct __track_range_args {
492         ext4_lblk_t start, end;
493 };
494
495 /* __track_fn for tracking data updates */
496 static int __track_range(struct inode *inode, void *arg, bool update)
497 {
498         struct ext4_inode_info *ei = EXT4_I(inode);
499         ext4_lblk_t oldstart;
500         struct __track_range_args *__arg =
501                 (struct __track_range_args *)arg;
502
503         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
504                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
505                 return -ECANCELED;
506         }
507
508         oldstart = ei->i_fc_lblk_start;
509
510         if (update && ei->i_fc_lblk_len > 0) {
511                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
512                 ei->i_fc_lblk_len =
513                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
514                                 ei->i_fc_lblk_start + 1;
515         } else {
516                 ei->i_fc_lblk_start = __arg->start;
517                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
518         }
519
520         return 0;
521 }
522
523 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
524                          ext4_lblk_t end)
525 {
526         struct __track_range_args args;
527         int ret;
528
529         if (S_ISDIR(inode->i_mode))
530                 return;
531
532         args.start = start;
533         args.end = end;
534
535         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
536
537         trace_ext4_fc_track_range(inode, start, end, ret);
538 }
539
540 static void ext4_fc_submit_bh(struct super_block *sb)
541 {
542         int write_flags = REQ_SYNC;
543         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
544
545         /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
546         if (test_opt(sb, BARRIER))
547                 write_flags |= REQ_FUA | REQ_PREFLUSH;
548         lock_buffer(bh);
549         clear_buffer_dirty(bh);
550         set_buffer_uptodate(bh);
551         bh->b_end_io = ext4_end_buffer_io_sync;
552         submit_bh(REQ_OP_WRITE, write_flags, bh);
553         EXT4_SB(sb)->s_fc_bh = NULL;
554 }
555
556 /* Ext4 commit path routines */
557
558 /* memzero and update CRC */
559 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
560                                 u32 *crc)
561 {
562         void *ret;
563
564         ret = memset(dst, 0, len);
565         if (crc)
566                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
567         return ret;
568 }
569
570 /*
571  * Allocate len bytes on a fast commit buffer.
572  *
573  * During the commit time this function is used to manage fast commit
574  * block space. We don't split a fast commit log onto different
575  * blocks. So this function makes sure that if there's not enough space
576  * on the current block, the remaining space in the current block is
577  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
578  * new block is from jbd2 and CRC is updated to reflect the padding
579  * we added.
580  */
581 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
582 {
583         struct ext4_fc_tl *tl;
584         struct ext4_sb_info *sbi = EXT4_SB(sb);
585         struct buffer_head *bh;
586         int bsize = sbi->s_journal->j_blocksize;
587         int ret, off = sbi->s_fc_bytes % bsize;
588         int pad_len;
589
590         /*
591          * After allocating len, we should have space at least for a 0 byte
592          * padding.
593          */
594         if (len + sizeof(struct ext4_fc_tl) > bsize)
595                 return NULL;
596
597         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
598                 /*
599                  * Only allocate from current buffer if we have enough space for
600                  * this request AND we have space to add a zero byte padding.
601                  */
602                 if (!sbi->s_fc_bh) {
603                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
604                         if (ret)
605                                 return NULL;
606                         sbi->s_fc_bh = bh;
607                 }
608                 sbi->s_fc_bytes += len;
609                 return sbi->s_fc_bh->b_data + off;
610         }
611         /* Need to add PAD tag */
612         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
613         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
614         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
615         tl->fc_len = cpu_to_le16(pad_len);
616         if (crc)
617                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
618         if (pad_len > 0)
619                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
620         ext4_fc_submit_bh(sb);
621
622         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
623         if (ret)
624                 return NULL;
625         sbi->s_fc_bh = bh;
626         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
627         return sbi->s_fc_bh->b_data;
628 }
629
630 /* memcpy to fc reserved space and update CRC */
631 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
632                                 int len, u32 *crc)
633 {
634         if (crc)
635                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
636         return memcpy(dst, src, len);
637 }
638
639 /*
640  * Complete a fast commit by writing tail tag.
641  *
642  * Writing tail tag marks the end of a fast commit. In order to guarantee
643  * atomicity, after writing tail tag, even if there's space remaining
644  * in the block, next commit shouldn't use it. That's why tail tag
645  * has the length as that of the remaining space on the block.
646  */
647 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
648 {
649         struct ext4_sb_info *sbi = EXT4_SB(sb);
650         struct ext4_fc_tl tl;
651         struct ext4_fc_tail tail;
652         int off, bsize = sbi->s_journal->j_blocksize;
653         u8 *dst;
654
655         /*
656          * ext4_fc_reserve_space takes care of allocating an extra block if
657          * there's no enough space on this block for accommodating this tail.
658          */
659         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
660         if (!dst)
661                 return -ENOSPC;
662
663         off = sbi->s_fc_bytes % bsize;
664
665         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
666         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
667         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
668
669         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
670         dst += sizeof(tl);
671         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
672         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
673         dst += sizeof(tail.fc_tid);
674         tail.fc_crc = cpu_to_le32(crc);
675         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
676
677         ext4_fc_submit_bh(sb);
678
679         return 0;
680 }
681
682 /*
683  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
684  * Returns false if there's not enough space.
685  */
686 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
687                            u32 *crc)
688 {
689         struct ext4_fc_tl tl;
690         u8 *dst;
691
692         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
693         if (!dst)
694                 return false;
695
696         tl.fc_tag = cpu_to_le16(tag);
697         tl.fc_len = cpu_to_le16(len);
698
699         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
700         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
701
702         return true;
703 }
704
705 /* Same as above, but adds dentry tlv. */
706 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
707                                         int parent_ino, int ino, int dlen,
708                                         const unsigned char *dname,
709                                         u32 *crc)
710 {
711         struct ext4_fc_dentry_info fcd;
712         struct ext4_fc_tl tl;
713         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
714                                         crc);
715
716         if (!dst)
717                 return false;
718
719         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
720         fcd.fc_ino = cpu_to_le32(ino);
721         tl.fc_tag = cpu_to_le16(tag);
722         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
723         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
724         dst += sizeof(tl);
725         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
726         dst += sizeof(fcd);
727         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
728         dst += dlen;
729
730         return true;
731 }
732
733 /*
734  * Writes inode in the fast commit space under TLV with tag @tag.
735  * Returns 0 on success, error on failure.
736  */
737 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
738 {
739         struct ext4_inode_info *ei = EXT4_I(inode);
740         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
741         int ret;
742         struct ext4_iloc iloc;
743         struct ext4_fc_inode fc_inode;
744         struct ext4_fc_tl tl;
745         u8 *dst;
746
747         ret = ext4_get_inode_loc(inode, &iloc);
748         if (ret)
749                 return ret;
750
751         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
752                 inode_len += ei->i_extra_isize;
753
754         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
755         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
756         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
757
758         dst = ext4_fc_reserve_space(inode->i_sb,
759                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
760         if (!dst)
761                 return -ECANCELED;
762
763         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
764                 return -ECANCELED;
765         dst += sizeof(tl);
766         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
767                 return -ECANCELED;
768         dst += sizeof(fc_inode);
769         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
770                                         inode_len, crc))
771                 return -ECANCELED;
772
773         return 0;
774 }
775
776 /*
777  * Writes updated data ranges for the inode in question. Updates CRC.
778  * Returns 0 on success, error otherwise.
779  */
780 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
781 {
782         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
783         struct ext4_inode_info *ei = EXT4_I(inode);
784         struct ext4_map_blocks map;
785         struct ext4_fc_add_range fc_ext;
786         struct ext4_fc_del_range lrange;
787         struct ext4_extent *ex;
788         int ret;
789
790         mutex_lock(&ei->i_fc_lock);
791         if (ei->i_fc_lblk_len == 0) {
792                 mutex_unlock(&ei->i_fc_lock);
793                 return 0;
794         }
795         old_blk_size = ei->i_fc_lblk_start;
796         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
797         ei->i_fc_lblk_len = 0;
798         mutex_unlock(&ei->i_fc_lock);
799
800         cur_lblk_off = old_blk_size;
801         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
802                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
803
804         while (cur_lblk_off <= new_blk_size) {
805                 map.m_lblk = cur_lblk_off;
806                 map.m_len = new_blk_size - cur_lblk_off + 1;
807                 ret = ext4_map_blocks(NULL, inode, &map, 0);
808                 if (ret < 0)
809                         return -ECANCELED;
810
811                 if (map.m_len == 0) {
812                         cur_lblk_off++;
813                         continue;
814                 }
815
816                 if (ret == 0) {
817                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
818                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
819                         lrange.fc_len = cpu_to_le32(map.m_len);
820                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
821                                             sizeof(lrange), (u8 *)&lrange, crc))
822                                 return -ENOSPC;
823                 } else {
824                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
825                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
826                         ex->ee_block = cpu_to_le32(map.m_lblk);
827                         ex->ee_len = cpu_to_le16(map.m_len);
828                         ext4_ext_store_pblock(ex, map.m_pblk);
829                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
830                                 ext4_ext_mark_unwritten(ex);
831                         else
832                                 ext4_ext_mark_initialized(ex);
833                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
834                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
835                                 return -ENOSPC;
836                 }
837
838                 cur_lblk_off += map.m_len;
839         }
840
841         return 0;
842 }
843
844
845 /* Submit data for all the fast commit inodes */
846 static int ext4_fc_submit_inode_data_all(journal_t *journal)
847 {
848         struct super_block *sb = (struct super_block *)(journal->j_private);
849         struct ext4_sb_info *sbi = EXT4_SB(sb);
850         struct ext4_inode_info *ei;
851         struct list_head *pos;
852         int ret = 0;
853
854         spin_lock(&sbi->s_fc_lock);
855         sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
856         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
857                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
858                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
859                 while (atomic_read(&ei->i_fc_updates)) {
860                         DEFINE_WAIT(wait);
861
862                         prepare_to_wait(&ei->i_fc_wait, &wait,
863                                                 TASK_UNINTERRUPTIBLE);
864                         if (atomic_read(&ei->i_fc_updates)) {
865                                 spin_unlock(&sbi->s_fc_lock);
866                                 schedule();
867                                 spin_lock(&sbi->s_fc_lock);
868                         }
869                         finish_wait(&ei->i_fc_wait, &wait);
870                 }
871                 spin_unlock(&sbi->s_fc_lock);
872                 ret = jbd2_submit_inode_data(ei->jinode);
873                 if (ret)
874                         return ret;
875                 spin_lock(&sbi->s_fc_lock);
876         }
877         spin_unlock(&sbi->s_fc_lock);
878
879         return ret;
880 }
881
882 /* Wait for completion of data for all the fast commit inodes */
883 static int ext4_fc_wait_inode_data_all(journal_t *journal)
884 {
885         struct super_block *sb = (struct super_block *)(journal->j_private);
886         struct ext4_sb_info *sbi = EXT4_SB(sb);
887         struct ext4_inode_info *pos, *n;
888         int ret = 0;
889
890         spin_lock(&sbi->s_fc_lock);
891         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
892                 if (!ext4_test_inode_state(&pos->vfs_inode,
893                                            EXT4_STATE_FC_COMMITTING))
894                         continue;
895                 spin_unlock(&sbi->s_fc_lock);
896
897                 ret = jbd2_wait_inode_data(journal, pos->jinode);
898                 if (ret)
899                         return ret;
900                 spin_lock(&sbi->s_fc_lock);
901         }
902         spin_unlock(&sbi->s_fc_lock);
903
904         return 0;
905 }
906
907 /* Commit all the directory entry updates */
908 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
909 {
910         struct super_block *sb = (struct super_block *)(journal->j_private);
911         struct ext4_sb_info *sbi = EXT4_SB(sb);
912         struct ext4_fc_dentry_update *fc_dentry;
913         struct inode *inode;
914         struct list_head *pos, *n, *fcd_pos, *fcd_n;
915         struct ext4_inode_info *ei;
916         int ret;
917
918         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
919                 return 0;
920         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
921                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
922                                         fcd_list);
923                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
924                         spin_unlock(&sbi->s_fc_lock);
925                         if (!ext4_fc_add_dentry_tlv(
926                                 sb, fc_dentry->fcd_op,
927                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
928                                 fc_dentry->fcd_name.len,
929                                 fc_dentry->fcd_name.name, crc)) {
930                                 ret = -ENOSPC;
931                                 goto lock_and_exit;
932                         }
933                         spin_lock(&sbi->s_fc_lock);
934                         continue;
935                 }
936
937                 inode = NULL;
938                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
939                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
940                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
941                                 inode = &ei->vfs_inode;
942                                 break;
943                         }
944                 }
945                 /*
946                  * If we don't find inode in our list, then it was deleted,
947                  * in which case, we don't need to record it's create tag.
948                  */
949                 if (!inode)
950                         continue;
951                 spin_unlock(&sbi->s_fc_lock);
952
953                 /*
954                  * We first write the inode and then the create dirent. This
955                  * allows the recovery code to create an unnamed inode first
956                  * and then link it to a directory entry. This allows us
957                  * to use namei.c routines almost as is and simplifies
958                  * the recovery code.
959                  */
960                 ret = ext4_fc_write_inode(inode, crc);
961                 if (ret)
962                         goto lock_and_exit;
963
964                 ret = ext4_fc_write_inode_data(inode, crc);
965                 if (ret)
966                         goto lock_and_exit;
967
968                 if (!ext4_fc_add_dentry_tlv(
969                         sb, fc_dentry->fcd_op,
970                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
971                         fc_dentry->fcd_name.len,
972                         fc_dentry->fcd_name.name, crc)) {
973                         ret = -ENOSPC;
974                         goto lock_and_exit;
975                 }
976
977                 spin_lock(&sbi->s_fc_lock);
978         }
979         return 0;
980 lock_and_exit:
981         spin_lock(&sbi->s_fc_lock);
982         return ret;
983 }
984
985 static int ext4_fc_perform_commit(journal_t *journal)
986 {
987         struct super_block *sb = (struct super_block *)(journal->j_private);
988         struct ext4_sb_info *sbi = EXT4_SB(sb);
989         struct ext4_inode_info *iter;
990         struct ext4_fc_head head;
991         struct list_head *pos;
992         struct inode *inode;
993         struct blk_plug plug;
994         int ret = 0;
995         u32 crc = 0;
996
997         ret = ext4_fc_submit_inode_data_all(journal);
998         if (ret)
999                 return ret;
1000
1001         ret = ext4_fc_wait_inode_data_all(journal);
1002         if (ret)
1003                 return ret;
1004
1005         blk_start_plug(&plug);
1006         if (sbi->s_fc_bytes == 0) {
1007                 /*
1008                  * Add a head tag only if this is the first fast commit
1009                  * in this TID.
1010                  */
1011                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1012                 head.fc_tid = cpu_to_le32(
1013                         sbi->s_journal->j_running_transaction->t_tid);
1014                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1015                         (u8 *)&head, &crc))
1016                         goto out;
1017         }
1018
1019         spin_lock(&sbi->s_fc_lock);
1020         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1021         if (ret) {
1022                 spin_unlock(&sbi->s_fc_lock);
1023                 goto out;
1024         }
1025
1026         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1027                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1028                 inode = &iter->vfs_inode;
1029                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1030                         continue;
1031
1032                 spin_unlock(&sbi->s_fc_lock);
1033                 ret = ext4_fc_write_inode_data(inode, &crc);
1034                 if (ret)
1035                         goto out;
1036                 ret = ext4_fc_write_inode(inode, &crc);
1037                 if (ret)
1038                         goto out;
1039                 spin_lock(&sbi->s_fc_lock);
1040                 EXT4_I(inode)->i_fc_committed_subtid =
1041                         atomic_read(&sbi->s_fc_subtid);
1042         }
1043         spin_unlock(&sbi->s_fc_lock);
1044
1045         ret = ext4_fc_write_tail(sb, crc);
1046
1047 out:
1048         blk_finish_plug(&plug);
1049         return ret;
1050 }
1051
1052 /*
1053  * The main commit entry point. Performs a fast commit for transaction
1054  * commit_tid if needed. If it's not possible to perform a fast commit
1055  * due to various reasons, we fall back to full commit. Returns 0
1056  * on success, error otherwise.
1057  */
1058 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1059 {
1060         struct super_block *sb = (struct super_block *)(journal->j_private);
1061         struct ext4_sb_info *sbi = EXT4_SB(sb);
1062         int nblks = 0, ret, bsize = journal->j_blocksize;
1063         int subtid = atomic_read(&sbi->s_fc_subtid);
1064         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1065         ktime_t start_time, commit_time;
1066
1067         trace_ext4_fc_commit_start(sb);
1068
1069         start_time = ktime_get();
1070
1071         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1072                 (ext4_fc_is_ineligible(sb))) {
1073                 reason = EXT4_FC_REASON_INELIGIBLE;
1074                 goto out;
1075         }
1076
1077 restart_fc:
1078         ret = jbd2_fc_begin_commit(journal, commit_tid);
1079         if (ret == -EALREADY) {
1080                 /* There was an ongoing commit, check if we need to restart */
1081                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1082                         commit_tid > journal->j_commit_sequence)
1083                         goto restart_fc;
1084                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1085                 goto out;
1086         } else if (ret) {
1087                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1088                 reason = EXT4_FC_REASON_FC_START_FAILED;
1089                 goto out;
1090         }
1091
1092         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1093         ret = ext4_fc_perform_commit(journal);
1094         if (ret < 0) {
1095                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1096                 reason = EXT4_FC_REASON_FC_FAILED;
1097                 goto out;
1098         }
1099         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1100         ret = jbd2_fc_wait_bufs(journal, nblks);
1101         if (ret < 0) {
1102                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1103                 reason = EXT4_FC_REASON_FC_FAILED;
1104                 goto out;
1105         }
1106         atomic_inc(&sbi->s_fc_subtid);
1107         jbd2_fc_end_commit(journal);
1108 out:
1109         /* Has any ineligible update happened since we started? */
1110         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1111                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1112                 reason = EXT4_FC_REASON_INELIGIBLE;
1113         }
1114
1115         spin_lock(&sbi->s_fc_lock);
1116         if (reason != EXT4_FC_REASON_OK &&
1117                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1118                 sbi->s_fc_stats.fc_ineligible_commits++;
1119         } else {
1120                 sbi->s_fc_stats.fc_num_commits++;
1121                 sbi->s_fc_stats.fc_numblks += nblks;
1122         }
1123         spin_unlock(&sbi->s_fc_lock);
1124         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1125         trace_ext4_fc_commit_stop(sb, nblks, reason);
1126         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1127         /*
1128          * weight the commit time higher than the average time so we don't
1129          * react too strongly to vast changes in the commit time
1130          */
1131         if (likely(sbi->s_fc_avg_commit_time))
1132                 sbi->s_fc_avg_commit_time = (commit_time +
1133                                 sbi->s_fc_avg_commit_time * 3) / 4;
1134         else
1135                 sbi->s_fc_avg_commit_time = commit_time;
1136         jbd_debug(1,
1137                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1138                 nblks, reason, subtid);
1139         if (reason == EXT4_FC_REASON_FC_FAILED)
1140                 return jbd2_fc_end_commit_fallback(journal);
1141         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1142                 reason == EXT4_FC_REASON_INELIGIBLE)
1143                 return jbd2_complete_transaction(journal, commit_tid);
1144         return 0;
1145 }
1146
1147 /*
1148  * Fast commit cleanup routine. This is called after every fast commit and
1149  * full commit. full is true if we are called after a full commit.
1150  */
1151 static void ext4_fc_cleanup(journal_t *journal, int full)
1152 {
1153         struct super_block *sb = journal->j_private;
1154         struct ext4_sb_info *sbi = EXT4_SB(sb);
1155         struct ext4_inode_info *iter;
1156         struct ext4_fc_dentry_update *fc_dentry;
1157         struct list_head *pos, *n;
1158
1159         if (full && sbi->s_fc_bh)
1160                 sbi->s_fc_bh = NULL;
1161
1162         jbd2_fc_release_bufs(journal);
1163
1164         spin_lock(&sbi->s_fc_lock);
1165         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1166                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1167                 list_del_init(&iter->i_fc_list);
1168                 ext4_clear_inode_state(&iter->vfs_inode,
1169                                        EXT4_STATE_FC_COMMITTING);
1170                 ext4_fc_reset_inode(&iter->vfs_inode);
1171                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1172                 smp_mb();
1173 #if (BITS_PER_LONG < 64)
1174                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1175 #else
1176                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1177 #endif
1178         }
1179
1180         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1181                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1182                                              struct ext4_fc_dentry_update,
1183                                              fcd_list);
1184                 list_del_init(&fc_dentry->fcd_list);
1185                 spin_unlock(&sbi->s_fc_lock);
1186
1187                 if (fc_dentry->fcd_name.name &&
1188                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1189                         kfree(fc_dentry->fcd_name.name);
1190                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1191                 spin_lock(&sbi->s_fc_lock);
1192         }
1193
1194         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1195                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1196         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1197                                 &sbi->s_fc_q[FC_Q_STAGING]);
1198
1199         sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1200         sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1201
1202         if (full)
1203                 sbi->s_fc_bytes = 0;
1204         spin_unlock(&sbi->s_fc_lock);
1205         trace_ext4_fc_stats(sb);
1206 }
1207
1208 /* Ext4 Replay Path Routines */
1209
1210 /* Get length of a particular tlv */
1211 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1212 {
1213         return le16_to_cpu(tl->fc_len);
1214 }
1215
1216 /* Get a pointer to "value" of a tlv */
1217 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1218 {
1219         return (u8 *)tl + sizeof(*tl);
1220 }
1221
1222 /* Helper struct for dentry replay routines */
1223 struct dentry_info_args {
1224         int parent_ino, dname_len, ino, inode_len;
1225         char *dname;
1226 };
1227
1228 static inline void tl_to_darg(struct dentry_info_args *darg,
1229                                 struct  ext4_fc_tl *tl)
1230 {
1231         struct ext4_fc_dentry_info *fcd;
1232
1233         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1234
1235         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1236         darg->ino = le32_to_cpu(fcd->fc_ino);
1237         darg->dname = fcd->fc_dname;
1238         darg->dname_len = ext4_fc_tag_len(tl) -
1239                         sizeof(struct ext4_fc_dentry_info);
1240 }
1241
1242 /* Unlink replay function */
1243 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1244 {
1245         struct inode *inode, *old_parent;
1246         struct qstr entry;
1247         struct dentry_info_args darg;
1248         int ret = 0;
1249
1250         tl_to_darg(&darg, tl);
1251
1252         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1253                         darg.parent_ino, darg.dname_len);
1254
1255         entry.name = darg.dname;
1256         entry.len = darg.dname_len;
1257         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1258
1259         if (IS_ERR_OR_NULL(inode)) {
1260                 jbd_debug(1, "Inode %d not found", darg.ino);
1261                 return 0;
1262         }
1263
1264         old_parent = ext4_iget(sb, darg.parent_ino,
1265                                 EXT4_IGET_NORMAL);
1266         if (IS_ERR_OR_NULL(old_parent)) {
1267                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1268                 iput(inode);
1269                 return 0;
1270         }
1271
1272         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1273         /* -ENOENT ok coz it might not exist anymore. */
1274         if (ret == -ENOENT)
1275                 ret = 0;
1276         iput(old_parent);
1277         iput(inode);
1278         return ret;
1279 }
1280
1281 static int ext4_fc_replay_link_internal(struct super_block *sb,
1282                                 struct dentry_info_args *darg,
1283                                 struct inode *inode)
1284 {
1285         struct inode *dir = NULL;
1286         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1287         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1288         int ret = 0;
1289
1290         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1291         if (IS_ERR(dir)) {
1292                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1293                 dir = NULL;
1294                 goto out;
1295         }
1296
1297         dentry_dir = d_obtain_alias(dir);
1298         if (IS_ERR(dentry_dir)) {
1299                 jbd_debug(1, "Failed to obtain dentry");
1300                 dentry_dir = NULL;
1301                 goto out;
1302         }
1303
1304         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1305         if (!dentry_inode) {
1306                 jbd_debug(1, "Inode dentry not created.");
1307                 ret = -ENOMEM;
1308                 goto out;
1309         }
1310
1311         ret = __ext4_link(dir, inode, dentry_inode);
1312         /*
1313          * It's possible that link already existed since data blocks
1314          * for the dir in question got persisted before we crashed OR
1315          * we replayed this tag and crashed before the entire replay
1316          * could complete.
1317          */
1318         if (ret && ret != -EEXIST) {
1319                 jbd_debug(1, "Failed to link\n");
1320                 goto out;
1321         }
1322
1323         ret = 0;
1324 out:
1325         if (dentry_dir) {
1326                 d_drop(dentry_dir);
1327                 dput(dentry_dir);
1328         } else if (dir) {
1329                 iput(dir);
1330         }
1331         if (dentry_inode) {
1332                 d_drop(dentry_inode);
1333                 dput(dentry_inode);
1334         }
1335
1336         return ret;
1337 }
1338
1339 /* Link replay function */
1340 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1341 {
1342         struct inode *inode;
1343         struct dentry_info_args darg;
1344         int ret = 0;
1345
1346         tl_to_darg(&darg, tl);
1347         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1348                         darg.parent_ino, darg.dname_len);
1349
1350         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1351         if (IS_ERR_OR_NULL(inode)) {
1352                 jbd_debug(1, "Inode not found.");
1353                 return 0;
1354         }
1355
1356         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1357         iput(inode);
1358         return ret;
1359 }
1360
1361 /*
1362  * Record all the modified inodes during replay. We use this later to setup
1363  * block bitmaps correctly.
1364  */
1365 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1366 {
1367         struct ext4_fc_replay_state *state;
1368         int i;
1369
1370         state = &EXT4_SB(sb)->s_fc_replay_state;
1371         for (i = 0; i < state->fc_modified_inodes_used; i++)
1372                 if (state->fc_modified_inodes[i] == ino)
1373                         return 0;
1374         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1375                 state->fc_modified_inodes_size +=
1376                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1377                 state->fc_modified_inodes = krealloc(
1378                                         state->fc_modified_inodes, sizeof(int) *
1379                                         state->fc_modified_inodes_size,
1380                                         GFP_KERNEL);
1381                 if (!state->fc_modified_inodes)
1382                         return -ENOMEM;
1383         }
1384         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1385         return 0;
1386 }
1387
1388 /*
1389  * Inode replay function
1390  */
1391 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1392 {
1393         struct ext4_fc_inode *fc_inode;
1394         struct ext4_inode *raw_inode;
1395         struct ext4_inode *raw_fc_inode;
1396         struct inode *inode = NULL;
1397         struct ext4_iloc iloc;
1398         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1399         struct ext4_extent_header *eh;
1400
1401         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1402
1403         ino = le32_to_cpu(fc_inode->fc_ino);
1404         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1405
1406         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1407         if (!IS_ERR_OR_NULL(inode)) {
1408                 ext4_ext_clear_bb(inode);
1409                 iput(inode);
1410         }
1411
1412         ext4_fc_record_modified_inode(sb, ino);
1413
1414         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1415         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1416         if (ret)
1417                 goto out;
1418
1419         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1420         raw_inode = ext4_raw_inode(&iloc);
1421
1422         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1423         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1424                 inode_len - offsetof(struct ext4_inode, i_generation));
1425         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1426                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1427                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1428                         memset(eh, 0, sizeof(*eh));
1429                         eh->eh_magic = EXT4_EXT_MAGIC;
1430                         eh->eh_max = cpu_to_le16(
1431                                 (sizeof(raw_inode->i_block) -
1432                                  sizeof(struct ext4_extent_header))
1433                                  / sizeof(struct ext4_extent));
1434                 }
1435         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1436                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1437                         sizeof(raw_inode->i_block));
1438         }
1439
1440         /* Immediately update the inode on disk. */
1441         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1442         if (ret)
1443                 goto out;
1444         ret = sync_dirty_buffer(iloc.bh);
1445         if (ret)
1446                 goto out;
1447         ret = ext4_mark_inode_used(sb, ino);
1448         if (ret)
1449                 goto out;
1450
1451         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1452         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1453         if (IS_ERR_OR_NULL(inode)) {
1454                 jbd_debug(1, "Inode not found.");
1455                 return -EFSCORRUPTED;
1456         }
1457
1458         /*
1459          * Our allocator could have made different decisions than before
1460          * crashing. This should be fixed but until then, we calculate
1461          * the number of blocks the inode.
1462          */
1463         ext4_ext_replay_set_iblocks(inode);
1464
1465         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1466         ext4_reset_inode_seed(inode);
1467
1468         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1469         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1470         sync_dirty_buffer(iloc.bh);
1471         brelse(iloc.bh);
1472 out:
1473         iput(inode);
1474         if (!ret)
1475                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1476
1477         return 0;
1478 }
1479
1480 /*
1481  * Dentry create replay function.
1482  *
1483  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1484  * inode for which we are trying to create a dentry here, should already have
1485  * been replayed before we start here.
1486  */
1487 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1488 {
1489         int ret = 0;
1490         struct inode *inode = NULL;
1491         struct inode *dir = NULL;
1492         struct dentry_info_args darg;
1493
1494         tl_to_darg(&darg, tl);
1495
1496         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1497                         darg.parent_ino, darg.dname_len);
1498
1499         /* This takes care of update group descriptor and other metadata */
1500         ret = ext4_mark_inode_used(sb, darg.ino);
1501         if (ret)
1502                 goto out;
1503
1504         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1505         if (IS_ERR_OR_NULL(inode)) {
1506                 jbd_debug(1, "inode %d not found.", darg.ino);
1507                 inode = NULL;
1508                 ret = -EINVAL;
1509                 goto out;
1510         }
1511
1512         if (S_ISDIR(inode->i_mode)) {
1513                 /*
1514                  * If we are creating a directory, we need to make sure that the
1515                  * dot and dot dot dirents are setup properly.
1516                  */
1517                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1518                 if (IS_ERR_OR_NULL(dir)) {
1519                         jbd_debug(1, "Dir %d not found.", darg.ino);
1520                         goto out;
1521                 }
1522                 ret = ext4_init_new_dir(NULL, dir, inode);
1523                 iput(dir);
1524                 if (ret) {
1525                         ret = 0;
1526                         goto out;
1527                 }
1528         }
1529         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1530         if (ret)
1531                 goto out;
1532         set_nlink(inode, 1);
1533         ext4_mark_inode_dirty(NULL, inode);
1534 out:
1535         if (inode)
1536                 iput(inode);
1537         return ret;
1538 }
1539
1540 /*
1541  * Record physical disk regions which are in use as per fast commit area. Our
1542  * simple replay phase allocator excludes these regions from allocation.
1543  */
1544 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1545                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1546 {
1547         struct ext4_fc_replay_state *state;
1548         struct ext4_fc_alloc_region *region;
1549
1550         state = &EXT4_SB(sb)->s_fc_replay_state;
1551         if (state->fc_regions_used == state->fc_regions_size) {
1552                 state->fc_regions_size +=
1553                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1554                 state->fc_regions = krealloc(
1555                                         state->fc_regions,
1556                                         state->fc_regions_size *
1557                                         sizeof(struct ext4_fc_alloc_region),
1558                                         GFP_KERNEL);
1559                 if (!state->fc_regions)
1560                         return -ENOMEM;
1561         }
1562         region = &state->fc_regions[state->fc_regions_used++];
1563         region->ino = ino;
1564         region->lblk = lblk;
1565         region->pblk = pblk;
1566         region->len = len;
1567
1568         return 0;
1569 }
1570
1571 /* Replay add range tag */
1572 static int ext4_fc_replay_add_range(struct super_block *sb,
1573                                 struct ext4_fc_tl *tl)
1574 {
1575         struct ext4_fc_add_range *fc_add_ex;
1576         struct ext4_extent newex, *ex;
1577         struct inode *inode;
1578         ext4_lblk_t start, cur;
1579         int remaining, len;
1580         ext4_fsblk_t start_pblk;
1581         struct ext4_map_blocks map;
1582         struct ext4_ext_path *path = NULL;
1583         int ret;
1584
1585         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1586         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1587
1588         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1589                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1590                 ext4_ext_get_actual_len(ex));
1591
1592         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1593                                 EXT4_IGET_NORMAL);
1594         if (IS_ERR_OR_NULL(inode)) {
1595                 jbd_debug(1, "Inode not found.");
1596                 return 0;
1597         }
1598
1599         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1600
1601         start = le32_to_cpu(ex->ee_block);
1602         start_pblk = ext4_ext_pblock(ex);
1603         len = ext4_ext_get_actual_len(ex);
1604
1605         cur = start;
1606         remaining = len;
1607         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1608                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1609                   inode->i_ino);
1610
1611         while (remaining > 0) {
1612                 map.m_lblk = cur;
1613                 map.m_len = remaining;
1614                 map.m_pblk = 0;
1615                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1616
1617                 if (ret < 0) {
1618                         iput(inode);
1619                         return 0;
1620                 }
1621
1622                 if (ret == 0) {
1623                         /* Range is not mapped */
1624                         path = ext4_find_extent(inode, cur, NULL, 0);
1625                         if (IS_ERR(path)) {
1626                                 iput(inode);
1627                                 return 0;
1628                         }
1629                         memset(&newex, 0, sizeof(newex));
1630                         newex.ee_block = cpu_to_le32(cur);
1631                         ext4_ext_store_pblock(
1632                                 &newex, start_pblk + cur - start);
1633                         newex.ee_len = cpu_to_le16(map.m_len);
1634                         if (ext4_ext_is_unwritten(ex))
1635                                 ext4_ext_mark_unwritten(&newex);
1636                         down_write(&EXT4_I(inode)->i_data_sem);
1637                         ret = ext4_ext_insert_extent(
1638                                 NULL, inode, &path, &newex, 0);
1639                         up_write((&EXT4_I(inode)->i_data_sem));
1640                         ext4_ext_drop_refs(path);
1641                         kfree(path);
1642                         if (ret) {
1643                                 iput(inode);
1644                                 return 0;
1645                         }
1646                         goto next;
1647                 }
1648
1649                 if (start_pblk + cur - start != map.m_pblk) {
1650                         /*
1651                          * Logical to physical mapping changed. This can happen
1652                          * if this range was removed and then reallocated to
1653                          * map to new physical blocks during a fast commit.
1654                          */
1655                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1656                                         ext4_ext_is_unwritten(ex),
1657                                         start_pblk + cur - start);
1658                         if (ret) {
1659                                 iput(inode);
1660                                 return 0;
1661                         }
1662                         /*
1663                          * Mark the old blocks as free since they aren't used
1664                          * anymore. We maintain an array of all the modified
1665                          * inodes. In case these blocks are still used at either
1666                          * a different logical range in the same inode or in
1667                          * some different inode, we will mark them as allocated
1668                          * at the end of the FC replay using our array of
1669                          * modified inodes.
1670                          */
1671                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1672                         goto next;
1673                 }
1674
1675                 /* Range is mapped and needs a state change */
1676                 jbd_debug(1, "Converting from %d to %d %lld",
1677                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1678                         ext4_ext_is_unwritten(ex), map.m_pblk);
1679                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1680                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1681                 if (ret) {
1682                         iput(inode);
1683                         return 0;
1684                 }
1685                 /*
1686                  * We may have split the extent tree while toggling the state.
1687                  * Try to shrink the extent tree now.
1688                  */
1689                 ext4_ext_replay_shrink_inode(inode, start + len);
1690 next:
1691                 cur += map.m_len;
1692                 remaining -= map.m_len;
1693         }
1694         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1695                                         sb->s_blocksize_bits);
1696         iput(inode);
1697         return 0;
1698 }
1699
1700 /* Replay DEL_RANGE tag */
1701 static int
1702 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1703 {
1704         struct inode *inode;
1705         struct ext4_fc_del_range *lrange;
1706         struct ext4_map_blocks map;
1707         ext4_lblk_t cur, remaining;
1708         int ret;
1709
1710         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1711         cur = le32_to_cpu(lrange->fc_lblk);
1712         remaining = le32_to_cpu(lrange->fc_len);
1713
1714         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1715                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1716
1717         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1718         if (IS_ERR_OR_NULL(inode)) {
1719                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1720                 return 0;
1721         }
1722
1723         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1724
1725         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1726                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1727                         le32_to_cpu(lrange->fc_len));
1728         while (remaining > 0) {
1729                 map.m_lblk = cur;
1730                 map.m_len = remaining;
1731
1732                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1733                 if (ret < 0) {
1734                         iput(inode);
1735                         return 0;
1736                 }
1737                 if (ret > 0) {
1738                         remaining -= ret;
1739                         cur += ret;
1740                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1741                 } else {
1742                         remaining -= map.m_len;
1743                         cur += map.m_len;
1744                 }
1745         }
1746
1747         ret = ext4_punch_hole(inode,
1748                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1749                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1750         if (ret)
1751                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1752         ext4_ext_replay_shrink_inode(inode,
1753                 i_size_read(inode) >> sb->s_blocksize_bits);
1754         ext4_mark_inode_dirty(NULL, inode);
1755         iput(inode);
1756
1757         return 0;
1758 }
1759
1760 static inline const char *tag2str(u16 tag)
1761 {
1762         switch (tag) {
1763         case EXT4_FC_TAG_LINK:
1764                 return "TAG_ADD_ENTRY";
1765         case EXT4_FC_TAG_UNLINK:
1766                 return "TAG_DEL_ENTRY";
1767         case EXT4_FC_TAG_ADD_RANGE:
1768                 return "TAG_ADD_RANGE";
1769         case EXT4_FC_TAG_CREAT:
1770                 return "TAG_CREAT_DENTRY";
1771         case EXT4_FC_TAG_DEL_RANGE:
1772                 return "TAG_DEL_RANGE";
1773         case EXT4_FC_TAG_INODE:
1774                 return "TAG_INODE";
1775         case EXT4_FC_TAG_PAD:
1776                 return "TAG_PAD";
1777         case EXT4_FC_TAG_TAIL:
1778                 return "TAG_TAIL";
1779         case EXT4_FC_TAG_HEAD:
1780                 return "TAG_HEAD";
1781         default:
1782                 return "TAG_ERROR";
1783         }
1784 }
1785
1786 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1787 {
1788         struct ext4_fc_replay_state *state;
1789         struct inode *inode;
1790         struct ext4_ext_path *path = NULL;
1791         struct ext4_map_blocks map;
1792         int i, ret, j;
1793         ext4_lblk_t cur, end;
1794
1795         state = &EXT4_SB(sb)->s_fc_replay_state;
1796         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1797                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1798                         EXT4_IGET_NORMAL);
1799                 if (IS_ERR_OR_NULL(inode)) {
1800                         jbd_debug(1, "Inode %d not found.",
1801                                 state->fc_modified_inodes[i]);
1802                         continue;
1803                 }
1804                 cur = 0;
1805                 end = EXT_MAX_BLOCKS;
1806                 while (cur < end) {
1807                         map.m_lblk = cur;
1808                         map.m_len = end - cur;
1809
1810                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1811                         if (ret < 0)
1812                                 break;
1813
1814                         if (ret > 0) {
1815                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1816                                 if (!IS_ERR_OR_NULL(path)) {
1817                                         for (j = 0; j < path->p_depth; j++)
1818                                                 ext4_mb_mark_bb(inode->i_sb,
1819                                                         path[j].p_block, 1, 1);
1820                                         ext4_ext_drop_refs(path);
1821                                         kfree(path);
1822                                 }
1823                                 cur += ret;
1824                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1825                                                         map.m_len, 1);
1826                         } else {
1827                                 cur = cur + (map.m_len ? map.m_len : 1);
1828                         }
1829                 }
1830                 iput(inode);
1831         }
1832 }
1833
1834 /*
1835  * Check if block is in excluded regions for block allocation. The simple
1836  * allocator that runs during replay phase is calls this function to see
1837  * if it is okay to use a block.
1838  */
1839 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1840 {
1841         int i;
1842         struct ext4_fc_replay_state *state;
1843
1844         state = &EXT4_SB(sb)->s_fc_replay_state;
1845         for (i = 0; i < state->fc_regions_valid; i++) {
1846                 if (state->fc_regions[i].ino == 0 ||
1847                         state->fc_regions[i].len == 0)
1848                         continue;
1849                 if (blk >= state->fc_regions[i].pblk &&
1850                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1851                         return true;
1852         }
1853         return false;
1854 }
1855
1856 /* Cleanup function called after replay */
1857 void ext4_fc_replay_cleanup(struct super_block *sb)
1858 {
1859         struct ext4_sb_info *sbi = EXT4_SB(sb);
1860
1861         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1862         kfree(sbi->s_fc_replay_state.fc_regions);
1863         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1864 }
1865
1866 /*
1867  * Recovery Scan phase handler
1868  *
1869  * This function is called during the scan phase and is responsible
1870  * for doing following things:
1871  * - Make sure the fast commit area has valid tags for replay
1872  * - Count number of tags that need to be replayed by the replay handler
1873  * - Verify CRC
1874  * - Create a list of excluded blocks for allocation during replay phase
1875  *
1876  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1877  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1878  * to indicate that scan has finished and JBD2 can now start replay phase.
1879  * It returns a negative error to indicate that there was an error. At the end
1880  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1881  * to indicate the number of tags that need to replayed during the replay phase.
1882  */
1883 static int ext4_fc_replay_scan(journal_t *journal,
1884                                 struct buffer_head *bh, int off,
1885                                 tid_t expected_tid)
1886 {
1887         struct super_block *sb = journal->j_private;
1888         struct ext4_sb_info *sbi = EXT4_SB(sb);
1889         struct ext4_fc_replay_state *state;
1890         int ret = JBD2_FC_REPLAY_CONTINUE;
1891         struct ext4_fc_add_range *ext;
1892         struct ext4_fc_tl *tl;
1893         struct ext4_fc_tail *tail;
1894         __u8 *start, *end;
1895         struct ext4_fc_head *head;
1896         struct ext4_extent *ex;
1897
1898         state = &sbi->s_fc_replay_state;
1899
1900         start = (u8 *)bh->b_data;
1901         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1902
1903         if (state->fc_replay_expected_off == 0) {
1904                 state->fc_cur_tag = 0;
1905                 state->fc_replay_num_tags = 0;
1906                 state->fc_crc = 0;
1907                 state->fc_regions = NULL;
1908                 state->fc_regions_valid = state->fc_regions_used =
1909                         state->fc_regions_size = 0;
1910                 /* Check if we can stop early */
1911                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1912                         != EXT4_FC_TAG_HEAD)
1913                         return 0;
1914         }
1915
1916         if (off != state->fc_replay_expected_off) {
1917                 ret = -EFSCORRUPTED;
1918                 goto out_err;
1919         }
1920
1921         state->fc_replay_expected_off++;
1922         fc_for_each_tl(start, end, tl) {
1923                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1924                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1925                 switch (le16_to_cpu(tl->fc_tag)) {
1926                 case EXT4_FC_TAG_ADD_RANGE:
1927                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1928                         ex = (struct ext4_extent *)&ext->fc_ex;
1929                         ret = ext4_fc_record_regions(sb,
1930                                 le32_to_cpu(ext->fc_ino),
1931                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1932                                 ext4_ext_get_actual_len(ex));
1933                         if (ret < 0)
1934                                 break;
1935                         ret = JBD2_FC_REPLAY_CONTINUE;
1936                         fallthrough;
1937                 case EXT4_FC_TAG_DEL_RANGE:
1938                 case EXT4_FC_TAG_LINK:
1939                 case EXT4_FC_TAG_UNLINK:
1940                 case EXT4_FC_TAG_CREAT:
1941                 case EXT4_FC_TAG_INODE:
1942                 case EXT4_FC_TAG_PAD:
1943                         state->fc_cur_tag++;
1944                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1945                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1946                         break;
1947                 case EXT4_FC_TAG_TAIL:
1948                         state->fc_cur_tag++;
1949                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1950                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1951                                                 sizeof(*tl) +
1952                                                 offsetof(struct ext4_fc_tail,
1953                                                 fc_crc));
1954                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1955                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1956                                 state->fc_replay_num_tags = state->fc_cur_tag;
1957                                 state->fc_regions_valid =
1958                                         state->fc_regions_used;
1959                         } else {
1960                                 ret = state->fc_replay_num_tags ?
1961                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1962                         }
1963                         state->fc_crc = 0;
1964                         break;
1965                 case EXT4_FC_TAG_HEAD:
1966                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1967                         if (le32_to_cpu(head->fc_features) &
1968                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1969                                 ret = -EOPNOTSUPP;
1970                                 break;
1971                         }
1972                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
1973                                 ret = JBD2_FC_REPLAY_STOP;
1974                                 break;
1975                         }
1976                         state->fc_cur_tag++;
1977                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1978                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1979                         break;
1980                 default:
1981                         ret = state->fc_replay_num_tags ?
1982                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
1983                 }
1984                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1985                         break;
1986         }
1987
1988 out_err:
1989         trace_ext4_fc_replay_scan(sb, ret, off);
1990         return ret;
1991 }
1992
1993 /*
1994  * Main recovery path entry point.
1995  * The meaning of return codes is similar as above.
1996  */
1997 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1998                                 enum passtype pass, int off, tid_t expected_tid)
1999 {
2000         struct super_block *sb = journal->j_private;
2001         struct ext4_sb_info *sbi = EXT4_SB(sb);
2002         struct ext4_fc_tl *tl;
2003         __u8 *start, *end;
2004         int ret = JBD2_FC_REPLAY_CONTINUE;
2005         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2006         struct ext4_fc_tail *tail;
2007
2008         if (pass == PASS_SCAN) {
2009                 state->fc_current_pass = PASS_SCAN;
2010                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2011         }
2012
2013         if (state->fc_current_pass != pass) {
2014                 state->fc_current_pass = pass;
2015                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2016         }
2017         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2018                 jbd_debug(1, "Replay stops\n");
2019                 ext4_fc_set_bitmaps_and_counters(sb);
2020                 return 0;
2021         }
2022
2023 #ifdef CONFIG_EXT4_DEBUG
2024         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2025                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2026                 return JBD2_FC_REPLAY_STOP;
2027         }
2028 #endif
2029
2030         start = (u8 *)bh->b_data;
2031         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2032
2033         fc_for_each_tl(start, end, tl) {
2034                 if (state->fc_replay_num_tags == 0) {
2035                         ret = JBD2_FC_REPLAY_STOP;
2036                         ext4_fc_set_bitmaps_and_counters(sb);
2037                         break;
2038                 }
2039                 jbd_debug(3, "Replay phase, tag:%s\n",
2040                                 tag2str(le16_to_cpu(tl->fc_tag)));
2041                 state->fc_replay_num_tags--;
2042                 switch (le16_to_cpu(tl->fc_tag)) {
2043                 case EXT4_FC_TAG_LINK:
2044                         ret = ext4_fc_replay_link(sb, tl);
2045                         break;
2046                 case EXT4_FC_TAG_UNLINK:
2047                         ret = ext4_fc_replay_unlink(sb, tl);
2048                         break;
2049                 case EXT4_FC_TAG_ADD_RANGE:
2050                         ret = ext4_fc_replay_add_range(sb, tl);
2051                         break;
2052                 case EXT4_FC_TAG_CREAT:
2053                         ret = ext4_fc_replay_create(sb, tl);
2054                         break;
2055                 case EXT4_FC_TAG_DEL_RANGE:
2056                         ret = ext4_fc_replay_del_range(sb, tl);
2057                         break;
2058                 case EXT4_FC_TAG_INODE:
2059                         ret = ext4_fc_replay_inode(sb, tl);
2060                         break;
2061                 case EXT4_FC_TAG_PAD:
2062                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2063                                 ext4_fc_tag_len(tl), 0);
2064                         break;
2065                 case EXT4_FC_TAG_TAIL:
2066                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2067                                 ext4_fc_tag_len(tl), 0);
2068                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2069                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2070                         break;
2071                 case EXT4_FC_TAG_HEAD:
2072                         break;
2073                 default:
2074                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2075                                 ext4_fc_tag_len(tl), 0);
2076                         ret = -ECANCELED;
2077                         break;
2078                 }
2079                 if (ret < 0)
2080                         break;
2081                 ret = JBD2_FC_REPLAY_CONTINUE;
2082         }
2083         return ret;
2084 }
2085
2086 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2087 {
2088         /*
2089          * We set replay callback even if fast commit disabled because we may
2090          * could still have fast commit blocks that need to be replayed even if
2091          * fast commit has now been turned off.
2092          */
2093         journal->j_fc_replay_callback = ext4_fc_replay;
2094         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2095                 return;
2096         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2097 }
2098
2099 const char *fc_ineligible_reasons[] = {
2100         "Extended attributes changed",
2101         "Cross rename",
2102         "Journal flag changed",
2103         "Insufficient memory",
2104         "Swap boot",
2105         "Resize",
2106         "Dir renamed",
2107         "Falloc range op",
2108         "FC Commit Failed"
2109 };
2110
2111 int ext4_fc_info_show(struct seq_file *seq, void *v)
2112 {
2113         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2114         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2115         int i;
2116
2117         if (v != SEQ_START_TOKEN)
2118                 return 0;
2119
2120         seq_printf(seq,
2121                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2122                    stats->fc_num_commits, stats->fc_ineligible_commits,
2123                    stats->fc_numblks,
2124                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2125         seq_puts(seq, "Ineligible reasons:\n");
2126         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2127                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2128                         stats->fc_ineligible_reason_count[i]);
2129
2130         return 0;
2131 }
2132
2133 int __init ext4_fc_init_dentry_cache(void)
2134 {
2135         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2136                                            SLAB_RECLAIM_ACCOUNT);
2137
2138         if (ext4_fc_dentry_cachep == NULL)
2139                 return -ENOMEM;
2140
2141         return 0;
2142 }