ext4: dedpulicate the code to wait on inode that's being committed
[linux-2.6-microblaze.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to gaurantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124         BUFFER_TRACE(bh, "");
125         if (uptodate) {
126                 ext4_debug("%s: Block %lld up-to-date",
127                            __func__, bh->b_blocknr);
128                 set_buffer_uptodate(bh);
129         } else {
130                 ext4_debug("%s: Block %lld not up-to-date",
131                            __func__, bh->b_blocknr);
132                 clear_buffer_uptodate(bh);
133         }
134
135         unlock_buffer(bh);
136 }
137
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140         struct ext4_inode_info *ei = EXT4_I(inode);
141
142         ei->i_fc_lblk_start = 0;
143         ei->i_fc_lblk_len = 0;
144 }
145
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148         struct ext4_inode_info *ei = EXT4_I(inode);
149
150         ext4_fc_reset_inode(inode);
151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152         INIT_LIST_HEAD(&ei->i_fc_list);
153         init_waitqueue_head(&ei->i_fc_wait);
154         atomic_set(&ei->i_fc_updates, 0);
155         ei->i_fc_committed_subtid = 0;
156 }
157
158 /* This function must be called with sbi->s_fc_lock held. */
159 static void ext4_fc_wait_committing_inode(struct inode *inode)
160 {
161         wait_queue_head_t *wq;
162         struct ext4_inode_info *ei = EXT4_I(inode);
163
164 #if (BITS_PER_LONG < 64)
165         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
166                         EXT4_STATE_FC_COMMITTING);
167         wq = bit_waitqueue(&ei->i_state_flags,
168                                 EXT4_STATE_FC_COMMITTING);
169 #else
170         DEFINE_WAIT_BIT(wait, &ei->i_flags,
171                         EXT4_STATE_FC_COMMITTING);
172         wq = bit_waitqueue(&ei->i_flags,
173                                 EXT4_STATE_FC_COMMITTING);
174 #endif
175         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
176         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
177         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
178         schedule();
179         finish_wait(wq, &wait.wq_entry);
180 }
181
182 /*
183  * Inform Ext4's fast about start of an inode update
184  *
185  * This function is called by the high level call VFS callbacks before
186  * performing any inode update. This function blocks if there's an ongoing
187  * fast commit on the inode in question.
188  */
189 void ext4_fc_start_update(struct inode *inode)
190 {
191         struct ext4_inode_info *ei = EXT4_I(inode);
192
193         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
194             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
195                 return;
196
197 restart:
198         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
199         if (list_empty(&ei->i_fc_list))
200                 goto out;
201
202         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
203                 ext4_fc_wait_committing_inode(inode);
204                 goto restart;
205         }
206 out:
207         atomic_inc(&ei->i_fc_updates);
208         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
209 }
210
211 /*
212  * Stop inode update and wake up waiting fast commits if any.
213  */
214 void ext4_fc_stop_update(struct inode *inode)
215 {
216         struct ext4_inode_info *ei = EXT4_I(inode);
217
218         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
219             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
220                 return;
221
222         if (atomic_dec_and_test(&ei->i_fc_updates))
223                 wake_up_all(&ei->i_fc_wait);
224 }
225
226 /*
227  * Remove inode from fast commit list. If the inode is being committed
228  * we wait until inode commit is done.
229  */
230 void ext4_fc_del(struct inode *inode)
231 {
232         struct ext4_inode_info *ei = EXT4_I(inode);
233
234         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
235             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
236                 return;
237
238 restart:
239         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
240         if (list_empty(&ei->i_fc_list)) {
241                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
242                 return;
243         }
244
245         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
246                 ext4_fc_wait_committing_inode(inode);
247                 goto restart;
248         }
249         list_del_init(&ei->i_fc_list);
250         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
251 }
252
253 /*
254  * Mark file system as fast commit ineligible. This means that next commit
255  * operation would result in a full jbd2 commit.
256  */
257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
258 {
259         struct ext4_sb_info *sbi = EXT4_SB(sb);
260
261         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
262             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
263                 return;
264
265         sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
266         WARN_ON(reason >= EXT4_FC_REASON_MAX);
267         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
268 }
269
270 /*
271  * Start a fast commit ineligible update. Any commits that happen while
272  * such an operation is in progress fall back to full commits.
273  */
274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
275 {
276         struct ext4_sb_info *sbi = EXT4_SB(sb);
277
278         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
279             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
280                 return;
281
282         WARN_ON(reason >= EXT4_FC_REASON_MAX);
283         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
284         atomic_inc(&sbi->s_fc_ineligible_updates);
285 }
286
287 /*
288  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
289  * to ensure that after stopping the ineligible update, at least one full
290  * commit takes place.
291  */
292 void ext4_fc_stop_ineligible(struct super_block *sb)
293 {
294         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
295             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
296                 return;
297
298         EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
299         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
300 }
301
302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
303 {
304         return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
305                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
306 }
307
308 /*
309  * Generic fast commit tracking function. If this is the first time this we are
310  * called after a full commit, we initialize fast commit fields and then call
311  * __fc_track_fn() with update = 0. If we have already been called after a full
312  * commit, we pass update = 1. Based on that, the track function can determine
313  * if it needs to track a field for the first time or if it needs to just
314  * update the previously tracked value.
315  *
316  * If enqueue is set, this function enqueues the inode in fast commit list.
317  */
318 static int ext4_fc_track_template(
319         handle_t *handle, struct inode *inode,
320         int (*__fc_track_fn)(struct inode *, void *, bool),
321         void *args, int enqueue)
322 {
323         bool update = false;
324         struct ext4_inode_info *ei = EXT4_I(inode);
325         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
326         tid_t tid = 0;
327         int ret;
328
329         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
330             (sbi->s_mount_state & EXT4_FC_REPLAY))
331                 return -EOPNOTSUPP;
332
333         if (ext4_fc_is_ineligible(inode->i_sb))
334                 return -EINVAL;
335
336         tid = handle->h_transaction->t_tid;
337         mutex_lock(&ei->i_fc_lock);
338         if (tid == ei->i_sync_tid) {
339                 update = true;
340         } else {
341                 ext4_fc_reset_inode(inode);
342                 ei->i_sync_tid = tid;
343         }
344         ret = __fc_track_fn(inode, args, update);
345         mutex_unlock(&ei->i_fc_lock);
346
347         if (!enqueue)
348                 return ret;
349
350         spin_lock(&sbi->s_fc_lock);
351         if (list_empty(&EXT4_I(inode)->i_fc_list))
352                 list_add_tail(&EXT4_I(inode)->i_fc_list,
353                                 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
354                                 &sbi->s_fc_q[FC_Q_STAGING] :
355                                 &sbi->s_fc_q[FC_Q_MAIN]);
356         spin_unlock(&sbi->s_fc_lock);
357
358         return ret;
359 }
360
361 struct __track_dentry_update_args {
362         struct dentry *dentry;
363         int op;
364 };
365
366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
368 {
369         struct ext4_fc_dentry_update *node;
370         struct ext4_inode_info *ei = EXT4_I(inode);
371         struct __track_dentry_update_args *dentry_update =
372                 (struct __track_dentry_update_args *)arg;
373         struct dentry *dentry = dentry_update->dentry;
374         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
375
376         mutex_unlock(&ei->i_fc_lock);
377         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
378         if (!node) {
379                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
380                 mutex_lock(&ei->i_fc_lock);
381                 return -ENOMEM;
382         }
383
384         node->fcd_op = dentry_update->op;
385         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
386         node->fcd_ino = inode->i_ino;
387         if (dentry->d_name.len > DNAME_INLINE_LEN) {
388                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
389                 if (!node->fcd_name.name) {
390                         kmem_cache_free(ext4_fc_dentry_cachep, node);
391                         ext4_fc_mark_ineligible(inode->i_sb,
392                                 EXT4_FC_REASON_NOMEM);
393                         mutex_lock(&ei->i_fc_lock);
394                         return -ENOMEM;
395                 }
396                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
397                         dentry->d_name.len);
398         } else {
399                 memcpy(node->fcd_iname, dentry->d_name.name,
400                         dentry->d_name.len);
401                 node->fcd_name.name = node->fcd_iname;
402         }
403         node->fcd_name.len = dentry->d_name.len;
404
405         spin_lock(&sbi->s_fc_lock);
406         if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
407                 list_add_tail(&node->fcd_list,
408                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
409         else
410                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
411         spin_unlock(&sbi->s_fc_lock);
412         mutex_lock(&ei->i_fc_lock);
413
414         return 0;
415 }
416
417 void __ext4_fc_track_unlink(handle_t *handle,
418                 struct inode *inode, struct dentry *dentry)
419 {
420         struct __track_dentry_update_args args;
421         int ret;
422
423         args.dentry = dentry;
424         args.op = EXT4_FC_TAG_UNLINK;
425
426         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
427                                         (void *)&args, 0);
428         trace_ext4_fc_track_unlink(inode, dentry, ret);
429 }
430
431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
432 {
433         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
434 }
435
436 void __ext4_fc_track_link(handle_t *handle,
437         struct inode *inode, struct dentry *dentry)
438 {
439         struct __track_dentry_update_args args;
440         int ret;
441
442         args.dentry = dentry;
443         args.op = EXT4_FC_TAG_LINK;
444
445         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
446                                         (void *)&args, 0);
447         trace_ext4_fc_track_link(inode, dentry, ret);
448 }
449
450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
451 {
452         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
453 }
454
455 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
456 {
457         struct __track_dentry_update_args args;
458         struct inode *inode = d_inode(dentry);
459         int ret;
460
461         args.dentry = dentry;
462         args.op = EXT4_FC_TAG_CREAT;
463
464         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
465                                         (void *)&args, 0);
466         trace_ext4_fc_track_create(inode, dentry, ret);
467 }
468
469 /* __track_fn for inode tracking */
470 static int __track_inode(struct inode *inode, void *arg, bool update)
471 {
472         if (update)
473                 return -EEXIST;
474
475         EXT4_I(inode)->i_fc_lblk_len = 0;
476
477         return 0;
478 }
479
480 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
481 {
482         int ret;
483
484         if (S_ISDIR(inode->i_mode))
485                 return;
486
487         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
488         trace_ext4_fc_track_inode(inode, ret);
489 }
490
491 struct __track_range_args {
492         ext4_lblk_t start, end;
493 };
494
495 /* __track_fn for tracking data updates */
496 static int __track_range(struct inode *inode, void *arg, bool update)
497 {
498         struct ext4_inode_info *ei = EXT4_I(inode);
499         ext4_lblk_t oldstart;
500         struct __track_range_args *__arg =
501                 (struct __track_range_args *)arg;
502
503         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
504                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
505                 return -ECANCELED;
506         }
507
508         oldstart = ei->i_fc_lblk_start;
509
510         if (update && ei->i_fc_lblk_len > 0) {
511                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
512                 ei->i_fc_lblk_len =
513                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
514                                 ei->i_fc_lblk_start + 1;
515         } else {
516                 ei->i_fc_lblk_start = __arg->start;
517                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
518         }
519
520         return 0;
521 }
522
523 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
524                          ext4_lblk_t end)
525 {
526         struct __track_range_args args;
527         int ret;
528
529         if (S_ISDIR(inode->i_mode))
530                 return;
531
532         args.start = start;
533         args.end = end;
534
535         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
536
537         trace_ext4_fc_track_range(inode, start, end, ret);
538 }
539
540 static void ext4_fc_submit_bh(struct super_block *sb)
541 {
542         int write_flags = REQ_SYNC;
543         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
544
545         if (test_opt(sb, BARRIER))
546                 write_flags |= REQ_FUA | REQ_PREFLUSH;
547         lock_buffer(bh);
548         clear_buffer_dirty(bh);
549         set_buffer_uptodate(bh);
550         bh->b_end_io = ext4_end_buffer_io_sync;
551         submit_bh(REQ_OP_WRITE, write_flags, bh);
552         EXT4_SB(sb)->s_fc_bh = NULL;
553 }
554
555 /* Ext4 commit path routines */
556
557 /* memzero and update CRC */
558 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
559                                 u32 *crc)
560 {
561         void *ret;
562
563         ret = memset(dst, 0, len);
564         if (crc)
565                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
566         return ret;
567 }
568
569 /*
570  * Allocate len bytes on a fast commit buffer.
571  *
572  * During the commit time this function is used to manage fast commit
573  * block space. We don't split a fast commit log onto different
574  * blocks. So this function makes sure that if there's not enough space
575  * on the current block, the remaining space in the current block is
576  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
577  * new block is from jbd2 and CRC is updated to reflect the padding
578  * we added.
579  */
580 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
581 {
582         struct ext4_fc_tl *tl;
583         struct ext4_sb_info *sbi = EXT4_SB(sb);
584         struct buffer_head *bh;
585         int bsize = sbi->s_journal->j_blocksize;
586         int ret, off = sbi->s_fc_bytes % bsize;
587         int pad_len;
588
589         /*
590          * After allocating len, we should have space at least for a 0 byte
591          * padding.
592          */
593         if (len + sizeof(struct ext4_fc_tl) > bsize)
594                 return NULL;
595
596         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
597                 /*
598                  * Only allocate from current buffer if we have enough space for
599                  * this request AND we have space to add a zero byte padding.
600                  */
601                 if (!sbi->s_fc_bh) {
602                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
603                         if (ret)
604                                 return NULL;
605                         sbi->s_fc_bh = bh;
606                 }
607                 sbi->s_fc_bytes += len;
608                 return sbi->s_fc_bh->b_data + off;
609         }
610         /* Need to add PAD tag */
611         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
612         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
613         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
614         tl->fc_len = cpu_to_le16(pad_len);
615         if (crc)
616                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
617         if (pad_len > 0)
618                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
619         ext4_fc_submit_bh(sb);
620
621         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
622         if (ret)
623                 return NULL;
624         sbi->s_fc_bh = bh;
625         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
626         return sbi->s_fc_bh->b_data;
627 }
628
629 /* memcpy to fc reserved space and update CRC */
630 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
631                                 int len, u32 *crc)
632 {
633         if (crc)
634                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
635         return memcpy(dst, src, len);
636 }
637
638 /*
639  * Complete a fast commit by writing tail tag.
640  *
641  * Writing tail tag marks the end of a fast commit. In order to guarantee
642  * atomicity, after writing tail tag, even if there's space remaining
643  * in the block, next commit shouldn't use it. That's why tail tag
644  * has the length as that of the remaining space on the block.
645  */
646 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
647 {
648         struct ext4_sb_info *sbi = EXT4_SB(sb);
649         struct ext4_fc_tl tl;
650         struct ext4_fc_tail tail;
651         int off, bsize = sbi->s_journal->j_blocksize;
652         u8 *dst;
653
654         /*
655          * ext4_fc_reserve_space takes care of allocating an extra block if
656          * there's no enough space on this block for accommodating this tail.
657          */
658         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
659         if (!dst)
660                 return -ENOSPC;
661
662         off = sbi->s_fc_bytes % bsize;
663
664         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
665         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
666         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
667
668         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
669         dst += sizeof(tl);
670         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
671         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
672         dst += sizeof(tail.fc_tid);
673         tail.fc_crc = cpu_to_le32(crc);
674         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
675
676         ext4_fc_submit_bh(sb);
677
678         return 0;
679 }
680
681 /*
682  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
683  * Returns false if there's not enough space.
684  */
685 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
686                            u32 *crc)
687 {
688         struct ext4_fc_tl tl;
689         u8 *dst;
690
691         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
692         if (!dst)
693                 return false;
694
695         tl.fc_tag = cpu_to_le16(tag);
696         tl.fc_len = cpu_to_le16(len);
697
698         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
699         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
700
701         return true;
702 }
703
704 /* Same as above, but adds dentry tlv. */
705 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
706                                         int parent_ino, int ino, int dlen,
707                                         const unsigned char *dname,
708                                         u32 *crc)
709 {
710         struct ext4_fc_dentry_info fcd;
711         struct ext4_fc_tl tl;
712         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
713                                         crc);
714
715         if (!dst)
716                 return false;
717
718         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
719         fcd.fc_ino = cpu_to_le32(ino);
720         tl.fc_tag = cpu_to_le16(tag);
721         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
722         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
723         dst += sizeof(tl);
724         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
725         dst += sizeof(fcd);
726         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
727         dst += dlen;
728
729         return true;
730 }
731
732 /*
733  * Writes inode in the fast commit space under TLV with tag @tag.
734  * Returns 0 on success, error on failure.
735  */
736 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
737 {
738         struct ext4_inode_info *ei = EXT4_I(inode);
739         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
740         int ret;
741         struct ext4_iloc iloc;
742         struct ext4_fc_inode fc_inode;
743         struct ext4_fc_tl tl;
744         u8 *dst;
745
746         ret = ext4_get_inode_loc(inode, &iloc);
747         if (ret)
748                 return ret;
749
750         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
751                 inode_len += ei->i_extra_isize;
752
753         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
754         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
755         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
756
757         dst = ext4_fc_reserve_space(inode->i_sb,
758                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
759         if (!dst)
760                 return -ECANCELED;
761
762         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
763                 return -ECANCELED;
764         dst += sizeof(tl);
765         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
766                 return -ECANCELED;
767         dst += sizeof(fc_inode);
768         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
769                                         inode_len, crc))
770                 return -ECANCELED;
771
772         return 0;
773 }
774
775 /*
776  * Writes updated data ranges for the inode in question. Updates CRC.
777  * Returns 0 on success, error otherwise.
778  */
779 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
780 {
781         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
782         struct ext4_inode_info *ei = EXT4_I(inode);
783         struct ext4_map_blocks map;
784         struct ext4_fc_add_range fc_ext;
785         struct ext4_fc_del_range lrange;
786         struct ext4_extent *ex;
787         int ret;
788
789         mutex_lock(&ei->i_fc_lock);
790         if (ei->i_fc_lblk_len == 0) {
791                 mutex_unlock(&ei->i_fc_lock);
792                 return 0;
793         }
794         old_blk_size = ei->i_fc_lblk_start;
795         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
796         ei->i_fc_lblk_len = 0;
797         mutex_unlock(&ei->i_fc_lock);
798
799         cur_lblk_off = old_blk_size;
800         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
801                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
802
803         while (cur_lblk_off <= new_blk_size) {
804                 map.m_lblk = cur_lblk_off;
805                 map.m_len = new_blk_size - cur_lblk_off + 1;
806                 ret = ext4_map_blocks(NULL, inode, &map, 0);
807                 if (ret < 0)
808                         return -ECANCELED;
809
810                 if (map.m_len == 0) {
811                         cur_lblk_off++;
812                         continue;
813                 }
814
815                 if (ret == 0) {
816                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
817                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
818                         lrange.fc_len = cpu_to_le32(map.m_len);
819                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
820                                             sizeof(lrange), (u8 *)&lrange, crc))
821                                 return -ENOSPC;
822                 } else {
823                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
824                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
825                         ex->ee_block = cpu_to_le32(map.m_lblk);
826                         ex->ee_len = cpu_to_le16(map.m_len);
827                         ext4_ext_store_pblock(ex, map.m_pblk);
828                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
829                                 ext4_ext_mark_unwritten(ex);
830                         else
831                                 ext4_ext_mark_initialized(ex);
832                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
833                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
834                                 return -ENOSPC;
835                 }
836
837                 cur_lblk_off += map.m_len;
838         }
839
840         return 0;
841 }
842
843
844 /* Submit data for all the fast commit inodes */
845 static int ext4_fc_submit_inode_data_all(journal_t *journal)
846 {
847         struct super_block *sb = (struct super_block *)(journal->j_private);
848         struct ext4_sb_info *sbi = EXT4_SB(sb);
849         struct ext4_inode_info *ei;
850         struct list_head *pos;
851         int ret = 0;
852
853         spin_lock(&sbi->s_fc_lock);
854         sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
855         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
856                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
857                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
858                 while (atomic_read(&ei->i_fc_updates)) {
859                         DEFINE_WAIT(wait);
860
861                         prepare_to_wait(&ei->i_fc_wait, &wait,
862                                                 TASK_UNINTERRUPTIBLE);
863                         if (atomic_read(&ei->i_fc_updates)) {
864                                 spin_unlock(&sbi->s_fc_lock);
865                                 schedule();
866                                 spin_lock(&sbi->s_fc_lock);
867                         }
868                         finish_wait(&ei->i_fc_wait, &wait);
869                 }
870                 spin_unlock(&sbi->s_fc_lock);
871                 ret = jbd2_submit_inode_data(ei->jinode);
872                 if (ret)
873                         return ret;
874                 spin_lock(&sbi->s_fc_lock);
875         }
876         spin_unlock(&sbi->s_fc_lock);
877
878         return ret;
879 }
880
881 /* Wait for completion of data for all the fast commit inodes */
882 static int ext4_fc_wait_inode_data_all(journal_t *journal)
883 {
884         struct super_block *sb = (struct super_block *)(journal->j_private);
885         struct ext4_sb_info *sbi = EXT4_SB(sb);
886         struct ext4_inode_info *pos, *n;
887         int ret = 0;
888
889         spin_lock(&sbi->s_fc_lock);
890         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
891                 if (!ext4_test_inode_state(&pos->vfs_inode,
892                                            EXT4_STATE_FC_COMMITTING))
893                         continue;
894                 spin_unlock(&sbi->s_fc_lock);
895
896                 ret = jbd2_wait_inode_data(journal, pos->jinode);
897                 if (ret)
898                         return ret;
899                 spin_lock(&sbi->s_fc_lock);
900         }
901         spin_unlock(&sbi->s_fc_lock);
902
903         return 0;
904 }
905
906 /* Commit all the directory entry updates */
907 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
908 {
909         struct super_block *sb = (struct super_block *)(journal->j_private);
910         struct ext4_sb_info *sbi = EXT4_SB(sb);
911         struct ext4_fc_dentry_update *fc_dentry;
912         struct inode *inode;
913         struct list_head *pos, *n, *fcd_pos, *fcd_n;
914         struct ext4_inode_info *ei;
915         int ret;
916
917         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
918                 return 0;
919         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
920                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
921                                         fcd_list);
922                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
923                         spin_unlock(&sbi->s_fc_lock);
924                         if (!ext4_fc_add_dentry_tlv(
925                                 sb, fc_dentry->fcd_op,
926                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
927                                 fc_dentry->fcd_name.len,
928                                 fc_dentry->fcd_name.name, crc)) {
929                                 ret = -ENOSPC;
930                                 goto lock_and_exit;
931                         }
932                         spin_lock(&sbi->s_fc_lock);
933                         continue;
934                 }
935
936                 inode = NULL;
937                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
938                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
939                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
940                                 inode = &ei->vfs_inode;
941                                 break;
942                         }
943                 }
944                 /*
945                  * If we don't find inode in our list, then it was deleted,
946                  * in which case, we don't need to record it's create tag.
947                  */
948                 if (!inode)
949                         continue;
950                 spin_unlock(&sbi->s_fc_lock);
951
952                 /*
953                  * We first write the inode and then the create dirent. This
954                  * allows the recovery code to create an unnamed inode first
955                  * and then link it to a directory entry. This allows us
956                  * to use namei.c routines almost as is and simplifies
957                  * the recovery code.
958                  */
959                 ret = ext4_fc_write_inode(inode, crc);
960                 if (ret)
961                         goto lock_and_exit;
962
963                 ret = ext4_fc_write_inode_data(inode, crc);
964                 if (ret)
965                         goto lock_and_exit;
966
967                 if (!ext4_fc_add_dentry_tlv(
968                         sb, fc_dentry->fcd_op,
969                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
970                         fc_dentry->fcd_name.len,
971                         fc_dentry->fcd_name.name, crc)) {
972                         ret = -ENOSPC;
973                         goto lock_and_exit;
974                 }
975
976                 spin_lock(&sbi->s_fc_lock);
977         }
978         return 0;
979 lock_and_exit:
980         spin_lock(&sbi->s_fc_lock);
981         return ret;
982 }
983
984 static int ext4_fc_perform_commit(journal_t *journal)
985 {
986         struct super_block *sb = (struct super_block *)(journal->j_private);
987         struct ext4_sb_info *sbi = EXT4_SB(sb);
988         struct ext4_inode_info *iter;
989         struct ext4_fc_head head;
990         struct list_head *pos;
991         struct inode *inode;
992         struct blk_plug plug;
993         int ret = 0;
994         u32 crc = 0;
995
996         ret = ext4_fc_submit_inode_data_all(journal);
997         if (ret)
998                 return ret;
999
1000         ret = ext4_fc_wait_inode_data_all(journal);
1001         if (ret)
1002                 return ret;
1003
1004         blk_start_plug(&plug);
1005         if (sbi->s_fc_bytes == 0) {
1006                 /*
1007                  * Add a head tag only if this is the first fast commit
1008                  * in this TID.
1009                  */
1010                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1011                 head.fc_tid = cpu_to_le32(
1012                         sbi->s_journal->j_running_transaction->t_tid);
1013                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1014                         (u8 *)&head, &crc))
1015                         goto out;
1016         }
1017
1018         spin_lock(&sbi->s_fc_lock);
1019         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1020         if (ret) {
1021                 spin_unlock(&sbi->s_fc_lock);
1022                 goto out;
1023         }
1024
1025         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1026                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1027                 inode = &iter->vfs_inode;
1028                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1029                         continue;
1030
1031                 spin_unlock(&sbi->s_fc_lock);
1032                 ret = ext4_fc_write_inode_data(inode, &crc);
1033                 if (ret)
1034                         goto out;
1035                 ret = ext4_fc_write_inode(inode, &crc);
1036                 if (ret)
1037                         goto out;
1038                 spin_lock(&sbi->s_fc_lock);
1039                 EXT4_I(inode)->i_fc_committed_subtid =
1040                         atomic_read(&sbi->s_fc_subtid);
1041         }
1042         spin_unlock(&sbi->s_fc_lock);
1043
1044         ret = ext4_fc_write_tail(sb, crc);
1045
1046 out:
1047         blk_finish_plug(&plug);
1048         return ret;
1049 }
1050
1051 /*
1052  * The main commit entry point. Performs a fast commit for transaction
1053  * commit_tid if needed. If it's not possible to perform a fast commit
1054  * due to various reasons, we fall back to full commit. Returns 0
1055  * on success, error otherwise.
1056  */
1057 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1058 {
1059         struct super_block *sb = (struct super_block *)(journal->j_private);
1060         struct ext4_sb_info *sbi = EXT4_SB(sb);
1061         int nblks = 0, ret, bsize = journal->j_blocksize;
1062         int subtid = atomic_read(&sbi->s_fc_subtid);
1063         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1064         ktime_t start_time, commit_time;
1065
1066         trace_ext4_fc_commit_start(sb);
1067
1068         start_time = ktime_get();
1069
1070         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1071                 (ext4_fc_is_ineligible(sb))) {
1072                 reason = EXT4_FC_REASON_INELIGIBLE;
1073                 goto out;
1074         }
1075
1076 restart_fc:
1077         ret = jbd2_fc_begin_commit(journal, commit_tid);
1078         if (ret == -EALREADY) {
1079                 /* There was an ongoing commit, check if we need to restart */
1080                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1081                         commit_tid > journal->j_commit_sequence)
1082                         goto restart_fc;
1083                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1084                 goto out;
1085         } else if (ret) {
1086                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1087                 reason = EXT4_FC_REASON_FC_START_FAILED;
1088                 goto out;
1089         }
1090
1091         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1092         ret = ext4_fc_perform_commit(journal);
1093         if (ret < 0) {
1094                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1095                 reason = EXT4_FC_REASON_FC_FAILED;
1096                 goto out;
1097         }
1098         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1099         ret = jbd2_fc_wait_bufs(journal, nblks);
1100         if (ret < 0) {
1101                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1102                 reason = EXT4_FC_REASON_FC_FAILED;
1103                 goto out;
1104         }
1105         atomic_inc(&sbi->s_fc_subtid);
1106         jbd2_fc_end_commit(journal);
1107 out:
1108         /* Has any ineligible update happened since we started? */
1109         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1110                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1111                 reason = EXT4_FC_REASON_INELIGIBLE;
1112         }
1113
1114         spin_lock(&sbi->s_fc_lock);
1115         if (reason != EXT4_FC_REASON_OK &&
1116                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1117                 sbi->s_fc_stats.fc_ineligible_commits++;
1118         } else {
1119                 sbi->s_fc_stats.fc_num_commits++;
1120                 sbi->s_fc_stats.fc_numblks += nblks;
1121         }
1122         spin_unlock(&sbi->s_fc_lock);
1123         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1124         trace_ext4_fc_commit_stop(sb, nblks, reason);
1125         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1126         /*
1127          * weight the commit time higher than the average time so we don't
1128          * react too strongly to vast changes in the commit time
1129          */
1130         if (likely(sbi->s_fc_avg_commit_time))
1131                 sbi->s_fc_avg_commit_time = (commit_time +
1132                                 sbi->s_fc_avg_commit_time * 3) / 4;
1133         else
1134                 sbi->s_fc_avg_commit_time = commit_time;
1135         jbd_debug(1,
1136                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1137                 nblks, reason, subtid);
1138         if (reason == EXT4_FC_REASON_FC_FAILED)
1139                 return jbd2_fc_end_commit_fallback(journal);
1140         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1141                 reason == EXT4_FC_REASON_INELIGIBLE)
1142                 return jbd2_complete_transaction(journal, commit_tid);
1143         return 0;
1144 }
1145
1146 /*
1147  * Fast commit cleanup routine. This is called after every fast commit and
1148  * full commit. full is true if we are called after a full commit.
1149  */
1150 static void ext4_fc_cleanup(journal_t *journal, int full)
1151 {
1152         struct super_block *sb = journal->j_private;
1153         struct ext4_sb_info *sbi = EXT4_SB(sb);
1154         struct ext4_inode_info *iter;
1155         struct ext4_fc_dentry_update *fc_dentry;
1156         struct list_head *pos, *n;
1157
1158         if (full && sbi->s_fc_bh)
1159                 sbi->s_fc_bh = NULL;
1160
1161         jbd2_fc_release_bufs(journal);
1162
1163         spin_lock(&sbi->s_fc_lock);
1164         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1165                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1166                 list_del_init(&iter->i_fc_list);
1167                 ext4_clear_inode_state(&iter->vfs_inode,
1168                                        EXT4_STATE_FC_COMMITTING);
1169                 ext4_fc_reset_inode(&iter->vfs_inode);
1170                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1171                 smp_mb();
1172 #if (BITS_PER_LONG < 64)
1173                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1174 #else
1175                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1176 #endif
1177         }
1178
1179         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1180                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1181                                              struct ext4_fc_dentry_update,
1182                                              fcd_list);
1183                 list_del_init(&fc_dentry->fcd_list);
1184                 spin_unlock(&sbi->s_fc_lock);
1185
1186                 if (fc_dentry->fcd_name.name &&
1187                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1188                         kfree(fc_dentry->fcd_name.name);
1189                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1190                 spin_lock(&sbi->s_fc_lock);
1191         }
1192
1193         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1194                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1195         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1196                                 &sbi->s_fc_q[FC_Q_STAGING]);
1197
1198         sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1199         sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1200
1201         if (full)
1202                 sbi->s_fc_bytes = 0;
1203         spin_unlock(&sbi->s_fc_lock);
1204         trace_ext4_fc_stats(sb);
1205 }
1206
1207 /* Ext4 Replay Path Routines */
1208
1209 /* Get length of a particular tlv */
1210 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1211 {
1212         return le16_to_cpu(tl->fc_len);
1213 }
1214
1215 /* Get a pointer to "value" of a tlv */
1216 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1217 {
1218         return (u8 *)tl + sizeof(*tl);
1219 }
1220
1221 /* Helper struct for dentry replay routines */
1222 struct dentry_info_args {
1223         int parent_ino, dname_len, ino, inode_len;
1224         char *dname;
1225 };
1226
1227 static inline void tl_to_darg(struct dentry_info_args *darg,
1228                                 struct  ext4_fc_tl *tl)
1229 {
1230         struct ext4_fc_dentry_info *fcd;
1231
1232         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1233
1234         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1235         darg->ino = le32_to_cpu(fcd->fc_ino);
1236         darg->dname = fcd->fc_dname;
1237         darg->dname_len = ext4_fc_tag_len(tl) -
1238                         sizeof(struct ext4_fc_dentry_info);
1239 }
1240
1241 /* Unlink replay function */
1242 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1243 {
1244         struct inode *inode, *old_parent;
1245         struct qstr entry;
1246         struct dentry_info_args darg;
1247         int ret = 0;
1248
1249         tl_to_darg(&darg, tl);
1250
1251         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1252                         darg.parent_ino, darg.dname_len);
1253
1254         entry.name = darg.dname;
1255         entry.len = darg.dname_len;
1256         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1257
1258         if (IS_ERR_OR_NULL(inode)) {
1259                 jbd_debug(1, "Inode %d not found", darg.ino);
1260                 return 0;
1261         }
1262
1263         old_parent = ext4_iget(sb, darg.parent_ino,
1264                                 EXT4_IGET_NORMAL);
1265         if (IS_ERR_OR_NULL(old_parent)) {
1266                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1267                 iput(inode);
1268                 return 0;
1269         }
1270
1271         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1272         /* -ENOENT ok coz it might not exist anymore. */
1273         if (ret == -ENOENT)
1274                 ret = 0;
1275         iput(old_parent);
1276         iput(inode);
1277         return ret;
1278 }
1279
1280 static int ext4_fc_replay_link_internal(struct super_block *sb,
1281                                 struct dentry_info_args *darg,
1282                                 struct inode *inode)
1283 {
1284         struct inode *dir = NULL;
1285         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1286         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1287         int ret = 0;
1288
1289         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1290         if (IS_ERR(dir)) {
1291                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1292                 dir = NULL;
1293                 goto out;
1294         }
1295
1296         dentry_dir = d_obtain_alias(dir);
1297         if (IS_ERR(dentry_dir)) {
1298                 jbd_debug(1, "Failed to obtain dentry");
1299                 dentry_dir = NULL;
1300                 goto out;
1301         }
1302
1303         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1304         if (!dentry_inode) {
1305                 jbd_debug(1, "Inode dentry not created.");
1306                 ret = -ENOMEM;
1307                 goto out;
1308         }
1309
1310         ret = __ext4_link(dir, inode, dentry_inode);
1311         /*
1312          * It's possible that link already existed since data blocks
1313          * for the dir in question got persisted before we crashed OR
1314          * we replayed this tag and crashed before the entire replay
1315          * could complete.
1316          */
1317         if (ret && ret != -EEXIST) {
1318                 jbd_debug(1, "Failed to link\n");
1319                 goto out;
1320         }
1321
1322         ret = 0;
1323 out:
1324         if (dentry_dir) {
1325                 d_drop(dentry_dir);
1326                 dput(dentry_dir);
1327         } else if (dir) {
1328                 iput(dir);
1329         }
1330         if (dentry_inode) {
1331                 d_drop(dentry_inode);
1332                 dput(dentry_inode);
1333         }
1334
1335         return ret;
1336 }
1337
1338 /* Link replay function */
1339 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1340 {
1341         struct inode *inode;
1342         struct dentry_info_args darg;
1343         int ret = 0;
1344
1345         tl_to_darg(&darg, tl);
1346         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1347                         darg.parent_ino, darg.dname_len);
1348
1349         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1350         if (IS_ERR_OR_NULL(inode)) {
1351                 jbd_debug(1, "Inode not found.");
1352                 return 0;
1353         }
1354
1355         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1356         iput(inode);
1357         return ret;
1358 }
1359
1360 /*
1361  * Record all the modified inodes during replay. We use this later to setup
1362  * block bitmaps correctly.
1363  */
1364 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1365 {
1366         struct ext4_fc_replay_state *state;
1367         int i;
1368
1369         state = &EXT4_SB(sb)->s_fc_replay_state;
1370         for (i = 0; i < state->fc_modified_inodes_used; i++)
1371                 if (state->fc_modified_inodes[i] == ino)
1372                         return 0;
1373         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1374                 state->fc_modified_inodes_size +=
1375                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1376                 state->fc_modified_inodes = krealloc(
1377                                         state->fc_modified_inodes, sizeof(int) *
1378                                         state->fc_modified_inodes_size,
1379                                         GFP_KERNEL);
1380                 if (!state->fc_modified_inodes)
1381                         return -ENOMEM;
1382         }
1383         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1384         return 0;
1385 }
1386
1387 /*
1388  * Inode replay function
1389  */
1390 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1391 {
1392         struct ext4_fc_inode *fc_inode;
1393         struct ext4_inode *raw_inode;
1394         struct ext4_inode *raw_fc_inode;
1395         struct inode *inode = NULL;
1396         struct ext4_iloc iloc;
1397         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1398         struct ext4_extent_header *eh;
1399
1400         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1401
1402         ino = le32_to_cpu(fc_inode->fc_ino);
1403         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1404
1405         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1406         if (!IS_ERR_OR_NULL(inode)) {
1407                 ext4_ext_clear_bb(inode);
1408                 iput(inode);
1409         }
1410
1411         ext4_fc_record_modified_inode(sb, ino);
1412
1413         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1414         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1415         if (ret)
1416                 goto out;
1417
1418         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1419         raw_inode = ext4_raw_inode(&iloc);
1420
1421         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1422         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1423                 inode_len - offsetof(struct ext4_inode, i_generation));
1424         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1425                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1426                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1427                         memset(eh, 0, sizeof(*eh));
1428                         eh->eh_magic = EXT4_EXT_MAGIC;
1429                         eh->eh_max = cpu_to_le16(
1430                                 (sizeof(raw_inode->i_block) -
1431                                  sizeof(struct ext4_extent_header))
1432                                  / sizeof(struct ext4_extent));
1433                 }
1434         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1435                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1436                         sizeof(raw_inode->i_block));
1437         }
1438
1439         /* Immediately update the inode on disk. */
1440         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1441         if (ret)
1442                 goto out;
1443         ret = sync_dirty_buffer(iloc.bh);
1444         if (ret)
1445                 goto out;
1446         ret = ext4_mark_inode_used(sb, ino);
1447         if (ret)
1448                 goto out;
1449
1450         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1451         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1452         if (IS_ERR_OR_NULL(inode)) {
1453                 jbd_debug(1, "Inode not found.");
1454                 return -EFSCORRUPTED;
1455         }
1456
1457         /*
1458          * Our allocator could have made different decisions than before
1459          * crashing. This should be fixed but until then, we calculate
1460          * the number of blocks the inode.
1461          */
1462         ext4_ext_replay_set_iblocks(inode);
1463
1464         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1465         ext4_reset_inode_seed(inode);
1466
1467         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1468         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1469         sync_dirty_buffer(iloc.bh);
1470         brelse(iloc.bh);
1471 out:
1472         iput(inode);
1473         if (!ret)
1474                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1475
1476         return 0;
1477 }
1478
1479 /*
1480  * Dentry create replay function.
1481  *
1482  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1483  * inode for which we are trying to create a dentry here, should already have
1484  * been replayed before we start here.
1485  */
1486 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1487 {
1488         int ret = 0;
1489         struct inode *inode = NULL;
1490         struct inode *dir = NULL;
1491         struct dentry_info_args darg;
1492
1493         tl_to_darg(&darg, tl);
1494
1495         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1496                         darg.parent_ino, darg.dname_len);
1497
1498         /* This takes care of update group descriptor and other metadata */
1499         ret = ext4_mark_inode_used(sb, darg.ino);
1500         if (ret)
1501                 goto out;
1502
1503         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1504         if (IS_ERR_OR_NULL(inode)) {
1505                 jbd_debug(1, "inode %d not found.", darg.ino);
1506                 inode = NULL;
1507                 ret = -EINVAL;
1508                 goto out;
1509         }
1510
1511         if (S_ISDIR(inode->i_mode)) {
1512                 /*
1513                  * If we are creating a directory, we need to make sure that the
1514                  * dot and dot dot dirents are setup properly.
1515                  */
1516                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1517                 if (IS_ERR_OR_NULL(dir)) {
1518                         jbd_debug(1, "Dir %d not found.", darg.ino);
1519                         goto out;
1520                 }
1521                 ret = ext4_init_new_dir(NULL, dir, inode);
1522                 iput(dir);
1523                 if (ret) {
1524                         ret = 0;
1525                         goto out;
1526                 }
1527         }
1528         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1529         if (ret)
1530                 goto out;
1531         set_nlink(inode, 1);
1532         ext4_mark_inode_dirty(NULL, inode);
1533 out:
1534         if (inode)
1535                 iput(inode);
1536         return ret;
1537 }
1538
1539 /*
1540  * Record physical disk regions which are in use as per fast commit area. Our
1541  * simple replay phase allocator excludes these regions from allocation.
1542  */
1543 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1544                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1545 {
1546         struct ext4_fc_replay_state *state;
1547         struct ext4_fc_alloc_region *region;
1548
1549         state = &EXT4_SB(sb)->s_fc_replay_state;
1550         if (state->fc_regions_used == state->fc_regions_size) {
1551                 state->fc_regions_size +=
1552                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1553                 state->fc_regions = krealloc(
1554                                         state->fc_regions,
1555                                         state->fc_regions_size *
1556                                         sizeof(struct ext4_fc_alloc_region),
1557                                         GFP_KERNEL);
1558                 if (!state->fc_regions)
1559                         return -ENOMEM;
1560         }
1561         region = &state->fc_regions[state->fc_regions_used++];
1562         region->ino = ino;
1563         region->lblk = lblk;
1564         region->pblk = pblk;
1565         region->len = len;
1566
1567         return 0;
1568 }
1569
1570 /* Replay add range tag */
1571 static int ext4_fc_replay_add_range(struct super_block *sb,
1572                                 struct ext4_fc_tl *tl)
1573 {
1574         struct ext4_fc_add_range *fc_add_ex;
1575         struct ext4_extent newex, *ex;
1576         struct inode *inode;
1577         ext4_lblk_t start, cur;
1578         int remaining, len;
1579         ext4_fsblk_t start_pblk;
1580         struct ext4_map_blocks map;
1581         struct ext4_ext_path *path = NULL;
1582         int ret;
1583
1584         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1585         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1586
1587         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1588                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1589                 ext4_ext_get_actual_len(ex));
1590
1591         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1592                                 EXT4_IGET_NORMAL);
1593         if (IS_ERR_OR_NULL(inode)) {
1594                 jbd_debug(1, "Inode not found.");
1595                 return 0;
1596         }
1597
1598         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1599
1600         start = le32_to_cpu(ex->ee_block);
1601         start_pblk = ext4_ext_pblock(ex);
1602         len = ext4_ext_get_actual_len(ex);
1603
1604         cur = start;
1605         remaining = len;
1606         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1607                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1608                   inode->i_ino);
1609
1610         while (remaining > 0) {
1611                 map.m_lblk = cur;
1612                 map.m_len = remaining;
1613                 map.m_pblk = 0;
1614                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1615
1616                 if (ret < 0) {
1617                         iput(inode);
1618                         return 0;
1619                 }
1620
1621                 if (ret == 0) {
1622                         /* Range is not mapped */
1623                         path = ext4_find_extent(inode, cur, NULL, 0);
1624                         if (IS_ERR(path)) {
1625                                 iput(inode);
1626                                 return 0;
1627                         }
1628                         memset(&newex, 0, sizeof(newex));
1629                         newex.ee_block = cpu_to_le32(cur);
1630                         ext4_ext_store_pblock(
1631                                 &newex, start_pblk + cur - start);
1632                         newex.ee_len = cpu_to_le16(map.m_len);
1633                         if (ext4_ext_is_unwritten(ex))
1634                                 ext4_ext_mark_unwritten(&newex);
1635                         down_write(&EXT4_I(inode)->i_data_sem);
1636                         ret = ext4_ext_insert_extent(
1637                                 NULL, inode, &path, &newex, 0);
1638                         up_write((&EXT4_I(inode)->i_data_sem));
1639                         ext4_ext_drop_refs(path);
1640                         kfree(path);
1641                         if (ret) {
1642                                 iput(inode);
1643                                 return 0;
1644                         }
1645                         goto next;
1646                 }
1647
1648                 if (start_pblk + cur - start != map.m_pblk) {
1649                         /*
1650                          * Logical to physical mapping changed. This can happen
1651                          * if this range was removed and then reallocated to
1652                          * map to new physical blocks during a fast commit.
1653                          */
1654                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1655                                         ext4_ext_is_unwritten(ex),
1656                                         start_pblk + cur - start);
1657                         if (ret) {
1658                                 iput(inode);
1659                                 return 0;
1660                         }
1661                         /*
1662                          * Mark the old blocks as free since they aren't used
1663                          * anymore. We maintain an array of all the modified
1664                          * inodes. In case these blocks are still used at either
1665                          * a different logical range in the same inode or in
1666                          * some different inode, we will mark them as allocated
1667                          * at the end of the FC replay using our array of
1668                          * modified inodes.
1669                          */
1670                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1671                         goto next;
1672                 }
1673
1674                 /* Range is mapped and needs a state change */
1675                 jbd_debug(1, "Converting from %d to %d %lld",
1676                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1677                         ext4_ext_is_unwritten(ex), map.m_pblk);
1678                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1679                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1680                 if (ret) {
1681                         iput(inode);
1682                         return 0;
1683                 }
1684                 /*
1685                  * We may have split the extent tree while toggling the state.
1686                  * Try to shrink the extent tree now.
1687                  */
1688                 ext4_ext_replay_shrink_inode(inode, start + len);
1689 next:
1690                 cur += map.m_len;
1691                 remaining -= map.m_len;
1692         }
1693         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1694                                         sb->s_blocksize_bits);
1695         iput(inode);
1696         return 0;
1697 }
1698
1699 /* Replay DEL_RANGE tag */
1700 static int
1701 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1702 {
1703         struct inode *inode;
1704         struct ext4_fc_del_range *lrange;
1705         struct ext4_map_blocks map;
1706         ext4_lblk_t cur, remaining;
1707         int ret;
1708
1709         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1710         cur = le32_to_cpu(lrange->fc_lblk);
1711         remaining = le32_to_cpu(lrange->fc_len);
1712
1713         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1714                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1715
1716         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1717         if (IS_ERR_OR_NULL(inode)) {
1718                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1719                 return 0;
1720         }
1721
1722         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1723
1724         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1725                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1726                         le32_to_cpu(lrange->fc_len));
1727         while (remaining > 0) {
1728                 map.m_lblk = cur;
1729                 map.m_len = remaining;
1730
1731                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1732                 if (ret < 0) {
1733                         iput(inode);
1734                         return 0;
1735                 }
1736                 if (ret > 0) {
1737                         remaining -= ret;
1738                         cur += ret;
1739                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1740                 } else {
1741                         remaining -= map.m_len;
1742                         cur += map.m_len;
1743                 }
1744         }
1745
1746         ret = ext4_punch_hole(inode,
1747                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1748                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1749         if (ret)
1750                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1751         ext4_ext_replay_shrink_inode(inode,
1752                 i_size_read(inode) >> sb->s_blocksize_bits);
1753         ext4_mark_inode_dirty(NULL, inode);
1754         iput(inode);
1755
1756         return 0;
1757 }
1758
1759 static inline const char *tag2str(u16 tag)
1760 {
1761         switch (tag) {
1762         case EXT4_FC_TAG_LINK:
1763                 return "TAG_ADD_ENTRY";
1764         case EXT4_FC_TAG_UNLINK:
1765                 return "TAG_DEL_ENTRY";
1766         case EXT4_FC_TAG_ADD_RANGE:
1767                 return "TAG_ADD_RANGE";
1768         case EXT4_FC_TAG_CREAT:
1769                 return "TAG_CREAT_DENTRY";
1770         case EXT4_FC_TAG_DEL_RANGE:
1771                 return "TAG_DEL_RANGE";
1772         case EXT4_FC_TAG_INODE:
1773                 return "TAG_INODE";
1774         case EXT4_FC_TAG_PAD:
1775                 return "TAG_PAD";
1776         case EXT4_FC_TAG_TAIL:
1777                 return "TAG_TAIL";
1778         case EXT4_FC_TAG_HEAD:
1779                 return "TAG_HEAD";
1780         default:
1781                 return "TAG_ERROR";
1782         }
1783 }
1784
1785 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1786 {
1787         struct ext4_fc_replay_state *state;
1788         struct inode *inode;
1789         struct ext4_ext_path *path = NULL;
1790         struct ext4_map_blocks map;
1791         int i, ret, j;
1792         ext4_lblk_t cur, end;
1793
1794         state = &EXT4_SB(sb)->s_fc_replay_state;
1795         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1796                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1797                         EXT4_IGET_NORMAL);
1798                 if (IS_ERR_OR_NULL(inode)) {
1799                         jbd_debug(1, "Inode %d not found.",
1800                                 state->fc_modified_inodes[i]);
1801                         continue;
1802                 }
1803                 cur = 0;
1804                 end = EXT_MAX_BLOCKS;
1805                 while (cur < end) {
1806                         map.m_lblk = cur;
1807                         map.m_len = end - cur;
1808
1809                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1810                         if (ret < 0)
1811                                 break;
1812
1813                         if (ret > 0) {
1814                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1815                                 if (!IS_ERR_OR_NULL(path)) {
1816                                         for (j = 0; j < path->p_depth; j++)
1817                                                 ext4_mb_mark_bb(inode->i_sb,
1818                                                         path[j].p_block, 1, 1);
1819                                         ext4_ext_drop_refs(path);
1820                                         kfree(path);
1821                                 }
1822                                 cur += ret;
1823                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1824                                                         map.m_len, 1);
1825                         } else {
1826                                 cur = cur + (map.m_len ? map.m_len : 1);
1827                         }
1828                 }
1829                 iput(inode);
1830         }
1831 }
1832
1833 /*
1834  * Check if block is in excluded regions for block allocation. The simple
1835  * allocator that runs during replay phase is calls this function to see
1836  * if it is okay to use a block.
1837  */
1838 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1839 {
1840         int i;
1841         struct ext4_fc_replay_state *state;
1842
1843         state = &EXT4_SB(sb)->s_fc_replay_state;
1844         for (i = 0; i < state->fc_regions_valid; i++) {
1845                 if (state->fc_regions[i].ino == 0 ||
1846                         state->fc_regions[i].len == 0)
1847                         continue;
1848                 if (blk >= state->fc_regions[i].pblk &&
1849                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1850                         return true;
1851         }
1852         return false;
1853 }
1854
1855 /* Cleanup function called after replay */
1856 void ext4_fc_replay_cleanup(struct super_block *sb)
1857 {
1858         struct ext4_sb_info *sbi = EXT4_SB(sb);
1859
1860         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1861         kfree(sbi->s_fc_replay_state.fc_regions);
1862         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1863 }
1864
1865 /*
1866  * Recovery Scan phase handler
1867  *
1868  * This function is called during the scan phase and is responsible
1869  * for doing following things:
1870  * - Make sure the fast commit area has valid tags for replay
1871  * - Count number of tags that need to be replayed by the replay handler
1872  * - Verify CRC
1873  * - Create a list of excluded blocks for allocation during replay phase
1874  *
1875  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1876  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1877  * to indicate that scan has finished and JBD2 can now start replay phase.
1878  * It returns a negative error to indicate that there was an error. At the end
1879  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1880  * to indicate the number of tags that need to replayed during the replay phase.
1881  */
1882 static int ext4_fc_replay_scan(journal_t *journal,
1883                                 struct buffer_head *bh, int off,
1884                                 tid_t expected_tid)
1885 {
1886         struct super_block *sb = journal->j_private;
1887         struct ext4_sb_info *sbi = EXT4_SB(sb);
1888         struct ext4_fc_replay_state *state;
1889         int ret = JBD2_FC_REPLAY_CONTINUE;
1890         struct ext4_fc_add_range *ext;
1891         struct ext4_fc_tl *tl;
1892         struct ext4_fc_tail *tail;
1893         __u8 *start, *end;
1894         struct ext4_fc_head *head;
1895         struct ext4_extent *ex;
1896
1897         state = &sbi->s_fc_replay_state;
1898
1899         start = (u8 *)bh->b_data;
1900         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1901
1902         if (state->fc_replay_expected_off == 0) {
1903                 state->fc_cur_tag = 0;
1904                 state->fc_replay_num_tags = 0;
1905                 state->fc_crc = 0;
1906                 state->fc_regions = NULL;
1907                 state->fc_regions_valid = state->fc_regions_used =
1908                         state->fc_regions_size = 0;
1909                 /* Check if we can stop early */
1910                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1911                         != EXT4_FC_TAG_HEAD)
1912                         return 0;
1913         }
1914
1915         if (off != state->fc_replay_expected_off) {
1916                 ret = -EFSCORRUPTED;
1917                 goto out_err;
1918         }
1919
1920         state->fc_replay_expected_off++;
1921         fc_for_each_tl(start, end, tl) {
1922                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1923                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1924                 switch (le16_to_cpu(tl->fc_tag)) {
1925                 case EXT4_FC_TAG_ADD_RANGE:
1926                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1927                         ex = (struct ext4_extent *)&ext->fc_ex;
1928                         ret = ext4_fc_record_regions(sb,
1929                                 le32_to_cpu(ext->fc_ino),
1930                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1931                                 ext4_ext_get_actual_len(ex));
1932                         if (ret < 0)
1933                                 break;
1934                         ret = JBD2_FC_REPLAY_CONTINUE;
1935                         fallthrough;
1936                 case EXT4_FC_TAG_DEL_RANGE:
1937                 case EXT4_FC_TAG_LINK:
1938                 case EXT4_FC_TAG_UNLINK:
1939                 case EXT4_FC_TAG_CREAT:
1940                 case EXT4_FC_TAG_INODE:
1941                 case EXT4_FC_TAG_PAD:
1942                         state->fc_cur_tag++;
1943                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1944                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1945                         break;
1946                 case EXT4_FC_TAG_TAIL:
1947                         state->fc_cur_tag++;
1948                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1949                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1950                                                 sizeof(*tl) +
1951                                                 offsetof(struct ext4_fc_tail,
1952                                                 fc_crc));
1953                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1954                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1955                                 state->fc_replay_num_tags = state->fc_cur_tag;
1956                                 state->fc_regions_valid =
1957                                         state->fc_regions_used;
1958                         } else {
1959                                 ret = state->fc_replay_num_tags ?
1960                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1961                         }
1962                         state->fc_crc = 0;
1963                         break;
1964                 case EXT4_FC_TAG_HEAD:
1965                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1966                         if (le32_to_cpu(head->fc_features) &
1967                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1968                                 ret = -EOPNOTSUPP;
1969                                 break;
1970                         }
1971                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
1972                                 ret = JBD2_FC_REPLAY_STOP;
1973                                 break;
1974                         }
1975                         state->fc_cur_tag++;
1976                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1977                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1978                         break;
1979                 default:
1980                         ret = state->fc_replay_num_tags ?
1981                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
1982                 }
1983                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1984                         break;
1985         }
1986
1987 out_err:
1988         trace_ext4_fc_replay_scan(sb, ret, off);
1989         return ret;
1990 }
1991
1992 /*
1993  * Main recovery path entry point.
1994  * The meaning of return codes is similar as above.
1995  */
1996 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1997                                 enum passtype pass, int off, tid_t expected_tid)
1998 {
1999         struct super_block *sb = journal->j_private;
2000         struct ext4_sb_info *sbi = EXT4_SB(sb);
2001         struct ext4_fc_tl *tl;
2002         __u8 *start, *end;
2003         int ret = JBD2_FC_REPLAY_CONTINUE;
2004         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2005         struct ext4_fc_tail *tail;
2006
2007         if (pass == PASS_SCAN) {
2008                 state->fc_current_pass = PASS_SCAN;
2009                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2010         }
2011
2012         if (state->fc_current_pass != pass) {
2013                 state->fc_current_pass = pass;
2014                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2015         }
2016         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2017                 jbd_debug(1, "Replay stops\n");
2018                 ext4_fc_set_bitmaps_and_counters(sb);
2019                 return 0;
2020         }
2021
2022 #ifdef CONFIG_EXT4_DEBUG
2023         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2024                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2025                 return JBD2_FC_REPLAY_STOP;
2026         }
2027 #endif
2028
2029         start = (u8 *)bh->b_data;
2030         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2031
2032         fc_for_each_tl(start, end, tl) {
2033                 if (state->fc_replay_num_tags == 0) {
2034                         ret = JBD2_FC_REPLAY_STOP;
2035                         ext4_fc_set_bitmaps_and_counters(sb);
2036                         break;
2037                 }
2038                 jbd_debug(3, "Replay phase, tag:%s\n",
2039                                 tag2str(le16_to_cpu(tl->fc_tag)));
2040                 state->fc_replay_num_tags--;
2041                 switch (le16_to_cpu(tl->fc_tag)) {
2042                 case EXT4_FC_TAG_LINK:
2043                         ret = ext4_fc_replay_link(sb, tl);
2044                         break;
2045                 case EXT4_FC_TAG_UNLINK:
2046                         ret = ext4_fc_replay_unlink(sb, tl);
2047                         break;
2048                 case EXT4_FC_TAG_ADD_RANGE:
2049                         ret = ext4_fc_replay_add_range(sb, tl);
2050                         break;
2051                 case EXT4_FC_TAG_CREAT:
2052                         ret = ext4_fc_replay_create(sb, tl);
2053                         break;
2054                 case EXT4_FC_TAG_DEL_RANGE:
2055                         ret = ext4_fc_replay_del_range(sb, tl);
2056                         break;
2057                 case EXT4_FC_TAG_INODE:
2058                         ret = ext4_fc_replay_inode(sb, tl);
2059                         break;
2060                 case EXT4_FC_TAG_PAD:
2061                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2062                                 ext4_fc_tag_len(tl), 0);
2063                         break;
2064                 case EXT4_FC_TAG_TAIL:
2065                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2066                                 ext4_fc_tag_len(tl), 0);
2067                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2068                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2069                         break;
2070                 case EXT4_FC_TAG_HEAD:
2071                         break;
2072                 default:
2073                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2074                                 ext4_fc_tag_len(tl), 0);
2075                         ret = -ECANCELED;
2076                         break;
2077                 }
2078                 if (ret < 0)
2079                         break;
2080                 ret = JBD2_FC_REPLAY_CONTINUE;
2081         }
2082         return ret;
2083 }
2084
2085 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2086 {
2087         /*
2088          * We set replay callback even if fast commit disabled because we may
2089          * could still have fast commit blocks that need to be replayed even if
2090          * fast commit has now been turned off.
2091          */
2092         journal->j_fc_replay_callback = ext4_fc_replay;
2093         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2094                 return;
2095         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2096 }
2097
2098 const char *fc_ineligible_reasons[] = {
2099         "Extended attributes changed",
2100         "Cross rename",
2101         "Journal flag changed",
2102         "Insufficient memory",
2103         "Swap boot",
2104         "Resize",
2105         "Dir renamed",
2106         "Falloc range op",
2107         "FC Commit Failed"
2108 };
2109
2110 int ext4_fc_info_show(struct seq_file *seq, void *v)
2111 {
2112         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2113         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2114         int i;
2115
2116         if (v != SEQ_START_TOKEN)
2117                 return 0;
2118
2119         seq_printf(seq,
2120                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2121                    stats->fc_num_commits, stats->fc_ineligible_commits,
2122                    stats->fc_numblks,
2123                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2124         seq_puts(seq, "Ineligible reasons:\n");
2125         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2126                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2127                         stats->fc_ineligible_reason_count[i]);
2128
2129         return 0;
2130 }
2131
2132 int __init ext4_fc_init_dentry_cache(void)
2133 {
2134         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2135                                            SLAB_RECLAIM_ACCOUNT);
2136
2137         if (ext4_fc_dentry_cachep == NULL)
2138                 return -ENOMEM;
2139
2140         return 0;
2141 }