6b963e09af2c9ac6182fc720b3fff6a0a678eda6
[linux-2.6-microblaze.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * TODOs
107  * -----
108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
109  *    eligible update must be protected within ext4_fc_start_update() and
110  *    ext4_fc_stop_update(). These routines are called at much higher
111  *    routines. This can be made more fine grained by combining with
112  *    ext4_journal_start().
113  *
114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
115  *
116  * 3) Handle more ineligible cases.
117  */
118
119 #include <trace/events/ext4.h>
120 static struct kmem_cache *ext4_fc_dentry_cachep;
121
122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
123 {
124         BUFFER_TRACE(bh, "");
125         if (uptodate) {
126                 ext4_debug("%s: Block %lld up-to-date",
127                            __func__, bh->b_blocknr);
128                 set_buffer_uptodate(bh);
129         } else {
130                 ext4_debug("%s: Block %lld not up-to-date",
131                            __func__, bh->b_blocknr);
132                 clear_buffer_uptodate(bh);
133         }
134
135         unlock_buffer(bh);
136 }
137
138 static inline void ext4_fc_reset_inode(struct inode *inode)
139 {
140         struct ext4_inode_info *ei = EXT4_I(inode);
141
142         ei->i_fc_lblk_start = 0;
143         ei->i_fc_lblk_len = 0;
144 }
145
146 void ext4_fc_init_inode(struct inode *inode)
147 {
148         struct ext4_inode_info *ei = EXT4_I(inode);
149
150         ext4_fc_reset_inode(inode);
151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
152         INIT_LIST_HEAD(&ei->i_fc_list);
153         init_waitqueue_head(&ei->i_fc_wait);
154         atomic_set(&ei->i_fc_updates, 0);
155 }
156
157 /* This function must be called with sbi->s_fc_lock held. */
158 static void ext4_fc_wait_committing_inode(struct inode *inode)
159 {
160         wait_queue_head_t *wq;
161         struct ext4_inode_info *ei = EXT4_I(inode);
162
163 #if (BITS_PER_LONG < 64)
164         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
165                         EXT4_STATE_FC_COMMITTING);
166         wq = bit_waitqueue(&ei->i_state_flags,
167                                 EXT4_STATE_FC_COMMITTING);
168 #else
169         DEFINE_WAIT_BIT(wait, &ei->i_flags,
170                         EXT4_STATE_FC_COMMITTING);
171         wq = bit_waitqueue(&ei->i_flags,
172                                 EXT4_STATE_FC_COMMITTING);
173 #endif
174         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
175         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
176         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
177         schedule();
178         finish_wait(wq, &wait.wq_entry);
179 }
180
181 /*
182  * Inform Ext4's fast about start of an inode update
183  *
184  * This function is called by the high level call VFS callbacks before
185  * performing any inode update. This function blocks if there's an ongoing
186  * fast commit on the inode in question.
187  */
188 void ext4_fc_start_update(struct inode *inode)
189 {
190         struct ext4_inode_info *ei = EXT4_I(inode);
191
192         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
193             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
194                 return;
195
196 restart:
197         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
198         if (list_empty(&ei->i_fc_list))
199                 goto out;
200
201         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
202                 ext4_fc_wait_committing_inode(inode);
203                 goto restart;
204         }
205 out:
206         atomic_inc(&ei->i_fc_updates);
207         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
208 }
209
210 /*
211  * Stop inode update and wake up waiting fast commits if any.
212  */
213 void ext4_fc_stop_update(struct inode *inode)
214 {
215         struct ext4_inode_info *ei = EXT4_I(inode);
216
217         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
218             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
219                 return;
220
221         if (atomic_dec_and_test(&ei->i_fc_updates))
222                 wake_up_all(&ei->i_fc_wait);
223 }
224
225 /*
226  * Remove inode from fast commit list. If the inode is being committed
227  * we wait until inode commit is done.
228  */
229 void ext4_fc_del(struct inode *inode)
230 {
231         struct ext4_inode_info *ei = EXT4_I(inode);
232
233         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
234             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
235                 return;
236
237 restart:
238         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239         if (list_empty(&ei->i_fc_list)) {
240                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
241                 return;
242         }
243
244         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
245                 ext4_fc_wait_committing_inode(inode);
246                 goto restart;
247         }
248         list_del_init(&ei->i_fc_list);
249         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
250 }
251
252 /*
253  * Mark file system as fast commit ineligible. This means that next commit
254  * operation would result in a full jbd2 commit.
255  */
256 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
257 {
258         struct ext4_sb_info *sbi = EXT4_SB(sb);
259
260         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
261             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
262                 return;
263
264         sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
265         WARN_ON(reason >= EXT4_FC_REASON_MAX);
266         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
267 }
268
269 /*
270  * Start a fast commit ineligible update. Any commits that happen while
271  * such an operation is in progress fall back to full commits.
272  */
273 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
274 {
275         struct ext4_sb_info *sbi = EXT4_SB(sb);
276
277         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
278             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
279                 return;
280
281         WARN_ON(reason >= EXT4_FC_REASON_MAX);
282         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
283         atomic_inc(&sbi->s_fc_ineligible_updates);
284 }
285
286 /*
287  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
288  * to ensure that after stopping the ineligible update, at least one full
289  * commit takes place.
290  */
291 void ext4_fc_stop_ineligible(struct super_block *sb)
292 {
293         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
294             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
295                 return;
296
297         EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE;
298         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
299 }
300
301 static inline int ext4_fc_is_ineligible(struct super_block *sb)
302 {
303         return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) ||
304                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates);
305 }
306
307 /*
308  * Generic fast commit tracking function. If this is the first time this we are
309  * called after a full commit, we initialize fast commit fields and then call
310  * __fc_track_fn() with update = 0. If we have already been called after a full
311  * commit, we pass update = 1. Based on that, the track function can determine
312  * if it needs to track a field for the first time or if it needs to just
313  * update the previously tracked value.
314  *
315  * If enqueue is set, this function enqueues the inode in fast commit list.
316  */
317 static int ext4_fc_track_template(
318         handle_t *handle, struct inode *inode,
319         int (*__fc_track_fn)(struct inode *, void *, bool),
320         void *args, int enqueue)
321 {
322         bool update = false;
323         struct ext4_inode_info *ei = EXT4_I(inode);
324         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
325         tid_t tid = 0;
326         int ret;
327
328         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
329             (sbi->s_mount_state & EXT4_FC_REPLAY))
330                 return -EOPNOTSUPP;
331
332         if (ext4_fc_is_ineligible(inode->i_sb))
333                 return -EINVAL;
334
335         tid = handle->h_transaction->t_tid;
336         mutex_lock(&ei->i_fc_lock);
337         if (tid == ei->i_sync_tid) {
338                 update = true;
339         } else {
340                 ext4_fc_reset_inode(inode);
341                 ei->i_sync_tid = tid;
342         }
343         ret = __fc_track_fn(inode, args, update);
344         mutex_unlock(&ei->i_fc_lock);
345
346         if (!enqueue)
347                 return ret;
348
349         spin_lock(&sbi->s_fc_lock);
350         if (list_empty(&EXT4_I(inode)->i_fc_list))
351                 list_add_tail(&EXT4_I(inode)->i_fc_list,
352                                 (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ?
353                                 &sbi->s_fc_q[FC_Q_STAGING] :
354                                 &sbi->s_fc_q[FC_Q_MAIN]);
355         spin_unlock(&sbi->s_fc_lock);
356
357         return ret;
358 }
359
360 struct __track_dentry_update_args {
361         struct dentry *dentry;
362         int op;
363 };
364
365 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
366 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
367 {
368         struct ext4_fc_dentry_update *node;
369         struct ext4_inode_info *ei = EXT4_I(inode);
370         struct __track_dentry_update_args *dentry_update =
371                 (struct __track_dentry_update_args *)arg;
372         struct dentry *dentry = dentry_update->dentry;
373         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
374
375         mutex_unlock(&ei->i_fc_lock);
376         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
377         if (!node) {
378                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
379                 mutex_lock(&ei->i_fc_lock);
380                 return -ENOMEM;
381         }
382
383         node->fcd_op = dentry_update->op;
384         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
385         node->fcd_ino = inode->i_ino;
386         if (dentry->d_name.len > DNAME_INLINE_LEN) {
387                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
388                 if (!node->fcd_name.name) {
389                         kmem_cache_free(ext4_fc_dentry_cachep, node);
390                         ext4_fc_mark_ineligible(inode->i_sb,
391                                 EXT4_FC_REASON_NOMEM);
392                         mutex_lock(&ei->i_fc_lock);
393                         return -ENOMEM;
394                 }
395                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
396                         dentry->d_name.len);
397         } else {
398                 memcpy(node->fcd_iname, dentry->d_name.name,
399                         dentry->d_name.len);
400                 node->fcd_name.name = node->fcd_iname;
401         }
402         node->fcd_name.len = dentry->d_name.len;
403
404         spin_lock(&sbi->s_fc_lock);
405         if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING)
406                 list_add_tail(&node->fcd_list,
407                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
408         else
409                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
410         spin_unlock(&sbi->s_fc_lock);
411         mutex_lock(&ei->i_fc_lock);
412
413         return 0;
414 }
415
416 void __ext4_fc_track_unlink(handle_t *handle,
417                 struct inode *inode, struct dentry *dentry)
418 {
419         struct __track_dentry_update_args args;
420         int ret;
421
422         args.dentry = dentry;
423         args.op = EXT4_FC_TAG_UNLINK;
424
425         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
426                                         (void *)&args, 0);
427         trace_ext4_fc_track_unlink(inode, dentry, ret);
428 }
429
430 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
431 {
432         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
433 }
434
435 void __ext4_fc_track_link(handle_t *handle,
436         struct inode *inode, struct dentry *dentry)
437 {
438         struct __track_dentry_update_args args;
439         int ret;
440
441         args.dentry = dentry;
442         args.op = EXT4_FC_TAG_LINK;
443
444         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
445                                         (void *)&args, 0);
446         trace_ext4_fc_track_link(inode, dentry, ret);
447 }
448
449 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
450 {
451         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
452 }
453
454 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
455 {
456         struct __track_dentry_update_args args;
457         struct inode *inode = d_inode(dentry);
458         int ret;
459
460         args.dentry = dentry;
461         args.op = EXT4_FC_TAG_CREAT;
462
463         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
464                                         (void *)&args, 0);
465         trace_ext4_fc_track_create(inode, dentry, ret);
466 }
467
468 /* __track_fn for inode tracking */
469 static int __track_inode(struct inode *inode, void *arg, bool update)
470 {
471         if (update)
472                 return -EEXIST;
473
474         EXT4_I(inode)->i_fc_lblk_len = 0;
475
476         return 0;
477 }
478
479 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
480 {
481         int ret;
482
483         if (S_ISDIR(inode->i_mode))
484                 return;
485
486         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
487         trace_ext4_fc_track_inode(inode, ret);
488 }
489
490 struct __track_range_args {
491         ext4_lblk_t start, end;
492 };
493
494 /* __track_fn for tracking data updates */
495 static int __track_range(struct inode *inode, void *arg, bool update)
496 {
497         struct ext4_inode_info *ei = EXT4_I(inode);
498         ext4_lblk_t oldstart;
499         struct __track_range_args *__arg =
500                 (struct __track_range_args *)arg;
501
502         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
503                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
504                 return -ECANCELED;
505         }
506
507         oldstart = ei->i_fc_lblk_start;
508
509         if (update && ei->i_fc_lblk_len > 0) {
510                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
511                 ei->i_fc_lblk_len =
512                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
513                                 ei->i_fc_lblk_start + 1;
514         } else {
515                 ei->i_fc_lblk_start = __arg->start;
516                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
517         }
518
519         return 0;
520 }
521
522 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
523                          ext4_lblk_t end)
524 {
525         struct __track_range_args args;
526         int ret;
527
528         if (S_ISDIR(inode->i_mode))
529                 return;
530
531         args.start = start;
532         args.end = end;
533
534         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
535
536         trace_ext4_fc_track_range(inode, start, end, ret);
537 }
538
539 static void ext4_fc_submit_bh(struct super_block *sb)
540 {
541         int write_flags = REQ_SYNC;
542         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
543
544         /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
545         if (test_opt(sb, BARRIER))
546                 write_flags |= REQ_FUA | REQ_PREFLUSH;
547         lock_buffer(bh);
548         set_buffer_dirty(bh);
549         set_buffer_uptodate(bh);
550         bh->b_end_io = ext4_end_buffer_io_sync;
551         submit_bh(REQ_OP_WRITE, write_flags, bh);
552         EXT4_SB(sb)->s_fc_bh = NULL;
553 }
554
555 /* Ext4 commit path routines */
556
557 /* memzero and update CRC */
558 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
559                                 u32 *crc)
560 {
561         void *ret;
562
563         ret = memset(dst, 0, len);
564         if (crc)
565                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
566         return ret;
567 }
568
569 /*
570  * Allocate len bytes on a fast commit buffer.
571  *
572  * During the commit time this function is used to manage fast commit
573  * block space. We don't split a fast commit log onto different
574  * blocks. So this function makes sure that if there's not enough space
575  * on the current block, the remaining space in the current block is
576  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
577  * new block is from jbd2 and CRC is updated to reflect the padding
578  * we added.
579  */
580 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
581 {
582         struct ext4_fc_tl *tl;
583         struct ext4_sb_info *sbi = EXT4_SB(sb);
584         struct buffer_head *bh;
585         int bsize = sbi->s_journal->j_blocksize;
586         int ret, off = sbi->s_fc_bytes % bsize;
587         int pad_len;
588
589         /*
590          * After allocating len, we should have space at least for a 0 byte
591          * padding.
592          */
593         if (len + sizeof(struct ext4_fc_tl) > bsize)
594                 return NULL;
595
596         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
597                 /*
598                  * Only allocate from current buffer if we have enough space for
599                  * this request AND we have space to add a zero byte padding.
600                  */
601                 if (!sbi->s_fc_bh) {
602                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
603                         if (ret)
604                                 return NULL;
605                         sbi->s_fc_bh = bh;
606                 }
607                 sbi->s_fc_bytes += len;
608                 return sbi->s_fc_bh->b_data + off;
609         }
610         /* Need to add PAD tag */
611         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
612         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
613         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
614         tl->fc_len = cpu_to_le16(pad_len);
615         if (crc)
616                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
617         if (pad_len > 0)
618                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
619         ext4_fc_submit_bh(sb);
620
621         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
622         if (ret)
623                 return NULL;
624         sbi->s_fc_bh = bh;
625         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
626         return sbi->s_fc_bh->b_data;
627 }
628
629 /* memcpy to fc reserved space and update CRC */
630 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
631                                 int len, u32 *crc)
632 {
633         if (crc)
634                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
635         return memcpy(dst, src, len);
636 }
637
638 /*
639  * Complete a fast commit by writing tail tag.
640  *
641  * Writing tail tag marks the end of a fast commit. In order to guarantee
642  * atomicity, after writing tail tag, even if there's space remaining
643  * in the block, next commit shouldn't use it. That's why tail tag
644  * has the length as that of the remaining space on the block.
645  */
646 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
647 {
648         struct ext4_sb_info *sbi = EXT4_SB(sb);
649         struct ext4_fc_tl tl;
650         struct ext4_fc_tail tail;
651         int off, bsize = sbi->s_journal->j_blocksize;
652         u8 *dst;
653
654         /*
655          * ext4_fc_reserve_space takes care of allocating an extra block if
656          * there's no enough space on this block for accommodating this tail.
657          */
658         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
659         if (!dst)
660                 return -ENOSPC;
661
662         off = sbi->s_fc_bytes % bsize;
663
664         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
665         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
666         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
667
668         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
669         dst += sizeof(tl);
670         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
671         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
672         dst += sizeof(tail.fc_tid);
673         tail.fc_crc = cpu_to_le32(crc);
674         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
675
676         ext4_fc_submit_bh(sb);
677
678         return 0;
679 }
680
681 /*
682  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
683  * Returns false if there's not enough space.
684  */
685 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
686                            u32 *crc)
687 {
688         struct ext4_fc_tl tl;
689         u8 *dst;
690
691         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
692         if (!dst)
693                 return false;
694
695         tl.fc_tag = cpu_to_le16(tag);
696         tl.fc_len = cpu_to_le16(len);
697
698         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
699         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
700
701         return true;
702 }
703
704 /* Same as above, but adds dentry tlv. */
705 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
706                                         int parent_ino, int ino, int dlen,
707                                         const unsigned char *dname,
708                                         u32 *crc)
709 {
710         struct ext4_fc_dentry_info fcd;
711         struct ext4_fc_tl tl;
712         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
713                                         crc);
714
715         if (!dst)
716                 return false;
717
718         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
719         fcd.fc_ino = cpu_to_le32(ino);
720         tl.fc_tag = cpu_to_le16(tag);
721         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
722         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
723         dst += sizeof(tl);
724         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
725         dst += sizeof(fcd);
726         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
727         dst += dlen;
728
729         return true;
730 }
731
732 /*
733  * Writes inode in the fast commit space under TLV with tag @tag.
734  * Returns 0 on success, error on failure.
735  */
736 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
737 {
738         struct ext4_inode_info *ei = EXT4_I(inode);
739         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
740         int ret;
741         struct ext4_iloc iloc;
742         struct ext4_fc_inode fc_inode;
743         struct ext4_fc_tl tl;
744         u8 *dst;
745
746         ret = ext4_get_inode_loc(inode, &iloc);
747         if (ret)
748                 return ret;
749
750         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
751                 inode_len += ei->i_extra_isize;
752
753         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
754         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
755         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
756
757         dst = ext4_fc_reserve_space(inode->i_sb,
758                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
759         if (!dst)
760                 return -ECANCELED;
761
762         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
763                 return -ECANCELED;
764         dst += sizeof(tl);
765         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
766                 return -ECANCELED;
767         dst += sizeof(fc_inode);
768         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
769                                         inode_len, crc))
770                 return -ECANCELED;
771
772         return 0;
773 }
774
775 /*
776  * Writes updated data ranges for the inode in question. Updates CRC.
777  * Returns 0 on success, error otherwise.
778  */
779 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
780 {
781         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
782         struct ext4_inode_info *ei = EXT4_I(inode);
783         struct ext4_map_blocks map;
784         struct ext4_fc_add_range fc_ext;
785         struct ext4_fc_del_range lrange;
786         struct ext4_extent *ex;
787         int ret;
788
789         mutex_lock(&ei->i_fc_lock);
790         if (ei->i_fc_lblk_len == 0) {
791                 mutex_unlock(&ei->i_fc_lock);
792                 return 0;
793         }
794         old_blk_size = ei->i_fc_lblk_start;
795         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
796         ei->i_fc_lblk_len = 0;
797         mutex_unlock(&ei->i_fc_lock);
798
799         cur_lblk_off = old_blk_size;
800         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
801                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
802
803         while (cur_lblk_off <= new_blk_size) {
804                 map.m_lblk = cur_lblk_off;
805                 map.m_len = new_blk_size - cur_lblk_off + 1;
806                 ret = ext4_map_blocks(NULL, inode, &map, 0);
807                 if (ret < 0)
808                         return -ECANCELED;
809
810                 if (map.m_len == 0) {
811                         cur_lblk_off++;
812                         continue;
813                 }
814
815                 if (ret == 0) {
816                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
817                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
818                         lrange.fc_len = cpu_to_le32(map.m_len);
819                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
820                                             sizeof(lrange), (u8 *)&lrange, crc))
821                                 return -ENOSPC;
822                 } else {
823                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
824                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
825                         ex->ee_block = cpu_to_le32(map.m_lblk);
826                         ex->ee_len = cpu_to_le16(map.m_len);
827                         ext4_ext_store_pblock(ex, map.m_pblk);
828                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
829                                 ext4_ext_mark_unwritten(ex);
830                         else
831                                 ext4_ext_mark_initialized(ex);
832                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
833                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
834                                 return -ENOSPC;
835                 }
836
837                 cur_lblk_off += map.m_len;
838         }
839
840         return 0;
841 }
842
843
844 /* Submit data for all the fast commit inodes */
845 static int ext4_fc_submit_inode_data_all(journal_t *journal)
846 {
847         struct super_block *sb = (struct super_block *)(journal->j_private);
848         struct ext4_sb_info *sbi = EXT4_SB(sb);
849         struct ext4_inode_info *ei;
850         struct list_head *pos;
851         int ret = 0;
852
853         spin_lock(&sbi->s_fc_lock);
854         sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING;
855         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
856                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
857                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
858                 while (atomic_read(&ei->i_fc_updates)) {
859                         DEFINE_WAIT(wait);
860
861                         prepare_to_wait(&ei->i_fc_wait, &wait,
862                                                 TASK_UNINTERRUPTIBLE);
863                         if (atomic_read(&ei->i_fc_updates)) {
864                                 spin_unlock(&sbi->s_fc_lock);
865                                 schedule();
866                                 spin_lock(&sbi->s_fc_lock);
867                         }
868                         finish_wait(&ei->i_fc_wait, &wait);
869                 }
870                 spin_unlock(&sbi->s_fc_lock);
871                 ret = jbd2_submit_inode_data(ei->jinode);
872                 if (ret)
873                         return ret;
874                 spin_lock(&sbi->s_fc_lock);
875         }
876         spin_unlock(&sbi->s_fc_lock);
877
878         return ret;
879 }
880
881 /* Wait for completion of data for all the fast commit inodes */
882 static int ext4_fc_wait_inode_data_all(journal_t *journal)
883 {
884         struct super_block *sb = (struct super_block *)(journal->j_private);
885         struct ext4_sb_info *sbi = EXT4_SB(sb);
886         struct ext4_inode_info *pos, *n;
887         int ret = 0;
888
889         spin_lock(&sbi->s_fc_lock);
890         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
891                 if (!ext4_test_inode_state(&pos->vfs_inode,
892                                            EXT4_STATE_FC_COMMITTING))
893                         continue;
894                 spin_unlock(&sbi->s_fc_lock);
895
896                 ret = jbd2_wait_inode_data(journal, pos->jinode);
897                 if (ret)
898                         return ret;
899                 spin_lock(&sbi->s_fc_lock);
900         }
901         spin_unlock(&sbi->s_fc_lock);
902
903         return 0;
904 }
905
906 /* Commit all the directory entry updates */
907 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
908 {
909         struct super_block *sb = (struct super_block *)(journal->j_private);
910         struct ext4_sb_info *sbi = EXT4_SB(sb);
911         struct ext4_fc_dentry_update *fc_dentry;
912         struct inode *inode;
913         struct list_head *pos, *n, *fcd_pos, *fcd_n;
914         struct ext4_inode_info *ei;
915         int ret;
916
917         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
918                 return 0;
919         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
920                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
921                                         fcd_list);
922                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
923                         spin_unlock(&sbi->s_fc_lock);
924                         if (!ext4_fc_add_dentry_tlv(
925                                 sb, fc_dentry->fcd_op,
926                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
927                                 fc_dentry->fcd_name.len,
928                                 fc_dentry->fcd_name.name, crc)) {
929                                 ret = -ENOSPC;
930                                 goto lock_and_exit;
931                         }
932                         spin_lock(&sbi->s_fc_lock);
933                         continue;
934                 }
935
936                 inode = NULL;
937                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
938                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
939                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
940                                 inode = &ei->vfs_inode;
941                                 break;
942                         }
943                 }
944                 /*
945                  * If we don't find inode in our list, then it was deleted,
946                  * in which case, we don't need to record it's create tag.
947                  */
948                 if (!inode)
949                         continue;
950                 spin_unlock(&sbi->s_fc_lock);
951
952                 /*
953                  * We first write the inode and then the create dirent. This
954                  * allows the recovery code to create an unnamed inode first
955                  * and then link it to a directory entry. This allows us
956                  * to use namei.c routines almost as is and simplifies
957                  * the recovery code.
958                  */
959                 ret = ext4_fc_write_inode(inode, crc);
960                 if (ret)
961                         goto lock_and_exit;
962
963                 ret = ext4_fc_write_inode_data(inode, crc);
964                 if (ret)
965                         goto lock_and_exit;
966
967                 if (!ext4_fc_add_dentry_tlv(
968                         sb, fc_dentry->fcd_op,
969                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
970                         fc_dentry->fcd_name.len,
971                         fc_dentry->fcd_name.name, crc)) {
972                         ret = -ENOSPC;
973                         goto lock_and_exit;
974                 }
975
976                 spin_lock(&sbi->s_fc_lock);
977         }
978         return 0;
979 lock_and_exit:
980         spin_lock(&sbi->s_fc_lock);
981         return ret;
982 }
983
984 static int ext4_fc_perform_commit(journal_t *journal)
985 {
986         struct super_block *sb = (struct super_block *)(journal->j_private);
987         struct ext4_sb_info *sbi = EXT4_SB(sb);
988         struct ext4_inode_info *iter;
989         struct ext4_fc_head head;
990         struct list_head *pos;
991         struct inode *inode;
992         struct blk_plug plug;
993         int ret = 0;
994         u32 crc = 0;
995
996         ret = ext4_fc_submit_inode_data_all(journal);
997         if (ret)
998                 return ret;
999
1000         ret = ext4_fc_wait_inode_data_all(journal);
1001         if (ret)
1002                 return ret;
1003
1004         blk_start_plug(&plug);
1005         if (sbi->s_fc_bytes == 0) {
1006                 /*
1007                  * Add a head tag only if this is the first fast commit
1008                  * in this TID.
1009                  */
1010                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1011                 head.fc_tid = cpu_to_le32(
1012                         sbi->s_journal->j_running_transaction->t_tid);
1013                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1014                         (u8 *)&head, &crc))
1015                         goto out;
1016         }
1017
1018         spin_lock(&sbi->s_fc_lock);
1019         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1020         if (ret) {
1021                 spin_unlock(&sbi->s_fc_lock);
1022                 goto out;
1023         }
1024
1025         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1026                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1027                 inode = &iter->vfs_inode;
1028                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1029                         continue;
1030
1031                 spin_unlock(&sbi->s_fc_lock);
1032                 ret = ext4_fc_write_inode_data(inode, &crc);
1033                 if (ret)
1034                         goto out;
1035                 ret = ext4_fc_write_inode(inode, &crc);
1036                 if (ret)
1037                         goto out;
1038                 spin_lock(&sbi->s_fc_lock);
1039         }
1040         spin_unlock(&sbi->s_fc_lock);
1041
1042         ret = ext4_fc_write_tail(sb, crc);
1043
1044 out:
1045         blk_finish_plug(&plug);
1046         return ret;
1047 }
1048
1049 /*
1050  * The main commit entry point. Performs a fast commit for transaction
1051  * commit_tid if needed. If it's not possible to perform a fast commit
1052  * due to various reasons, we fall back to full commit. Returns 0
1053  * on success, error otherwise.
1054  */
1055 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1056 {
1057         struct super_block *sb = (struct super_block *)(journal->j_private);
1058         struct ext4_sb_info *sbi = EXT4_SB(sb);
1059         int nblks = 0, ret, bsize = journal->j_blocksize;
1060         int subtid = atomic_read(&sbi->s_fc_subtid);
1061         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1062         ktime_t start_time, commit_time;
1063
1064         trace_ext4_fc_commit_start(sb);
1065
1066         start_time = ktime_get();
1067
1068         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1069                 (ext4_fc_is_ineligible(sb))) {
1070                 reason = EXT4_FC_REASON_INELIGIBLE;
1071                 goto out;
1072         }
1073
1074 restart_fc:
1075         ret = jbd2_fc_begin_commit(journal, commit_tid);
1076         if (ret == -EALREADY) {
1077                 /* There was an ongoing commit, check if we need to restart */
1078                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1079                         commit_tid > journal->j_commit_sequence)
1080                         goto restart_fc;
1081                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1082                 goto out;
1083         } else if (ret) {
1084                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1085                 reason = EXT4_FC_REASON_FC_START_FAILED;
1086                 goto out;
1087         }
1088
1089         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1090         ret = ext4_fc_perform_commit(journal);
1091         if (ret < 0) {
1092                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1093                 reason = EXT4_FC_REASON_FC_FAILED;
1094                 goto out;
1095         }
1096         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1097         ret = jbd2_fc_wait_bufs(journal, nblks);
1098         if (ret < 0) {
1099                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1100                 reason = EXT4_FC_REASON_FC_FAILED;
1101                 goto out;
1102         }
1103         atomic_inc(&sbi->s_fc_subtid);
1104         jbd2_fc_end_commit(journal);
1105 out:
1106         /* Has any ineligible update happened since we started? */
1107         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1108                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1109                 reason = EXT4_FC_REASON_INELIGIBLE;
1110         }
1111
1112         spin_lock(&sbi->s_fc_lock);
1113         if (reason != EXT4_FC_REASON_OK &&
1114                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1115                 sbi->s_fc_stats.fc_ineligible_commits++;
1116         } else {
1117                 sbi->s_fc_stats.fc_num_commits++;
1118                 sbi->s_fc_stats.fc_numblks += nblks;
1119         }
1120         spin_unlock(&sbi->s_fc_lock);
1121         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1122         trace_ext4_fc_commit_stop(sb, nblks, reason);
1123         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1124         /*
1125          * weight the commit time higher than the average time so we don't
1126          * react too strongly to vast changes in the commit time
1127          */
1128         if (likely(sbi->s_fc_avg_commit_time))
1129                 sbi->s_fc_avg_commit_time = (commit_time +
1130                                 sbi->s_fc_avg_commit_time * 3) / 4;
1131         else
1132                 sbi->s_fc_avg_commit_time = commit_time;
1133         jbd_debug(1,
1134                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1135                 nblks, reason, subtid);
1136         if (reason == EXT4_FC_REASON_FC_FAILED)
1137                 return jbd2_fc_end_commit_fallback(journal);
1138         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1139                 reason == EXT4_FC_REASON_INELIGIBLE)
1140                 return jbd2_complete_transaction(journal, commit_tid);
1141         return 0;
1142 }
1143
1144 /*
1145  * Fast commit cleanup routine. This is called after every fast commit and
1146  * full commit. full is true if we are called after a full commit.
1147  */
1148 static void ext4_fc_cleanup(journal_t *journal, int full)
1149 {
1150         struct super_block *sb = journal->j_private;
1151         struct ext4_sb_info *sbi = EXT4_SB(sb);
1152         struct ext4_inode_info *iter;
1153         struct ext4_fc_dentry_update *fc_dentry;
1154         struct list_head *pos, *n;
1155
1156         if (full && sbi->s_fc_bh)
1157                 sbi->s_fc_bh = NULL;
1158
1159         jbd2_fc_release_bufs(journal);
1160
1161         spin_lock(&sbi->s_fc_lock);
1162         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1163                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1164                 list_del_init(&iter->i_fc_list);
1165                 ext4_clear_inode_state(&iter->vfs_inode,
1166                                        EXT4_STATE_FC_COMMITTING);
1167                 ext4_fc_reset_inode(&iter->vfs_inode);
1168                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1169                 smp_mb();
1170 #if (BITS_PER_LONG < 64)
1171                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1172 #else
1173                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1174 #endif
1175         }
1176
1177         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1178                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1179                                              struct ext4_fc_dentry_update,
1180                                              fcd_list);
1181                 list_del_init(&fc_dentry->fcd_list);
1182                 spin_unlock(&sbi->s_fc_lock);
1183
1184                 if (fc_dentry->fcd_name.name &&
1185                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1186                         kfree(fc_dentry->fcd_name.name);
1187                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1188                 spin_lock(&sbi->s_fc_lock);
1189         }
1190
1191         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1192                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1193         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1194                                 &sbi->s_fc_q[FC_Q_STAGING]);
1195
1196         sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING;
1197         sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE;
1198
1199         if (full)
1200                 sbi->s_fc_bytes = 0;
1201         spin_unlock(&sbi->s_fc_lock);
1202         trace_ext4_fc_stats(sb);
1203 }
1204
1205 /* Ext4 Replay Path Routines */
1206
1207 /* Get length of a particular tlv */
1208 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1209 {
1210         return le16_to_cpu(tl->fc_len);
1211 }
1212
1213 /* Get a pointer to "value" of a tlv */
1214 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1215 {
1216         return (u8 *)tl + sizeof(*tl);
1217 }
1218
1219 /* Helper struct for dentry replay routines */
1220 struct dentry_info_args {
1221         int parent_ino, dname_len, ino, inode_len;
1222         char *dname;
1223 };
1224
1225 static inline void tl_to_darg(struct dentry_info_args *darg,
1226                                 struct  ext4_fc_tl *tl)
1227 {
1228         struct ext4_fc_dentry_info *fcd;
1229
1230         fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1231
1232         darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1233         darg->ino = le32_to_cpu(fcd->fc_ino);
1234         darg->dname = fcd->fc_dname;
1235         darg->dname_len = ext4_fc_tag_len(tl) -
1236                         sizeof(struct ext4_fc_dentry_info);
1237 }
1238
1239 /* Unlink replay function */
1240 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1241 {
1242         struct inode *inode, *old_parent;
1243         struct qstr entry;
1244         struct dentry_info_args darg;
1245         int ret = 0;
1246
1247         tl_to_darg(&darg, tl);
1248
1249         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1250                         darg.parent_ino, darg.dname_len);
1251
1252         entry.name = darg.dname;
1253         entry.len = darg.dname_len;
1254         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1255
1256         if (IS_ERR_OR_NULL(inode)) {
1257                 jbd_debug(1, "Inode %d not found", darg.ino);
1258                 return 0;
1259         }
1260
1261         old_parent = ext4_iget(sb, darg.parent_ino,
1262                                 EXT4_IGET_NORMAL);
1263         if (IS_ERR_OR_NULL(old_parent)) {
1264                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1265                 iput(inode);
1266                 return 0;
1267         }
1268
1269         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1270         /* -ENOENT ok coz it might not exist anymore. */
1271         if (ret == -ENOENT)
1272                 ret = 0;
1273         iput(old_parent);
1274         iput(inode);
1275         return ret;
1276 }
1277
1278 static int ext4_fc_replay_link_internal(struct super_block *sb,
1279                                 struct dentry_info_args *darg,
1280                                 struct inode *inode)
1281 {
1282         struct inode *dir = NULL;
1283         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1284         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1285         int ret = 0;
1286
1287         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1288         if (IS_ERR(dir)) {
1289                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1290                 dir = NULL;
1291                 goto out;
1292         }
1293
1294         dentry_dir = d_obtain_alias(dir);
1295         if (IS_ERR(dentry_dir)) {
1296                 jbd_debug(1, "Failed to obtain dentry");
1297                 dentry_dir = NULL;
1298                 goto out;
1299         }
1300
1301         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1302         if (!dentry_inode) {
1303                 jbd_debug(1, "Inode dentry not created.");
1304                 ret = -ENOMEM;
1305                 goto out;
1306         }
1307
1308         ret = __ext4_link(dir, inode, dentry_inode);
1309         /*
1310          * It's possible that link already existed since data blocks
1311          * for the dir in question got persisted before we crashed OR
1312          * we replayed this tag and crashed before the entire replay
1313          * could complete.
1314          */
1315         if (ret && ret != -EEXIST) {
1316                 jbd_debug(1, "Failed to link\n");
1317                 goto out;
1318         }
1319
1320         ret = 0;
1321 out:
1322         if (dentry_dir) {
1323                 d_drop(dentry_dir);
1324                 dput(dentry_dir);
1325         } else if (dir) {
1326                 iput(dir);
1327         }
1328         if (dentry_inode) {
1329                 d_drop(dentry_inode);
1330                 dput(dentry_inode);
1331         }
1332
1333         return ret;
1334 }
1335
1336 /* Link replay function */
1337 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1338 {
1339         struct inode *inode;
1340         struct dentry_info_args darg;
1341         int ret = 0;
1342
1343         tl_to_darg(&darg, tl);
1344         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1345                         darg.parent_ino, darg.dname_len);
1346
1347         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1348         if (IS_ERR_OR_NULL(inode)) {
1349                 jbd_debug(1, "Inode not found.");
1350                 return 0;
1351         }
1352
1353         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1354         iput(inode);
1355         return ret;
1356 }
1357
1358 /*
1359  * Record all the modified inodes during replay. We use this later to setup
1360  * block bitmaps correctly.
1361  */
1362 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1363 {
1364         struct ext4_fc_replay_state *state;
1365         int i;
1366
1367         state = &EXT4_SB(sb)->s_fc_replay_state;
1368         for (i = 0; i < state->fc_modified_inodes_used; i++)
1369                 if (state->fc_modified_inodes[i] == ino)
1370                         return 0;
1371         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1372                 state->fc_modified_inodes_size +=
1373                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1374                 state->fc_modified_inodes = krealloc(
1375                                         state->fc_modified_inodes, sizeof(int) *
1376                                         state->fc_modified_inodes_size,
1377                                         GFP_KERNEL);
1378                 if (!state->fc_modified_inodes)
1379                         return -ENOMEM;
1380         }
1381         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1382         return 0;
1383 }
1384
1385 /*
1386  * Inode replay function
1387  */
1388 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1389 {
1390         struct ext4_fc_inode *fc_inode;
1391         struct ext4_inode *raw_inode;
1392         struct ext4_inode *raw_fc_inode;
1393         struct inode *inode = NULL;
1394         struct ext4_iloc iloc;
1395         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1396         struct ext4_extent_header *eh;
1397
1398         fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1399
1400         ino = le32_to_cpu(fc_inode->fc_ino);
1401         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1402
1403         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1404         if (!IS_ERR_OR_NULL(inode)) {
1405                 ext4_ext_clear_bb(inode);
1406                 iput(inode);
1407         }
1408
1409         ext4_fc_record_modified_inode(sb, ino);
1410
1411         raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1412         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1413         if (ret)
1414                 goto out;
1415
1416         inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1417         raw_inode = ext4_raw_inode(&iloc);
1418
1419         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1420         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1421                 inode_len - offsetof(struct ext4_inode, i_generation));
1422         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1423                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1424                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1425                         memset(eh, 0, sizeof(*eh));
1426                         eh->eh_magic = EXT4_EXT_MAGIC;
1427                         eh->eh_max = cpu_to_le16(
1428                                 (sizeof(raw_inode->i_block) -
1429                                  sizeof(struct ext4_extent_header))
1430                                  / sizeof(struct ext4_extent));
1431                 }
1432         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1433                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1434                         sizeof(raw_inode->i_block));
1435         }
1436
1437         /* Immediately update the inode on disk. */
1438         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1439         if (ret)
1440                 goto out;
1441         ret = sync_dirty_buffer(iloc.bh);
1442         if (ret)
1443                 goto out;
1444         ret = ext4_mark_inode_used(sb, ino);
1445         if (ret)
1446                 goto out;
1447
1448         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1449         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1450         if (IS_ERR_OR_NULL(inode)) {
1451                 jbd_debug(1, "Inode not found.");
1452                 return -EFSCORRUPTED;
1453         }
1454
1455         /*
1456          * Our allocator could have made different decisions than before
1457          * crashing. This should be fixed but until then, we calculate
1458          * the number of blocks the inode.
1459          */
1460         ext4_ext_replay_set_iblocks(inode);
1461
1462         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1463         ext4_reset_inode_seed(inode);
1464
1465         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1466         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1467         sync_dirty_buffer(iloc.bh);
1468         brelse(iloc.bh);
1469 out:
1470         iput(inode);
1471         if (!ret)
1472                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1473
1474         return 0;
1475 }
1476
1477 /*
1478  * Dentry create replay function.
1479  *
1480  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1481  * inode for which we are trying to create a dentry here, should already have
1482  * been replayed before we start here.
1483  */
1484 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1485 {
1486         int ret = 0;
1487         struct inode *inode = NULL;
1488         struct inode *dir = NULL;
1489         struct dentry_info_args darg;
1490
1491         tl_to_darg(&darg, tl);
1492
1493         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1494                         darg.parent_ino, darg.dname_len);
1495
1496         /* This takes care of update group descriptor and other metadata */
1497         ret = ext4_mark_inode_used(sb, darg.ino);
1498         if (ret)
1499                 goto out;
1500
1501         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1502         if (IS_ERR_OR_NULL(inode)) {
1503                 jbd_debug(1, "inode %d not found.", darg.ino);
1504                 inode = NULL;
1505                 ret = -EINVAL;
1506                 goto out;
1507         }
1508
1509         if (S_ISDIR(inode->i_mode)) {
1510                 /*
1511                  * If we are creating a directory, we need to make sure that the
1512                  * dot and dot dot dirents are setup properly.
1513                  */
1514                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1515                 if (IS_ERR_OR_NULL(dir)) {
1516                         jbd_debug(1, "Dir %d not found.", darg.ino);
1517                         goto out;
1518                 }
1519                 ret = ext4_init_new_dir(NULL, dir, inode);
1520                 iput(dir);
1521                 if (ret) {
1522                         ret = 0;
1523                         goto out;
1524                 }
1525         }
1526         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1527         if (ret)
1528                 goto out;
1529         set_nlink(inode, 1);
1530         ext4_mark_inode_dirty(NULL, inode);
1531 out:
1532         if (inode)
1533                 iput(inode);
1534         return ret;
1535 }
1536
1537 /*
1538  * Record physical disk regions which are in use as per fast commit area. Our
1539  * simple replay phase allocator excludes these regions from allocation.
1540  */
1541 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1542                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1543 {
1544         struct ext4_fc_replay_state *state;
1545         struct ext4_fc_alloc_region *region;
1546
1547         state = &EXT4_SB(sb)->s_fc_replay_state;
1548         if (state->fc_regions_used == state->fc_regions_size) {
1549                 state->fc_regions_size +=
1550                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1551                 state->fc_regions = krealloc(
1552                                         state->fc_regions,
1553                                         state->fc_regions_size *
1554                                         sizeof(struct ext4_fc_alloc_region),
1555                                         GFP_KERNEL);
1556                 if (!state->fc_regions)
1557                         return -ENOMEM;
1558         }
1559         region = &state->fc_regions[state->fc_regions_used++];
1560         region->ino = ino;
1561         region->lblk = lblk;
1562         region->pblk = pblk;
1563         region->len = len;
1564
1565         return 0;
1566 }
1567
1568 /* Replay add range tag */
1569 static int ext4_fc_replay_add_range(struct super_block *sb,
1570                                 struct ext4_fc_tl *tl)
1571 {
1572         struct ext4_fc_add_range *fc_add_ex;
1573         struct ext4_extent newex, *ex;
1574         struct inode *inode;
1575         ext4_lblk_t start, cur;
1576         int remaining, len;
1577         ext4_fsblk_t start_pblk;
1578         struct ext4_map_blocks map;
1579         struct ext4_ext_path *path = NULL;
1580         int ret;
1581
1582         fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1583         ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1584
1585         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1586                 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1587                 ext4_ext_get_actual_len(ex));
1588
1589         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1590                                 EXT4_IGET_NORMAL);
1591         if (IS_ERR_OR_NULL(inode)) {
1592                 jbd_debug(1, "Inode not found.");
1593                 return 0;
1594         }
1595
1596         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1597
1598         start = le32_to_cpu(ex->ee_block);
1599         start_pblk = ext4_ext_pblock(ex);
1600         len = ext4_ext_get_actual_len(ex);
1601
1602         cur = start;
1603         remaining = len;
1604         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1605                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1606                   inode->i_ino);
1607
1608         while (remaining > 0) {
1609                 map.m_lblk = cur;
1610                 map.m_len = remaining;
1611                 map.m_pblk = 0;
1612                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1613
1614                 if (ret < 0) {
1615                         iput(inode);
1616                         return 0;
1617                 }
1618
1619                 if (ret == 0) {
1620                         /* Range is not mapped */
1621                         path = ext4_find_extent(inode, cur, NULL, 0);
1622                         if (IS_ERR(path)) {
1623                                 iput(inode);
1624                                 return 0;
1625                         }
1626                         memset(&newex, 0, sizeof(newex));
1627                         newex.ee_block = cpu_to_le32(cur);
1628                         ext4_ext_store_pblock(
1629                                 &newex, start_pblk + cur - start);
1630                         newex.ee_len = cpu_to_le16(map.m_len);
1631                         if (ext4_ext_is_unwritten(ex))
1632                                 ext4_ext_mark_unwritten(&newex);
1633                         down_write(&EXT4_I(inode)->i_data_sem);
1634                         ret = ext4_ext_insert_extent(
1635                                 NULL, inode, &path, &newex, 0);
1636                         up_write((&EXT4_I(inode)->i_data_sem));
1637                         ext4_ext_drop_refs(path);
1638                         kfree(path);
1639                         if (ret) {
1640                                 iput(inode);
1641                                 return 0;
1642                         }
1643                         goto next;
1644                 }
1645
1646                 if (start_pblk + cur - start != map.m_pblk) {
1647                         /*
1648                          * Logical to physical mapping changed. This can happen
1649                          * if this range was removed and then reallocated to
1650                          * map to new physical blocks during a fast commit.
1651                          */
1652                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1653                                         ext4_ext_is_unwritten(ex),
1654                                         start_pblk + cur - start);
1655                         if (ret) {
1656                                 iput(inode);
1657                                 return 0;
1658                         }
1659                         /*
1660                          * Mark the old blocks as free since they aren't used
1661                          * anymore. We maintain an array of all the modified
1662                          * inodes. In case these blocks are still used at either
1663                          * a different logical range in the same inode or in
1664                          * some different inode, we will mark them as allocated
1665                          * at the end of the FC replay using our array of
1666                          * modified inodes.
1667                          */
1668                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1669                         goto next;
1670                 }
1671
1672                 /* Range is mapped and needs a state change */
1673                 jbd_debug(1, "Converting from %d to %d %lld",
1674                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1675                         ext4_ext_is_unwritten(ex), map.m_pblk);
1676                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1677                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1678                 if (ret) {
1679                         iput(inode);
1680                         return 0;
1681                 }
1682                 /*
1683                  * We may have split the extent tree while toggling the state.
1684                  * Try to shrink the extent tree now.
1685                  */
1686                 ext4_ext_replay_shrink_inode(inode, start + len);
1687 next:
1688                 cur += map.m_len;
1689                 remaining -= map.m_len;
1690         }
1691         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1692                                         sb->s_blocksize_bits);
1693         iput(inode);
1694         return 0;
1695 }
1696
1697 /* Replay DEL_RANGE tag */
1698 static int
1699 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1700 {
1701         struct inode *inode;
1702         struct ext4_fc_del_range *lrange;
1703         struct ext4_map_blocks map;
1704         ext4_lblk_t cur, remaining;
1705         int ret;
1706
1707         lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1708         cur = le32_to_cpu(lrange->fc_lblk);
1709         remaining = le32_to_cpu(lrange->fc_len);
1710
1711         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1712                 le32_to_cpu(lrange->fc_ino), cur, remaining);
1713
1714         inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1715         if (IS_ERR_OR_NULL(inode)) {
1716                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1717                 return 0;
1718         }
1719
1720         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1721
1722         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1723                         inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1724                         le32_to_cpu(lrange->fc_len));
1725         while (remaining > 0) {
1726                 map.m_lblk = cur;
1727                 map.m_len = remaining;
1728
1729                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1730                 if (ret < 0) {
1731                         iput(inode);
1732                         return 0;
1733                 }
1734                 if (ret > 0) {
1735                         remaining -= ret;
1736                         cur += ret;
1737                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1738                 } else {
1739                         remaining -= map.m_len;
1740                         cur += map.m_len;
1741                 }
1742         }
1743
1744         ret = ext4_punch_hole(inode,
1745                 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1746                 le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1747         if (ret)
1748                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1749         ext4_ext_replay_shrink_inode(inode,
1750                 i_size_read(inode) >> sb->s_blocksize_bits);
1751         ext4_mark_inode_dirty(NULL, inode);
1752         iput(inode);
1753
1754         return 0;
1755 }
1756
1757 static inline const char *tag2str(u16 tag)
1758 {
1759         switch (tag) {
1760         case EXT4_FC_TAG_LINK:
1761                 return "TAG_ADD_ENTRY";
1762         case EXT4_FC_TAG_UNLINK:
1763                 return "TAG_DEL_ENTRY";
1764         case EXT4_FC_TAG_ADD_RANGE:
1765                 return "TAG_ADD_RANGE";
1766         case EXT4_FC_TAG_CREAT:
1767                 return "TAG_CREAT_DENTRY";
1768         case EXT4_FC_TAG_DEL_RANGE:
1769                 return "TAG_DEL_RANGE";
1770         case EXT4_FC_TAG_INODE:
1771                 return "TAG_INODE";
1772         case EXT4_FC_TAG_PAD:
1773                 return "TAG_PAD";
1774         case EXT4_FC_TAG_TAIL:
1775                 return "TAG_TAIL";
1776         case EXT4_FC_TAG_HEAD:
1777                 return "TAG_HEAD";
1778         default:
1779                 return "TAG_ERROR";
1780         }
1781 }
1782
1783 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1784 {
1785         struct ext4_fc_replay_state *state;
1786         struct inode *inode;
1787         struct ext4_ext_path *path = NULL;
1788         struct ext4_map_blocks map;
1789         int i, ret, j;
1790         ext4_lblk_t cur, end;
1791
1792         state = &EXT4_SB(sb)->s_fc_replay_state;
1793         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1794                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1795                         EXT4_IGET_NORMAL);
1796                 if (IS_ERR_OR_NULL(inode)) {
1797                         jbd_debug(1, "Inode %d not found.",
1798                                 state->fc_modified_inodes[i]);
1799                         continue;
1800                 }
1801                 cur = 0;
1802                 end = EXT_MAX_BLOCKS;
1803                 while (cur < end) {
1804                         map.m_lblk = cur;
1805                         map.m_len = end - cur;
1806
1807                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1808                         if (ret < 0)
1809                                 break;
1810
1811                         if (ret > 0) {
1812                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1813                                 if (!IS_ERR_OR_NULL(path)) {
1814                                         for (j = 0; j < path->p_depth; j++)
1815                                                 ext4_mb_mark_bb(inode->i_sb,
1816                                                         path[j].p_block, 1, 1);
1817                                         ext4_ext_drop_refs(path);
1818                                         kfree(path);
1819                                 }
1820                                 cur += ret;
1821                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1822                                                         map.m_len, 1);
1823                         } else {
1824                                 cur = cur + (map.m_len ? map.m_len : 1);
1825                         }
1826                 }
1827                 iput(inode);
1828         }
1829 }
1830
1831 /*
1832  * Check if block is in excluded regions for block allocation. The simple
1833  * allocator that runs during replay phase is calls this function to see
1834  * if it is okay to use a block.
1835  */
1836 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1837 {
1838         int i;
1839         struct ext4_fc_replay_state *state;
1840
1841         state = &EXT4_SB(sb)->s_fc_replay_state;
1842         for (i = 0; i < state->fc_regions_valid; i++) {
1843                 if (state->fc_regions[i].ino == 0 ||
1844                         state->fc_regions[i].len == 0)
1845                         continue;
1846                 if (blk >= state->fc_regions[i].pblk &&
1847                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1848                         return true;
1849         }
1850         return false;
1851 }
1852
1853 /* Cleanup function called after replay */
1854 void ext4_fc_replay_cleanup(struct super_block *sb)
1855 {
1856         struct ext4_sb_info *sbi = EXT4_SB(sb);
1857
1858         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1859         kfree(sbi->s_fc_replay_state.fc_regions);
1860         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1861 }
1862
1863 /*
1864  * Recovery Scan phase handler
1865  *
1866  * This function is called during the scan phase and is responsible
1867  * for doing following things:
1868  * - Make sure the fast commit area has valid tags for replay
1869  * - Count number of tags that need to be replayed by the replay handler
1870  * - Verify CRC
1871  * - Create a list of excluded blocks for allocation during replay phase
1872  *
1873  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1874  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1875  * to indicate that scan has finished and JBD2 can now start replay phase.
1876  * It returns a negative error to indicate that there was an error. At the end
1877  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1878  * to indicate the number of tags that need to replayed during the replay phase.
1879  */
1880 static int ext4_fc_replay_scan(journal_t *journal,
1881                                 struct buffer_head *bh, int off,
1882                                 tid_t expected_tid)
1883 {
1884         struct super_block *sb = journal->j_private;
1885         struct ext4_sb_info *sbi = EXT4_SB(sb);
1886         struct ext4_fc_replay_state *state;
1887         int ret = JBD2_FC_REPLAY_CONTINUE;
1888         struct ext4_fc_add_range *ext;
1889         struct ext4_fc_tl *tl;
1890         struct ext4_fc_tail *tail;
1891         __u8 *start, *end;
1892         struct ext4_fc_head *head;
1893         struct ext4_extent *ex;
1894
1895         state = &sbi->s_fc_replay_state;
1896
1897         start = (u8 *)bh->b_data;
1898         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1899
1900         if (state->fc_replay_expected_off == 0) {
1901                 state->fc_cur_tag = 0;
1902                 state->fc_replay_num_tags = 0;
1903                 state->fc_crc = 0;
1904                 state->fc_regions = NULL;
1905                 state->fc_regions_valid = state->fc_regions_used =
1906                         state->fc_regions_size = 0;
1907                 /* Check if we can stop early */
1908                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1909                         != EXT4_FC_TAG_HEAD)
1910                         return 0;
1911         }
1912
1913         if (off != state->fc_replay_expected_off) {
1914                 ret = -EFSCORRUPTED;
1915                 goto out_err;
1916         }
1917
1918         state->fc_replay_expected_off++;
1919         fc_for_each_tl(start, end, tl) {
1920                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1921                           tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1922                 switch (le16_to_cpu(tl->fc_tag)) {
1923                 case EXT4_FC_TAG_ADD_RANGE:
1924                         ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1925                         ex = (struct ext4_extent *)&ext->fc_ex;
1926                         ret = ext4_fc_record_regions(sb,
1927                                 le32_to_cpu(ext->fc_ino),
1928                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1929                                 ext4_ext_get_actual_len(ex));
1930                         if (ret < 0)
1931                                 break;
1932                         ret = JBD2_FC_REPLAY_CONTINUE;
1933                         fallthrough;
1934                 case EXT4_FC_TAG_DEL_RANGE:
1935                 case EXT4_FC_TAG_LINK:
1936                 case EXT4_FC_TAG_UNLINK:
1937                 case EXT4_FC_TAG_CREAT:
1938                 case EXT4_FC_TAG_INODE:
1939                 case EXT4_FC_TAG_PAD:
1940                         state->fc_cur_tag++;
1941                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1942                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1943                         break;
1944                 case EXT4_FC_TAG_TAIL:
1945                         state->fc_cur_tag++;
1946                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
1947                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1948                                                 sizeof(*tl) +
1949                                                 offsetof(struct ext4_fc_tail,
1950                                                 fc_crc));
1951                         if (le32_to_cpu(tail->fc_tid) == expected_tid &&
1952                                 le32_to_cpu(tail->fc_crc) == state->fc_crc) {
1953                                 state->fc_replay_num_tags = state->fc_cur_tag;
1954                                 state->fc_regions_valid =
1955                                         state->fc_regions_used;
1956                         } else {
1957                                 ret = state->fc_replay_num_tags ?
1958                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1959                         }
1960                         state->fc_crc = 0;
1961                         break;
1962                 case EXT4_FC_TAG_HEAD:
1963                         head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
1964                         if (le32_to_cpu(head->fc_features) &
1965                                 ~EXT4_FC_SUPPORTED_FEATURES) {
1966                                 ret = -EOPNOTSUPP;
1967                                 break;
1968                         }
1969                         if (le32_to_cpu(head->fc_tid) != expected_tid) {
1970                                 ret = JBD2_FC_REPLAY_STOP;
1971                                 break;
1972                         }
1973                         state->fc_cur_tag++;
1974                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
1975                                         sizeof(*tl) + ext4_fc_tag_len(tl));
1976                         break;
1977                 default:
1978                         ret = state->fc_replay_num_tags ?
1979                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
1980                 }
1981                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
1982                         break;
1983         }
1984
1985 out_err:
1986         trace_ext4_fc_replay_scan(sb, ret, off);
1987         return ret;
1988 }
1989
1990 /*
1991  * Main recovery path entry point.
1992  * The meaning of return codes is similar as above.
1993  */
1994 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
1995                                 enum passtype pass, int off, tid_t expected_tid)
1996 {
1997         struct super_block *sb = journal->j_private;
1998         struct ext4_sb_info *sbi = EXT4_SB(sb);
1999         struct ext4_fc_tl *tl;
2000         __u8 *start, *end;
2001         int ret = JBD2_FC_REPLAY_CONTINUE;
2002         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2003         struct ext4_fc_tail *tail;
2004
2005         if (pass == PASS_SCAN) {
2006                 state->fc_current_pass = PASS_SCAN;
2007                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2008         }
2009
2010         if (state->fc_current_pass != pass) {
2011                 state->fc_current_pass = pass;
2012                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2013         }
2014         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2015                 jbd_debug(1, "Replay stops\n");
2016                 ext4_fc_set_bitmaps_and_counters(sb);
2017                 return 0;
2018         }
2019
2020 #ifdef CONFIG_EXT4_DEBUG
2021         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2022                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2023                 return JBD2_FC_REPLAY_STOP;
2024         }
2025 #endif
2026
2027         start = (u8 *)bh->b_data;
2028         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2029
2030         fc_for_each_tl(start, end, tl) {
2031                 if (state->fc_replay_num_tags == 0) {
2032                         ret = JBD2_FC_REPLAY_STOP;
2033                         ext4_fc_set_bitmaps_and_counters(sb);
2034                         break;
2035                 }
2036                 jbd_debug(3, "Replay phase, tag:%s\n",
2037                                 tag2str(le16_to_cpu(tl->fc_tag)));
2038                 state->fc_replay_num_tags--;
2039                 switch (le16_to_cpu(tl->fc_tag)) {
2040                 case EXT4_FC_TAG_LINK:
2041                         ret = ext4_fc_replay_link(sb, tl);
2042                         break;
2043                 case EXT4_FC_TAG_UNLINK:
2044                         ret = ext4_fc_replay_unlink(sb, tl);
2045                         break;
2046                 case EXT4_FC_TAG_ADD_RANGE:
2047                         ret = ext4_fc_replay_add_range(sb, tl);
2048                         break;
2049                 case EXT4_FC_TAG_CREAT:
2050                         ret = ext4_fc_replay_create(sb, tl);
2051                         break;
2052                 case EXT4_FC_TAG_DEL_RANGE:
2053                         ret = ext4_fc_replay_del_range(sb, tl);
2054                         break;
2055                 case EXT4_FC_TAG_INODE:
2056                         ret = ext4_fc_replay_inode(sb, tl);
2057                         break;
2058                 case EXT4_FC_TAG_PAD:
2059                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2060                                 ext4_fc_tag_len(tl), 0);
2061                         break;
2062                 case EXT4_FC_TAG_TAIL:
2063                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2064                                 ext4_fc_tag_len(tl), 0);
2065                         tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2066                         WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2067                         break;
2068                 case EXT4_FC_TAG_HEAD:
2069                         break;
2070                 default:
2071                         trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2072                                 ext4_fc_tag_len(tl), 0);
2073                         ret = -ECANCELED;
2074                         break;
2075                 }
2076                 if (ret < 0)
2077                         break;
2078                 ret = JBD2_FC_REPLAY_CONTINUE;
2079         }
2080         return ret;
2081 }
2082
2083 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2084 {
2085         /*
2086          * We set replay callback even if fast commit disabled because we may
2087          * could still have fast commit blocks that need to be replayed even if
2088          * fast commit has now been turned off.
2089          */
2090         journal->j_fc_replay_callback = ext4_fc_replay;
2091         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2092                 return;
2093         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2094 }
2095
2096 const char *fc_ineligible_reasons[] = {
2097         "Extended attributes changed",
2098         "Cross rename",
2099         "Journal flag changed",
2100         "Insufficient memory",
2101         "Swap boot",
2102         "Resize",
2103         "Dir renamed",
2104         "Falloc range op",
2105         "FC Commit Failed"
2106 };
2107
2108 int ext4_fc_info_show(struct seq_file *seq, void *v)
2109 {
2110         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2111         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2112         int i;
2113
2114         if (v != SEQ_START_TOKEN)
2115                 return 0;
2116
2117         seq_printf(seq,
2118                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2119                    stats->fc_num_commits, stats->fc_ineligible_commits,
2120                    stats->fc_numblks,
2121                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2122         seq_puts(seq, "Ineligible reasons:\n");
2123         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2124                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2125                         stats->fc_ineligible_reason_count[i]);
2126
2127         return 0;
2128 }
2129
2130 int __init ext4_fc_init_dentry_cache(void)
2131 {
2132         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2133                                            SLAB_RECLAIM_ACCOUNT);
2134
2135         if (ext4_fc_dentry_cachep == NULL)
2136                 return -ENOMEM;
2137
2138         return 0;
2139 }