Merge tag 'for-5.20/block-2022-08-04' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173         BUFFER_TRACE(bh, "");
174         if (uptodate) {
175                 ext4_debug("%s: Block %lld up-to-date",
176                            __func__, bh->b_blocknr);
177                 set_buffer_uptodate(bh);
178         } else {
179                 ext4_debug("%s: Block %lld not up-to-date",
180                            __func__, bh->b_blocknr);
181                 clear_buffer_uptodate(bh);
182         }
183
184         unlock_buffer(bh);
185 }
186
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189         struct ext4_inode_info *ei = EXT4_I(inode);
190
191         ei->i_fc_lblk_start = 0;
192         ei->i_fc_lblk_len = 0;
193 }
194
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197         struct ext4_inode_info *ei = EXT4_I(inode);
198
199         ext4_fc_reset_inode(inode);
200         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201         INIT_LIST_HEAD(&ei->i_fc_list);
202         INIT_LIST_HEAD(&ei->i_fc_dilist);
203         init_waitqueue_head(&ei->i_fc_wait);
204         atomic_set(&ei->i_fc_updates, 0);
205 }
206
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211         wait_queue_head_t *wq;
212         struct ext4_inode_info *ei = EXT4_I(inode);
213
214 #if (BITS_PER_LONG < 64)
215         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216                         EXT4_STATE_FC_COMMITTING);
217         wq = bit_waitqueue(&ei->i_state_flags,
218                                 EXT4_STATE_FC_COMMITTING);
219 #else
220         DEFINE_WAIT_BIT(wait, &ei->i_flags,
221                         EXT4_STATE_FC_COMMITTING);
222         wq = bit_waitqueue(&ei->i_flags,
223                                 EXT4_STATE_FC_COMMITTING);
224 #endif
225         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228         schedule();
229         finish_wait(wq, &wait.wq_entry);
230 }
231
232 /*
233  * Inform Ext4's fast about start of an inode update
234  *
235  * This function is called by the high level call VFS callbacks before
236  * performing any inode update. This function blocks if there's an ongoing
237  * fast commit on the inode in question.
238  */
239 void ext4_fc_start_update(struct inode *inode)
240 {
241         struct ext4_inode_info *ei = EXT4_I(inode);
242
243         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
244             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
245                 return;
246
247 restart:
248         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
249         if (list_empty(&ei->i_fc_list))
250                 goto out;
251
252         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
253                 ext4_fc_wait_committing_inode(inode);
254                 goto restart;
255         }
256 out:
257         atomic_inc(&ei->i_fc_updates);
258         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
259 }
260
261 /*
262  * Stop inode update and wake up waiting fast commits if any.
263  */
264 void ext4_fc_stop_update(struct inode *inode)
265 {
266         struct ext4_inode_info *ei = EXT4_I(inode);
267
268         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
269             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
270                 return;
271
272         if (atomic_dec_and_test(&ei->i_fc_updates))
273                 wake_up_all(&ei->i_fc_wait);
274 }
275
276 /*
277  * Remove inode from fast commit list. If the inode is being committed
278  * we wait until inode commit is done.
279  */
280 void ext4_fc_del(struct inode *inode)
281 {
282         struct ext4_inode_info *ei = EXT4_I(inode);
283         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
284         struct ext4_fc_dentry_update *fc_dentry;
285
286         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
287             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
288                 return;
289
290 restart:
291         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
292         if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
293                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
294                 return;
295         }
296
297         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
298                 ext4_fc_wait_committing_inode(inode);
299                 goto restart;
300         }
301
302         if (!list_empty(&ei->i_fc_list))
303                 list_del_init(&ei->i_fc_list);
304
305         /*
306          * Since this inode is getting removed, let's also remove all FC
307          * dentry create references, since it is not needed to log it anyways.
308          */
309         if (list_empty(&ei->i_fc_dilist)) {
310                 spin_unlock(&sbi->s_fc_lock);
311                 return;
312         }
313
314         fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
315         WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
316         list_del_init(&fc_dentry->fcd_list);
317         list_del_init(&fc_dentry->fcd_dilist);
318
319         WARN_ON(!list_empty(&ei->i_fc_dilist));
320         spin_unlock(&sbi->s_fc_lock);
321
322         if (fc_dentry->fcd_name.name &&
323                 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
324                 kfree(fc_dentry->fcd_name.name);
325         kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
326
327         return;
328 }
329
330 /*
331  * Mark file system as fast commit ineligible, and record latest
332  * ineligible transaction tid. This means until the recorded
333  * transaction, commit operation would result in a full jbd2 commit.
334  */
335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
336 {
337         struct ext4_sb_info *sbi = EXT4_SB(sb);
338         tid_t tid;
339
340         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
341             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
342                 return;
343
344         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
345         if (handle && !IS_ERR(handle))
346                 tid = handle->h_transaction->t_tid;
347         else {
348                 read_lock(&sbi->s_journal->j_state_lock);
349                 tid = sbi->s_journal->j_running_transaction ?
350                                 sbi->s_journal->j_running_transaction->t_tid : 0;
351                 read_unlock(&sbi->s_journal->j_state_lock);
352         }
353         spin_lock(&sbi->s_fc_lock);
354         if (sbi->s_fc_ineligible_tid < tid)
355                 sbi->s_fc_ineligible_tid = tid;
356         spin_unlock(&sbi->s_fc_lock);
357         WARN_ON(reason >= EXT4_FC_REASON_MAX);
358         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
359 }
360
361 /*
362  * Generic fast commit tracking function. If this is the first time this we are
363  * called after a full commit, we initialize fast commit fields and then call
364  * __fc_track_fn() with update = 0. If we have already been called after a full
365  * commit, we pass update = 1. Based on that, the track function can determine
366  * if it needs to track a field for the first time or if it needs to just
367  * update the previously tracked value.
368  *
369  * If enqueue is set, this function enqueues the inode in fast commit list.
370  */
371 static int ext4_fc_track_template(
372         handle_t *handle, struct inode *inode,
373         int (*__fc_track_fn)(struct inode *, void *, bool),
374         void *args, int enqueue)
375 {
376         bool update = false;
377         struct ext4_inode_info *ei = EXT4_I(inode);
378         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
379         tid_t tid = 0;
380         int ret;
381
382         tid = handle->h_transaction->t_tid;
383         mutex_lock(&ei->i_fc_lock);
384         if (tid == ei->i_sync_tid) {
385                 update = true;
386         } else {
387                 ext4_fc_reset_inode(inode);
388                 ei->i_sync_tid = tid;
389         }
390         ret = __fc_track_fn(inode, args, update);
391         mutex_unlock(&ei->i_fc_lock);
392
393         if (!enqueue)
394                 return ret;
395
396         spin_lock(&sbi->s_fc_lock);
397         if (list_empty(&EXT4_I(inode)->i_fc_list))
398                 list_add_tail(&EXT4_I(inode)->i_fc_list,
399                                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
400                                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
401                                 &sbi->s_fc_q[FC_Q_STAGING] :
402                                 &sbi->s_fc_q[FC_Q_MAIN]);
403         spin_unlock(&sbi->s_fc_lock);
404
405         return ret;
406 }
407
408 struct __track_dentry_update_args {
409         struct dentry *dentry;
410         int op;
411 };
412
413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
414 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
415 {
416         struct ext4_fc_dentry_update *node;
417         struct ext4_inode_info *ei = EXT4_I(inode);
418         struct __track_dentry_update_args *dentry_update =
419                 (struct __track_dentry_update_args *)arg;
420         struct dentry *dentry = dentry_update->dentry;
421         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
422
423         mutex_unlock(&ei->i_fc_lock);
424         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
425         if (!node) {
426                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
427                 mutex_lock(&ei->i_fc_lock);
428                 return -ENOMEM;
429         }
430
431         node->fcd_op = dentry_update->op;
432         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
433         node->fcd_ino = inode->i_ino;
434         if (dentry->d_name.len > DNAME_INLINE_LEN) {
435                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
436                 if (!node->fcd_name.name) {
437                         kmem_cache_free(ext4_fc_dentry_cachep, node);
438                         ext4_fc_mark_ineligible(inode->i_sb,
439                                 EXT4_FC_REASON_NOMEM, NULL);
440                         mutex_lock(&ei->i_fc_lock);
441                         return -ENOMEM;
442                 }
443                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
444                         dentry->d_name.len);
445         } else {
446                 memcpy(node->fcd_iname, dentry->d_name.name,
447                         dentry->d_name.len);
448                 node->fcd_name.name = node->fcd_iname;
449         }
450         node->fcd_name.len = dentry->d_name.len;
451         INIT_LIST_HEAD(&node->fcd_dilist);
452         spin_lock(&sbi->s_fc_lock);
453         if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
454                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
455                 list_add_tail(&node->fcd_list,
456                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
457         else
458                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
459
460         /*
461          * This helps us keep a track of all fc_dentry updates which is part of
462          * this ext4 inode. So in case the inode is getting unlinked, before
463          * even we get a chance to fsync, we could remove all fc_dentry
464          * references while evicting the inode in ext4_fc_del().
465          * Also with this, we don't need to loop over all the inodes in
466          * sbi->s_fc_q to get the corresponding inode in
467          * ext4_fc_commit_dentry_updates().
468          */
469         if (dentry_update->op == EXT4_FC_TAG_CREAT) {
470                 WARN_ON(!list_empty(&ei->i_fc_dilist));
471                 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
472         }
473         spin_unlock(&sbi->s_fc_lock);
474         mutex_lock(&ei->i_fc_lock);
475
476         return 0;
477 }
478
479 void __ext4_fc_track_unlink(handle_t *handle,
480                 struct inode *inode, struct dentry *dentry)
481 {
482         struct __track_dentry_update_args args;
483         int ret;
484
485         args.dentry = dentry;
486         args.op = EXT4_FC_TAG_UNLINK;
487
488         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
489                                         (void *)&args, 0);
490         trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
491 }
492
493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
494 {
495         struct inode *inode = d_inode(dentry);
496         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
497
498         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
499             (sbi->s_mount_state & EXT4_FC_REPLAY))
500                 return;
501
502         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503                 return;
504
505         __ext4_fc_track_unlink(handle, inode, dentry);
506 }
507
508 void __ext4_fc_track_link(handle_t *handle,
509         struct inode *inode, struct dentry *dentry)
510 {
511         struct __track_dentry_update_args args;
512         int ret;
513
514         args.dentry = dentry;
515         args.op = EXT4_FC_TAG_LINK;
516
517         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518                                         (void *)&args, 0);
519         trace_ext4_fc_track_link(handle, inode, dentry, ret);
520 }
521
522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524         struct inode *inode = d_inode(dentry);
525         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
526
527         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
528             (sbi->s_mount_state & EXT4_FC_REPLAY))
529                 return;
530
531         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
532                 return;
533
534         __ext4_fc_track_link(handle, inode, dentry);
535 }
536
537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
538                           struct dentry *dentry)
539 {
540         struct __track_dentry_update_args args;
541         int ret;
542
543         args.dentry = dentry;
544         args.op = EXT4_FC_TAG_CREAT;
545
546         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
547                                         (void *)&args, 0);
548         trace_ext4_fc_track_create(handle, inode, dentry, ret);
549 }
550
551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
552 {
553         struct inode *inode = d_inode(dentry);
554         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
555
556         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
557             (sbi->s_mount_state & EXT4_FC_REPLAY))
558                 return;
559
560         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
561                 return;
562
563         __ext4_fc_track_create(handle, inode, dentry);
564 }
565
566 /* __track_fn for inode tracking */
567 static int __track_inode(struct inode *inode, void *arg, bool update)
568 {
569         if (update)
570                 return -EEXIST;
571
572         EXT4_I(inode)->i_fc_lblk_len = 0;
573
574         return 0;
575 }
576
577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
578 {
579         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
580         int ret;
581
582         if (S_ISDIR(inode->i_mode))
583                 return;
584
585         if (ext4_should_journal_data(inode)) {
586                 ext4_fc_mark_ineligible(inode->i_sb,
587                                         EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
588                 return;
589         }
590
591         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
592             (sbi->s_mount_state & EXT4_FC_REPLAY))
593                 return;
594
595         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
596                 return;
597
598         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
599         trace_ext4_fc_track_inode(handle, inode, ret);
600 }
601
602 struct __track_range_args {
603         ext4_lblk_t start, end;
604 };
605
606 /* __track_fn for tracking data updates */
607 static int __track_range(struct inode *inode, void *arg, bool update)
608 {
609         struct ext4_inode_info *ei = EXT4_I(inode);
610         ext4_lblk_t oldstart;
611         struct __track_range_args *__arg =
612                 (struct __track_range_args *)arg;
613
614         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
615                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
616                 return -ECANCELED;
617         }
618
619         oldstart = ei->i_fc_lblk_start;
620
621         if (update && ei->i_fc_lblk_len > 0) {
622                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
623                 ei->i_fc_lblk_len =
624                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
625                                 ei->i_fc_lblk_start + 1;
626         } else {
627                 ei->i_fc_lblk_start = __arg->start;
628                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
629         }
630
631         return 0;
632 }
633
634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
635                          ext4_lblk_t end)
636 {
637         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
638         struct __track_range_args args;
639         int ret;
640
641         if (S_ISDIR(inode->i_mode))
642                 return;
643
644         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
645             (sbi->s_mount_state & EXT4_FC_REPLAY))
646                 return;
647
648         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
649                 return;
650
651         args.start = start;
652         args.end = end;
653
654         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
655
656         trace_ext4_fc_track_range(handle, inode, start, end, ret);
657 }
658
659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
660 {
661         blk_opf_t write_flags = REQ_SYNC;
662         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
663
664         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
665         if (test_opt(sb, BARRIER) && is_tail)
666                 write_flags |= REQ_FUA | REQ_PREFLUSH;
667         lock_buffer(bh);
668         set_buffer_dirty(bh);
669         set_buffer_uptodate(bh);
670         bh->b_end_io = ext4_end_buffer_io_sync;
671         submit_bh(REQ_OP_WRITE | write_flags, bh);
672         EXT4_SB(sb)->s_fc_bh = NULL;
673 }
674
675 /* Ext4 commit path routines */
676
677 /* memzero and update CRC */
678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
679                                 u32 *crc)
680 {
681         void *ret;
682
683         ret = memset(dst, 0, len);
684         if (crc)
685                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
686         return ret;
687 }
688
689 /*
690  * Allocate len bytes on a fast commit buffer.
691  *
692  * During the commit time this function is used to manage fast commit
693  * block space. We don't split a fast commit log onto different
694  * blocks. So this function makes sure that if there's not enough space
695  * on the current block, the remaining space in the current block is
696  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
697  * new block is from jbd2 and CRC is updated to reflect the padding
698  * we added.
699  */
700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
701 {
702         struct ext4_fc_tl *tl;
703         struct ext4_sb_info *sbi = EXT4_SB(sb);
704         struct buffer_head *bh;
705         int bsize = sbi->s_journal->j_blocksize;
706         int ret, off = sbi->s_fc_bytes % bsize;
707         int pad_len;
708
709         /*
710          * After allocating len, we should have space at least for a 0 byte
711          * padding.
712          */
713         if (len + sizeof(struct ext4_fc_tl) > bsize)
714                 return NULL;
715
716         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
717                 /*
718                  * Only allocate from current buffer if we have enough space for
719                  * this request AND we have space to add a zero byte padding.
720                  */
721                 if (!sbi->s_fc_bh) {
722                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
723                         if (ret)
724                                 return NULL;
725                         sbi->s_fc_bh = bh;
726                 }
727                 sbi->s_fc_bytes += len;
728                 return sbi->s_fc_bh->b_data + off;
729         }
730         /* Need to add PAD tag */
731         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
732         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
733         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
734         tl->fc_len = cpu_to_le16(pad_len);
735         if (crc)
736                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
737         if (pad_len > 0)
738                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
739         ext4_fc_submit_bh(sb, false);
740
741         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
742         if (ret)
743                 return NULL;
744         sbi->s_fc_bh = bh;
745         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
746         return sbi->s_fc_bh->b_data;
747 }
748
749 /* memcpy to fc reserved space and update CRC */
750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
751                                 int len, u32 *crc)
752 {
753         if (crc)
754                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
755         return memcpy(dst, src, len);
756 }
757
758 /*
759  * Complete a fast commit by writing tail tag.
760  *
761  * Writing tail tag marks the end of a fast commit. In order to guarantee
762  * atomicity, after writing tail tag, even if there's space remaining
763  * in the block, next commit shouldn't use it. That's why tail tag
764  * has the length as that of the remaining space on the block.
765  */
766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
767 {
768         struct ext4_sb_info *sbi = EXT4_SB(sb);
769         struct ext4_fc_tl tl;
770         struct ext4_fc_tail tail;
771         int off, bsize = sbi->s_journal->j_blocksize;
772         u8 *dst;
773
774         /*
775          * ext4_fc_reserve_space takes care of allocating an extra block if
776          * there's no enough space on this block for accommodating this tail.
777          */
778         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
779         if (!dst)
780                 return -ENOSPC;
781
782         off = sbi->s_fc_bytes % bsize;
783
784         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
785         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
786         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
787
788         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
789         dst += sizeof(tl);
790         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
791         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
792         dst += sizeof(tail.fc_tid);
793         tail.fc_crc = cpu_to_le32(crc);
794         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
795
796         ext4_fc_submit_bh(sb, true);
797
798         return 0;
799 }
800
801 /*
802  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
803  * Returns false if there's not enough space.
804  */
805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
806                            u32 *crc)
807 {
808         struct ext4_fc_tl tl;
809         u8 *dst;
810
811         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
812         if (!dst)
813                 return false;
814
815         tl.fc_tag = cpu_to_le16(tag);
816         tl.fc_len = cpu_to_le16(len);
817
818         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
819         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
820
821         return true;
822 }
823
824 /* Same as above, but adds dentry tlv. */
825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
826                                    struct ext4_fc_dentry_update *fc_dentry)
827 {
828         struct ext4_fc_dentry_info fcd;
829         struct ext4_fc_tl tl;
830         int dlen = fc_dentry->fcd_name.len;
831         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
832                                         crc);
833
834         if (!dst)
835                 return false;
836
837         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
838         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
839         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
840         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
841         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
842         dst += sizeof(tl);
843         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
844         dst += sizeof(fcd);
845         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
846
847         return true;
848 }
849
850 /*
851  * Writes inode in the fast commit space under TLV with tag @tag.
852  * Returns 0 on success, error on failure.
853  */
854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
855 {
856         struct ext4_inode_info *ei = EXT4_I(inode);
857         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
858         int ret;
859         struct ext4_iloc iloc;
860         struct ext4_fc_inode fc_inode;
861         struct ext4_fc_tl tl;
862         u8 *dst;
863
864         ret = ext4_get_inode_loc(inode, &iloc);
865         if (ret)
866                 return ret;
867
868         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
869                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
870         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
871                 inode_len += ei->i_extra_isize;
872
873         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
874         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
875         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
876
877         dst = ext4_fc_reserve_space(inode->i_sb,
878                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
879         if (!dst)
880                 return -ECANCELED;
881
882         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
883                 return -ECANCELED;
884         dst += sizeof(tl);
885         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
886                 return -ECANCELED;
887         dst += sizeof(fc_inode);
888         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
889                                         inode_len, crc))
890                 return -ECANCELED;
891
892         return 0;
893 }
894
895 /*
896  * Writes updated data ranges for the inode in question. Updates CRC.
897  * Returns 0 on success, error otherwise.
898  */
899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
900 {
901         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
902         struct ext4_inode_info *ei = EXT4_I(inode);
903         struct ext4_map_blocks map;
904         struct ext4_fc_add_range fc_ext;
905         struct ext4_fc_del_range lrange;
906         struct ext4_extent *ex;
907         int ret;
908
909         mutex_lock(&ei->i_fc_lock);
910         if (ei->i_fc_lblk_len == 0) {
911                 mutex_unlock(&ei->i_fc_lock);
912                 return 0;
913         }
914         old_blk_size = ei->i_fc_lblk_start;
915         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
916         ei->i_fc_lblk_len = 0;
917         mutex_unlock(&ei->i_fc_lock);
918
919         cur_lblk_off = old_blk_size;
920         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
921                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
922
923         while (cur_lblk_off <= new_blk_size) {
924                 map.m_lblk = cur_lblk_off;
925                 map.m_len = new_blk_size - cur_lblk_off + 1;
926                 ret = ext4_map_blocks(NULL, inode, &map, 0);
927                 if (ret < 0)
928                         return -ECANCELED;
929
930                 if (map.m_len == 0) {
931                         cur_lblk_off++;
932                         continue;
933                 }
934
935                 if (ret == 0) {
936                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
937                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
938                         lrange.fc_len = cpu_to_le32(map.m_len);
939                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
940                                             sizeof(lrange), (u8 *)&lrange, crc))
941                                 return -ENOSPC;
942                 } else {
943                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
944                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
945
946                         /* Limit the number of blocks in one extent */
947                         map.m_len = min(max, map.m_len);
948
949                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
950                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
951                         ex->ee_block = cpu_to_le32(map.m_lblk);
952                         ex->ee_len = cpu_to_le16(map.m_len);
953                         ext4_ext_store_pblock(ex, map.m_pblk);
954                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
955                                 ext4_ext_mark_unwritten(ex);
956                         else
957                                 ext4_ext_mark_initialized(ex);
958                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
959                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
960                                 return -ENOSPC;
961                 }
962
963                 cur_lblk_off += map.m_len;
964         }
965
966         return 0;
967 }
968
969
970 /* Submit data for all the fast commit inodes */
971 static int ext4_fc_submit_inode_data_all(journal_t *journal)
972 {
973         struct super_block *sb = journal->j_private;
974         struct ext4_sb_info *sbi = EXT4_SB(sb);
975         struct ext4_inode_info *ei;
976         int ret = 0;
977
978         spin_lock(&sbi->s_fc_lock);
979         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
980                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
981                 while (atomic_read(&ei->i_fc_updates)) {
982                         DEFINE_WAIT(wait);
983
984                         prepare_to_wait(&ei->i_fc_wait, &wait,
985                                                 TASK_UNINTERRUPTIBLE);
986                         if (atomic_read(&ei->i_fc_updates)) {
987                                 spin_unlock(&sbi->s_fc_lock);
988                                 schedule();
989                                 spin_lock(&sbi->s_fc_lock);
990                         }
991                         finish_wait(&ei->i_fc_wait, &wait);
992                 }
993                 spin_unlock(&sbi->s_fc_lock);
994                 ret = jbd2_submit_inode_data(ei->jinode);
995                 if (ret)
996                         return ret;
997                 spin_lock(&sbi->s_fc_lock);
998         }
999         spin_unlock(&sbi->s_fc_lock);
1000
1001         return ret;
1002 }
1003
1004 /* Wait for completion of data for all the fast commit inodes */
1005 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006 {
1007         struct super_block *sb = journal->j_private;
1008         struct ext4_sb_info *sbi = EXT4_SB(sb);
1009         struct ext4_inode_info *pos, *n;
1010         int ret = 0;
1011
1012         spin_lock(&sbi->s_fc_lock);
1013         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014                 if (!ext4_test_inode_state(&pos->vfs_inode,
1015                                            EXT4_STATE_FC_COMMITTING))
1016                         continue;
1017                 spin_unlock(&sbi->s_fc_lock);
1018
1019                 ret = jbd2_wait_inode_data(journal, pos->jinode);
1020                 if (ret)
1021                         return ret;
1022                 spin_lock(&sbi->s_fc_lock);
1023         }
1024         spin_unlock(&sbi->s_fc_lock);
1025
1026         return 0;
1027 }
1028
1029 /* Commit all the directory entry updates */
1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1031 __acquires(&sbi->s_fc_lock)
1032 __releases(&sbi->s_fc_lock)
1033 {
1034         struct super_block *sb = journal->j_private;
1035         struct ext4_sb_info *sbi = EXT4_SB(sb);
1036         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1037         struct inode *inode;
1038         struct ext4_inode_info *ei;
1039         int ret;
1040
1041         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042                 return 0;
1043         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1045                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046                         spin_unlock(&sbi->s_fc_lock);
1047                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1048                                 ret = -ENOSPC;
1049                                 goto lock_and_exit;
1050                         }
1051                         spin_lock(&sbi->s_fc_lock);
1052                         continue;
1053                 }
1054                 /*
1055                  * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056                  * corresponding inode pointer
1057                  */
1058                 WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059                 ei = list_first_entry(&fc_dentry->fcd_dilist,
1060                                 struct ext4_inode_info, i_fc_dilist);
1061                 inode = &ei->vfs_inode;
1062                 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063
1064                 spin_unlock(&sbi->s_fc_lock);
1065
1066                 /*
1067                  * We first write the inode and then the create dirent. This
1068                  * allows the recovery code to create an unnamed inode first
1069                  * and then link it to a directory entry. This allows us
1070                  * to use namei.c routines almost as is and simplifies
1071                  * the recovery code.
1072                  */
1073                 ret = ext4_fc_write_inode(inode, crc);
1074                 if (ret)
1075                         goto lock_and_exit;
1076
1077                 ret = ext4_fc_write_inode_data(inode, crc);
1078                 if (ret)
1079                         goto lock_and_exit;
1080
1081                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1082                         ret = -ENOSPC;
1083                         goto lock_and_exit;
1084                 }
1085
1086                 spin_lock(&sbi->s_fc_lock);
1087         }
1088         return 0;
1089 lock_and_exit:
1090         spin_lock(&sbi->s_fc_lock);
1091         return ret;
1092 }
1093
1094 static int ext4_fc_perform_commit(journal_t *journal)
1095 {
1096         struct super_block *sb = journal->j_private;
1097         struct ext4_sb_info *sbi = EXT4_SB(sb);
1098         struct ext4_inode_info *iter;
1099         struct ext4_fc_head head;
1100         struct inode *inode;
1101         struct blk_plug plug;
1102         int ret = 0;
1103         u32 crc = 0;
1104
1105         ret = ext4_fc_submit_inode_data_all(journal);
1106         if (ret)
1107                 return ret;
1108
1109         ret = ext4_fc_wait_inode_data_all(journal);
1110         if (ret)
1111                 return ret;
1112
1113         /*
1114          * If file system device is different from journal device, issue a cache
1115          * flush before we start writing fast commit blocks.
1116          */
1117         if (journal->j_fs_dev != journal->j_dev)
1118                 blkdev_issue_flush(journal->j_fs_dev);
1119
1120         blk_start_plug(&plug);
1121         if (sbi->s_fc_bytes == 0) {
1122                 /*
1123                  * Add a head tag only if this is the first fast commit
1124                  * in this TID.
1125                  */
1126                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127                 head.fc_tid = cpu_to_le32(
1128                         sbi->s_journal->j_running_transaction->t_tid);
1129                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1130                         (u8 *)&head, &crc)) {
1131                         ret = -ENOSPC;
1132                         goto out;
1133                 }
1134         }
1135
1136         spin_lock(&sbi->s_fc_lock);
1137         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138         if (ret) {
1139                 spin_unlock(&sbi->s_fc_lock);
1140                 goto out;
1141         }
1142
1143         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1144                 inode = &iter->vfs_inode;
1145                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146                         continue;
1147
1148                 spin_unlock(&sbi->s_fc_lock);
1149                 ret = ext4_fc_write_inode_data(inode, &crc);
1150                 if (ret)
1151                         goto out;
1152                 ret = ext4_fc_write_inode(inode, &crc);
1153                 if (ret)
1154                         goto out;
1155                 spin_lock(&sbi->s_fc_lock);
1156         }
1157         spin_unlock(&sbi->s_fc_lock);
1158
1159         ret = ext4_fc_write_tail(sb, crc);
1160
1161 out:
1162         blk_finish_plug(&plug);
1163         return ret;
1164 }
1165
1166 static void ext4_fc_update_stats(struct super_block *sb, int status,
1167                                  u64 commit_time, int nblks, tid_t commit_tid)
1168 {
1169         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170
1171         jbd_debug(1, "Fast commit ended with status = %d for tid %u",
1172                         status, commit_tid);
1173         if (status == EXT4_FC_STATUS_OK) {
1174                 stats->fc_num_commits++;
1175                 stats->fc_numblks += nblks;
1176                 if (likely(stats->s_fc_avg_commit_time))
1177                         stats->s_fc_avg_commit_time =
1178                                 (commit_time +
1179                                  stats->s_fc_avg_commit_time * 3) / 4;
1180                 else
1181                         stats->s_fc_avg_commit_time = commit_time;
1182         } else if (status == EXT4_FC_STATUS_FAILED ||
1183                    status == EXT4_FC_STATUS_INELIGIBLE) {
1184                 if (status == EXT4_FC_STATUS_FAILED)
1185                         stats->fc_failed_commits++;
1186                 stats->fc_ineligible_commits++;
1187         } else {
1188                 stats->fc_skipped_commits++;
1189         }
1190         trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1191 }
1192
1193 /*
1194  * The main commit entry point. Performs a fast commit for transaction
1195  * commit_tid if needed. If it's not possible to perform a fast commit
1196  * due to various reasons, we fall back to full commit. Returns 0
1197  * on success, error otherwise.
1198  */
1199 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1200 {
1201         struct super_block *sb = journal->j_private;
1202         struct ext4_sb_info *sbi = EXT4_SB(sb);
1203         int nblks = 0, ret, bsize = journal->j_blocksize;
1204         int subtid = atomic_read(&sbi->s_fc_subtid);
1205         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1206         ktime_t start_time, commit_time;
1207
1208         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1209                 return jbd2_complete_transaction(journal, commit_tid);
1210
1211         trace_ext4_fc_commit_start(sb, commit_tid);
1212
1213         start_time = ktime_get();
1214
1215 restart_fc:
1216         ret = jbd2_fc_begin_commit(journal, commit_tid);
1217         if (ret == -EALREADY) {
1218                 /* There was an ongoing commit, check if we need to restart */
1219                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1220                         commit_tid > journal->j_commit_sequence)
1221                         goto restart_fc;
1222                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1223                                 commit_tid);
1224                 return 0;
1225         } else if (ret) {
1226                 /*
1227                  * Commit couldn't start. Just update stats and perform a
1228                  * full commit.
1229                  */
1230                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1231                                 commit_tid);
1232                 return jbd2_complete_transaction(journal, commit_tid);
1233         }
1234
1235         /*
1236          * After establishing journal barrier via jbd2_fc_begin_commit(), check
1237          * if we are fast commit ineligible.
1238          */
1239         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1240                 status = EXT4_FC_STATUS_INELIGIBLE;
1241                 goto fallback;
1242         }
1243
1244         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1245         ret = ext4_fc_perform_commit(journal);
1246         if (ret < 0) {
1247                 status = EXT4_FC_STATUS_FAILED;
1248                 goto fallback;
1249         }
1250         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1251         ret = jbd2_fc_wait_bufs(journal, nblks);
1252         if (ret < 0) {
1253                 status = EXT4_FC_STATUS_FAILED;
1254                 goto fallback;
1255         }
1256         atomic_inc(&sbi->s_fc_subtid);
1257         ret = jbd2_fc_end_commit(journal);
1258         /*
1259          * weight the commit time higher than the average time so we
1260          * don't react too strongly to vast changes in the commit time
1261          */
1262         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1263         ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1264         return ret;
1265
1266 fallback:
1267         ret = jbd2_fc_end_commit_fallback(journal);
1268         ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1269         return ret;
1270 }
1271
1272 /*
1273  * Fast commit cleanup routine. This is called after every fast commit and
1274  * full commit. full is true if we are called after a full commit.
1275  */
1276 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1277 {
1278         struct super_block *sb = journal->j_private;
1279         struct ext4_sb_info *sbi = EXT4_SB(sb);
1280         struct ext4_inode_info *iter, *iter_n;
1281         struct ext4_fc_dentry_update *fc_dentry;
1282
1283         if (full && sbi->s_fc_bh)
1284                 sbi->s_fc_bh = NULL;
1285
1286         trace_ext4_fc_cleanup(journal, full, tid);
1287         jbd2_fc_release_bufs(journal);
1288
1289         spin_lock(&sbi->s_fc_lock);
1290         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1291                                  i_fc_list) {
1292                 list_del_init(&iter->i_fc_list);
1293                 ext4_clear_inode_state(&iter->vfs_inode,
1294                                        EXT4_STATE_FC_COMMITTING);
1295                 if (iter->i_sync_tid <= tid)
1296                         ext4_fc_reset_inode(&iter->vfs_inode);
1297                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1298                 smp_mb();
1299 #if (BITS_PER_LONG < 64)
1300                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1301 #else
1302                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1303 #endif
1304         }
1305
1306         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1307                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1308                                              struct ext4_fc_dentry_update,
1309                                              fcd_list);
1310                 list_del_init(&fc_dentry->fcd_list);
1311                 list_del_init(&fc_dentry->fcd_dilist);
1312                 spin_unlock(&sbi->s_fc_lock);
1313
1314                 if (fc_dentry->fcd_name.name &&
1315                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1316                         kfree(fc_dentry->fcd_name.name);
1317                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1318                 spin_lock(&sbi->s_fc_lock);
1319         }
1320
1321         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1322                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1323         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1324                                 &sbi->s_fc_q[FC_Q_MAIN]);
1325
1326         if (tid >= sbi->s_fc_ineligible_tid) {
1327                 sbi->s_fc_ineligible_tid = 0;
1328                 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1329         }
1330
1331         if (full)
1332                 sbi->s_fc_bytes = 0;
1333         spin_unlock(&sbi->s_fc_lock);
1334         trace_ext4_fc_stats(sb);
1335 }
1336
1337 /* Ext4 Replay Path Routines */
1338
1339 /* Helper struct for dentry replay routines */
1340 struct dentry_info_args {
1341         int parent_ino, dname_len, ino, inode_len;
1342         char *dname;
1343 };
1344
1345 static inline void tl_to_darg(struct dentry_info_args *darg,
1346                               struct  ext4_fc_tl *tl, u8 *val)
1347 {
1348         struct ext4_fc_dentry_info fcd;
1349
1350         memcpy(&fcd, val, sizeof(fcd));
1351
1352         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1353         darg->ino = le32_to_cpu(fcd.fc_ino);
1354         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1355         darg->dname_len = le16_to_cpu(tl->fc_len) -
1356                 sizeof(struct ext4_fc_dentry_info);
1357 }
1358
1359 /* Unlink replay function */
1360 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1361                                  u8 *val)
1362 {
1363         struct inode *inode, *old_parent;
1364         struct qstr entry;
1365         struct dentry_info_args darg;
1366         int ret = 0;
1367
1368         tl_to_darg(&darg, tl, val);
1369
1370         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1371                         darg.parent_ino, darg.dname_len);
1372
1373         entry.name = darg.dname;
1374         entry.len = darg.dname_len;
1375         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1376
1377         if (IS_ERR(inode)) {
1378                 jbd_debug(1, "Inode %d not found", darg.ino);
1379                 return 0;
1380         }
1381
1382         old_parent = ext4_iget(sb, darg.parent_ino,
1383                                 EXT4_IGET_NORMAL);
1384         if (IS_ERR(old_parent)) {
1385                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1386                 iput(inode);
1387                 return 0;
1388         }
1389
1390         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1391         /* -ENOENT ok coz it might not exist anymore. */
1392         if (ret == -ENOENT)
1393                 ret = 0;
1394         iput(old_parent);
1395         iput(inode);
1396         return ret;
1397 }
1398
1399 static int ext4_fc_replay_link_internal(struct super_block *sb,
1400                                 struct dentry_info_args *darg,
1401                                 struct inode *inode)
1402 {
1403         struct inode *dir = NULL;
1404         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1405         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1406         int ret = 0;
1407
1408         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1409         if (IS_ERR(dir)) {
1410                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1411                 dir = NULL;
1412                 goto out;
1413         }
1414
1415         dentry_dir = d_obtain_alias(dir);
1416         if (IS_ERR(dentry_dir)) {
1417                 jbd_debug(1, "Failed to obtain dentry");
1418                 dentry_dir = NULL;
1419                 goto out;
1420         }
1421
1422         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1423         if (!dentry_inode) {
1424                 jbd_debug(1, "Inode dentry not created.");
1425                 ret = -ENOMEM;
1426                 goto out;
1427         }
1428
1429         ret = __ext4_link(dir, inode, dentry_inode);
1430         /*
1431          * It's possible that link already existed since data blocks
1432          * for the dir in question got persisted before we crashed OR
1433          * we replayed this tag and crashed before the entire replay
1434          * could complete.
1435          */
1436         if (ret && ret != -EEXIST) {
1437                 jbd_debug(1, "Failed to link\n");
1438                 goto out;
1439         }
1440
1441         ret = 0;
1442 out:
1443         if (dentry_dir) {
1444                 d_drop(dentry_dir);
1445                 dput(dentry_dir);
1446         } else if (dir) {
1447                 iput(dir);
1448         }
1449         if (dentry_inode) {
1450                 d_drop(dentry_inode);
1451                 dput(dentry_inode);
1452         }
1453
1454         return ret;
1455 }
1456
1457 /* Link replay function */
1458 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1459                                u8 *val)
1460 {
1461         struct inode *inode;
1462         struct dentry_info_args darg;
1463         int ret = 0;
1464
1465         tl_to_darg(&darg, tl, val);
1466         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1467                         darg.parent_ino, darg.dname_len);
1468
1469         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1470         if (IS_ERR(inode)) {
1471                 jbd_debug(1, "Inode not found.");
1472                 return 0;
1473         }
1474
1475         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1476         iput(inode);
1477         return ret;
1478 }
1479
1480 /*
1481  * Record all the modified inodes during replay. We use this later to setup
1482  * block bitmaps correctly.
1483  */
1484 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1485 {
1486         struct ext4_fc_replay_state *state;
1487         int i;
1488
1489         state = &EXT4_SB(sb)->s_fc_replay_state;
1490         for (i = 0; i < state->fc_modified_inodes_used; i++)
1491                 if (state->fc_modified_inodes[i] == ino)
1492                         return 0;
1493         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1494                 state->fc_modified_inodes = krealloc(
1495                                 state->fc_modified_inodes,
1496                                 sizeof(int) * (state->fc_modified_inodes_size +
1497                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1498                                 GFP_KERNEL);
1499                 if (!state->fc_modified_inodes)
1500                         return -ENOMEM;
1501                 state->fc_modified_inodes_size +=
1502                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1503         }
1504         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1505         return 0;
1506 }
1507
1508 /*
1509  * Inode replay function
1510  */
1511 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1512                                 u8 *val)
1513 {
1514         struct ext4_fc_inode fc_inode;
1515         struct ext4_inode *raw_inode;
1516         struct ext4_inode *raw_fc_inode;
1517         struct inode *inode = NULL;
1518         struct ext4_iloc iloc;
1519         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1520         struct ext4_extent_header *eh;
1521
1522         memcpy(&fc_inode, val, sizeof(fc_inode));
1523
1524         ino = le32_to_cpu(fc_inode.fc_ino);
1525         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1526
1527         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1528         if (!IS_ERR(inode)) {
1529                 ext4_ext_clear_bb(inode);
1530                 iput(inode);
1531         }
1532         inode = NULL;
1533
1534         ret = ext4_fc_record_modified_inode(sb, ino);
1535         if (ret)
1536                 goto out;
1537
1538         raw_fc_inode = (struct ext4_inode *)
1539                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1540         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1541         if (ret)
1542                 goto out;
1543
1544         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1545         raw_inode = ext4_raw_inode(&iloc);
1546
1547         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1548         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1549                 inode_len - offsetof(struct ext4_inode, i_generation));
1550         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1551                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1552                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1553                         memset(eh, 0, sizeof(*eh));
1554                         eh->eh_magic = EXT4_EXT_MAGIC;
1555                         eh->eh_max = cpu_to_le16(
1556                                 (sizeof(raw_inode->i_block) -
1557                                  sizeof(struct ext4_extent_header))
1558                                  / sizeof(struct ext4_extent));
1559                 }
1560         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1561                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1562                         sizeof(raw_inode->i_block));
1563         }
1564
1565         /* Immediately update the inode on disk. */
1566         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1567         if (ret)
1568                 goto out;
1569         ret = sync_dirty_buffer(iloc.bh);
1570         if (ret)
1571                 goto out;
1572         ret = ext4_mark_inode_used(sb, ino);
1573         if (ret)
1574                 goto out;
1575
1576         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1577         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1578         if (IS_ERR(inode)) {
1579                 jbd_debug(1, "Inode not found.");
1580                 return -EFSCORRUPTED;
1581         }
1582
1583         /*
1584          * Our allocator could have made different decisions than before
1585          * crashing. This should be fixed but until then, we calculate
1586          * the number of blocks the inode.
1587          */
1588         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1589                 ext4_ext_replay_set_iblocks(inode);
1590
1591         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1592         ext4_reset_inode_seed(inode);
1593
1594         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1595         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1596         sync_dirty_buffer(iloc.bh);
1597         brelse(iloc.bh);
1598 out:
1599         iput(inode);
1600         if (!ret)
1601                 blkdev_issue_flush(sb->s_bdev);
1602
1603         return 0;
1604 }
1605
1606 /*
1607  * Dentry create replay function.
1608  *
1609  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1610  * inode for which we are trying to create a dentry here, should already have
1611  * been replayed before we start here.
1612  */
1613 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1614                                  u8 *val)
1615 {
1616         int ret = 0;
1617         struct inode *inode = NULL;
1618         struct inode *dir = NULL;
1619         struct dentry_info_args darg;
1620
1621         tl_to_darg(&darg, tl, val);
1622
1623         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1624                         darg.parent_ino, darg.dname_len);
1625
1626         /* This takes care of update group descriptor and other metadata */
1627         ret = ext4_mark_inode_used(sb, darg.ino);
1628         if (ret)
1629                 goto out;
1630
1631         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1632         if (IS_ERR(inode)) {
1633                 jbd_debug(1, "inode %d not found.", darg.ino);
1634                 inode = NULL;
1635                 ret = -EINVAL;
1636                 goto out;
1637         }
1638
1639         if (S_ISDIR(inode->i_mode)) {
1640                 /*
1641                  * If we are creating a directory, we need to make sure that the
1642                  * dot and dot dot dirents are setup properly.
1643                  */
1644                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1645                 if (IS_ERR(dir)) {
1646                         jbd_debug(1, "Dir %d not found.", darg.ino);
1647                         goto out;
1648                 }
1649                 ret = ext4_init_new_dir(NULL, dir, inode);
1650                 iput(dir);
1651                 if (ret) {
1652                         ret = 0;
1653                         goto out;
1654                 }
1655         }
1656         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1657         if (ret)
1658                 goto out;
1659         set_nlink(inode, 1);
1660         ext4_mark_inode_dirty(NULL, inode);
1661 out:
1662         iput(inode);
1663         return ret;
1664 }
1665
1666 /*
1667  * Record physical disk regions which are in use as per fast commit area,
1668  * and used by inodes during replay phase. Our simple replay phase
1669  * allocator excludes these regions from allocation.
1670  */
1671 int ext4_fc_record_regions(struct super_block *sb, int ino,
1672                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1673 {
1674         struct ext4_fc_replay_state *state;
1675         struct ext4_fc_alloc_region *region;
1676
1677         state = &EXT4_SB(sb)->s_fc_replay_state;
1678         /*
1679          * during replay phase, the fc_regions_valid may not same as
1680          * fc_regions_used, update it when do new additions.
1681          */
1682         if (replay && state->fc_regions_used != state->fc_regions_valid)
1683                 state->fc_regions_used = state->fc_regions_valid;
1684         if (state->fc_regions_used == state->fc_regions_size) {
1685                 state->fc_regions_size +=
1686                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1687                 state->fc_regions = krealloc(
1688                                         state->fc_regions,
1689                                         state->fc_regions_size *
1690                                         sizeof(struct ext4_fc_alloc_region),
1691                                         GFP_KERNEL);
1692                 if (!state->fc_regions)
1693                         return -ENOMEM;
1694         }
1695         region = &state->fc_regions[state->fc_regions_used++];
1696         region->ino = ino;
1697         region->lblk = lblk;
1698         region->pblk = pblk;
1699         region->len = len;
1700
1701         if (replay)
1702                 state->fc_regions_valid++;
1703
1704         return 0;
1705 }
1706
1707 /* Replay add range tag */
1708 static int ext4_fc_replay_add_range(struct super_block *sb,
1709                                     struct ext4_fc_tl *tl, u8 *val)
1710 {
1711         struct ext4_fc_add_range fc_add_ex;
1712         struct ext4_extent newex, *ex;
1713         struct inode *inode;
1714         ext4_lblk_t start, cur;
1715         int remaining, len;
1716         ext4_fsblk_t start_pblk;
1717         struct ext4_map_blocks map;
1718         struct ext4_ext_path *path = NULL;
1719         int ret;
1720
1721         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1722         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1723
1724         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1725                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1726                 ext4_ext_get_actual_len(ex));
1727
1728         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1729         if (IS_ERR(inode)) {
1730                 jbd_debug(1, "Inode not found.");
1731                 return 0;
1732         }
1733
1734         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1735         if (ret)
1736                 goto out;
1737
1738         start = le32_to_cpu(ex->ee_block);
1739         start_pblk = ext4_ext_pblock(ex);
1740         len = ext4_ext_get_actual_len(ex);
1741
1742         cur = start;
1743         remaining = len;
1744         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1745                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1746                   inode->i_ino);
1747
1748         while (remaining > 0) {
1749                 map.m_lblk = cur;
1750                 map.m_len = remaining;
1751                 map.m_pblk = 0;
1752                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1753
1754                 if (ret < 0)
1755                         goto out;
1756
1757                 if (ret == 0) {
1758                         /* Range is not mapped */
1759                         path = ext4_find_extent(inode, cur, NULL, 0);
1760                         if (IS_ERR(path))
1761                                 goto out;
1762                         memset(&newex, 0, sizeof(newex));
1763                         newex.ee_block = cpu_to_le32(cur);
1764                         ext4_ext_store_pblock(
1765                                 &newex, start_pblk + cur - start);
1766                         newex.ee_len = cpu_to_le16(map.m_len);
1767                         if (ext4_ext_is_unwritten(ex))
1768                                 ext4_ext_mark_unwritten(&newex);
1769                         down_write(&EXT4_I(inode)->i_data_sem);
1770                         ret = ext4_ext_insert_extent(
1771                                 NULL, inode, &path, &newex, 0);
1772                         up_write((&EXT4_I(inode)->i_data_sem));
1773                         ext4_ext_drop_refs(path);
1774                         kfree(path);
1775                         if (ret)
1776                                 goto out;
1777                         goto next;
1778                 }
1779
1780                 if (start_pblk + cur - start != map.m_pblk) {
1781                         /*
1782                          * Logical to physical mapping changed. This can happen
1783                          * if this range was removed and then reallocated to
1784                          * map to new physical blocks during a fast commit.
1785                          */
1786                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1787                                         ext4_ext_is_unwritten(ex),
1788                                         start_pblk + cur - start);
1789                         if (ret)
1790                                 goto out;
1791                         /*
1792                          * Mark the old blocks as free since they aren't used
1793                          * anymore. We maintain an array of all the modified
1794                          * inodes. In case these blocks are still used at either
1795                          * a different logical range in the same inode or in
1796                          * some different inode, we will mark them as allocated
1797                          * at the end of the FC replay using our array of
1798                          * modified inodes.
1799                          */
1800                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1801                         goto next;
1802                 }
1803
1804                 /* Range is mapped and needs a state change */
1805                 jbd_debug(1, "Converting from %ld to %d %lld",
1806                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1807                         ext4_ext_is_unwritten(ex), map.m_pblk);
1808                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1809                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1810                 if (ret)
1811                         goto out;
1812                 /*
1813                  * We may have split the extent tree while toggling the state.
1814                  * Try to shrink the extent tree now.
1815                  */
1816                 ext4_ext_replay_shrink_inode(inode, start + len);
1817 next:
1818                 cur += map.m_len;
1819                 remaining -= map.m_len;
1820         }
1821         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1822                                         sb->s_blocksize_bits);
1823 out:
1824         iput(inode);
1825         return 0;
1826 }
1827
1828 /* Replay DEL_RANGE tag */
1829 static int
1830 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1831                          u8 *val)
1832 {
1833         struct inode *inode;
1834         struct ext4_fc_del_range lrange;
1835         struct ext4_map_blocks map;
1836         ext4_lblk_t cur, remaining;
1837         int ret;
1838
1839         memcpy(&lrange, val, sizeof(lrange));
1840         cur = le32_to_cpu(lrange.fc_lblk);
1841         remaining = le32_to_cpu(lrange.fc_len);
1842
1843         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1844                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1845
1846         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1847         if (IS_ERR(inode)) {
1848                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1849                 return 0;
1850         }
1851
1852         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1853         if (ret)
1854                 goto out;
1855
1856         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1857                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1858                         le32_to_cpu(lrange.fc_len));
1859         while (remaining > 0) {
1860                 map.m_lblk = cur;
1861                 map.m_len = remaining;
1862
1863                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1864                 if (ret < 0)
1865                         goto out;
1866                 if (ret > 0) {
1867                         remaining -= ret;
1868                         cur += ret;
1869                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1870                 } else {
1871                         remaining -= map.m_len;
1872                         cur += map.m_len;
1873                 }
1874         }
1875
1876         down_write(&EXT4_I(inode)->i_data_sem);
1877         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1878                                 le32_to_cpu(lrange.fc_lblk) +
1879                                 le32_to_cpu(lrange.fc_len) - 1);
1880         up_write(&EXT4_I(inode)->i_data_sem);
1881         if (ret)
1882                 goto out;
1883         ext4_ext_replay_shrink_inode(inode,
1884                 i_size_read(inode) >> sb->s_blocksize_bits);
1885         ext4_mark_inode_dirty(NULL, inode);
1886 out:
1887         iput(inode);
1888         return 0;
1889 }
1890
1891 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1892 {
1893         struct ext4_fc_replay_state *state;
1894         struct inode *inode;
1895         struct ext4_ext_path *path = NULL;
1896         struct ext4_map_blocks map;
1897         int i, ret, j;
1898         ext4_lblk_t cur, end;
1899
1900         state = &EXT4_SB(sb)->s_fc_replay_state;
1901         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1902                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1903                         EXT4_IGET_NORMAL);
1904                 if (IS_ERR(inode)) {
1905                         jbd_debug(1, "Inode %d not found.",
1906                                 state->fc_modified_inodes[i]);
1907                         continue;
1908                 }
1909                 cur = 0;
1910                 end = EXT_MAX_BLOCKS;
1911                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1912                         iput(inode);
1913                         continue;
1914                 }
1915                 while (cur < end) {
1916                         map.m_lblk = cur;
1917                         map.m_len = end - cur;
1918
1919                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1920                         if (ret < 0)
1921                                 break;
1922
1923                         if (ret > 0) {
1924                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1925                                 if (!IS_ERR(path)) {
1926                                         for (j = 0; j < path->p_depth; j++)
1927                                                 ext4_mb_mark_bb(inode->i_sb,
1928                                                         path[j].p_block, 1, 1);
1929                                         ext4_ext_drop_refs(path);
1930                                         kfree(path);
1931                                 }
1932                                 cur += ret;
1933                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1934                                                         map.m_len, 1);
1935                         } else {
1936                                 cur = cur + (map.m_len ? map.m_len : 1);
1937                         }
1938                 }
1939                 iput(inode);
1940         }
1941 }
1942
1943 /*
1944  * Check if block is in excluded regions for block allocation. The simple
1945  * allocator that runs during replay phase is calls this function to see
1946  * if it is okay to use a block.
1947  */
1948 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1949 {
1950         int i;
1951         struct ext4_fc_replay_state *state;
1952
1953         state = &EXT4_SB(sb)->s_fc_replay_state;
1954         for (i = 0; i < state->fc_regions_valid; i++) {
1955                 if (state->fc_regions[i].ino == 0 ||
1956                         state->fc_regions[i].len == 0)
1957                         continue;
1958                 if (in_range(blk, state->fc_regions[i].pblk,
1959                                         state->fc_regions[i].len))
1960                         return true;
1961         }
1962         return false;
1963 }
1964
1965 /* Cleanup function called after replay */
1966 void ext4_fc_replay_cleanup(struct super_block *sb)
1967 {
1968         struct ext4_sb_info *sbi = EXT4_SB(sb);
1969
1970         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1971         kfree(sbi->s_fc_replay_state.fc_regions);
1972         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1973 }
1974
1975 /*
1976  * Recovery Scan phase handler
1977  *
1978  * This function is called during the scan phase and is responsible
1979  * for doing following things:
1980  * - Make sure the fast commit area has valid tags for replay
1981  * - Count number of tags that need to be replayed by the replay handler
1982  * - Verify CRC
1983  * - Create a list of excluded blocks for allocation during replay phase
1984  *
1985  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1986  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1987  * to indicate that scan has finished and JBD2 can now start replay phase.
1988  * It returns a negative error to indicate that there was an error. At the end
1989  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1990  * to indicate the number of tags that need to replayed during the replay phase.
1991  */
1992 static int ext4_fc_replay_scan(journal_t *journal,
1993                                 struct buffer_head *bh, int off,
1994                                 tid_t expected_tid)
1995 {
1996         struct super_block *sb = journal->j_private;
1997         struct ext4_sb_info *sbi = EXT4_SB(sb);
1998         struct ext4_fc_replay_state *state;
1999         int ret = JBD2_FC_REPLAY_CONTINUE;
2000         struct ext4_fc_add_range ext;
2001         struct ext4_fc_tl tl;
2002         struct ext4_fc_tail tail;
2003         __u8 *start, *end, *cur, *val;
2004         struct ext4_fc_head head;
2005         struct ext4_extent *ex;
2006
2007         state = &sbi->s_fc_replay_state;
2008
2009         start = (u8 *)bh->b_data;
2010         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2011
2012         if (state->fc_replay_expected_off == 0) {
2013                 state->fc_cur_tag = 0;
2014                 state->fc_replay_num_tags = 0;
2015                 state->fc_crc = 0;
2016                 state->fc_regions = NULL;
2017                 state->fc_regions_valid = state->fc_regions_used =
2018                         state->fc_regions_size = 0;
2019                 /* Check if we can stop early */
2020                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2021                         != EXT4_FC_TAG_HEAD)
2022                         return 0;
2023         }
2024
2025         if (off != state->fc_replay_expected_off) {
2026                 ret = -EFSCORRUPTED;
2027                 goto out_err;
2028         }
2029
2030         state->fc_replay_expected_off++;
2031         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2032                 memcpy(&tl, cur, sizeof(tl));
2033                 val = cur + sizeof(tl);
2034                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
2035                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2036                 switch (le16_to_cpu(tl.fc_tag)) {
2037                 case EXT4_FC_TAG_ADD_RANGE:
2038                         memcpy(&ext, val, sizeof(ext));
2039                         ex = (struct ext4_extent *)&ext.fc_ex;
2040                         ret = ext4_fc_record_regions(sb,
2041                                 le32_to_cpu(ext.fc_ino),
2042                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2043                                 ext4_ext_get_actual_len(ex), 0);
2044                         if (ret < 0)
2045                                 break;
2046                         ret = JBD2_FC_REPLAY_CONTINUE;
2047                         fallthrough;
2048                 case EXT4_FC_TAG_DEL_RANGE:
2049                 case EXT4_FC_TAG_LINK:
2050                 case EXT4_FC_TAG_UNLINK:
2051                 case EXT4_FC_TAG_CREAT:
2052                 case EXT4_FC_TAG_INODE:
2053                 case EXT4_FC_TAG_PAD:
2054                         state->fc_cur_tag++;
2055                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2056                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
2057                         break;
2058                 case EXT4_FC_TAG_TAIL:
2059                         state->fc_cur_tag++;
2060                         memcpy(&tail, val, sizeof(tail));
2061                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2062                                                 sizeof(tl) +
2063                                                 offsetof(struct ext4_fc_tail,
2064                                                 fc_crc));
2065                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2066                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2067                                 state->fc_replay_num_tags = state->fc_cur_tag;
2068                                 state->fc_regions_valid =
2069                                         state->fc_regions_used;
2070                         } else {
2071                                 ret = state->fc_replay_num_tags ?
2072                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2073                         }
2074                         state->fc_crc = 0;
2075                         break;
2076                 case EXT4_FC_TAG_HEAD:
2077                         memcpy(&head, val, sizeof(head));
2078                         if (le32_to_cpu(head.fc_features) &
2079                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2080                                 ret = -EOPNOTSUPP;
2081                                 break;
2082                         }
2083                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2084                                 ret = JBD2_FC_REPLAY_STOP;
2085                                 break;
2086                         }
2087                         state->fc_cur_tag++;
2088                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2089                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2090                         break;
2091                 default:
2092                         ret = state->fc_replay_num_tags ?
2093                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2094                 }
2095                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2096                         break;
2097         }
2098
2099 out_err:
2100         trace_ext4_fc_replay_scan(sb, ret, off);
2101         return ret;
2102 }
2103
2104 /*
2105  * Main recovery path entry point.
2106  * The meaning of return codes is similar as above.
2107  */
2108 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2109                                 enum passtype pass, int off, tid_t expected_tid)
2110 {
2111         struct super_block *sb = journal->j_private;
2112         struct ext4_sb_info *sbi = EXT4_SB(sb);
2113         struct ext4_fc_tl tl;
2114         __u8 *start, *end, *cur, *val;
2115         int ret = JBD2_FC_REPLAY_CONTINUE;
2116         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2117         struct ext4_fc_tail tail;
2118
2119         if (pass == PASS_SCAN) {
2120                 state->fc_current_pass = PASS_SCAN;
2121                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2122         }
2123
2124         if (state->fc_current_pass != pass) {
2125                 state->fc_current_pass = pass;
2126                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2127         }
2128         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2129                 jbd_debug(1, "Replay stops\n");
2130                 ext4_fc_set_bitmaps_and_counters(sb);
2131                 return 0;
2132         }
2133
2134 #ifdef CONFIG_EXT4_DEBUG
2135         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2136                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2137                 return JBD2_FC_REPLAY_STOP;
2138         }
2139 #endif
2140
2141         start = (u8 *)bh->b_data;
2142         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2143
2144         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2145                 memcpy(&tl, cur, sizeof(tl));
2146                 val = cur + sizeof(tl);
2147
2148                 if (state->fc_replay_num_tags == 0) {
2149                         ret = JBD2_FC_REPLAY_STOP;
2150                         ext4_fc_set_bitmaps_and_counters(sb);
2151                         break;
2152                 }
2153                 jbd_debug(3, "Replay phase, tag:%s\n",
2154                                 tag2str(le16_to_cpu(tl.fc_tag)));
2155                 state->fc_replay_num_tags--;
2156                 switch (le16_to_cpu(tl.fc_tag)) {
2157                 case EXT4_FC_TAG_LINK:
2158                         ret = ext4_fc_replay_link(sb, &tl, val);
2159                         break;
2160                 case EXT4_FC_TAG_UNLINK:
2161                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2162                         break;
2163                 case EXT4_FC_TAG_ADD_RANGE:
2164                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2165                         break;
2166                 case EXT4_FC_TAG_CREAT:
2167                         ret = ext4_fc_replay_create(sb, &tl, val);
2168                         break;
2169                 case EXT4_FC_TAG_DEL_RANGE:
2170                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2171                         break;
2172                 case EXT4_FC_TAG_INODE:
2173                         ret = ext4_fc_replay_inode(sb, &tl, val);
2174                         break;
2175                 case EXT4_FC_TAG_PAD:
2176                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2177                                              le16_to_cpu(tl.fc_len), 0);
2178                         break;
2179                 case EXT4_FC_TAG_TAIL:
2180                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2181                                              le16_to_cpu(tl.fc_len), 0);
2182                         memcpy(&tail, val, sizeof(tail));
2183                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2184                         break;
2185                 case EXT4_FC_TAG_HEAD:
2186                         break;
2187                 default:
2188                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2189                                              le16_to_cpu(tl.fc_len), 0);
2190                         ret = -ECANCELED;
2191                         break;
2192                 }
2193                 if (ret < 0)
2194                         break;
2195                 ret = JBD2_FC_REPLAY_CONTINUE;
2196         }
2197         return ret;
2198 }
2199
2200 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2201 {
2202         /*
2203          * We set replay callback even if fast commit disabled because we may
2204          * could still have fast commit blocks that need to be replayed even if
2205          * fast commit has now been turned off.
2206          */
2207         journal->j_fc_replay_callback = ext4_fc_replay;
2208         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2209                 return;
2210         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2211 }
2212
2213 static const char *fc_ineligible_reasons[] = {
2214         "Extended attributes changed",
2215         "Cross rename",
2216         "Journal flag changed",
2217         "Insufficient memory",
2218         "Swap boot",
2219         "Resize",
2220         "Dir renamed",
2221         "Falloc range op",
2222         "Data journalling",
2223         "FC Commit Failed"
2224 };
2225
2226 int ext4_fc_info_show(struct seq_file *seq, void *v)
2227 {
2228         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2229         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2230         int i;
2231
2232         if (v != SEQ_START_TOKEN)
2233                 return 0;
2234
2235         seq_printf(seq,
2236                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2237                    stats->fc_num_commits, stats->fc_ineligible_commits,
2238                    stats->fc_numblks,
2239                    div_u64(stats->s_fc_avg_commit_time, 1000));
2240         seq_puts(seq, "Ineligible reasons:\n");
2241         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2242                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2243                         stats->fc_ineligible_reason_count[i]);
2244
2245         return 0;
2246 }
2247
2248 int __init ext4_fc_init_dentry_cache(void)
2249 {
2250         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2251                                            SLAB_RECLAIM_ACCOUNT);
2252
2253         if (ext4_fc_dentry_cachep == NULL)
2254                 return -ENOMEM;
2255
2256         return 0;
2257 }
2258
2259 void ext4_fc_destroy_dentry_cache(void)
2260 {
2261         kmem_cache_destroy(ext4_fc_dentry_cachep);
2262 }