Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Dec 2020 22:16:02 +0000 (14:16 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Dec 2020 22:16:02 +0000 (14:16 -0800)
Pull ext4 updates from Ted Ts'o:
 "Various bug fixes and cleanups for ext4; no new features this cycle"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (29 commits)
  ext4: remove unnecessary wbc parameter from ext4_bio_write_page
  ext4: avoid s_mb_prefetch to be zero in individual scenarios
  ext4: defer saving error info from atomic context
  ext4: simplify ext4 error translation
  ext4: move functions in super.c
  ext4: make ext4_abort() use __ext4_error()
  ext4: standardize error message in ext4_protect_reserved_inode()
  ext4: remove redundant sb checksum recomputation
  ext4: don't remount read-only with errors=continue on reboot
  ext4: fix deadlock with fs freezing and EA inodes
  jbd2: add a helper to find out number of fast commit blocks
  ext4: make fast_commit.h byte identical with e2fsprogs/fast_commit.h
  ext4: fix fall-through warnings for Clang
  ext4: add docs about fast commit idempotence
  ext4: remove the unused EXT4_CURRENT_REV macro
  ext4: fix an IS_ERR() vs NULL check
  ext4: check for invalid block size early when mounting a file system
  ext4: fix a memory leak of ext4_free_data
  ext4: delete nonsensical (commented-out) code inside ext4_xattr_block_set()
  ext4: update ext4_data_block_valid related comments
  ...

19 files changed:
Documentation/filesystems/ext4/journal.rst
fs/ext4/balloc.c
fs/ext4/block_validity.c
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/fast_commit.c
fs/ext4/fast_commit.h
fs/ext4/fsync.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/jbd2/journal.c
include/linux/jbd2.h

index 849d5b1..cdbfec4 100644 (file)
@@ -681,3 +681,53 @@ Here is the list of supported tags and their meanings:
      - Stores the TID of the commit, CRC of the fast commit of which this tag
        represents the end of
 
+Fast Commit Replay Idempotence
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Fast commits tags are idempotent in nature provided the recovery code follows
+certain rules. The guiding principle that the commit path follows while
+committing is that it stores the result of a particular operation instead of
+storing the procedure.
+
+Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
+was associated with inode 10. During fast commit, instead of storing this
+operation as a procedure "rename a to b", we store the resulting file system
+state as a "series" of outcomes:
+
+- Link dirent b to inode 10
+- Unlink dirent a
+- Inode 10 with valid refcount
+
+Now when recovery code runs, it needs "enforce" this state on the file
+system. This is what guarantees idempotence of fast commit replay.
+
+Let's take an example of a procedure that is not idempotent and see how fast
+commits make it idempotent. Consider following sequence of operations:
+
+1) rm A
+2) mv B A
+3) read A
+
+If we store this sequence of operations as is then the replay is not idempotent.
+Let's say while in replay, we crash after (2). During the second replay,
+file A (which was actually created as a result of "mv B A" operation) would get
+deleted. Thus, file named A would be absent when we try to read A. So, this
+sequence of operations is not idempotent. However, as mentioned above, instead
+of storing the procedure fast commits store the outcome of each procedure. Thus
+the fast commit log for above procedure would be as follows:
+
+(Let's assume dirent A was linked to inode 10 and dirent B was linked to
+inode 11 before the replay)
+
+1) Unlink A
+2) Link A to inode 11
+3) Unlink B
+4) Inode 11
+
+If we crash after (3) we will have file A linked to inode 11. During the second
+replay, we will remove file A (inode 11). But we will create it back and make
+it point to inode 11. We won't find B, so we'll just skip that step. At this
+point, the refcount for inode 11 is not reliable, but that gets fixed by the
+replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
+into a series of idempotent outcomes, fast commits ensured idempotence during
+the replay.
index 1d640b1..f45f9fe 100644 (file)
@@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;
 
-       J_ASSERT_BH(bh, buffer_locked(bh));
+       ASSERT(buffer_locked(bh));
 
        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
index 8e6ca23..4666b55 100644 (file)
@@ -176,12 +176,10 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
                        err = add_system_zone(system_blks, map.m_pblk, n, ino);
                        if (err < 0) {
                                if (err == -EFSCORRUPTED) {
-                                       __ext4_error(sb, __func__, __LINE__,
-                                                    -err, map.m_pblk,
-                                                    "blocks %llu-%llu from inode %u overlap system zone",
-                                                    map.m_pblk,
-                                                    map.m_pblk + map.m_len - 1,
-                                                    ino);
+                                       EXT4_ERROR_INODE_ERR(inode, -err,
+                                               "blocks %llu-%llu from inode overlap system zone",
+                                               map.m_pblk,
+                                               map.m_pblk + map.m_len - 1);
                                }
                                break;
                        }
@@ -206,7 +204,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
  *
  * The update of system_blks pointer in this function is protected by
  * sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
  * protected only by RCU. That's why we first build the rbtree and then
  * swap it in place.
  */
@@ -258,7 +256,7 @@ int ext4_setup_system_zone(struct super_block *sb)
 
        /*
         * System blks rbtree complete, announce it once to prevent racing
-        * with ext4_data_block_valid() accessing the rbtree at the same
+        * with ext4_inode_block_valid() accessing the rbtree at the same
         * time.
         */
        rcu_assign_pointer(sbi->s_system_blks, system_blks);
@@ -278,7 +276,7 @@ err:
  *
  * The update of system_blks pointer in this function is protected by
  * sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
  * protected only by RCU. So we first clear the system_blks pointer and
  * then free the rbtree only after RCU grace period expires.
  */
index c64ea8f..2866d24 100644 (file)
 #define ext_debug(ino, fmt, ...)       no_printk(fmt, ##__VA_ARGS__)
 #endif
 
+#define ASSERT(assert)                                         \
+do {                                                                   \
+       if (unlikely(!(assert))) {                                      \
+               printk(KERN_EMERG                                       \
+                      "Assertion failure in %s() at %s:%d: '%s'\n",    \
+                      __func__, __FILE__, __LINE__, #assert);          \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
@@ -1619,6 +1629,27 @@ struct ext4_sb_info {
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;
 
+       /* Information about errors that happened during this mount */
+       spinlock_t s_error_lock;
+       int s_add_error_count;
+       int s_first_error_code;
+       __u32 s_first_error_line;
+       __u32 s_first_error_ino;
+       __u64 s_first_error_block;
+       const char *s_first_error_func;
+       time64_t s_first_error_time;
+       int s_last_error_code;
+       __u32 s_last_error_line;
+       __u32 s_last_error_ino;
+       __u64 s_last_error_block;
+       const char *s_last_error_func;
+       time64_t s_last_error_time;
+       /*
+        * If we are in a context where we cannot update error information in
+        * the on-disk superblock, we queue this work to do it.
+        */
+       struct work_struct s_error_work;
+
        /* Ext4 fast commit stuff */
        atomic_t s_fc_subtid;
        atomic_t s_fc_ineligible_updates;
@@ -1858,7 +1889,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
 #define EXT4_GOOD_OLD_REV      0       /* The good old (original) format */
 #define EXT4_DYNAMIC_REV       1       /* V2 format w/ dynamic inode sizes */
 
-#define EXT4_CURRENT_REV       EXT4_GOOD_OLD_REV
 #define EXT4_MAX_SUPP_REV      EXT4_DYNAMIC_REV
 
 #define EXT4_GOOD_OLD_INODE_SIZE 128
@@ -2952,9 +2982,9 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
 
-extern __printf(6, 7)
-void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
-                 const char *, ...);
+extern __printf(7, 8)
+void __ext4_error(struct super_block *, const char *, unsigned int, bool,
+                 int, __u64, const char *, ...);
 extern __printf(6, 7)
 void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
@@ -2963,9 +2993,6 @@ void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
-extern __printf(5, 6)
-void __ext4_abort(struct super_block *, const char *, unsigned int, int,
-                 const char *, ...);
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
@@ -2995,6 +3022,9 @@ void __ext4_grp_locked_error(const char *, unsigned int,
 #define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
 
+#define ext4_abort(sb, err, fmt, a...)                                 \
+       __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)
+
 #ifdef CONFIG_PRINTK
 
 #define ext4_error_inode(inode, func, line, block, fmt, ...)           \
@@ -3005,11 +3035,11 @@ void __ext4_grp_locked_error(const char *, unsigned int,
 #define ext4_error_file(file, func, line, block, fmt, ...)             \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
 #define ext4_error(sb, fmt, ...)                                       \
-       __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
+       __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),      \
+               ##__VA_ARGS__)
 #define ext4_error_err(sb, err, fmt, ...)                              \
-       __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
-#define ext4_abort(sb, err, fmt, ...)                                  \
-       __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
+       __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),  \
+               ##__VA_ARGS__)
 #define ext4_warning(sb, fmt, ...)                                     \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
 #define ext4_warning_inode(inode, fmt, ...)                            \
@@ -3042,17 +3072,12 @@ do {                                                                    \
 #define ext4_error(sb, fmt, ...)                                       \
 do {                                                                   \
        no_printk(fmt, ##__VA_ARGS__);                                  \
-       __ext4_error(sb, "", 0, 0, 0, " ");                             \
+       __ext4_error(sb, "", 0, false, 0, 0, " ");                      \
 } while (0)
 #define ext4_error_err(sb, err, fmt, ...)                              \
 do {                                                                   \
        no_printk(fmt, ##__VA_ARGS__);                                  \
-       __ext4_error(sb, "", 0, err, 0, " ");                           \
-} while (0)
-#define ext4_abort(sb, err, fmt, ...)                                  \
-do {                                                                   \
-       no_printk(fmt, ##__VA_ARGS__);                                  \
-       __ext4_abort(sb, "", 0, err, " ");                              \
+       __ext4_error(sb, "", 0, false, err, 0, " ");                    \
 } while (0)
 #define ext4_warning(sb, fmt, ...)                                     \
 do {                                                                   \
@@ -3361,6 +3386,21 @@ static inline void ext4_unlock_group(struct super_block *sb,
        spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
+#ifdef CONFIG_QUOTA
+static inline bool ext4_quota_capable(struct super_block *sb)
+{
+       return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
+}
+
+static inline bool ext4_is_quota_journalled(struct super_block *sb)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       return (ext4_has_feature_quota(sb) ||
+               sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
+}
+#endif
+
 /*
  * Block validity checking
  */
@@ -3609,7 +3649,6 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
                               int len,
-                              struct writeback_control *wbc,
                               bool keep_towrite);
 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
index 0fd0c42..1a0a827 100644 (file)
@@ -296,8 +296,8 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
        if (err) {
                ext4_journal_abort_handle(where, line, __func__,
                                          bh, handle, err);
-               __ext4_abort(inode->i_sb, where, line, -err,
-                          "error %d when attempting revoke", err);
+               __ext4_error(inode->i_sb, where, line, true, -err, 0,
+                            "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
index 00dc668..a124c68 100644 (file)
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
  * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-               ext4_has_feature_quota(sb)) ? 1 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
-#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-               ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)
 
-#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
-               ext4_has_feature_quota(sb)) ?\
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
 #else
index 17d7096..3960b7e 100644 (file)
@@ -5815,8 +5815,8 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
        int ret;
 
        path = ext4_find_extent(inode, start, NULL, 0);
-       if (!path)
-               return -EINVAL;
+       if (IS_ERR(path))
+               return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ret = -EFSCORRUPTED;
@@ -5988,7 +5988,6 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
                        kfree(path);
                        break;
                }
-               ex = path2[path2->p_depth].p_ext;
                for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
                        cmp1 = cmp2 = 0;
                        if (i <= path->p_depth)
index f2033e1..4fcc21c 100644 (file)
  *
  * Replay code should thus check for all the valid tails in the FC area.
  *
+ * Fast Commit Replay Idempotence
+ * ------------------------------
+ *
+ * Fast commits tags are idempotent in nature provided the recovery code follows
+ * certain rules. The guiding principle that the commit path follows while
+ * committing is that it stores the result of a particular operation instead of
+ * storing the procedure.
+ *
+ * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
+ * was associated with inode 10. During fast commit, instead of storing this
+ * operation as a procedure "rename a to b", we store the resulting file system
+ * state as a "series" of outcomes:
+ *
+ * - Link dirent b to inode 10
+ * - Unlink dirent a
+ * - Inode <10> with valid refcount
+ *
+ * Now when recovery code runs, it needs "enforce" this state on the file
+ * system. This is what guarantees idempotence of fast commit replay.
+ *
+ * Let's take an example of a procedure that is not idempotent and see how fast
+ * commits make it idempotent. Consider following sequence of operations:
+ *
+ *     rm A;    mv B A;    read A
+ *  (x)     (y)        (z)
+ *
+ * (x), (y) and (z) are the points at which we can crash. If we store this
+ * sequence of operations as is then the replay is not idempotent. Let's say
+ * while in replay, we crash at (z). During the second replay, file A (which was
+ * actually created as a result of "mv B A" operation) would get deleted. Thus,
+ * file named A would be absent when we try to read A. So, this sequence of
+ * operations is not idempotent. However, as mentioned above, instead of storing
+ * the procedure fast commits store the outcome of each procedure. Thus the fast
+ * commit log for above procedure would be as follows:
+ *
+ * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
+ * inode 11 before the replay)
+ *
+ *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
+ * (w)          (x)                    (y)          (z)
+ *
+ * If we crash at (z), we will have file A linked to inode 11. During the second
+ * replay, we will remove file A (inode 11). But we will create it back and make
+ * it point to inode 11. We won't find B, so we'll just skip that step. At this
+ * point, the refcount for inode 11 is not reliable, but that gets fixed by the
+ * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
+ * similarly. Thus, by converting a non-idempotent procedure into a series of
+ * idempotent outcomes, fast commits ensured idempotence during the replay.
+ *
  * TODOs
  * -----
+ *
+ * 0) Fast commit replay path hardening: Fast commit replay code should use
+ *    journal handles to make sure all the updates it does during the replay
+ *    path are atomic. With that if we crash during fast commit replay, after
+ *    trying to do recovery again, we will find a file system where fast commit
+ *    area is invalid (because new full commit would be found). In order to deal
+ *    with that, fast commit replay code should ensure that the "FC_REPLAY"
+ *    superblock state is persisted before starting the replay, so that after
+ *    the crash, fast commit recovery code can look at that flag and perform
+ *    fast commit recovery even if that area is invalidated by later full
+ *    commits.
+ *
  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
  *    eligible update must be protected within ext4_fc_start_update() and
  *    ext4_fc_stop_update(). These routines are called at much higher
@@ -1220,18 +1281,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
 
 /* Ext4 Replay Path Routines */
 
-/* Get length of a particular tlv */
-static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
-{
-       return le16_to_cpu(tl->fc_len);
-}
-
-/* Get a pointer to "value" of a tlv */
-static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
-{
-       return (u8 *)tl + sizeof(*tl);
-}
-
 /* Helper struct for dentry replay routines */
 struct dentry_info_args {
        int parent_ino, dname_len, ino, inode_len;
@@ -1770,32 +1819,6 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
        return 0;
 }
 
-static inline const char *tag2str(u16 tag)
-{
-       switch (tag) {
-       case EXT4_FC_TAG_LINK:
-               return "TAG_ADD_ENTRY";
-       case EXT4_FC_TAG_UNLINK:
-               return "TAG_DEL_ENTRY";
-       case EXT4_FC_TAG_ADD_RANGE:
-               return "TAG_ADD_RANGE";
-       case EXT4_FC_TAG_CREAT:
-               return "TAG_CREAT_DENTRY";
-       case EXT4_FC_TAG_DEL_RANGE:
-               return "TAG_DEL_RANGE";
-       case EXT4_FC_TAG_INODE:
-               return "TAG_INODE";
-       case EXT4_FC_TAG_PAD:
-               return "TAG_PAD";
-       case EXT4_FC_TAG_TAIL:
-               return "TAG_TAIL";
-       case EXT4_FC_TAG_HEAD:
-               return "TAG_HEAD";
-       default:
-               return "TAG_ERROR";
-       }
-}
-
 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
 {
        struct ext4_fc_replay_state *state;
index 3a6e5a1..b77f70f 100644 (file)
@@ -3,6 +3,11 @@
 #ifndef __FAST_COMMIT_H__
 #define __FAST_COMMIT_H__
 
+/*
+ * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and
+ * linux/fs/ext4/fast_commit.h. These file should always be byte identical.
+ */
+
 /* Fast commit tags */
 #define EXT4_FC_TAG_ADD_RANGE          0x0001
 #define EXT4_FC_TAG_DEL_RANGE          0x0002
@@ -50,7 +55,7 @@ struct ext4_fc_del_range {
 struct ext4_fc_dentry_info {
        __le32 fc_parent_ino;
        __le32 fc_ino;
-       u8 fc_dname[0];
+       __u8 fc_dname[0];
 };
 
 /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
@@ -65,19 +70,6 @@ struct ext4_fc_tail {
        __le32 fc_crc;
 };
 
-/*
- * In memory list of dentry updates that are performed on the file
- * system used by fast commit code.
- */
-struct ext4_fc_dentry_update {
-       int fcd_op;             /* Type of update create / unlink / link */
-       int fcd_parent;         /* Parent inode number */
-       int fcd_ino;            /* Inode number */
-       struct qstr fcd_name;   /* Dirent name */
-       unsigned char fcd_iname[DNAME_INLINE_LEN];      /* Dirent name string */
-       struct list_head fcd_list;
-};
-
 /*
  * Fast commit reason codes
  */
@@ -107,6 +99,20 @@ enum {
        EXT4_FC_REASON_MAX
 };
 
+#ifdef __KERNEL__
+/*
+ * In memory list of dentry updates that are performed on the file
+ * system used by fast commit code.
+ */
+struct ext4_fc_dentry_update {
+       int fcd_op;             /* Type of update create / unlink / link */
+       int fcd_parent;         /* Parent inode number */
+       int fcd_ino;            /* Inode number */
+       struct qstr fcd_name;   /* Dirent name */
+       unsigned char fcd_iname[DNAME_INLINE_LEN];      /* Dirent name string */
+       struct list_head fcd_list;
+};
+
 struct ext4_fc_stats {
        unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
        unsigned long fc_num_commits;
@@ -145,13 +151,51 @@ struct ext4_fc_replay_state {
 };
 
 #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
+#endif
 
 #define fc_for_each_tl(__start, __end, __tl)                           \
-       for (tl = (struct ext4_fc_tl *)start;                           \
-               (u8 *)tl < (u8 *)end;                                   \
-               tl = (struct ext4_fc_tl *)((u8 *)tl +                   \
+       for (tl = (struct ext4_fc_tl *)(__start);                       \
+            (__u8 *)tl < (__u8 *)(__end);                              \
+               tl = (struct ext4_fc_tl *)((__u8 *)tl +                 \
                                        sizeof(struct ext4_fc_tl) +     \
                                        + le16_to_cpu(tl->fc_len)))
 
+static inline const char *tag2str(__u16 tag)
+{
+       switch (tag) {
+       case EXT4_FC_TAG_LINK:
+               return "ADD_ENTRY";
+       case EXT4_FC_TAG_UNLINK:
+               return "DEL_ENTRY";
+       case EXT4_FC_TAG_ADD_RANGE:
+               return "ADD_RANGE";
+       case EXT4_FC_TAG_CREAT:
+               return "CREAT_DENTRY";
+       case EXT4_FC_TAG_DEL_RANGE:
+               return "DEL_RANGE";
+       case EXT4_FC_TAG_INODE:
+               return "INODE";
+       case EXT4_FC_TAG_PAD:
+               return "PAD";
+       case EXT4_FC_TAG_TAIL:
+               return "TAIL";
+       case EXT4_FC_TAG_HEAD:
+               return "HEAD";
+       default:
+               return "ERROR";
+       }
+}
+
+/* Get length of a particular tlv */
+static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
+{
+       return le16_to_cpu(tl->fc_len);
+}
+
+/* Get a pointer to "value" of a tlv */
+static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
+{
+       return (__u8 *)tl + sizeof(*tl);
+}
 
 #endif /* __FAST_COMMIT_H__ */
index a42ca95..113bfb0 100644 (file)
@@ -136,7 +136,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (unlikely(ext4_forced_shutdown(sbi)))
                return -EIO;
 
-       J_ASSERT(ext4_journal_current_handle() == NULL);
+       ASSERT(ext4_journal_current_handle() == NULL);
 
        trace_ext4_sync_file_enter(file, datasync);
 
index 05efa68..1223a18 100644 (file)
@@ -534,8 +534,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t first_block = 0;
 
        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
-       J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
-       J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+       ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+       ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
 
index 0d8385a..2794688 100644 (file)
@@ -175,6 +175,7 @@ void ext4_evict_inode(struct inode *inode)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
+       bool freeze_protected = false;
 
        trace_ext4_evict_inode(inode);
 
@@ -232,9 +233,14 @@ void ext4_evict_inode(struct inode *inode)
 
        /*
         * Protect us against freezing - iput() caller didn't have to have any
-        * protection against it
+        * protection against it. When we are in a running transaction though,
+        * we are already protected against freezing and we cannot grab further
+        * protection due to lock ordering constraints.
         */
-       sb_start_intwrite(inode->i_sb);
+       if (!ext4_journal_current_handle()) {
+               sb_start_intwrite(inode->i_sb);
+               freeze_protected = true;
+       }
 
        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
@@ -253,7 +259,8 @@ void ext4_evict_inode(struct inode *inode)
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
-               sb_end_intwrite(inode->i_sb);
+               if (freeze_protected)
+                       sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }
 
@@ -294,7 +301,8 @@ void ext4_evict_inode(struct inode *inode)
 stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
-               sb_end_intwrite(inode->i_sb);
+               if (freeze_protected)
+                       sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }
@@ -323,7 +331,8 @@ stop_handle:
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
-       sb_end_intwrite(inode->i_sb);
+       if (freeze_protected)
+               sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
 no_delete:
@@ -830,8 +839,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        int err;
 
-       J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
-                || handle != NULL || create == 0);
+       ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+                   || handle != NULL || create == 0);
 
        map.m_lblk = block;
        map.m_len = 1;
@@ -846,9 +855,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
-               J_ASSERT(create != 0);
-               J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
-                        || (handle != NULL));
+               ASSERT(create != 0);
+               ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+                           || (handle != NULL));
 
                /*
                 * Now that we do not always journal data, we should
@@ -2055,7 +2064,7 @@ static int ext4_writepage(struct page *page,
                unlock_page(page);
                return -ENOMEM;
        }
-       ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+       ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
        ext4_io_submit(&io_submit);
        /* Drop io_end reference we got from init */
        ext4_put_io_end_defer(io_submit.io_end);
@@ -2089,7 +2098,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
                len = size & ~PAGE_MASK;
        else
                len = PAGE_SIZE;
-       err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+       err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
        if (!err)
                mpd->wbc->nr_to_write--;
        mpd->first_page++;
@@ -4610,7 +4619,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
            (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
-               __ext4_error(sb, function, line, EFSCORRUPTED, 0,
+               __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
index 24af9ed..99bf091 100644 (file)
@@ -822,24 +822,6 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        spin_unlock(&sbi->s_bal_lock);
 }
 
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
-{
-       int count;
-       int order = 1;
-       void *buddy;
-
-       while ((buddy = mb_find_buddy(e4b, order++, &count))) {
-               ext4_set_bits(buddy, 0, count);
-       }
-       e4b->bd_info->bb_fragments = 0;
-       memset(e4b->bd_info->bb_counters, 0,
-               sizeof(*e4b->bd_info->bb_counters) *
-               (e4b->bd_sb->s_blocksize_bits + 2));
-
-       ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
-               e4b->bd_bitmap, e4b->bd_group);
-}
-
 /* The buddy information is attached the buddy cache inode
  * for convenience. The information regarding each group
  * is loaded via ext4_mb_load_buddy. The information involve
@@ -1307,22 +1289,18 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 
 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 {
-       int order = 1;
-       int bb_incr = 1 << (e4b->bd_blkbits - 1);
+       int order = 1, max;
        void *bb;
 
        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
 
-       bb = e4b->bd_buddy;
        while (order <= e4b->bd_blkbits + 1) {
-               block = block >> 1;
-               if (!mb_test_bit(block, bb)) {
+               bb = mb_find_buddy(e4b, order, &max);
+               if (!mb_test_bit(block >> order, bb)) {
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
-               bb += bb_incr;
-               bb_incr >>= 1;
                order++;
        }
        return 0;
@@ -1512,7 +1490,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                                sb, e4b->bd_group,
                                EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                }
-               mb_regenerate_buddy(e4b);
                goto done;
        }
 
@@ -2395,9 +2372,9 @@ repeat:
 
                                nr = sbi->s_mb_prefetch;
                                if (ext4_has_feature_flex_bg(sb)) {
-                                       nr = (group / sbi->s_mb_prefetch) *
-                                               sbi->s_mb_prefetch;
-                                       nr = nr + sbi->s_mb_prefetch - group;
+                                       nr = 1 << sbi->s_log_groups_per_flex;
+                                       nr -= group & (nr - 1);
+                                       nr = min(nr, sbi->s_mb_prefetch);
                                }
                                prefetch_grp = ext4_mb_prefetch(sb, group,
                                                        nr, &prefetch_ios);
@@ -2733,7 +2710,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
 
        if (ext4_has_feature_flex_bg(sb)) {
                /* a single flex group is supposed to be read by a single IO */
-               sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+               sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
+                       BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
        } else {
                sbi->s_mb_prefetch = 32;
@@ -5126,6 +5104,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
+                       kmem_cache_free(ext4_free_data_cachep, new_entry);
                        return 0;
                }
        }
index 326fe40..b17a082 100644 (file)
@@ -182,10 +182,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
        return bh;
 }
 
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -843,7 +839,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
                                        break;
                                }
                        }
-                       assert (at == p - 1);
+                       ASSERT(at == p - 1);
                }
 
                at = p - 1;
@@ -1259,8 +1255,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
        struct dx_entry *old = frame->at, *new = old + 1;
        int count = dx_get_count(entries);
 
-       assert(count < dx_get_limit(entries));
-       assert(old < entries + count);
+       ASSERT(count < dx_get_limit(entries));
+       ASSERT(old < entries + count);
        memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
        dx_set_hash(new, hash);
        dx_set_block(new, block);
@@ -2959,7 +2955,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         * hold i_mutex, or the inode can not be referenced from outside,
         * so i_nlink should not be bumped due to race
         */
-       J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+       ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
index defd2e1..03a44a0 100644 (file)
@@ -111,9 +111,6 @@ static void ext4_finish_bio(struct bio *bio)
                unsigned under_io = 0;
                unsigned long flags;
 
-               if (!page)
-                       continue;
-
                if (fscrypt_is_bounce_page(page)) {
                        bounce_page = page;
                        page = fscrypt_pagecache_page(bounce_page);
@@ -438,7 +435,6 @@ submit_and_retry:
 int ext4_bio_write_page(struct ext4_io_submit *io,
                        struct page *page,
                        int len,
-                       struct writeback_control *wbc,
                        bool keep_towrite)
 {
        struct page *bounce_page = NULL;
@@ -448,6 +444,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        int ret = 0;
        int nr_submitted = 0;
        int nr_to_submit = 0;
+       struct writeback_control *wbc = io->io_wbc;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
index 830c196..2112178 100644 (file)
@@ -404,10 +404,8 @@ void ext4_itable_unused_set(struct super_block *sb,
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
 
-static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
+static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
 {
-       time64_t now = ktime_get_real_seconds();
-
        now = clamp_val(now, 0, (1ull << 40) - 1);
 
        *lo = cpu_to_le32(lower_32_bits(now));
@@ -419,108 +417,11 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 }
 #define ext4_update_tstamp(es, tstamp) \
-       __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+       __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
+                            ktime_get_real_seconds())
 #define ext4_get_tstamp(es, tstamp) \
        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 
-static void __save_error_info(struct super_block *sb, int error,
-                             __u32 ino, __u64 block,
-                             const char *func, unsigned int line)
-{
-       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-       int err;
-
-       EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-       if (bdev_read_only(sb->s_bdev))
-               return;
-       es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-       ext4_update_tstamp(es, s_last_error_time);
-       strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
-       es->s_last_error_line = cpu_to_le32(line);
-       es->s_last_error_ino = cpu_to_le32(ino);
-       es->s_last_error_block = cpu_to_le64(block);
-       switch (error) {
-       case EIO:
-               err = EXT4_ERR_EIO;
-               break;
-       case ENOMEM:
-               err = EXT4_ERR_ENOMEM;
-               break;
-       case EFSBADCRC:
-               err = EXT4_ERR_EFSBADCRC;
-               break;
-       case 0:
-       case EFSCORRUPTED:
-               err = EXT4_ERR_EFSCORRUPTED;
-               break;
-       case ENOSPC:
-               err = EXT4_ERR_ENOSPC;
-               break;
-       case ENOKEY:
-               err = EXT4_ERR_ENOKEY;
-               break;
-       case EROFS:
-               err = EXT4_ERR_EROFS;
-               break;
-       case EFBIG:
-               err = EXT4_ERR_EFBIG;
-               break;
-       case EEXIST:
-               err = EXT4_ERR_EEXIST;
-               break;
-       case ERANGE:
-               err = EXT4_ERR_ERANGE;
-               break;
-       case EOVERFLOW:
-               err = EXT4_ERR_EOVERFLOW;
-               break;
-       case EBUSY:
-               err = EXT4_ERR_EBUSY;
-               break;
-       case ENOTDIR:
-               err = EXT4_ERR_ENOTDIR;
-               break;
-       case ENOTEMPTY:
-               err = EXT4_ERR_ENOTEMPTY;
-               break;
-       case ESHUTDOWN:
-               err = EXT4_ERR_ESHUTDOWN;
-               break;
-       case EFAULT:
-               err = EXT4_ERR_EFAULT;
-               break;
-       default:
-               err = EXT4_ERR_UNKNOWN;
-       }
-       es->s_last_error_errcode = err;
-       if (!es->s_first_error_time) {
-               es->s_first_error_time = es->s_last_error_time;
-               es->s_first_error_time_hi = es->s_last_error_time_hi;
-               strncpy(es->s_first_error_func, func,
-                       sizeof(es->s_first_error_func));
-               es->s_first_error_line = cpu_to_le32(line);
-               es->s_first_error_ino = es->s_last_error_ino;
-               es->s_first_error_block = es->s_last_error_block;
-               es->s_first_error_errcode = es->s_last_error_errcode;
-       }
-       /*
-        * Start the daily error reporting function if it hasn't been
-        * started already
-        */
-       if (!es->s_error_count)
-               mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
-       le32_add_cpu(&es->s_error_count, 1);
-}
-
-static void save_error_info(struct super_block *sb, int error,
-                           __u32 ino, __u64 block,
-                           const char *func, unsigned int line)
-{
-       __save_error_info(sb, error, ino, block, func, line);
-       if (!bdev_read_only(sb->s_bdev))
-               ext4_commit_super(sb, 1);
-}
-
 /*
  * The del_gendisk() function uninitializes the disk-specific data
  * structures, including the bdi structure, without telling anyone
@@ -649,6 +550,83 @@ static bool system_going_down(void)
                || system_state == SYSTEM_RESTART;
 }
 
+struct ext4_err_translation {
+       int code;
+       int errno;
+};
+
+#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }
+
+static struct ext4_err_translation err_translation[] = {
+       EXT4_ERR_TRANSLATE(EIO),
+       EXT4_ERR_TRANSLATE(ENOMEM),
+       EXT4_ERR_TRANSLATE(EFSBADCRC),
+       EXT4_ERR_TRANSLATE(EFSCORRUPTED),
+       EXT4_ERR_TRANSLATE(ENOSPC),
+       EXT4_ERR_TRANSLATE(ENOKEY),
+       EXT4_ERR_TRANSLATE(EROFS),
+       EXT4_ERR_TRANSLATE(EFBIG),
+       EXT4_ERR_TRANSLATE(EEXIST),
+       EXT4_ERR_TRANSLATE(ERANGE),
+       EXT4_ERR_TRANSLATE(EOVERFLOW),
+       EXT4_ERR_TRANSLATE(EBUSY),
+       EXT4_ERR_TRANSLATE(ENOTDIR),
+       EXT4_ERR_TRANSLATE(ENOTEMPTY),
+       EXT4_ERR_TRANSLATE(ESHUTDOWN),
+       EXT4_ERR_TRANSLATE(EFAULT),
+};
+
+static int ext4_errno_to_code(int errno)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(err_translation); i++)
+               if (err_translation[i].errno == errno)
+                       return err_translation[i].code;
+       return EXT4_ERR_UNKNOWN;
+}
+
+static void __save_error_info(struct super_block *sb, int error,
+                             __u32 ino, __u64 block,
+                             const char *func, unsigned int line)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+       if (bdev_read_only(sb->s_bdev))
+               return;
+       /* We default to EFSCORRUPTED error... */
+       if (error == 0)
+               error = EFSCORRUPTED;
+
+       spin_lock(&sbi->s_error_lock);
+       sbi->s_add_error_count++;
+       sbi->s_last_error_code = error;
+       sbi->s_last_error_line = line;
+       sbi->s_last_error_ino = ino;
+       sbi->s_last_error_block = block;
+       sbi->s_last_error_func = func;
+       sbi->s_last_error_time = ktime_get_real_seconds();
+       if (!sbi->s_first_error_time) {
+               sbi->s_first_error_code = error;
+               sbi->s_first_error_line = line;
+               sbi->s_first_error_ino = ino;
+               sbi->s_first_error_block = block;
+               sbi->s_first_error_func = func;
+               sbi->s_first_error_time = sbi->s_last_error_time;
+       }
+       spin_unlock(&sbi->s_error_lock);
+}
+
+static void save_error_info(struct super_block *sb, int error,
+                           __u32 ino, __u64 block,
+                           const char *func, unsigned int line)
+{
+       __save_error_info(sb, error, ino, block, func, line);
+       if (!bdev_read_only(sb->s_bdev))
+               ext4_commit_super(sb, 1);
+}
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -662,40 +640,50 @@ static bool system_going_down(void)
  * We'll just use the jbd2_journal_abort() error code to record an error in
  * the journal instead.  On recovery, the journal will complain about
  * that error until we've noted it down and cleared it.
+ *
+ * If force_ro is set, we unconditionally force the filesystem into an
+ * ABORT|READONLY state, unless the error response on the fs has been set to
+ * panic in which case we take the easy way out and panic immediately. This is
+ * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
+ * at a critical moment in log management.
  */
-
-static void ext4_handle_error(struct super_block *sb)
+static void ext4_handle_error(struct super_block *sb, bool force_ro)
 {
+       journal_t *journal = EXT4_SB(sb)->s_journal;
+
        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);
 
-       if (sb_rdonly(sb))
+       if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT)))
                return;
 
-       if (!test_opt(sb, ERRORS_CONT)) {
-               journal_t *journal = EXT4_SB(sb)->s_journal;
-
-               ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
-               if (journal)
-                       jbd2_journal_abort(journal, -EIO);
-       }
+       ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
+       if (journal)
+               jbd2_journal_abort(journal, -EIO);
        /*
         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
         * could panic during 'reboot -f' as the underlying device got already
         * disabled.
         */
-       if (test_opt(sb, ERRORS_RO) || system_going_down()) {
-               ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-               /*
-                * Make sure updated value of ->s_mount_flags will be visible
-                * before ->s_flags update
-                */
-               smp_wmb();
-               sb->s_flags |= SB_RDONLY;
-       } else if (test_opt(sb, ERRORS_PANIC)) {
+       if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
        }
+       ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+       /*
+        * Make sure updated value of ->s_mount_flags will be visible before
+        * ->s_flags update
+        */
+       smp_wmb();
+       sb->s_flags |= SB_RDONLY;
+}
+
+static void flush_stashed_error_work(struct work_struct *work)
+{
+       struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+                                               s_error_work);
+
+       ext4_commit_super(sbi->s_sb, 1);
 }
 
 #define ext4_error_ratelimit(sb)                                       \
@@ -703,7 +691,7 @@ static void ext4_handle_error(struct super_block *sb)
                             "EXT4-fs error")
 
 void __ext4_error(struct super_block *sb, const char *function,
-                 unsigned int line, int error, __u64 block,
+                 unsigned int line, bool force_ro, int error, __u64 block,
                  const char *fmt, ...)
 {
        struct va_format vaf;
@@ -723,7 +711,7 @@ void __ext4_error(struct super_block *sb, const char *function,
                va_end(args);
        }
        save_error_info(sb, error, 0, block, function, line);
-       ext4_handle_error(sb);
+       ext4_handle_error(sb, force_ro);
 }
 
 void __ext4_error_inode(struct inode *inode, const char *function,
@@ -755,7 +743,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
        }
        save_error_info(inode->i_sb, error, inode->i_ino, block,
                        function, line);
-       ext4_handle_error(inode->i_sb);
+       ext4_handle_error(inode->i_sb, false);
 }
 
 void __ext4_error_file(struct file *file, const char *function,
@@ -794,7 +782,7 @@ void __ext4_error_file(struct file *file, const char *function,
        }
        save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
                        function, line);
-       ext4_handle_error(inode->i_sb);
+       ext4_handle_error(inode->i_sb, false);
 }
 
 const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -862,51 +850,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
        }
 
        save_error_info(sb, -errno, 0, 0, function, line);
-       ext4_handle_error(sb);
-}
-
-/*
- * ext4_abort is a much stronger failure handler than ext4_error.  The
- * abort function may be used to deal with unrecoverable failures such
- * as journal IO errors or ENOMEM at a critical moment in log management.
- *
- * We unconditionally force the filesystem into an ABORT|READONLY state,
- * unless the error response on the fs has been set to panic in which
- * case we take the easy way out and panic immediately.
- */
-
-void __ext4_abort(struct super_block *sb, const char *function,
-                 unsigned int line, int error, const char *fmt, ...)
-{
-       struct va_format vaf;
-       va_list args;
-
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
-               return;
-
-       save_error_info(sb, error, 0, 0, function, line);
-       va_start(args, fmt);
-       vaf.fmt = fmt;
-       vaf.va = &args;
-       printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
-              sb->s_id, function, line, &vaf);
-       va_end(args);
-
-       if (sb_rdonly(sb) == 0) {
-               ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
-               if (EXT4_SB(sb)->s_journal)
-                       jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
-
-               ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-               /*
-                * Make sure updated value of ->s_mount_flags will be visible
-                * before ->s_flags update
-                */
-               smp_wmb();
-               sb->s_flags |= SB_RDONLY;
-       }
-       if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
-               panic("EXT4-fs panic from previous error\n");
+       ext4_handle_error(sb, false);
 }
 
 void __ext4_msg(struct super_block *sb,
@@ -982,8 +926,6 @@ __acquires(bitlock)
                return;
 
        trace_ext4_error(sb, function, line);
-       __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
-
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
@@ -999,17 +941,16 @@ __acquires(bitlock)
                va_end(args);
        }
 
-       if (test_opt(sb, WARN_ON_ERROR))
-               WARN_ON_ONCE(1);
-
        if (test_opt(sb, ERRORS_CONT)) {
-               ext4_commit_super(sb, 0);
+               if (test_opt(sb, WARN_ON_ERROR))
+                       WARN_ON_ONCE(1);
+               __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
+               schedule_work(&EXT4_SB(sb)->s_error_work);
                return;
        }
-
        ext4_unlock_group(sb, grp);
-       ext4_commit_super(sb, 1);
-       ext4_handle_error(sb);
+       save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
+       ext4_handle_error(sb, false);
        /*
         * We only get here in the ERRORS_RO case; relocking the group
         * may be dangerous, but nothing bad will happen since the
@@ -1181,6 +1122,7 @@ static void ext4_put_super(struct super_block *sb)
        ext4_unregister_li_request(sb);
        ext4_quota_off_umount(sb);
 
+       flush_work(&sbi->s_error_work);
        destroy_workqueue(sbi->rsv_conversion_wq);
 
        /*
@@ -1240,7 +1182,7 @@ static void ext4_put_super(struct super_block *sb)
         * in-memory list had better be clean by this point. */
        if (!list_empty(&sbi->s_orphan))
                dump_orphan_list(sb, sbi);
-       J_ASSERT(list_empty(&sbi->s_orphan));
+       ASSERT(list_empty(&sbi->s_orphan));
 
        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
@@ -4005,6 +3947,21 @@ static void ext4_set_resv_clusters(struct super_block *sb)
        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
 }
 
+static const char *ext4_quota_mode(struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+       if (!ext4_quota_capable(sb))
+               return "none";
+
+       if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
+               return "journalled";
+       else
+               return "writeback";
+#else
+       return "disabled";
+#endif
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
@@ -4073,7 +4030,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                ret = PTR_ERR(bh);
-               bh = NULL;
                goto out_fail;
        }
        /*
@@ -4187,19 +4143,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
-       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
-       if (blocksize == PAGE_SIZE)
-               set_opt(sb, DIOREAD_NOLOCK);
-
-       if (blocksize < EXT4_MIN_BLOCK_SIZE ||
-           blocksize > EXT4_MAX_BLOCK_SIZE) {
+       if (le32_to_cpu(es->s_log_block_size) >
+           (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+               ext4_msg(sb, KERN_ERR,
+                        "Invalid log block size: %u",
+                        le32_to_cpu(es->s_log_block_size));
+               goto failed_mount;
+       }
+       if (le32_to_cpu(es->s_log_cluster_size) >
+           (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
-                      "Unsupported filesystem blocksize %d (%d log_block_size)",
-                        blocksize, le32_to_cpu(es->s_log_block_size));
+                        "Invalid log cluster size: %u",
+                        le32_to_cpu(es->s_log_cluster_size));
                goto failed_mount;
        }
 
+       blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+       if (blocksize == PAGE_SIZE)
+               set_opt(sb, DIOREAD_NOLOCK);
+
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
@@ -4417,21 +4380,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
                goto failed_mount;
 
-       if (le32_to_cpu(es->s_log_block_size) >
-           (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Invalid log block size: %u",
-                        le32_to_cpu(es->s_log_block_size));
-               goto failed_mount;
-       }
-       if (le32_to_cpu(es->s_log_cluster_size) >
-           (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Invalid log cluster size: %u",
-                        le32_to_cpu(es->s_log_cluster_size));
-               goto failed_mount;
-       }
-
        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
                ext4_msg(sb, KERN_ERR,
                         "Number of reserved GDT blocks insanely large: %d",
@@ -4702,7 +4650,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                               "can't read group descriptor %d", i);
                        db_count = i;
                        ret = PTR_ERR(bh);
-                       bh = NULL;
                        goto failed_mount2;
                }
                rcu_read_lock();
@@ -4717,6 +4664,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
+       spin_lock_init(&sbi->s_error_lock);
+       INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
 
        /* Register extent status tree shrinker */
        if (ext4_es_register_shrinker(sbi))
@@ -4872,6 +4821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                               "requested data journaling mode");
                        goto failed_mount_wq;
                }
+               break;
        default:
                break;
        }
@@ -5000,13 +4950,11 @@ no_journal:
        block = ext4_count_free_clusters(sb);
        ext4_free_blocks_count_set(sbi->s_es, 
                                   EXT4_C2B(sbi, block));
-       ext4_superblock_csum_set(sb);
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
-               ext4_superblock_csum_set(sb);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
@@ -5086,10 +5034,11 @@ no_journal:
 
        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-                        "Opts: %.*s%s%s", descr,
+                        "Opts: %.*s%s%s. Quota mode: %s.", descr,
                         (int) sizeof(sbi->s_es->s_mount_opts),
                         sbi->s_es->s_mount_opts,
-                        *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+                        *sbi->s_es->s_mount_opts ? "; " : "", orig_data,
+                        ext4_quota_mode(sb));
 
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5154,6 +5103,7 @@ failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
 failed_mount3:
        del_timer_sync(&sbi->s_err_report);
+       flush_work(&sbi->s_error_work);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -5480,6 +5430,7 @@ err_out:
 
 static int ext4_commit_super(struct super_block *sb, int sync)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
@@ -5511,6 +5462,46 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                es->s_free_inodes_count =
                        cpu_to_le32(percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeinodes_counter));
+       /* Copy error information to the on-disk superblock */
+       spin_lock(&sbi->s_error_lock);
+       if (sbi->s_add_error_count > 0) {
+               es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+               if (!es->s_first_error_time && !es->s_first_error_time_hi) {
+                       __ext4_update_tstamp(&es->s_first_error_time,
+                                            &es->s_first_error_time_hi,
+                                            sbi->s_first_error_time);
+                       strncpy(es->s_first_error_func, sbi->s_first_error_func,
+                               sizeof(es->s_first_error_func));
+                       es->s_first_error_line =
+                               cpu_to_le32(sbi->s_first_error_line);
+                       es->s_first_error_ino =
+                               cpu_to_le32(sbi->s_first_error_ino);
+                       es->s_first_error_block =
+                               cpu_to_le64(sbi->s_first_error_block);
+                       es->s_first_error_errcode =
+                               ext4_errno_to_code(sbi->s_first_error_code);
+               }
+               __ext4_update_tstamp(&es->s_last_error_time,
+                                    &es->s_last_error_time_hi,
+                                    sbi->s_last_error_time);
+               strncpy(es->s_last_error_func, sbi->s_last_error_func,
+                       sizeof(es->s_last_error_func));
+               es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
+               es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
+               es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
+               es->s_last_error_errcode =
+                               ext4_errno_to_code(sbi->s_last_error_code);
+               /*
+                * Start the daily error reporting function if it hasn't been
+                * started already
+                */
+               if (!es->s_error_count)
+                       mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
+               le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
+               sbi->s_add_error_count = 0;
+       }
+       spin_unlock(&sbi->s_error_lock);
+
        BUFFER_TRACE(sbh, "marking dirty");
        ext4_superblock_csum_set(sb);
        if (sync)
@@ -5864,6 +5855,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }
 
+       /* Flush outstanding errors before changing fs state */
+       flush_work(&sbi->s_error_work);
+
        if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
                if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
                        err = -EROFS;
@@ -6022,7 +6016,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         */
        *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags);
 
-       ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+       ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.",
+                orig_data, ext4_quota_mode(sb));
        kfree(orig_data);
        return 0;
 
@@ -6201,11 +6196,8 @@ static int ext4_release_dquot(struct dquot *dquot)
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       /* Are we journaling quotas? */
-       if (ext4_has_feature_quota(sb) ||
-           sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+       if (ext4_is_quota_journalled(sb)) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
        } else {
index 6127e94..4e3b1f8 100644 (file)
@@ -1927,7 +1927,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        } else {
                /* Allocate a buffer where we construct the new block. */
                s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
-               /* assert(header == s->base) */
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
index 188f79d..2dc9444 100644 (file)
@@ -1869,9 +1869,7 @@ static int load_superblock(journal_t *journal)
 
        if (jbd2_has_feature_fast_commit(journal)) {
                journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
-               num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
-               if (!num_fc_blocks)
-                       num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+               num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
                if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
                        journal->j_last = journal->j_fc_last - num_fc_blocks;
                journal->j_fc_first = journal->j_last + 1;
@@ -2102,9 +2100,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
        journal_superblock_t *sb = journal->j_superblock;
        unsigned long long num_fc_blks;
 
-       num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
-       if (num_fc_blks == 0)
-               num_fc_blks = JBD2_MIN_FC_BLOCKS;
+       num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
        if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
                return -ENOSPC;
 
index 578ff19..99d3cd0 100644 (file)
@@ -68,7 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
 extern void jbd2_free(void *ptr, size_t size);
 
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
-#define JBD2_MIN_FC_BLOCKS     256
+#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
 
 #ifdef __KERNEL__
 
@@ -538,6 +538,7 @@ struct transaction_chp_stats_s {
  * The transaction keeps track of all of the buffers modified by a
  * running transaction, and all of the buffers committed but not yet
  * flushed to home for finished transactions.
+ * (Locking Documentation improved by LockDoc)
  */
 
 /*
@@ -658,12 +659,12 @@ struct transaction_s
        unsigned long           t_start;
 
        /*
-        * When commit was requested
+        * When commit was requested [j_state_lock]
         */
        unsigned long           t_requested;
 
        /*
-        * Checkpointing stats [j_checkpoint_sem]
+        * Checkpointing stats [j_list_lock]
         */
        struct transaction_chp_stats_s t_chp_stats;
 
@@ -1691,6 +1692,13 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
        return journal->j_chksum_driver != NULL;
 }
 
+static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
+{
+       int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);
+
+       return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
+}
+
 /*
  * Return number of free blocks in the log. Must be called under j_state_lock.
  */