Merge tag 'perf-tools-fixes-for-v5.9-2020-09-03' of git://git.kernel.org/pub/scm...
[linux-2.6-microblaze.git] / fs / ext4 / mballoc.c
index c0a331e..132c118 100644 (file)
@@ -922,7 +922,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
                        bh[i] = NULL;
                        continue;
                }
-               bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+               bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
                if (IS_ERR(bh[i])) {
                        err = PTR_ERR(bh[i]);
                        bh[i] = NULL;
@@ -1279,9 +1279,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
        e4b->bd_buddy_page = page;
        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
 
-       BUG_ON(e4b->bd_bitmap_page == NULL);
-       BUG_ON(e4b->bd_buddy_page == NULL);
-
        return 0;
 
 err:
@@ -1743,10 +1740,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 
 }
 
-/*
- * regular allocator, for general purposes allocation
- */
-
 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b,
                                        int finish_group)
@@ -2119,13 +2112,11 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
 
        BUG_ON(cr < 0 || cr >= 4);
 
-       free = grp->bb_free;
-       if (free == 0)
-               return false;
-       if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+       if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return false;
 
-       if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+       free = grp->bb_free;
+       if (free == 0)
                return false;
 
        fragments = grp->bb_fragments;
@@ -2142,8 +2133,10 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
                    ((group % flex_size) == 0))
                        return false;
 
-               if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
-                   (free / fragments) >= ac->ac_g_ex.fe_len)
+               if (free < ac->ac_g_ex.fe_len)
+                       return false;
+
+               if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
                        return true;
 
                if (grp->bb_largest_free_order < ac->ac_2order)
@@ -2177,6 +2170,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
 {
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct super_block *sb = ac->ac_sb;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
        ext4_grpblk_t free;
        int ret = 0;
@@ -2195,7 +2189,25 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
 
        /* We only do this if the grp has never been initialized */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-               ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
+               struct ext4_group_desc *gdp =
+                       ext4_get_group_desc(sb, group, NULL);
+               int ret;
+
+               /* cr=0/1 is a very optimistic search to find large
+                * good chunks almost for free.  If buddy data is not
+                * ready, then this optimization makes no sense.  But
+                * we never skip the first block group in a flex_bg,
+                * since this gets used for metadata block allocation,
+                * and we want to make sure we locate metadata blocks
+                * in the first block group in the flex_bg if possible.
+                */
+               if (cr < 2 &&
+                   (!sbi->s_log_groups_per_flex ||
+                    ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+                   !(ext4_has_group_desc_csum(sb) &&
+                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+                       return 0;
+               ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                if (ret)
                        return ret;
        }
@@ -2209,15 +2221,95 @@ out:
        return ret;
 }
 
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+                             unsigned int nr, int *cnt)
+{
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+       struct buffer_head *bh;
+       struct blk_plug plug;
+
+       blk_start_plug(&plug);
+       while (nr-- > 0) {
+               struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+                                                                 NULL);
+               struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+               /*
+                * Prefetch block groups with free blocks; but don't
+                * bother if it is marked uninitialized on disk, since
+                * it won't require I/O to read.  Also only try to
+                * prefetch once, so we avoid getblk() call, which can
+                * be expensive.
+                */
+               if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+                   EXT4_MB_GRP_NEED_INIT(grp) &&
+                   ext4_free_group_clusters(sb, gdp) > 0 &&
+                   !(ext4_has_group_desc_csum(sb) &&
+                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+                       bh = ext4_read_block_bitmap_nowait(sb, group, true);
+                       if (bh && !IS_ERR(bh)) {
+                               if (!buffer_uptodate(bh) && cnt)
+                                       (*cnt)++;
+                               brelse(bh);
+                       }
+               }
+               if (++group >= ngroups)
+                       group = 0;
+       }
+       blk_finish_plug(&plug);
+       return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized.  Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+                          unsigned int nr)
+{
+       while (nr-- > 0) {
+               struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+                                                                 NULL);
+               struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+               if (!group)
+                       group = ext4_get_groups_count(sb);
+               group--;
+               grp = ext4_get_group_info(sb, group);
+
+               if (EXT4_MB_GRP_NEED_INIT(grp) &&
+                   ext4_free_group_clusters(sb, gdp) > 0 &&
+                   !(ext4_has_group_desc_csum(sb) &&
+                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+                       if (ext4_mb_init_group(sb, group, GFP_NOFS))
+                               break;
+               }
+       }
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-       ext4_group_t ngroups, group, i;
+       ext4_group_t prefetch_grp = 0, ngroups, group, i;
        int cr = -1;
        int err = 0, first_err = 0;
+       unsigned int nr = 0, prefetch_ios = 0;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
+       int lost;
 
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
@@ -2237,8 +2329,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                goto out;
 
        /*
-        * ac->ac2_order is set only if the fe_len is a power of 2
-        * if ac2_order is set we also set criteria to 0 so that we
+        * ac->ac_2order is set only if the fe_len is a power of 2
+        * if ac->ac_2order is set we also set criteria to 0 so that we
         * try exact allocation using buddy.
         */
        i = fls(ac->ac_g_ex.fe_len);
@@ -2282,6 +2374,7 @@ repeat:
                 * from the goal value specified
                 */
                group = ac->ac_g_ex.fe_group;
+               prefetch_grp = group;
 
                for (i = 0; i < ngroups; group++, i++) {
                        int ret = 0;
@@ -2293,6 +2386,29 @@ repeat:
                        if (group >= ngroups)
                                group = 0;
 
+                       /*
+                        * Batch reads of the block allocation bitmaps
+                        * to get multiple READs in flight; limit
+                        * prefetching at cr=0/1, otherwise mballoc can
+                        * spend a lot of time loading imperfect groups
+                        */
+                       if ((prefetch_grp == group) &&
+                           (cr > 1 ||
+                            prefetch_ios < sbi->s_mb_prefetch_limit)) {
+                               unsigned int curr_ios = prefetch_ios;
+
+                               nr = sbi->s_mb_prefetch;
+                               if (ext4_has_feature_flex_bg(sb)) {
+                                       nr = (group / sbi->s_mb_prefetch) *
+                                               sbi->s_mb_prefetch;
+                                       nr = nr + sbi->s_mb_prefetch - group;
+                               }
+                               prefetch_grp = ext4_mb_prefetch(sb, group,
+                                                       nr, &prefetch_ios);
+                               if (prefetch_ios == curr_ios)
+                                       nr = 0;
+                       }
+
                        /* This now checks without needing the buddy page */
                        ret = ext4_mb_good_group_nolock(ac, group, cr);
                        if (ret <= 0) {
@@ -2341,22 +2457,24 @@ repeat:
                 * We've been searching too long. Let's try to allocate
                 * the best chunk we've found so far
                 */
-
                ext4_mb_try_best_found(ac, &e4b);
                if (ac->ac_status != AC_STATUS_FOUND) {
                        /*
                         * Someone more lucky has already allocated it.
                         * The only thing we can do is just take first
                         * found block(s)
-                       printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
                         */
+                       lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+                       mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+                                ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+                                ac->ac_b_ex.fe_len, lost);
+
                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
                        cr = 3;
-                       atomic_inc(&sbi->s_mb_lost_chunks);
                        goto repeat;
                }
        }
@@ -2367,6 +2485,10 @@ out:
        mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
                 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
                 ac->ac_flags, cr, err);
+
+       if (nr)
+               ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
        return err;
 }
 
@@ -2439,7 +2561,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
                                sg.info.bb_counters[i] : 0);
-       seq_printf(seq, " ]\n");
+       seq_puts(seq, " ]\n");
 
        return 0;
 }
@@ -2613,6 +2735,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
                        goto err_freebuddy;
        }
 
+       if (ext4_has_feature_flex_bg(sb)) {
+               /* a single flex group is supposed to be read by a single IO */
+               sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+               sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+       } else {
+               sbi->s_mb_prefetch = 32;
+       }
+       if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+               sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+       /* now many real IOs to prefetch within a single allocation at cr=0
+        * given cr=0 is an CPU-related optimization we shouldn't try to
+        * load too many groups, at some point we should start to use what
+        * we've got in memory.
+        * with an average random access time 5ms, it'd take a second to get
+        * 200 groups (* N with flex_bg), so let's make this limit 4
+        */
+       sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+       if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+               sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
        return 0;
 
 err_freebuddy:
@@ -2736,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb)
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
        /*
         * The default group preallocation is 512, which for 4k block
         * sizes translates to 2 megabytes.  However for bigalloc file
@@ -3090,7 +3233,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
-       if (!ext4_data_block_valid(sbi, block, len)) {
+       if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata", block, block+len);
                /* File system mounted not to panic on error
@@ -3674,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
 }
 
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+                                   struct ext4_prealloc_space *pa)
+{
+       struct ext4_inode_info *ei;
+
+       if (pa->pa_deleted) {
+               ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+                            pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+                            pa->pa_len);
+               return;
+       }
+
+       pa->pa_deleted = 1;
+
+       if (pa->pa_type == MB_INODE_PA) {
+               ei = EXT4_I(pa->pa_inode);
+               atomic_dec(&ei->i_prealloc_active);
+       }
+}
+
 static void ext4_mb_pa_callback(struct rcu_head *head)
 {
        struct ext4_prealloc_space *pa;
@@ -3706,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                return;
        }
 
-       pa->pa_deleted = 1;
+       ext4_mb_mark_pa_deleted(sb, pa);
        spin_unlock(&pa->pa_lock);
 
        grp_blk = pa->pa_pstart;
@@ -3830,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        spin_lock(pa->pa_obj_lock);
        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
        spin_unlock(pa->pa_obj_lock);
+       atomic_inc(&ei->i_prealloc_active);
 }
 
 /*
@@ -4040,7 +4204,7 @@ repeat:
                }
 
                /* seems this one can be freed ... */
-               pa->pa_deleted = 1;
+               ext4_mb_mark_pa_deleted(sb, pa);
 
                /* we can trust pa_free ... */
                free += pa->pa_free;
@@ -4103,7 +4267,7 @@ out_dbg:
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -4121,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode)
 
        mb_debug(sb, "discard preallocation for inode %lu\n",
                 inode->i_ino);
-       trace_ext4_discard_preallocations(inode);
+       trace_ext4_discard_preallocations(inode,
+                       atomic_read(&ei->i_prealloc_active), needed);
 
        INIT_LIST_HEAD(&list);
 
+       if (needed == 0)
+               needed = UINT_MAX;
+
 repeat:
        /* first, collect all pa's in the inode */
        spin_lock(&ei->i_prealloc_lock);
-       while (!list_empty(&ei->i_prealloc_list)) {
-               pa = list_entry(ei->i_prealloc_list.next,
+       while (!list_empty(&ei->i_prealloc_list) && needed) {
+               pa = list_entry(ei->i_prealloc_list.prev,
                                struct ext4_prealloc_space, pa_inode_list);
                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
                spin_lock(&pa->pa_lock);
@@ -4146,10 +4314,11 @@ repeat:
 
                }
                if (pa->pa_deleted == 0) {
-                       pa->pa_deleted = 1;
+                       ext4_mb_mark_pa_deleted(sb, pa);
                        spin_unlock(&pa->pa_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        list_add(&pa->u.pa_tmp_list, &list);
+                       needed--;
                        continue;
                }
 
@@ -4399,7 +4568,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_g_ex = ac->ac_o_ex;
        ac->ac_flags = ar->flags;
 
-       /* we have to define context: we'll we work with a file or
+       /* we have to define context: we'll work with a file or
         * locality group. this is a policy, actually */
        ext4_mb_group_or_file(ac);
 
@@ -4450,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                BUG_ON(pa->pa_type != MB_GROUP_PA);
 
                /* seems this one can be freed ... */
-               pa->pa_deleted = 1;
+               ext4_mb_mark_pa_deleted(sb, pa);
                spin_unlock(&pa->pa_lock);
 
                list_del_rcu(&pa->pa_inode_list);
@@ -4548,11 +4717,30 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
        return ;
 }
 
+/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       int count, delta;
+
+       count = atomic_read(&ei->i_prealloc_active);
+       delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+       if (count > sbi->s_mb_max_inode_prealloc + delta) {
+               count -= sbi->s_mb_max_inode_prealloc;
+               ext4_discard_preallocations(inode, count);
+       }
+}
+
 /*
  * release all resource we used in allocation
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
+       struct inode *inode = ac->ac_inode;
+       struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
@@ -4564,21 +4752,31 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);
+
+                       /*
+                        * We want to add the pa to the right bucket.
+                        * Remove it from the list and while adding
+                        * make sure the list to which we are adding
+                        * doesn't grow big.
+                        */
+                       if (likely(pa->pa_free)) {
+                               spin_lock(pa->pa_obj_lock);
+                               list_del_rcu(&pa->pa_inode_list);
+                               spin_unlock(pa->pa_obj_lock);
+                               ext4_mb_add_n_trim(ac);
+                       }
                }
-       }
-       if (pa) {
-               /*
-                * We want to add the pa to the right bucket.
-                * Remove it from the list and while adding
-                * make sure the list to which we are adding
-                * doesn't grow big.
-                */
-               if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+               if (pa->pa_type == MB_INODE_PA) {
+                       /*
+                        * treat per-inode prealloc list as a lru list, then try
+                        * to trim the least recently used PA.
+                        */
                        spin_lock(pa->pa_obj_lock);
-                       list_del_rcu(&pa->pa_inode_list);
+                       list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
                        spin_unlock(pa->pa_obj_lock);
-                       ext4_mb_add_n_trim(ac);
                }
+
                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_page)
@@ -4588,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                mutex_unlock(&ac->ac_lg->lg_mutex);
        ext4_mb_collect_stats(ac);
+       ext4_mb_trim_inode_pa(inode);
        return 0;
 }
 
@@ -4915,7 +5114,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
        sbi = EXT4_SB(sb);
        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
-           !ext4_data_block_valid(sbi, block, count)) {
+           !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks not in datazone - "
                           "block = %llu, count = %lu", block, count);
                goto error_return;