Merge remote-tracking branch 'torvalds/master' into perf/core
[linux-2.6-microblaze.git] / fs / ext4 / mballoc.c
index a02fadf..c2c22c2 100644 (file)
  * smallest multiple of the stripe value (sbi->s_stripe) which is
  * greater than the default mb_group_prealloc.
  *
+ * If "mb_optimize_scan" mount option is set, we maintain in memory group info
+ * structures in two data structures:
+ *
+ * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
+ *
+ *    Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
+ *
+ *    This is an array of lists where the index in the array represents the
+ *    largest free order in the buddy bitmap of the participating group infos of
+ *    that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ *    number of buddy bitmap orders possible) number of lists. Group-infos are
+ *    placed in appropriate lists.
+ *
+ * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
+ *
+ *    Locking: sbi->s_mb_rb_lock (rwlock)
+ *
+ *    This is a red black tree consisting of group infos and the tree is sorted
+ *    by average fragment sizes (which is calculated as ext4_group_info->bb_free
+ *    / ext4_group_info->bb_fragments).
+ *
+ * When "mb_optimize_scan" mount option is set, mballoc consults the above data
+ * structures to decide the order in which groups are to be traversed for
+ * fulfilling an allocation request.
+ *
+ * At CR = 0, we look for groups which have the largest_free_order >= the order
+ * of the request. We directly look at the largest free order list in the data
+ * structure (1) above where largest_free_order = order of the request. If that
+ * list is empty, we look at remaining list in the increasing order of
+ * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
+ *
+ * At CR = 1, we only consider groups where average fragment size > request
+ * size. So, we lookup a group which has average fragment size just above or
+ * equal to request size using our rb tree (data structure 2) in O(log N) time.
+ *
+ * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
+ * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
+ *
  * The regular allocator (using the buddy cache) supports a few tunables.
  *
  * /sys/fs/ext4/<partition>/mb_min_to_scan
  * /sys/fs/ext4/<partition>/mb_max_to_scan
  * /sys/fs/ext4/<partition>/mb_order2_req
+ * /sys/fs/ext4/<partition>/mb_linear_limit
  *
  * The regular allocator uses buddy scan only if the request len is power of
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * can be used for allocation. ext4_mb_good_group explains how the groups are
  * checked.
  *
+ * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
+ * get traversed linearly. That may result in subsequent allocations being not
+ * close to each other. And so, the underlying device may get filled up in a
+ * non-linear fashion. While that may not matter on non-rotational devices, for
+ * rotational devices that may result in higher seek times. "mb_linear_limit"
+ * tells mballoc how many groups mballoc should search linearly before
+ * performing consulting above data structures for more efficient lookups. For
+ * non rotational devices, this value defaults to 0 and for rotational devices
+ * this is set to MB_DEFAULT_LINEAR_LIMIT.
+ *
  * Both the prealloc space are getting populated as above. So for the first
  * request we will hit the buddy cache which will result in this prealloc
  * space getting filled. The prealloc space is then later used for the
  *  - bitlock on a group       (group)
  *  - object (inode/locality)  (object)
  *  - per-pa lock              (pa)
+ *  - cr0 lists lock           (cr0)
+ *  - cr1 tree lock            (cr1)
  *
  * Paths:
  *  - new pa
  *    group
  *        object
  *
+ *  - allocation path (ext4_mb_regular_allocator)
+ *    group
+ *    cr0/cr1
  */
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
@@ -351,6 +405,9 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
 
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
+                              ext4_group_t group, int cr);
+
 /*
  * The algorithm using this percpu seq counter goes below:
  * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -744,6 +801,269 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
 
+static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
+                       int (*cmp)(struct rb_node *, struct rb_node *))
+{
+       struct rb_node **iter = &root->rb_node, *parent = NULL;
+
+       while (*iter) {
+               parent = *iter;
+               if (cmp(new, *iter) > 0)
+                       iter = &((*iter)->rb_left);
+               else
+                       iter = &((*iter)->rb_right);
+       }
+
+       rb_link_node(new, parent, iter);
+       rb_insert_color(new, root);
+}
+
+static int
+ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
+{
+       struct ext4_group_info *grp1 = rb_entry(rb1,
+                                               struct ext4_group_info,
+                                               bb_avg_fragment_size_rb);
+       struct ext4_group_info *grp2 = rb_entry(rb2,
+                                               struct ext4_group_info,
+                                               bb_avg_fragment_size_rb);
+       int num_frags_1, num_frags_2;
+
+       num_frags_1 = grp1->bb_fragments ?
+               grp1->bb_free / grp1->bb_fragments : 0;
+       num_frags_2 = grp2->bb_fragments ?
+               grp2->bb_free / grp2->bb_fragments : 0;
+
+       return (num_frags_2 - num_frags_1);
+}
+
+/*
+ * Reinsert grpinfo into the avg_fragment_size tree with new average
+ * fragment size.
+ */
+static void
+mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+               return;
+
+       write_lock(&sbi->s_mb_rb_lock);
+       if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
+               rb_erase(&grp->bb_avg_fragment_size_rb,
+                               &sbi->s_mb_avg_fragment_size_root);
+               RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
+       }
+
+       ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
+               &grp->bb_avg_fragment_size_rb,
+               ext4_mb_avg_fragment_size_cmp);
+       write_unlock(&sbi->s_mb_rb_lock);
+}
+
+/*
+ * Choose next group by traversing largest_free_order lists. Updates *new_cr if
+ * cr level needs an update.
+ */
+static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
+                       int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_group_info *iter, *grp;
+       int i;
+
+       if (ac->ac_status == AC_STATUS_FOUND)
+               return;
+
+       if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
+               atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
+
+       grp = NULL;
+       for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+               if (list_empty(&sbi->s_mb_largest_free_orders[i]))
+                       continue;
+               read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
+               if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
+                       read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+                       continue;
+               }
+               grp = NULL;
+               list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
+                                   bb_largest_free_order_node) {
+                       if (sbi->s_mb_stats)
+                               atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
+                       if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
+                               grp = iter;
+                               break;
+                       }
+               }
+               read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+               if (grp)
+                       break;
+       }
+
+       if (!grp) {
+               /* Increment cr and search again */
+               *new_cr = 1;
+       } else {
+               *group = grp->bb_group;
+               ac->ac_last_optimal_group = *group;
+               ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
+       }
+}
+
+/*
+ * Choose next group by traversing average fragment size tree. Updates *new_cr
+ * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
+ * the linear search should continue for one iteration since there's lock
+ * contention on the rb tree lock.
+ */
+static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
+               int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       int avg_fragment_size, best_so_far;
+       struct rb_node *node, *found;
+       struct ext4_group_info *grp;
+
+       /*
+        * If there is contention on the lock, instead of waiting for the lock
+        * to become available, just continue searching lineraly. We'll resume
+        * our rb tree search later starting at ac->ac_last_optimal_group.
+        */
+       if (!read_trylock(&sbi->s_mb_rb_lock)) {
+               ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
+               return;
+       }
+
+       if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
+               if (sbi->s_mb_stats)
+                       atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
+               /* We have found something at CR 1 in the past */
+               grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
+               for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
+                    found = rb_next(found)) {
+                       grp = rb_entry(found, struct ext4_group_info,
+                                      bb_avg_fragment_size_rb);
+                       if (sbi->s_mb_stats)
+                               atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
+                       if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
+                               break;
+               }
+               goto done;
+       }
+
+       node = sbi->s_mb_avg_fragment_size_root.rb_node;
+       best_so_far = 0;
+       found = NULL;
+
+       while (node) {
+               grp = rb_entry(node, struct ext4_group_info,
+                              bb_avg_fragment_size_rb);
+               avg_fragment_size = 0;
+               if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
+                       avg_fragment_size = grp->bb_fragments ?
+                               grp->bb_free / grp->bb_fragments : 0;
+                       if (!best_so_far || avg_fragment_size < best_so_far) {
+                               best_so_far = avg_fragment_size;
+                               found = node;
+                       }
+               }
+               if (avg_fragment_size > ac->ac_g_ex.fe_len)
+                       node = node->rb_right;
+               else
+                       node = node->rb_left;
+       }
+
+done:
+       if (found) {
+               grp = rb_entry(found, struct ext4_group_info,
+                              bb_avg_fragment_size_rb);
+               *group = grp->bb_group;
+               ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
+       } else {
+               *new_cr = 2;
+       }
+
+       read_unlock(&sbi->s_mb_rb_lock);
+       ac->ac_last_optimal_group = *group;
+}
+
+static inline int should_optimize_scan(struct ext4_allocation_context *ac)
+{
+       if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
+               return 0;
+       if (ac->ac_criteria >= 2)
+               return 0;
+       if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+               return 0;
+       return 1;
+}
+
+/*
+ * Return next linear group for allocation. If linear traversal should not be
+ * performed, this function just returns the same group
+ */
+static int
+next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
+{
+       if (!should_optimize_scan(ac))
+               goto inc_and_return;
+
+       if (ac->ac_groups_linear_remaining) {
+               ac->ac_groups_linear_remaining--;
+               goto inc_and_return;
+       }
+
+       if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
+               ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
+               goto inc_and_return;
+       }
+
+       return group;
+inc_and_return:
+       /*
+        * Artificially restricted ngroups for non-extent
+        * files makes group > ngroups possible on first loop.
+        */
+       return group + 1 >= ngroups ? 0 : group + 1;
+}
+
+/*
+ * ext4_mb_choose_next_group: choose next group for allocation.
+ *
+ * @ac        Allocation Context
+ * @new_cr    This is an output parameter. If the there is no good group
+ *            available at current CR level, this field is updated to indicate
+ *            the new cr level that should be used.
+ * @group     This is an input / output parameter. As an input it indicates the
+ *            next group that the allocator intends to use for allocation. As
+ *            output, this field indicates the next group that should be used as
+ *            determined by the optimization functions.
+ * @ngroups   Total number of groups
+ */
+static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
+               int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+       *new_cr = ac->ac_criteria;
+
+       if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
+               return;
+
+       if (*new_cr == 0) {
+               ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
+       } else if (*new_cr == 1) {
+               ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
+       } else {
+               /*
+                * TODO: For CR=2, we can arrange groups in an rb tree sorted by
+                * bb_free. But until that happens, we should never come here.
+                */
+               WARN_ON(1);
+       }
+}
+
 /*
  * Cache the order of the largest free extent we have available in this block
  * group.
@@ -751,18 +1071,33 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 static void
 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        int i;
-       int bits;
 
+       if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
+               write_lock(&sbi->s_mb_largest_free_orders_locks[
+                                             grp->bb_largest_free_order]);
+               list_del_init(&grp->bb_largest_free_order_node);
+               write_unlock(&sbi->s_mb_largest_free_orders_locks[
+                                             grp->bb_largest_free_order]);
+       }
        grp->bb_largest_free_order = -1; /* uninit */
 
-       bits = sb->s_blocksize_bits + 1;
-       for (i = bits; i >= 0; i--) {
+       for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
                if (grp->bb_counters[i] > 0) {
                        grp->bb_largest_free_order = i;
                        break;
                }
        }
+       if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
+           grp->bb_largest_free_order >= 0 && grp->bb_free) {
+               write_lock(&sbi->s_mb_largest_free_orders_locks[
+                                             grp->bb_largest_free_order]);
+               list_add_tail(&grp->bb_largest_free_order_node,
+                     &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
+               write_unlock(&sbi->s_mb_largest_free_orders_locks[
+                                             grp->bb_largest_free_order]);
+       }
 }
 
 static noinline_for_stack
@@ -816,10 +1151,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
        period = get_cycles() - period;
-       spin_lock(&sbi->s_bal_lock);
-       sbi->s_mb_buddies_generated++;
-       sbi->s_mb_generation_time += period;
-       spin_unlock(&sbi->s_bal_lock);
+       atomic_inc(&sbi->s_mb_buddies_generated);
+       atomic64_add(period, &sbi->s_mb_generation_time);
+       mb_update_avg_fragment_size(sb, grp);
 }
 
 /* The buddy information is attached the buddy cache inode
@@ -959,7 +1293,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
                               sizeof(*grinfo->bb_counters) *
-                               (sb->s_blocksize_bits+2));
+                              (MB_NUM_ORDERS(sb)));
                        /*
                         * incore got set to the group block bitmap below
                         */
@@ -1519,6 +1853,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 
 done:
        mb_set_largest_free_order(sb, e4b->bd_info);
+       mb_update_avg_fragment_size(sb, e4b->bd_info);
        mb_check_buddy(e4b);
 }
 
@@ -1655,6 +1990,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
+       mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
        mb_check_buddy(e4b);
 
@@ -1930,7 +2266,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
        int max;
 
        BUG_ON(ac->ac_2order <= 0);
-       for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+       for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
                if (grp->bb_counters[i] == 0)
                        continue;
 
@@ -2109,7 +2445,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
                if (free < ac->ac_g_ex.fe_len)
                        return false;
 
-               if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
+               if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
                        return true;
 
                if (grp->bb_largest_free_order < ac->ac_2order)
@@ -2148,6 +2484,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
        ext4_grpblk_t free;
        int ret = 0;
 
+       if (sbi->s_mb_stats)
+               atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
        if (should_lock)
                ext4_lock_group(sb, group);
        free = grp->bb_free;
@@ -2315,13 +2653,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
         * We also support searching for power-of-two requests only for
         * requests upto maximum buddy size we have constructed.
         */
-       if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
+       if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
                /*
                 * This should tell if fe_len is exactly power of 2
                 */
                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
                        ac->ac_2order = array_index_nospec(i - 1,
-                                                          sb->s_blocksize_bits + 2);
+                                                          MB_NUM_ORDERS(sb));
        }
 
        /* if stream allocation is enabled, use global goal */
@@ -2347,17 +2685,21 @@ repeat:
                 * from the goal value specified
                 */
                group = ac->ac_g_ex.fe_group;
+               ac->ac_last_optimal_group = group;
+               ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
                prefetch_grp = group;
 
-               for (i = 0; i < ngroups; group++, i++) {
-                       int ret = 0;
+               for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
+                            i++) {
+                       int ret = 0, new_cr;
+
                        cond_resched();
-                       /*
-                        * Artificially restricted ngroups for non-extent
-                        * files makes group > ngroups possible on first loop.
-                        */
-                       if (group >= ngroups)
-                               group = 0;
+
+                       ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
+                       if (new_cr != cr) {
+                               cr = new_cr;
+                               goto repeat;
+                       }
 
                        /*
                         * Batch reads of the block allocation bitmaps
@@ -2422,6 +2764,9 @@ repeat:
                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
                }
+               /* Processed all groups and haven't found blocks */
+               if (sbi->s_mb_stats && i == ngroups)
+                       atomic64_inc(&sbi->s_bal_cX_failed[cr]);
        }
 
        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2451,6 +2796,9 @@ repeat:
                        goto repeat;
                }
        }
+
+       if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+               atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
 out:
        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
                err = first_err;
@@ -2550,6 +2898,157 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
        .show   = ext4_mb_seq_groups_show,
 };
 
+int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
+{
+       struct super_block *sb = (struct super_block *)seq->private;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       seq_puts(seq, "mballoc:\n");
+       if (!sbi->s_mb_stats) {
+               seq_puts(seq, "\tmb stats collection turned off.\n");
+               seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+               return 0;
+       }
+       seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
+       seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+
+       seq_printf(seq, "\tgroups_scanned: %u\n",  atomic_read(&sbi->s_bal_groups_scanned));
+
+       seq_puts(seq, "\tcr0_stats:\n");
+       seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
+       seq_printf(seq, "\t\tgroups_considered: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
+       seq_printf(seq, "\t\tuseless_loops: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_failed[0]));
+       seq_printf(seq, "\t\tbad_suggestions: %u\n",
+                  atomic_read(&sbi->s_bal_cr0_bad_suggestions));
+
+       seq_puts(seq, "\tcr1_stats:\n");
+       seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
+       seq_printf(seq, "\t\tgroups_considered: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
+       seq_printf(seq, "\t\tuseless_loops: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_failed[1]));
+       seq_printf(seq, "\t\tbad_suggestions: %u\n",
+                  atomic_read(&sbi->s_bal_cr1_bad_suggestions));
+
+       seq_puts(seq, "\tcr2_stats:\n");
+       seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
+       seq_printf(seq, "\t\tgroups_considered: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
+       seq_printf(seq, "\t\tuseless_loops: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_failed[2]));
+
+       seq_puts(seq, "\tcr3_stats:\n");
+       seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
+       seq_printf(seq, "\t\tgroups_considered: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
+       seq_printf(seq, "\t\tuseless_loops: %llu\n",
+                  atomic64_read(&sbi->s_bal_cX_failed[3]));
+       seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
+       seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+       seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
+       seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
+       seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
+
+       seq_printf(seq, "\tbuddies_generated: %u/%u\n",
+                  atomic_read(&sbi->s_mb_buddies_generated),
+                  ext4_get_groups_count(sb));
+       seq_printf(seq, "\tbuddies_time_used: %llu\n",
+                  atomic64_read(&sbi->s_mb_generation_time));
+       seq_printf(seq, "\tpreallocated: %u\n",
+                  atomic_read(&sbi->s_mb_preallocated));
+       seq_printf(seq, "\tdiscarded: %u\n",
+                  atomic_read(&sbi->s_mb_discarded));
+       return 0;
+}
+
+static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
+{
+       struct super_block *sb = PDE_DATA(file_inode(seq->file));
+       unsigned long position;
+
+       read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
+
+       if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+               return NULL;
+       position = *pos + 1;
+       return (void *) ((unsigned long) position);
+}
+
+static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct super_block *sb = PDE_DATA(file_inode(seq->file));
+       unsigned long position;
+
+       ++*pos;
+       if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+               return NULL;
+       position = *pos + 1;
+       return (void *) ((unsigned long) position);
+}
+
+static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
+{
+       struct super_block *sb = PDE_DATA(file_inode(seq->file));
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       unsigned long position = ((unsigned long) v);
+       struct ext4_group_info *grp;
+       struct rb_node *n;
+       unsigned int count, min, max;
+
+       position--;
+       if (position >= MB_NUM_ORDERS(sb)) {
+               seq_puts(seq, "fragment_size_tree:\n");
+               n = rb_first(&sbi->s_mb_avg_fragment_size_root);
+               if (!n) {
+                       seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
+                       return 0;
+               }
+               grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+               min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+               count = 1;
+               while (rb_next(n)) {
+                       count++;
+                       n = rb_next(n);
+               }
+               grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+               max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+
+               seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
+                          min, max, count);
+               return 0;
+       }
+
+       if (position == 0) {
+               seq_printf(seq, "optimize_scan: %d\n",
+                          test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
+               seq_puts(seq, "max_free_order_lists:\n");
+       }
+       count = 0;
+       list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
+                           bb_largest_free_order_node)
+               count++;
+       seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+                  (unsigned int)position, count);
+
+       return 0;
+}
+
+static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
+{
+       struct super_block *sb = PDE_DATA(file_inode(seq->file));
+
+       read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
+}
+
+const struct seq_operations ext4_mb_seq_structs_summary_ops = {
+       .start  = ext4_mb_seq_structs_summary_start,
+       .next   = ext4_mb_seq_structs_summary_next,
+       .stop   = ext4_mb_seq_structs_summary_stop,
+       .show   = ext4_mb_seq_structs_summary_show,
+};
+
 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
 {
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
@@ -2590,7 +3089,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
        if (old_groupinfo)
                ext4_kvfree_array_rcu(old_groupinfo);
-       ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
+       ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
                   sbi->s_group_info_size);
        return 0;
 }
@@ -2652,7 +3151,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
+       INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
+       RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
+       meta_group_info[i]->bb_group = group;
 
        mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
        return 0;
@@ -2715,7 +3217,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                 */
                if (sbi->s_es->s_log_groups_per_flex >= 32) {
                        ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
-                       goto err_freesgi;
+                       goto err_freebuddy;
                }
                sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
                        BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
@@ -2813,7 +3315,7 @@ int ext4_mb_init(struct super_block *sb)
        unsigned max;
        int ret;
 
-       i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+       i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
 
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
@@ -2821,7 +3323,7 @@ int ext4_mb_init(struct super_block *sb)
                goto out;
        }
 
-       i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+       i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                ret = -ENOMEM;
@@ -2847,10 +3349,30 @@ int ext4_mb_init(struct super_block *sb)
                offset_incr = offset_incr >> 1;
                max = max >> 1;
                i++;
-       } while (i <= sb->s_blocksize_bits + 1);
+       } while (i < MB_NUM_ORDERS(sb));
+
+       sbi->s_mb_avg_fragment_size_root = RB_ROOT;
+       sbi->s_mb_largest_free_orders =
+               kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+                       GFP_KERNEL);
+       if (!sbi->s_mb_largest_free_orders) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       sbi->s_mb_largest_free_orders_locks =
+               kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
+                       GFP_KERNEL);
+       if (!sbi->s_mb_largest_free_orders_locks) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
+               INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
+               rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
+       }
+       rwlock_init(&sbi->s_mb_rb_lock);
 
        spin_lock_init(&sbi->s_md_lock);
-       spin_lock_init(&sbi->s_bal_lock);
        sbi->s_mb_free_pending = 0;
        INIT_LIST_HEAD(&sbi->s_freed_data_list);
 
@@ -2901,6 +3423,10 @@ int ext4_mb_init(struct super_block *sb)
                spin_lock_init(&lg->lg_prealloc_lock);
        }
 
+       if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+               sbi->s_mb_max_linear_groups = 0;
+       else
+               sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0)
@@ -2912,6 +3438,8 @@ out_free_locality_groups:
        free_percpu(sbi->s_locality_groups);
        sbi->s_locality_groups = NULL;
 out:
+       kfree(sbi->s_mb_largest_free_orders);
+       kfree(sbi->s_mb_largest_free_orders_locks);
        kfree(sbi->s_mb_offsets);
        sbi->s_mb_offsets = NULL;
        kfree(sbi->s_mb_maxs);
@@ -2968,6 +3496,8 @@ int ext4_mb_release(struct super_block *sb)
                kvfree(group_info);
                rcu_read_unlock();
        }
+       kfree(sbi->s_mb_largest_free_orders);
+       kfree(sbi->s_mb_largest_free_orders_locks);
        kfree(sbi->s_mb_offsets);
        kfree(sbi->s_mb_maxs);
        iput(sbi->s_buddy_cache);
@@ -2978,17 +3508,18 @@ int ext4_mb_release(struct super_block *sb)
                                atomic_read(&sbi->s_bal_reqs),
                                atomic_read(&sbi->s_bal_success));
                ext4_msg(sb, KERN_INFO,
-                     "mballoc: %u extents scanned, %u goal hits, "
+                     "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
                                "%u 2^N hits, %u breaks, %u lost",
                                atomic_read(&sbi->s_bal_ex_scanned),
+                               atomic_read(&sbi->s_bal_groups_scanned),
                                atomic_read(&sbi->s_bal_goals),
                                atomic_read(&sbi->s_bal_2orders),
                                atomic_read(&sbi->s_bal_breaks),
                                atomic_read(&sbi->s_mb_lost_chunks));
                ext4_msg(sb, KERN_INFO,
-                      "mballoc: %lu generated and it took %Lu",
-                               sbi->s_mb_buddies_generated,
-                               sbi->s_mb_generation_time);
+                      "mballoc: %u generated and it took %llu",
+                               atomic_read(&sbi->s_mb_buddies_generated),
+                               atomic64_read(&sbi->s_mb_generation_time));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u preallocated, %u discarded",
                                atomic_read(&sbi->s_mb_preallocated),
@@ -3583,12 +4114,13 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 {
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 
-       if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+       if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);
                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+               atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
                        atomic_inc(&sbi->s_bal_goals);