* Return buffer_head on success or an ERR_PTR in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+                             bool ignore_locked)
 {
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
                return ERR_PTR(-ENOMEM);
        }
 
+       if (ignore_locked && buffer_locked(bh)) {
+               /* buffer under IO already, return if called for prefetching */
+               put_bh(bh);
+               return NULL;
+       }
+
        if (bitmap_uptodate(bh))
                goto verify;
 
        trace_ext4_read_block_bitmap_load(sb, block_group);
        bh->b_end_io = ext4_end_bitmap_read;
        get_bh(bh);
-       submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+       submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO |
+                 (ignore_locked ? REQ_RAHEAD : 0), bh);
        return bh;
 verify:
        err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
        struct buffer_head *bh;
        int err;
 
-       bh = ext4_read_block_bitmap_nowait(sb, block_group);
+       bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
        if (IS_ERR(bh))
                return bh;
        err = ext4_wait_block_bitmap(sb, block_group, bh);
 
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
+       unsigned int s_mb_prefetch;
+       unsigned int s_mb_prefetch_limit;
 
        /* stats for buddy allocator */
        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 
 extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
-                                               ext4_group_t block_group);
+                                               ext4_group_t block_group,
+                                               bool ignore_locked);
 extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT       4
 
 #define EXT4_MB_GRP_NEED_INIT(grp)     \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)     \
+       (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
 
 #define EXT4_MAX_CONTENTION            8
 #define EXT4_CONTENTION_THRESHOLD      2
 
                        bh[i] = NULL;
                        continue;
                }
-               bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+               bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
                if (IS_ERR(bh[i])) {
                        err = PTR_ERR(bh[i]);
                        bh[i] = NULL;
        return ret;
 }
 
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+static ext4_group_t
+ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+                unsigned int nr, int *cnt)
+{
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+       struct buffer_head *bh;
+       struct blk_plug plug;
+
+       blk_start_plug(&plug);
+       while (nr-- > 0) {
+               struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+                                                                 NULL);
+               struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+               /*
+                * Prefetch block groups with free blocks; but don't
+                * bother if it is marked uninitialized on disk, since
+                * it won't require I/O to read.  Also only try to
+                * prefetch once, so we avoid getblk() call, which can
+                * be expensive.
+                */
+               if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+                   EXT4_MB_GRP_NEED_INIT(grp) &&
+                   ext4_free_group_clusters(sb, gdp) > 0 &&
+                   !(ext4_has_group_desc_csum(sb) &&
+                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+                       bh = ext4_read_block_bitmap_nowait(sb, group, true);
+                       if (bh && !IS_ERR(bh)) {
+                               if (!buffer_uptodate(bh) && cnt)
+                                       (*cnt)++;
+                               brelse(bh);
+                       }
+               }
+               if (++group >= ngroups)
+                       group = 0;
+       }
+       blk_finish_plug(&plug);
+       return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized.  Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+static void
+ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+                     unsigned int nr)
+{
+       while (nr-- > 0) {
+               struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+                                                                 NULL);
+               struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+               if (!group)
+                       group = ext4_get_groups_count(sb);
+               group--;
+               grp = ext4_get_group_info(sb, group);
+
+               if (EXT4_MB_GRP_NEED_INIT(grp) &&
+                   ext4_free_group_clusters(sb, gdp) > 0 &&
+                   !(ext4_has_group_desc_csum(sb) &&
+                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+                       if (ext4_mb_init_group(sb, group, GFP_NOFS))
+                               break;
+               }
+       }
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-       ext4_group_t ngroups, group, i;
+       ext4_group_t prefetch_grp = 0, ngroups, group, i;
        int cr = -1;
        int err = 0, first_err = 0;
+       unsigned int nr = 0, prefetch_ios = 0;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
                 * from the goal value specified
                 */
                group = ac->ac_g_ex.fe_group;
+               prefetch_grp = group;
 
                for (i = 0; i < ngroups; group++, i++) {
                        int ret = 0;
                        if (group >= ngroups)
                                group = 0;
 
+                       /*
+                        * Batch reads of the block allocation bitmaps
+                        * to get multiple READs in flight; limit
+                        * prefetching at cr=0/1, otherwise mballoc can
+                        * spend a lot of time loading imperfect groups
+                        */
+                       if ((prefetch_grp == group) &&
+                           (cr > 1 ||
+                            prefetch_ios < sbi->s_mb_prefetch_limit)) {
+                               unsigned int curr_ios = prefetch_ios;
+
+                               nr = sbi->s_mb_prefetch;
+                               if (ext4_has_feature_flex_bg(sb)) {
+                                       nr = (group / sbi->s_mb_prefetch) *
+                                               sbi->s_mb_prefetch;
+                                       nr = nr + sbi->s_mb_prefetch - group;
+                               }
+                               prefetch_grp = ext4_mb_prefetch(sb, group,
+                                                       nr, &prefetch_ios);
+                               if (prefetch_ios == curr_ios)
+                                       nr = 0;
+                       }
+
                        /* This now checks without needing the buddy page */
                        ret = ext4_mb_good_group_nolock(ac, group, cr);
                        if (ret <= 0) {
        mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
                 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
                 ac->ac_flags, cr, err);
+
+       if (nr)
+               ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
        return err;
 }
 
                        goto err_freebuddy;
        }
 
+       if (ext4_has_feature_flex_bg(sb)) {
+               /* a single flex group is supposed to be read by a single IO */
+               sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
+               sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+       } else {
+               sbi->s_mb_prefetch = 32;
+       }
+       if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+               sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+       /* now many real IOs to prefetch within a single allocation at cr=0
+        * given cr=0 is an CPU-related optimization we shouldn't try to
+        * load too many groups, at some point we should start to use what
+        * we've got in memory.
+        * with an average random access time 5ms, it'd take a second to get
+        * 200 groups (* N with flex_bg), so let's make this limit 4
+        */
+       sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+       if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+               sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
        return 0;
 
 err_freebuddy: