Merge linux-block/for-4.3/core into md/for-linux

author NeilBrown <neilb@suse.com>

Sat, 5 Sep 2015 09:07:04 +0000 (11:07 +0200)

committer NeilBrown <neilb@suse.com>

Sat, 5 Sep 2015 09:08:32 +0000 (11:08 +0200)
author NeilBrown <neilb@suse.com>
Sat, 5 Sep 2015 09:07:04 +0000 (11:07 +0200)
committer NeilBrown <neilb@suse.com>
Sat, 5 Sep 2015 09:08:32 +0000 (11:08 +0200)
diff --combined drivers/md/md.c

index 8644ce7,4033262..4f5ecbe
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -257,13 -257,17 +257,17 @@@ static void md_make_request(struct requ
         unsigned int sectors;
         int cpu;
   
+       blk_queue_split(q, &bio, q->bio_split);
+ 
         if (mddev == NULL || mddev->pers == NULL
             || !mddev->ready) {
                 bio_io_error(bio);
                 return;
         }
         if (mddev->ro == 1 && unlikely(rw == WRITE)) {
-               bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+               if (bio_sectors(bio) != 0)
+                       bio->bi_error = -EROFS;
+               bio_endio(bio);
                 return;
         }
         smp_rmb(); /* Ensure implications of  'active' are visible */
@@@ -350,34 -354,11 +354,11 @@@ static int md_congested(void *data, in
         return mddev_congested(mddev, bits);
   }
   
- static int md_mergeable_bvec(struct request_queue *q,
-                            struct bvec_merge_data *bvm,
-                            struct bio_vec *biovec)
- {
-       struct mddev *mddev = q->queuedata;
-       int ret;
-       rcu_read_lock();
-       if (mddev->suspended) {
-               /* Must always allow one vec */
-               if (bvm->bi_size == 0)
-                       ret = biovec->bv_len;
-               else
-                       ret = 0;
-       } else {
-               struct md_personality *pers = mddev->pers;
-               if (pers && pers->mergeable_bvec)
-                       ret = pers->mergeable_bvec(mddev, bvm, biovec);
-               else
-                       ret = biovec->bv_len;
-       }
-       rcu_read_unlock();
-       return ret;
- }
   /*
    * Generic flush handling for md
    */
   
- static void md_end_flush(struct bio *bio, int err)
+ static void md_end_flush(struct bio *bio)
   {
         struct md_rdev *rdev = bio->bi_private;
         struct mddev *mddev = rdev->mddev;
@@@ -433,7 -414,7 +414,7 @@@ static void md_submit_flush_data(struc
   
         if (bio->bi_iter.bi_size == 0)
                 /* an empty barrier - all done */
-               bio_endio(bio, 0);
+               bio_endio(bio);
         else {
                 bio->bi_rw &= ~REQ_FLUSH;
                 mddev->pers->make_request(mddev, bio);
@@@ -502,8 -483,6 +483,8 @@@ static void mddev_put(struct mddev *mdd
                 bioset_free(bs);
   }
   
+ +static void md_safemode_timeout(unsigned long data);
+ +
   void mddev_init(struct mddev *mddev)
   {
         mutex_init(&mddev->open_mutex);
@@@ -511,8 -490,7 +492,8 @@@
         mutex_init(&mddev->bitmap_info.mutex);
         INIT_LIST_HEAD(&mddev->disks);
         INIT_LIST_HEAD(&mddev->all_mddevs);
- -      init_timer(&mddev->safemode_timer);
+ +      setup_timer(&mddev->safemode_timer, md_safemode_timeout,
+ +                  (unsigned long) mddev);
         atomic_set(&mddev->active, 1);
         atomic_set(&mddev->openers, 0);
         atomic_set(&mddev->active_io, 0);
@@@ -731,15 -709,13 +712,13 @@@ void md_rdev_clear(struct md_rdev *rdev
   }
   EXPORT_SYMBOL_GPL(md_rdev_clear);
   
- static void super_written(struct bio *bio, int error)
+ static void super_written(struct bio *bio)
   {
         struct md_rdev *rdev = bio->bi_private;
         struct mddev *mddev = rdev->mddev;
   
-       if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-               printk("md: super_written gets error=%d, uptodate=%d\n",
-                      error, test_bit(BIO_UPTODATE, &bio->bi_flags));
-               WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
+       if (bio->bi_error) {
+               printk("md: super_written gets error=%d\n", bio->bi_error);
                 md_error(mddev, rdev);
         }
   
@@@ -794,7 -770,7 +773,7 @@@ int sync_page_io(struct md_rdev *rdev, 
         bio_add_page(bio, page, size, 0);
         submit_bio_wait(rw, bio);
   
-       ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       ret = !bio->bi_error;
         bio_put(bio);
         return ret;
   }
@@@ -3279,6 -3255,8 +3258,6 @@@ int strict_strtoul_scaled(const char *c
         return 0;
   }
   
- -static void md_safemode_timeout(unsigned long data);
- -
   static ssize_t
   safe_delay_show(struct mddev *mddev, char *page)
   {
@@@ -4211,8 -4189,6 +4190,8 @@@ action_show(struct mddev *mddev, char *
                                 type = "repair";
                 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
                         type = "recover";
+ +              else if (mddev->reshape_position != MaxSector)
+ +                      type = "reshape";
         }
         return sprintf(page, "%s\n", type);
   }
@@@ -5189,7 -5165,6 +5168,6 @@@ int md_run(struct mddev *mddev
         if (mddev->queue) {
                 mddev->queue->backing_dev_info.congested_data = mddev;
                 mddev->queue->backing_dev_info.congested_fn = md_congested;
-               blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec);
         }
         if (pers->sync_request) {
                 if (mddev->kobj.sd &&
@@@ -5205,6 -5180,8 +5183,6 @@@
         atomic_set(&mddev->max_corr_read_errors,
                    MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
         mddev->safemode = 0;
- -      mddev->safemode_timer.function = md_safemode_timeout;
- -      mddev->safemode_timer.data = (unsigned long) mddev;
         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
         mddev->in_sync = 1;
         smp_wmb();
@@@ -5217,11 -5194,6 +5195,11 @@@
                         if (sysfs_link_rdev(mddev, rdev))
                                 /* failure here is OK */;
   
+ +      if (mddev->degraded && !mddev->ro)
+ +              /* This ensures that recovering status is reported immediately
+ +               * via sysfs - until a lack of spares is confirmed.
+ +               */
+ +              set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
   
         if (mddev->flags & MD_UPDATE_SB_FLAGS)
@@@ -5321,7 -5293,6 +5299,6 @@@ static void md_clean(struct mddev *mdde
         mddev->degraded = 0;
         mddev->safemode = 0;
         mddev->private = NULL;
-       mddev->merge_check_needed = 0;
         mddev->bitmap_info.offset = 0;
         mddev->bitmap_info.default_offset = 0;
         mddev->bitmap_info.default_space = 0;
@@@ -5520,7 -5491,6 +5497,6 @@@ static int do_md_stop(struct mddev *mdd
   
                 __md_stop_writes(mddev);
                 __md_stop(mddev);
-               mddev->queue->merge_bvec_fn = NULL;
                 mddev->queue->backing_dev_info.congested_fn = NULL;
   
                 /* tell userspace to handle 'inactive' */
@@@ -5771,16 -5741,16 +5747,16 @@@ static int get_bitmap_file(struct mdde
   
         err = 0;
         spin_lock(&mddev->lock);
- -      /* bitmap disabled, zero the first byte and copy out */
- -      if (!mddev->bitmap_info.file)
- -              file->pathname[0] = '\0';
- -      else if ((ptr = file_path(mddev->bitmap_info.file,
- -                             file->pathname, sizeof(file->pathname))),
- -               IS_ERR(ptr))
- -              err = PTR_ERR(ptr);
- -      else
- -              memmove(file->pathname, ptr,
- -                      sizeof(file->pathname)-(ptr-file->pathname));
+ +      /* bitmap enabled */
+ +      if (mddev->bitmap_info.file) {
+ +              ptr = file_path(mddev->bitmap_info.file, file->pathname,
+ +                              sizeof(file->pathname));
+ +              if (IS_ERR(ptr))
+ +                      err = PTR_ERR(ptr);
+ +              else
+ +                      memmove(file->pathname, ptr,
+ +                              sizeof(file->pathname)-(ptr-file->pathname));
+ +      }
         spin_unlock(&mddev->lock);
   
         if (err == 0 &&
@@@ -7099,7 -7069,7 +7075,7 @@@ static void status_unused(struct seq_fi
         seq_printf(seq, "\n");
   }
   
- -static void status_resync(struct seq_file *seq, struct mddev *mddev)
+ +static int status_resync(struct seq_file *seq, struct mddev *mddev)
   {
         sector_t max_sectors, resync, res;
         unsigned long dt, db;
@@@ -7107,32 -7077,18 +7083,32 @@@
         int scale;
         unsigned int per_milli;
   
- -      if (mddev->curr_resync <= 3)
- -              resync = 0;
- -      else
- -              resync = mddev->curr_resync
- -                      - atomic_read(&mddev->recovery_active);
- -
         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
             test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                 max_sectors = mddev->resync_max_sectors;
         else
                 max_sectors = mddev->dev_sectors;
   
+ +      resync = mddev->curr_resync;
+ +      if (resync <= 3) {
+ +              if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ +                      /* Still cleaning up */
+ +                      resync = max_sectors;
+ +      } else
+ +              resync -= atomic_read(&mddev->recovery_active);
+ +
+ +      if (resync == 0) {
+ +              if (mddev->recovery_cp < MaxSector) {
+ +                      seq_printf(seq, "\tresync=PENDING");
+ +                      return 1;
+ +              }
+ +              return 0;
+ +      }
+ +      if (resync < 3) {
+ +              seq_printf(seq, "\tresync=DELAYED");
+ +              return 1;
+ +      }
+ +
         WARN_ON(max_sectors == 0);
         /* Pick 'scale' such that (resync>>scale)*1000 will fit
          * in a sector_t, and (max_sectors>>scale) will fit in a
@@@ -7197,7 -7153,6 +7173,7 @@@
                    ((unsigned long)rt % 60)/6);
   
         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
+ +      return 1;
   }
   
   static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@@ -7343,8 -7298,13 +7319,8 @@@ static int md_seq_show(struct seq_file 
                         mddev->pers->status(seq, mddev);
                         seq_printf(seq, "\n      ");
                         if (mddev->pers->sync_request) {
- -                              if (mddev->curr_resync > 2) {
- -                                      status_resync(seq, mddev);
+ +                              if (status_resync(seq, mddev))
                                         seq_printf(seq, "\n      ");
- -                              } else if (mddev->curr_resync >= 1)
- -                                      seq_printf(seq, "\tresync=DELAYED\n      ");
- -                              else if (mddev->recovery_cp < MaxSector)
- -                                      seq_printf(seq, "\tresync=PENDING\n      ");
                         }
                 } else
                         seq_printf(seq, "\n       ");
@@@ -7427,19 -7387,15 +7403,19 @@@ int unregister_md_personality(struct md
   }
   EXPORT_SYMBOL(unregister_md_personality);
   
- -int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+ +int register_md_cluster_operations(struct md_cluster_operations *ops,
+ +                                 struct module *module)
   {
- -      if (md_cluster_ops != NULL)
- -              return -EALREADY;
+ +      int ret = 0;
         spin_lock(&pers_lock);
- -      md_cluster_ops = ops;
- -      md_cluster_mod = module;
+ +      if (md_cluster_ops != NULL)
+ +              ret = -EALREADY;
+ +      else {
+ +              md_cluster_ops = ops;
+ +              md_cluster_mod = module;
+ +      }
         spin_unlock(&pers_lock);
- -      return 0;
+ +      return ret;
   }
   EXPORT_SYMBOL(register_md_cluster_operations);
   
@@@ -7837,8 -7793,7 +7813,8 @@@ void md_do_sync(struct md_thread *threa
                       > (max_sectors >> 4)) ||
                      time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
                      (j - mddev->curr_resync_completed)*2
- -                   >= mddev->resync_max - mddev->curr_resync_completed
+ +                   >= mddev->resync_max - mddev->curr_resync_completed ||
+ +                   mddev->curr_resync_completed > mddev->resync_max
                             )) {
                         /* time to update curr_resync_completed */
                         wait_event(mddev->recovery_wait,
@@@ -7883,9 -7838,6 +7859,9 @@@
                         break;
   
                 j += sectors;
+ +              if (j > max_sectors)
+ +                      /* when skipping, extra large numbers can be returned. */
+ +                      j = max_sectors;
                 if (j > 2)
                         mddev->curr_resync = j;
                 if (mddev_is_clustered(mddev))
@@@ -7954,15 -7906,12 +7930,15 @@@
         blk_finish_plug(&plug);
         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
   
+ +      if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ +          !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ +          mddev->curr_resync > 2) {
+ +              mddev->curr_resync_completed = mddev->curr_resync;
+ +              sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ +      }
         /* tell personality that we are finished */
         mddev->pers->sync_request(mddev, max_sectors, &skipped);
   
- -      if (mddev_is_clustered(mddev))
- -              md_cluster_ops->resync_finish(mddev);
- -
         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
             mddev->curr_resync > 2) {
                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@@ -7996,9 -7945,6 +7972,9 @@@
                 }
         }
    skip:
+ +      if (mddev_is_clustered(mddev))
+ +              md_cluster_ops->resync_finish(mddev);
+ +
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
   
         spin_lock(&mddev->lock);
@@@ -8009,11 -7955,11 +7985,11 @@@
                 mddev->resync_max = MaxSector;
         } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
                 mddev->resync_min = mddev->curr_resync_completed;
+ +      set_bit(MD_RECOVERY_DONE, &mddev->recovery);
         mddev->curr_resync = 0;
         spin_unlock(&mddev->lock);
   
         wake_up(&resync_wait);
- -      set_bit(MD_RECOVERY_DONE, &mddev->recovery);
         md_wakeup_thread(mddev->thread);
         return;
   }
@@@ -8182,7 -8128,6 +8158,7 @@@ void md_check_recovery(struct mddev *md
                          */
                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                         md_reap_sync_thread(mddev);
+ +                      clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                         goto unlock;
                 }
@@@ -8629,7 -8574,6 +8605,7 @@@ int rdev_set_badblocks(struct md_rdev *
                 /* Make sure they get written out promptly */
                 sysfs_notify_dirent_safe(rdev->sysfs_state);
                 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+ +              set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
                 md_wakeup_thread(rdev->mddev->thread);
         }
         return rv;
diff --combined drivers/md/raid0.c

index 4a13c3c,59cda50..63e619b
--- 1/drivers/md/raid0.c
--- 2/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@@ -83,7 -83,7 +83,7 @@@ static int create_strip_zones(struct md
         char b[BDEVNAME_SIZE];
         char b2[BDEVNAME_SIZE];
         struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- -      bool discard_supported = false;
+ +      unsigned short blksize = 512;
   
         if (!conf)
                 return -ENOMEM;
@@@ -98,9 -98,6 +98,9 @@@
                 sector_div(sectors, mddev->chunk_sectors);
                 rdev1->sectors = sectors * mddev->chunk_sectors;
   
+ +              blksize = max(blksize, queue_logical_block_size(
+ +                                    rdev1->bdev->bd_disk->queue));
+ +
                 rdev_for_each(rdev2, mddev) {
                         pr_debug("md/raid0:%s:   comparing %s(%llu)"
                                  " with %s(%llu)\n",
@@@ -137,18 -134,6 +137,18 @@@
         }
         pr_debug("md/raid0:%s: FINAL %d zones\n",
                  mdname(mddev), conf->nr_strip_zones);
+ +      /*
+ +       * now since we have the hard sector sizes, we can make sure
+ +       * chunk size is a multiple of that sector size
+ +       */
+ +      if ((mddev->chunk_sectors << 9) % blksize) {
+ +              printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+ +                     mdname(mddev),
+ +                     mddev->chunk_sectors << 9, blksize);
+ +              err = -EINVAL;
+ +              goto abort;
+ +      }
+ +
         err = -ENOMEM;
         conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
                                 conf->nr_strip_zones, GFP_KERNEL);
@@@ -203,12 -188,16 +203,9 @@@
                 }
                 dev[j] = rdev1;
   
-               if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
-                       conf->has_merge_bvec = 1;
- -              if (mddev->queue)
- -                      disk_stack_limits(mddev->gendisk, rdev1->bdev,
- -                                        rdev1->data_offset << 9);
--
                 if (!smallest || (rdev1->sectors < smallest->sectors))
                         smallest = rdev1;
                 cnt++;
- -
- -              if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
- -                      discard_supported = true;
         }
         if (cnt != mddev->raid_disks) {
                 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@@ -269,6 -258,28 +266,6 @@@
                          (unsigned long long)smallest->sectors);
         }
   
- -      /*
- -       * now since we have the hard sector sizes, we can make sure
- -       * chunk size is a multiple of that sector size
- -       */
- -      if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
- -              printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
- -                     mdname(mddev),
- -                     mddev->chunk_sectors << 9);
- -              goto abort;
- -      }
- -
- -      if (mddev->queue) {
- -              blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
- -              blk_queue_io_opt(mddev->queue,
- -                               (mddev->chunk_sectors << 9) * mddev->raid_disks);
- -
- -              if (!discard_supported)
- -                      queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- -              else
- -                      queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
- -      }
- -
         pr_debug("md/raid0:%s: done.\n", mdname(mddev));
         *private_conf = conf;
   
@@@ -337,58 -348,6 +334,6 @@@ static struct md_rdev *map_sector(struc
                              + sector_div(sector, zone->nb_dev)];
   }
   
- /**
-  *    raid0_mergeable_bvec -- tell bio layer if two requests can be merged
-  *    @mddev: the md device
-  *    @bvm: properties of new bio
-  *    @biovec: the request that could be merged to it.
-  *
-  *    Return amount of bytes we can accept at this offset
-  */
- static int raid0_mergeable_bvec(struct mddev *mddev,
-                               struct bvec_merge_data *bvm,
-                               struct bio_vec *biovec)
- {
-       struct r0conf *conf = mddev->private;
-       sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-       sector_t sector_offset = sector;
-       int max;
-       unsigned int chunk_sectors = mddev->chunk_sectors;
-       unsigned int bio_sectors = bvm->bi_size >> 9;
-       struct strip_zone *zone;
-       struct md_rdev *rdev;
-       struct request_queue *subq;
- 
-       if (is_power_of_2(chunk_sectors))
-               max =  (chunk_sectors - ((sector & (chunk_sectors-1))
-                                               + bio_sectors)) << 9;
-       else
-               max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
-                                               + bio_sectors)) << 9;
-       if (max < 0)
-               max = 0; /* bio_add cannot handle a negative return */
-       if (max <= biovec->bv_len && bio_sectors == 0)
-               return biovec->bv_len;
-       if (max < biovec->bv_len)
-               /* too small already, no need to check further */
-               return max;
-       if (!conf->has_merge_bvec)
-               return max;
- 
-       /* May need to check subordinate device */
-       sector = sector_offset;
-       zone = find_zone(mddev->private, &sector_offset);
-       rdev = map_sector(mddev, zone, sector, &sector_offset);
-       subq = bdev_get_queue(rdev->bdev);
-       if (subq->merge_bvec_fn) {
-               bvm->bi_bdev = rdev->bdev;
-               bvm->bi_sector = sector_offset + zone->dev_start +
-                       rdev->data_offset;
-               return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
-       } else
-               return max;
- }
- 
   static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
   {
         sector_t array_sectors = 0;
@@@ -419,6 -378,12 +364,6 @@@ static int raid0_run(struct mddev *mdde
         if (md_check_no_bitmap(mddev))
                 return -EINVAL;
   
- -      if (mddev->queue) {
- -              blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
- -              blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
- -              blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
- -      }
- -
         /* if private is not null, we are here after takeover */
         if (mddev->private == NULL) {
                 ret = create_strip_zones(mddev, &conf);
@@@ -427,29 -392,6 +372,29 @@@
                 mddev->private = conf;
         }
         conf = mddev->private;
+ +      if (mddev->queue) {
+ +              struct md_rdev *rdev;
+ +              bool discard_supported = false;
+ +
+ +              rdev_for_each(rdev, mddev) {
+ +                      disk_stack_limits(mddev->gendisk, rdev->bdev,
+ +                                        rdev->data_offset << 9);
+ +                      if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+ +                              discard_supported = true;
+ +              }
+ +              blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+ +              blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ +              blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+ +
+ +              blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ +              blk_queue_io_opt(mddev->queue,
+ +                               (mddev->chunk_sectors << 9) * mddev->raid_disks);
+ +
+ +              if (!discard_supported)
+ +                      queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ +              else
+ +                      queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+ +      }
   
         /* calculate array device size */
         md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@@ -546,7 -488,7 +491,7 @@@ static void raid0_make_request(struct m
                 if (unlikely((split->bi_rw & REQ_DISCARD) &&
                          !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
                         /* Just ignore it */
-                       bio_endio(split, 0);
+                       bio_endio(split);
                 } else
                         generic_make_request(split);
         } while (split != bio);
@@@ -730,7 -672,6 +675,6 @@@ static struct md_personality raid0_pers
         .takeover       = raid0_takeover,
         .quiesce        = raid0_quiesce,
         .congested      = raid0_congested,
-       .mergeable_bvec = raid0_mergeable_bvec,
   };
   
   static int __init raid0_init (void)
diff --combined drivers/md/raid1.c

index 3d9ca83,f39d69f..4517f06
--- 1/drivers/md/raid1.c
--- 2/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -255,9 -255,10 +255,10 @@@ static void call_bio_endio(struct r1bi
                 done = 1;
   
         if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+               bio->bi_error = -EIO;
+ 
         if (done) {
-               bio_endio(bio, 0);
+               bio_endio(bio);
                 /*
                  * Wake up any possible resync thread that waits for the device
                  * to go idle.
@@@ -312,9 -313,9 +313,9 @@@ static int find_bio_disk(struct r1bio *
         return mirror;
   }
   
- static void raid1_end_read_request(struct bio *bio, int error)
+ static void raid1_end_read_request(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       int uptodate = !bio->bi_error;
         struct r1bio *r1_bio = bio->bi_private;
         int mirror;
         struct r1conf *conf = r1_bio->mddev->private;
@@@ -397,9 -398,8 +398,8 @@@ static void r1_bio_write_done(struct r1
         }
   }
   
- static void raid1_end_write_request(struct bio *bio, int error)
+ static void raid1_end_write_request(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
         struct r1bio *r1_bio = bio->bi_private;
         int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
         struct r1conf *conf = r1_bio->mddev->private;
@@@ -410,7 -410,7 +410,7 @@@
         /*
          * 'one mirror IO has finished' event handler:
          */
-       if (!uptodate) {
+       if (bio->bi_error) {
                 set_bit(WriteErrorSeen,
                         &conf->mirrors[mirror].rdev->flags);
                 if (!test_and_set_bit(WantReplacement,
@@@ -557,7 -557,6 +557,6 @@@ static int read_balance(struct r1conf *
                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
                 if (r1_bio->bios[disk] == IO_BLOCKED
                     || rdev == NULL
-                   || test_bit(Unmerged, &rdev->flags)
                     || test_bit(Faulty, &rdev->flags))
                         continue;
                 if (!test_bit(In_sync, &rdev->flags) &&
@@@ -708,38 -707,6 +707,6 @@@
         return best_disk;
   }
   
- static int raid1_mergeable_bvec(struct mddev *mddev,
-                               struct bvec_merge_data *bvm,
-                               struct bio_vec *biovec)
- {
-       struct r1conf *conf = mddev->private;
-       sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-       int max = biovec->bv_len;
- 
-       if (mddev->merge_check_needed) {
-               int disk;
-               rcu_read_lock();
-               for (disk = 0; disk < conf->raid_disks * 2; disk++) {
-                       struct md_rdev *rdev = rcu_dereference(
-                               conf->mirrors[disk].rdev);
-                       if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                               struct request_queue *q =
-                                       bdev_get_queue(rdev->bdev);
-                               if (q->merge_bvec_fn) {
-                                       bvm->bi_sector = sector +
-                                               rdev->data_offset;
-                                       bvm->bi_bdev = rdev->bdev;
-                                       max = min(max, q->merge_bvec_fn(
-                                                         q, bvm, biovec));
-                               }
-                       }
-               }
-               rcu_read_unlock();
-       }
-       return max;
- 
- }
- 
   static int raid1_congested(struct mddev *mddev, int bits)
   {
         struct r1conf *conf = mddev->private;
@@@ -793,7 -760,7 +760,7 @@@ static void flush_pending_writes(struc
                         if (unlikely((bio->bi_rw & REQ_DISCARD) &&
                             !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
                                 /* Just ignore it */
-                               bio_endio(bio, 0);
+                               bio_endio(bio);
                         else
                                 generic_make_request(bio);
                         bio = next;
@@@ -1068,7 -1035,7 +1035,7 @@@ static void raid1_unplug(struct blk_plu
                 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
                     !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
                         /* Just ignore it */
-                       bio_endio(bio, 0);
+                       bio_endio(bio);
                 else
                         generic_make_request(bio);
                 bio = next;
@@@ -1158,7 -1125,7 +1125,7 @@@ static void make_request(struct mddev *
          * non-zero, then it is the number of not-completed requests.
          */
         bio->bi_phys_segments = 0;
-       clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+       bio_clear_flag(bio, BIO_SEG_VALID);
   
         if (rw == READ) {
                 /*
@@@ -1269,8 -1236,7 +1236,7 @@@ read_again
                         break;
                 }
                 r1_bio->bios[i] = NULL;
-               if (!rdev || test_bit(Faulty, &rdev->flags)
-                   || test_bit(Unmerged, &rdev->flags)) {
+               if (!rdev || test_bit(Faulty, &rdev->flags)) {
                         if (i < conf->raid_disks)
                                 set_bit(R1BIO_Degraded, &r1_bio->state);
                         continue;
@@@ -1508,7 -1474,6 +1474,7 @@@ static void error(struct mddev *mddev, 
          */
         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ +      set_bit(MD_CHANGE_PENDING, &mddev->flags);
         printk(KERN_ALERT
                "md/raid1:%s: Disk failure on %s, disabling device.\n"
                "md/raid1:%s: Operation continuing on %d devices.\n",
@@@ -1618,7 -1583,6 +1584,6 @@@ static int raid1_add_disk(struct mddev 
         struct raid1_info *p;
         int first = 0;
         int last = conf->raid_disks - 1;
-       struct request_queue *q = bdev_get_queue(rdev->bdev);
   
         if (mddev->recovery_disabled == conf->recovery_disabled)
                 return -EBUSY;
@@@ -1626,11 -1590,6 +1591,6 @@@
         if (rdev->raid_disk >= 0)
                 first = last = rdev->raid_disk;
   
-       if (q->merge_bvec_fn) {
-               set_bit(Unmerged, &rdev->flags);
-               mddev->merge_check_needed = 1;
-       }
- 
         for (mirror = first; mirror <= last; mirror++) {
                 p = conf->mirrors+mirror;
                 if (!p->rdev) {
@@@ -1662,19 -1621,6 +1622,6 @@@
                         break;
                 }
         }
-       if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
-               /* Some requests might not have seen this new
-                * merge_bvec_fn.  We must wait for them to complete
-                * before merging the device fully.
-                * First we make sure any code which has tested
-                * our function has submitted the request, then
-                * we wait for all outstanding requests to complete.
-                */
-               synchronize_sched();
-               freeze_array(conf, 0);
-               unfreeze_array(conf);
-               clear_bit(Unmerged, &rdev->flags);
-       }
         md_integrity_add_rdev(rdev, mddev);
         if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
                 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@@ -1738,7 -1684,7 +1685,7 @@@ abort
         return err;
   }
   
- static void end_sync_read(struct bio *bio, int error)
+ static void end_sync_read(struct bio *bio)
   {
         struct r1bio *r1_bio = bio->bi_private;
   
@@@ -1749,16 -1695,16 +1696,16 @@@
          * or re-read if the read failed.
          * We don't do much here, just schedule handling by raid1d
          */
-       if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+       if (!bio->bi_error)
                 set_bit(R1BIO_Uptodate, &r1_bio->state);
   
         if (atomic_dec_and_test(&r1_bio->remaining))
                 reschedule_retry(r1_bio);
   }
   
- static void end_sync_write(struct bio *bio, int error)
+ static void end_sync_write(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       int uptodate = !bio->bi_error;
         struct r1bio *r1_bio = bio->bi_private;
         struct mddev *mddev = r1_bio->mddev;
         struct r1conf *conf = mddev->private;
@@@ -1945,7 -1891,7 +1892,7 @@@ static int fix_sync_read_error(struct r
                 idx ++;
         }
         set_bit(R1BIO_Uptodate, &r1_bio->state);
-       set_bit(BIO_UPTODATE, &bio->bi_flags);
+       bio->bi_error = 0;
         return 1;
   }
   
@@@ -1969,15 -1915,14 +1916,14 @@@ static void process_checks(struct r1bi
         for (i = 0; i < conf->raid_disks * 2; i++) {
                 int j;
                 int size;
-               int uptodate;
+               int error;
                 struct bio *b = r1_bio->bios[i];
                 if (b->bi_end_io != end_sync_read)
                         continue;
-               /* fixup the bio for reuse, but preserve BIO_UPTODATE */
-               uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
+               /* fixup the bio for reuse, but preserve errno */
+               error = b->bi_error;
                 bio_reset(b);
-               if (!uptodate)
-                       clear_bit(BIO_UPTODATE, &b->bi_flags);
+               b->bi_error = error;
                 b->bi_vcnt = vcnt;
                 b->bi_iter.bi_size = r1_bio->sectors << 9;
                 b->bi_iter.bi_sector = r1_bio->sector +
@@@ -2000,7 -1945,7 +1946,7 @@@
         }
         for (primary = 0; primary < conf->raid_disks * 2; primary++)
                 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-                   test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+                   !r1_bio->bios[primary]->bi_error) {
                         r1_bio->bios[primary]->bi_end_io = NULL;
                         rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
                         break;
@@@ -2010,14 -1955,14 +1956,14 @@@
                 int j;
                 struct bio *pbio = r1_bio->bios[primary];
                 struct bio *sbio = r1_bio->bios[i];
-               int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
+               int error = sbio->bi_error;
   
                 if (sbio->bi_end_io != end_sync_read)
                         continue;
-               /* Now we can 'fixup' the BIO_UPTODATE flag */
-               set_bit(BIO_UPTODATE, &sbio->bi_flags);
+               /* Now we can 'fixup' the error value */
+               sbio->bi_error = 0;
   
-               if (uptodate) {
+               if (!error) {
                         for (j = vcnt; j-- ; ) {
                                 struct page *p, *s;
                                 p = pbio->bi_io_vec[j].bv_page;
@@@ -2032,7 -1977,7 +1978,7 @@@
                 if (j >= 0)
                         atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
                 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-                             && uptodate)) {
+                             && !error)) {
                         /* No need to write to this device. */
                         sbio->bi_end_io = NULL;
                         rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@@ -2273,11 -2218,11 +2219,11 @@@ static void handle_sync_write_finished(
                 struct bio *bio = r1_bio->bios[m];
                 if (bio->bi_end_io == NULL)
                         continue;
-               if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+               if (!bio->bi_error &&
                     test_bit(R1BIO_MadeGood, &r1_bio->state)) {
                         rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
                 }
-               if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+               if (bio->bi_error &&
                     test_bit(R1BIO_WriteError, &r1_bio->state)) {
                         if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
                                 md_error(conf->mddev, rdev);
@@@ -2290,7 -2235,6 +2236,7 @@@
   static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
   {
         int m;
+ +      bool fail = false;
         for (m = 0; m < conf->raid_disks * 2 ; m++)
                 if (r1_bio->bios[m] == IO_MADE_GOOD) {
                         struct md_rdev *rdev = conf->mirrors[m].rdev;
@@@ -2303,7 -2247,6 +2249,7 @@@
                          * narrow down and record precise write
                          * errors.
                          */
+ +                      fail = true;
                         if (!narrow_write_error(r1_bio, m)) {
                                 md_error(conf->mddev,
                                          conf->mirrors[m].rdev);
@@@ -2315,13 -2258,7 +2261,13 @@@
                 }
         if (test_bit(R1BIO_WriteError, &r1_bio->state))
                 close_write(r1_bio);
- -      raid_end_bio_io(r1_bio);
+ +      if (fail) {
+ +              spin_lock_irq(&conf->device_lock);
+ +              list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+ +              spin_unlock_irq(&conf->device_lock);
+ +              md_wakeup_thread(conf->mddev->thread);
+ +      } else
+ +              raid_end_bio_io(r1_bio);
   }
   
   static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@@ -2427,23 -2364,6 +2373,23 @@@ static void raid1d(struct md_thread *th
   
         md_check_recovery(mddev);
   
+ +      if (!list_empty_careful(&conf->bio_end_io_list) &&
+ +          !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ +              LIST_HEAD(tmp);
+ +              spin_lock_irqsave(&conf->device_lock, flags);
+ +              if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ +                      list_add(&tmp, &conf->bio_end_io_list);
+ +                      list_del_init(&conf->bio_end_io_list);
+ +              }
+ +              spin_unlock_irqrestore(&conf->device_lock, flags);
+ +              while (!list_empty(&tmp)) {
+ +                      r1_bio = list_first_entry(&conf->bio_end_io_list,
+ +                                                struct r1bio, retry_list);
+ +                      list_del(&r1_bio->retry_list);
+ +                      raid_end_bio_io(r1_bio);
+ +              }
+ +      }
+ +
         blk_start_plug(&plug);
         for (;;) {
   
@@@ -2741,7 -2661,7 +2687,7 @@@ static sector_t sync_request(struct mdd
                                                 /* remove last page from this bio */
                                                 bio->bi_vcnt--;
                                                 bio->bi_iter.bi_size -= len;
-                                               __clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+                                               bio_clear_flag(bio, BIO_SEG_VALID);
                                         }
                                         goto bio_full;
                                 }
@@@ -2836,8 -2756,6 +2782,6 @@@ static struct r1conf *setup_conf(struc
                         goto abort;
                 disk->rdev = rdev;
                 q = bdev_get_queue(rdev->bdev);
-               if (q->merge_bvec_fn)
-                       mddev->merge_check_needed = 1;
   
                 disk->head_position = 0;
                 disk->seq_start = MaxSector;
@@@ -2845,7 -2763,6 +2789,7 @@@
         conf->raid_disks = mddev->raid_disks;
         conf->mddev = mddev;
         INIT_LIST_HEAD(&conf->retry_list);
+ +      INIT_LIST_HEAD(&conf->bio_end_io_list);
   
         spin_lock_init(&conf->resync_lock);
         init_waitqueue_head(&conf->wait_barrier);
@@@ -3140,7 -3057,6 +3084,7 @@@ static int raid1_reshape(struct mddev *
   
         unfreeze_array(conf);
   
+ +      set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
         md_wakeup_thread(mddev->thread);
   
@@@ -3204,7 -3120,6 +3148,6 @@@ static struct md_personality raid1_pers
         .quiesce        = raid1_quiesce,
         .takeover       = raid1_takeover,
         .congested      = raid1_congested,
-       .mergeable_bvec = raid1_mergeable_bvec,
   };
   
   static int __init raid_init(void)
diff --combined drivers/md/raid10.c

index a14c304,b0fce2e..0fc33eb
--- 1/drivers/md/raid10.c
--- 2/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -101,7 -101,7 +101,7 @@@ static int _enough(struct r10conf *conf
   static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
                                 int *skipped);
   static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
- static void end_reshape_write(struct bio *bio, int error);
+ static void end_reshape_write(struct bio *bio);
   static void end_reshape(struct r10conf *conf);
   
   static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
@@@ -307,9 -307,9 +307,9 @@@ static void raid_end_bio_io(struct r10b
         } else
                 done = 1;
         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+               bio->bi_error = -EIO;
         if (done) {
-               bio_endio(bio, 0);
+               bio_endio(bio);
                 /*
                  * Wake up any possible resync thread that waits for the device
                  * to go idle.
@@@ -358,9 -358,9 +358,9 @@@ static int find_bio_disk(struct r10con
         return r10_bio->devs[slot].devnum;
   }
   
- static void raid10_end_read_request(struct bio *bio, int error)
+ static void raid10_end_read_request(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       int uptodate = !bio->bi_error;
         struct r10bio *r10_bio = bio->bi_private;
         int slot, dev;
         struct md_rdev *rdev;
@@@ -438,9 -438,8 +438,8 @@@ static void one_write_done(struct r10bi
         }
   }
   
- static void raid10_end_write_request(struct bio *bio, int error)
+ static void raid10_end_write_request(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
         struct r10bio *r10_bio = bio->bi_private;
         int dev;
         int dec_rdev = 1;
@@@ -460,7 -459,7 +459,7 @@@
         /*
          * this branch is our 'one mirror IO has finished' event handler:
          */
-       if (!uptodate) {
+       if (bio->bi_error) {
                 if (repl)
                         /* Never record new bad blocks to replacement,
                          * just fail it.
@@@ -672,93 -671,6 +671,6 @@@ static sector_t raid10_find_virt(struc
         return (vchunk << geo->chunk_shift) + offset;
   }
   
- /**
-  *    raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
-  *    @mddev: the md device
-  *    @bvm: properties of new bio
-  *    @biovec: the request that could be merged to it.
-  *
-  *    Return amount of bytes we can accept at this offset
-  *    This requires checking for end-of-chunk if near_copies != raid_disks,
-  *    and for subordinate merge_bvec_fns if merge_check_needed.
-  */
- static int raid10_mergeable_bvec(struct mddev *mddev,
-                                struct bvec_merge_data *bvm,
-                                struct bio_vec *biovec)
- {
-       struct r10conf *conf = mddev->private;
-       sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-       int max;
-       unsigned int chunk_sectors;
-       unsigned int bio_sectors = bvm->bi_size >> 9;
-       struct geom *geo = &conf->geo;
- 
-       chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
-       if (conf->reshape_progress != MaxSector &&
-           ((sector >= conf->reshape_progress) !=
-            conf->mddev->reshape_backwards))
-               geo = &conf->prev;
- 
-       if (geo->near_copies < geo->raid_disks) {
-               max = (chunk_sectors - ((sector & (chunk_sectors - 1))
-                                       + bio_sectors)) << 9;
-               if (max < 0)
-                       /* bio_add cannot handle a negative return */
-                       max = 0;
-               if (max <= biovec->bv_len && bio_sectors == 0)
-                       return biovec->bv_len;
-       } else
-               max = biovec->bv_len;
- 
-       if (mddev->merge_check_needed) {
-               struct {
-                       struct r10bio r10_bio;
-                       struct r10dev devs[conf->copies];
-               } on_stack;
-               struct r10bio *r10_bio = &on_stack.r10_bio;
-               int s;
-               if (conf->reshape_progress != MaxSector) {
-                       /* Cannot give any guidance during reshape */
-                       if (max <= biovec->bv_len && bio_sectors == 0)
-                               return biovec->bv_len;
-                       return 0;
-               }
-               r10_bio->sector = sector;
-               raid10_find_phys(conf, r10_bio);
-               rcu_read_lock();
-               for (s = 0; s < conf->copies; s++) {
-                       int disk = r10_bio->devs[s].devnum;
-                       struct md_rdev *rdev = rcu_dereference(
-                               conf->mirrors[disk].rdev);
-                       if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                               struct request_queue *q =
-                                       bdev_get_queue(rdev->bdev);
-                               if (q->merge_bvec_fn) {
-                                       bvm->bi_sector = r10_bio->devs[s].addr
-                                               + rdev->data_offset;
-                                       bvm->bi_bdev = rdev->bdev;
-                                       max = min(max, q->merge_bvec_fn(
-                                                         q, bvm, biovec));
-                               }
-                       }
-                       rdev = rcu_dereference(conf->mirrors[disk].replacement);
-                       if (rdev && !test_bit(Faulty, &rdev->flags)) {
-                               struct request_queue *q =
-                                       bdev_get_queue(rdev->bdev);
-                               if (q->merge_bvec_fn) {
-                                       bvm->bi_sector = r10_bio->devs[s].addr
-                                               + rdev->data_offset;
-                                       bvm->bi_bdev = rdev->bdev;
-                                       max = min(max, q->merge_bvec_fn(
-                                                         q, bvm, biovec));
-                               }
-                       }
-               }
-               rcu_read_unlock();
-       }
-       return max;
- }
- 
   /*
    * This routine returns the disk from which the requested read should
    * be done. There is a per-array 'next expected sequential IO' sector
@@@ -821,12 -733,10 +733,10 @@@ retry
                 disk = r10_bio->devs[slot].devnum;
                 rdev = rcu_dereference(conf->mirrors[disk].replacement);
                 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
-                   test_bit(Unmerged, &rdev->flags) ||
                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
                         rdev = rcu_dereference(conf->mirrors[disk].rdev);
                 if (rdev == NULL ||
-                   test_bit(Faulty, &rdev->flags) ||
-                   test_bit(Unmerged, &rdev->flags))
+                   test_bit(Faulty, &rdev->flags))
                         continue;
                 if (!test_bit(In_sync, &rdev->flags) &&
                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@@ -957,7 -867,7 +867,7 @@@ static void flush_pending_writes(struc
                         if (unlikely((bio->bi_rw & REQ_DISCARD) &&
                             !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
                                 /* Just ignore it */
-                               bio_endio(bio, 0);
+                               bio_endio(bio);
                         else
                                 generic_make_request(bio);
                         bio = next;
@@@ -1133,7 -1043,7 +1043,7 @@@ static void raid10_unplug(struct blk_pl
                 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
                     !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
                         /* Just ignore it */
-                       bio_endio(bio, 0);
+                       bio_endio(bio);
                 else
                         generic_make_request(bio);
                 bio = next;
@@@ -1217,7 -1127,7 +1127,7 @@@ static void __make_request(struct mdde
          * non-zero, then it is the number of not-completed requests.
          */
         bio->bi_phys_segments = 0;
-       clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+       bio_clear_flag(bio, BIO_SEG_VALID);
   
         if (rw == READ) {
                 /*
@@@ -1326,11 -1236,9 +1236,9 @@@ retry_write
                         blocked_rdev = rrdev;
                         break;
                 }
-               if (rdev && (test_bit(Faulty, &rdev->flags)
-                            || test_bit(Unmerged, &rdev->flags)))
+               if (rdev && (test_bit(Faulty, &rdev->flags)))
                         rdev = NULL;
-               if (rrdev && (test_bit(Faulty, &rrdev->flags)
-                             || test_bit(Unmerged, &rrdev->flags)))
+               if (rrdev && (test_bit(Faulty, &rrdev->flags)))
                         rrdev = NULL;
   
                 r10_bio->devs[i].bio = NULL;
@@@ -1681,7 -1589,6 +1589,7 @@@ static void error(struct mddev *mddev, 
         set_bit(Blocked, &rdev->flags);
         set_bit(Faulty, &rdev->flags);
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ +      set_bit(MD_CHANGE_PENDING, &mddev->flags);
         spin_unlock_irqrestore(&conf->device_lock, flags);
         printk(KERN_ALERT
                "md/raid10:%s: Disk failure on %s, disabling device.\n"
@@@ -1778,7 -1685,6 +1686,6 @@@ static int raid10_add_disk(struct mdde
         int mirror;
         int first = 0;
         int last = conf->geo.raid_disks - 1;
-       struct request_queue *q = bdev_get_queue(rdev->bdev);
   
         if (mddev->recovery_cp < MaxSector)
                 /* only hot-add to in-sync arrays, as recovery is
@@@ -1791,11 -1697,6 +1698,6 @@@
         if (rdev->raid_disk >= 0)
                 first = last = rdev->raid_disk;
   
-       if (q->merge_bvec_fn) {
-               set_bit(Unmerged, &rdev->flags);
-               mddev->merge_check_needed = 1;
-       }
- 
         if (rdev->saved_raid_disk >= first &&
             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
                 mirror = rdev->saved_raid_disk;
@@@ -1834,19 -1735,6 +1736,6 @@@
                 rcu_assign_pointer(p->rdev, rdev);
                 break;
         }
-       if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
-               /* Some requests might not have seen this new
-                * merge_bvec_fn.  We must wait for them to complete
-                * before merging the device fully.
-                * First we make sure any code which has tested
-                * our function has submitted the request, then
-                * we wait for all outstanding requests to complete.
-                */
-               synchronize_sched();
-               freeze_array(conf, 0);
-               unfreeze_array(conf);
-               clear_bit(Unmerged, &rdev->flags);
-       }
         md_integrity_add_rdev(rdev, mddev);
         if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
                 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
@@@ -1917,7 -1805,7 +1806,7 @@@ abort
         return err;
   }
   
- static void end_sync_read(struct bio *bio, int error)
+ static void end_sync_read(struct bio *bio)
   {
         struct r10bio *r10_bio = bio->bi_private;
         struct r10conf *conf = r10_bio->mddev->private;
@@@ -1929,7 -1817,7 +1818,7 @@@
         } else
                 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
   
-       if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+       if (!bio->bi_error)
                 set_bit(R10BIO_Uptodate, &r10_bio->state);
         else
                 /* The write handler will notice the lack of
@@@ -1978,9 -1866,8 +1867,8 @@@ static void end_sync_request(struct r10
         }
   }
   
- static void end_sync_write(struct bio *bio, int error)
+ static void end_sync_write(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
         struct r10bio *r10_bio = bio->bi_private;
         struct mddev *mddev = r10_bio->mddev;
         struct r10conf *conf = mddev->private;
@@@ -1997,7 -1884,7 +1885,7 @@@
         else
                 rdev = conf->mirrors[d].rdev;
   
-       if (!uptodate) {
+       if (bio->bi_error) {
                 if (repl)
                         md_error(mddev, rdev);
                 else {
@@@ -2045,7 -1932,7 +1933,7 @@@ static void sync_request_write(struct m
   
         /* find the first device with a block */
         for (i=0; i<conf->copies; i++)
-               if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
+               if (!r10_bio->devs[i].bio->bi_error)
                         break;
   
         if (i == conf->copies)
@@@ -2065,7 -1952,7 +1953,7 @@@
                         continue;
                 if (i == first)
                         continue;
-               if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+               if (!r10_bio->devs[i].bio->bi_error) {
                         /* We know that the bi_io_vec layout is the same for
                          * both 'first' and 'i', so we just compare them.
                          * All vec entries are PAGE_SIZE;
@@@ -2395,7 -2282,6 +2283,6 @@@ static void fix_read_error(struct r10co
                         d = r10_bio->devs[sl].devnum;
                         rdev = rcu_dereference(conf->mirrors[d].rdev);
                         if (rdev &&
-                           !test_bit(Unmerged, &rdev->flags) &&
                             test_bit(In_sync, &rdev->flags) &&
                             is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
                                         &first_bad, &bad_sectors) == 0) {
@@@ -2449,7 -2335,6 +2336,6 @@@
                         d = r10_bio->devs[sl].devnum;
                         rdev = rcu_dereference(conf->mirrors[d].rdev);
                         if (!rdev ||
-                           test_bit(Unmerged, &rdev->flags) ||
                             !test_bit(In_sync, &rdev->flags))
                                 continue;
   
@@@ -2707,8 -2592,7 +2593,7 @@@ static void handle_write_completed(stru
                         rdev = conf->mirrors[dev].rdev;
                         if (r10_bio->devs[m].bio == NULL)
                                 continue;
-                       if (test_bit(BIO_UPTODATE,
-                                    &r10_bio->devs[m].bio->bi_flags)) {
+                       if (!r10_bio->devs[m].bio->bi_error) {
                                 rdev_clear_badblocks(
                                         rdev,
                                         r10_bio->devs[m].addr,
@@@ -2723,8 -2607,8 +2608,8 @@@
                         rdev = conf->mirrors[dev].replacement;
                         if (r10_bio->devs[m].repl_bio == NULL)
                                 continue;
-                       if (test_bit(BIO_UPTODATE,
-                                    &r10_bio->devs[m].repl_bio->bi_flags)) {
+ 
+                       if (!r10_bio->devs[m].repl_bio->bi_error) {
                                 rdev_clear_badblocks(
                                         rdev,
                                         r10_bio->devs[m].addr,
@@@ -2739,7 -2623,6 +2624,7 @@@
                 }
                 put_buf(r10_bio);
         } else {
+ +              bool fail = false;
                 for (m = 0; m < conf->copies; m++) {
                         int dev = r10_bio->devs[m].devnum;
                         struct bio *bio = r10_bio->devs[m].bio;
@@@ -2750,9 -2633,7 +2635,8 @@@
                                         r10_bio->devs[m].addr,
                                         r10_bio->sectors, 0);
                                 rdev_dec_pending(rdev, conf->mddev);
-                       } else if (bio != NULL &&
-                                  !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                       } else if (bio != NULL && bio->bi_error) {
+ +                              fail = true;
                                 if (!narrow_write_error(r10_bio, m)) {
                                         md_error(conf->mddev, rdev);
                                         set_bit(R10BIO_Degraded,
@@@ -2773,13 -2654,7 +2657,13 @@@
                 if (test_bit(R10BIO_WriteError,
                              &r10_bio->state))
                         close_write(r10_bio);
- -              raid_end_bio_io(r10_bio);
+ +              if (fail) {
+ +                      spin_lock_irq(&conf->device_lock);
+ +                      list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
+ +                      spin_unlock_irq(&conf->device_lock);
+ +                      md_wakeup_thread(conf->mddev->thread);
+ +              } else
+ +                      raid_end_bio_io(r10_bio);
         }
   }
   
@@@ -2794,23 -2669,6 +2678,23 @@@ static void raid10d(struct md_thread *t
   
         md_check_recovery(mddev);
   
+ +      if (!list_empty_careful(&conf->bio_end_io_list) &&
+ +          !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ +              LIST_HEAD(tmp);
+ +              spin_lock_irqsave(&conf->device_lock, flags);
+ +              if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ +                      list_add(&tmp, &conf->bio_end_io_list);
+ +                      list_del_init(&conf->bio_end_io_list);
+ +              }
+ +              spin_unlock_irqrestore(&conf->device_lock, flags);
+ +              while (!list_empty(&tmp)) {
+ +                      r10_bio = list_first_entry(&conf->bio_end_io_list,
+ +                                                struct r10bio, retry_list);
+ +                      list_del(&r10_bio->retry_list);
+ +                      raid_end_bio_io(r10_bio);
+ +              }
+ +      }
+ +
         blk_start_plug(&plug);
         for (;;) {
   
@@@ -3289,7 -3147,7 +3173,7 @@@ static sector_t sync_request(struct mdd
   
                         bio = r10_bio->devs[i].bio;
                         bio_reset(bio);
-                       clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                       bio->bi_error = -EIO;
                         if (conf->mirrors[d].rdev == NULL ||
                             test_bit(Faulty, &conf->mirrors[d].rdev->flags))
                                 continue;
@@@ -3326,7 -3184,7 +3210,7 @@@
                         /* Need to set up for writing to the replacement */
                         bio = r10_bio->devs[i].repl_bio;
                         bio_reset(bio);
-                       clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                       bio->bi_error = -EIO;
   
                         sector = r10_bio->devs[i].addr;
                         atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@@ -3383,7 -3241,7 +3267,7 @@@
                                 /* remove last page from this bio */
                                 bio2->bi_vcnt--;
                                 bio2->bi_iter.bi_size -= len;
-                               __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+                               bio_clear_flag(bio2, BIO_SEG_VALID);
                         }
                         goto bio_full;
                 }
@@@ -3403,7 -3261,7 +3287,7 @@@
   
                 if (bio->bi_end_io == end_sync_read) {
                         md_sync_acct(bio->bi_bdev, nr_sectors);
-                       set_bit(BIO_UPTODATE, &bio->bi_flags);
+                       bio->bi_error = 0;
                         generic_make_request(bio);
                 }
         }
@@@ -3585,7 -3443,6 +3469,7 @@@ static struct r10conf *setup_conf(struc
         conf->reshape_safe = conf->reshape_progress;
         spin_lock_init(&conf->device_lock);
         INIT_LIST_HEAD(&conf->retry_list);
+ +      INIT_LIST_HEAD(&conf->bio_end_io_list);
   
         spin_lock_init(&conf->resync_lock);
         init_waitqueue_head(&conf->wait_barrier);
@@@ -3670,8 -3527,6 +3554,6 @@@ static int run(struct mddev *mddev
                         disk->rdev = rdev;
                 }
                 q = bdev_get_queue(rdev->bdev);
-               if (q->merge_bvec_fn)
-                       mddev->merge_check_needed = 1;
                 diff = (rdev->new_data_offset - rdev->data_offset);
                 if (!mddev->reshape_backwards)
                         diff = -diff;
@@@ -4242,7 -4097,7 +4124,7 @@@ static sector_t reshape_request(struct 
          * at a time, possibly less if that exceeds RESYNC_PAGES,
          * or we hit a bad block or something.
          * This might mean we pause for normal IO in the middle of
- -       * a chunk, but that is not a problem was mddev->reshape_position
+ +       * a chunk, but that is not a problem as mddev->reshape_position
          * can record any location.
          *
          * If we will want to write to a location that isn't
@@@ -4266,7 -4121,7 +4148,7 @@@
          *
          * In all this the minimum difference in data offsets
          * (conf->offset_diff - always positive) allows a bit of slack,
- -       * so next can be after 'safe', but not by more than offset_disk
+ +       * so next can be after 'safe', but not by more than offset_diff
          *
          * We need to prepare all the bios here before we start any IO
          * to ensure the size we choose is acceptable to all devices.
@@@ -4409,7 -4264,7 +4291,7 @@@ read_more
         read_bio->bi_end_io = end_sync_read;
         read_bio->bi_rw = READ;
         read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
-       __set_bit(BIO_UPTODATE, &read_bio->bi_flags);
+       read_bio->bi_error = 0;
         read_bio->bi_vcnt = 0;
         read_bio->bi_iter.bi_size = 0;
         r10_bio->master_bio = read_bio;
@@@ -4466,7 -4321,7 +4348,7 @@@
                                 /* Remove last page from this bio */
                                 bio2->bi_vcnt--;
                                 bio2->bi_iter.bi_size -= len;
-                               __clear_bit(BIO_SEG_VALID, &bio2->bi_flags);
+                               bio_clear_flag(bio2, BIO_SEG_VALID);
                         }
                         goto bio_full;
                 }
@@@ -4631,9 -4486,8 +4513,8 @@@ static int handle_reshape_read_error(st
         return 0;
   }
   
- static void end_reshape_write(struct bio *bio, int error)
+ static void end_reshape_write(struct bio *bio)
   {
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
         struct r10bio *r10_bio = bio->bi_private;
         struct mddev *mddev = r10_bio->mddev;
         struct r10conf *conf = mddev->private;
@@@ -4650,7 -4504,7 +4531,7 @@@
                 rdev = conf->mirrors[d].rdev;
         }
   
-       if (!uptodate) {
+       if (bio->bi_error) {
                 /* FIXME should record badblock */
                 md_error(mddev, rdev);
         }
@@@ -4727,7 -4581,6 +4608,6 @@@ static struct md_personality raid10_per
         .start_reshape  = raid10_start_reshape,
         .finish_reshape = raid10_finish_reshape,
         .congested      = raid10_congested,
-       .mergeable_bvec = raid10_mergeable_bvec,
   };
   
   static int __init raid_init(void)
diff --combined drivers/md/raid5.c

index 4195064,b29e89c..15ef2c6
--- 1/drivers/md/raid5.c
--- 2/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@@ -223,14 -223,18 +223,14 @@@ static int raid6_idx_to_slot(int idx, s
         return slot;
   }
   
- -static void return_io(struct bio *return_bi)
+ +static void return_io(struct bio_list *return_bi)
   {
- -      struct bio *bi = return_bi;
- -      while (bi) {
- -
- -              return_bi = bi->bi_next;
- -              bi->bi_next = NULL;
+ +      struct bio *bi;
+ +      while ((bi = bio_list_pop(return_bi)) != NULL) {
                 bi->bi_iter.bi_size = 0;
                 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
                                          bi, 0);
-               bio_endio(bi, 0);
+               bio_endio(bi);
- -              bi = return_bi;
         }
   }
   
@@@ -883,9 -887,9 +883,9 @@@ static int use_new_offset(struct r5con
   }
   
   static void
- raid5_end_read_request(struct bio *bi, int error);
+ raid5_end_read_request(struct bio *bi);
   static void
- raid5_end_write_request(struct bio *bi, int error);
+ raid5_end_write_request(struct bio *bi);
   
   static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
   {
@@@ -1173,7 -1177,7 +1173,7 @@@ async_copy_data(int frombio, struct bi
   static void ops_complete_biofill(void *stripe_head_ref)
   {
         struct stripe_head *sh = stripe_head_ref;
- -      struct bio *return_bi = NULL;
+ +      struct bio_list return_bi = BIO_EMPTY_LIST;
         int i;
   
         pr_debug("%s: stripe %llu\n", __func__,
@@@ -1197,15 -1201,17 +1197,15 @@@
                         while (rbi && rbi->bi_iter.bi_sector <
                                 dev->sector + STRIPE_SECTORS) {
                                 rbi2 = r5_next_bio(rbi, dev->sector);
- -                              if (!raid5_dec_bi_active_stripes(rbi)) {
- -                                      rbi->bi_next = return_bi;
- -                                      return_bi = rbi;
- -                              }
+ +                              if (!raid5_dec_bi_active_stripes(rbi))
+ +                                      bio_list_add(&return_bi, rbi);
                                 rbi = rbi2;
                         }
                 }
         }
         clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
   
- -      return_io(return_bi);
+ +      return_io(&return_bi);
   
         set_bit(STRIPE_HANDLE, &sh->state);
         release_stripe(sh);
@@@ -2276,12 -2282,11 +2276,11 @@@ static void shrink_stripes(struct r5con
         conf->slab_cache = NULL;
   }
   
- static void raid5_end_read_request(struct bio * bi, int error)
+ static void raid5_end_read_request(struct bio * bi)
   {
         struct stripe_head *sh = bi->bi_private;
         struct r5conf *conf = sh->raid_conf;
         int disks = sh->disks, i;
-       int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
         char b[BDEVNAME_SIZE];
         struct md_rdev *rdev = NULL;
         sector_t s;
@@@ -2290,9 -2295,9 +2289,9 @@@
                 if (bi == &sh->dev[i].req)
                         break;
   
-       pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+       pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
-               uptodate);
+               bi->bi_error);
         if (i == disks) {
                 BUG();
                 return;
@@@ -2311,7 -2316,7 +2310,7 @@@
                 s = sh->sector + rdev->new_data_offset;
         else
                 s = sh->sector + rdev->data_offset;
-       if (uptodate) {
+       if (!bi->bi_error) {
                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
                         /* Note that this cannot happen on a
@@@ -2399,13 -2404,12 +2398,12 @@@
         release_stripe(sh);
   }
   
- static void raid5_end_write_request(struct bio *bi, int error)
+ static void raid5_end_write_request(struct bio *bi)
   {
         struct stripe_head *sh = bi->bi_private;
         struct r5conf *conf = sh->raid_conf;
         int disks = sh->disks, i;
         struct md_rdev *uninitialized_var(rdev);
-       int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
         sector_t first_bad;
         int bad_sectors;
         int replacement = 0;
@@@ -2428,23 -2432,23 +2426,23 @@@
                         break;
                 }
         }
-       pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+       pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
                 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
-               uptodate);
+               bi->bi_error);
         if (i == disks) {
                 BUG();
                 return;
         }
   
         if (replacement) {
-               if (!uptodate)
+               if (bi->bi_error)
                         md_error(conf->mddev, rdev);
                 else if (is_badblock(rdev, sh->sector,
                                      STRIPE_SECTORS,
                                      &first_bad, &bad_sectors))
                         set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
         } else {
-               if (!uptodate) {
+               if (bi->bi_error) {
                         set_bit(STRIPE_DEGRADED, &sh->state);
                         set_bit(WriteErrorSeen, &rdev->flags);
                         set_bit(R5_WriteError, &sh->dev[i].flags);
@@@ -2465,7 -2469,7 +2463,7 @@@
         }
         rdev_dec_pending(rdev, conf->mddev);
   
-       if (sh->batch_head && !uptodate && !replacement)
+       if (sh->batch_head && bi->bi_error && !replacement)
                 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
   
         if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@@ -2513,7 -2517,6 +2511,7 @@@ static void error(struct mddev *mddev, 
         set_bit(Blocked, &rdev->flags);
         set_bit(Faulty, &rdev->flags);
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ +      set_bit(MD_CHANGE_PENDING, &mddev->flags);
         printk(KERN_ALERT
                "md/raid:%s: Disk failure on %s, disabling device.\n"
                "md/raid:%s: Operation continuing on %d devices.\n",
@@@ -3066,7 -3069,7 +3064,7 @@@ static void stripe_set_idx(sector_t str
   static void
   handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                                 struct stripe_head_state *s, int disks,
- -                              struct bio **return_bi)
+ +                              struct bio_list *return_bi)
   {
         int i;
         BUG_ON(sh->batch_head);
@@@ -3107,10 -3110,12 +3105,11 @@@
                 while (bi && bi->bi_iter.bi_sector <
                         sh->dev[i].sector + STRIPE_SECTORS) {
                         struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-                       clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ 
+                       bi->bi_error = -EIO;
                         if (!raid5_dec_bi_active_stripes(bi)) {
                                 md_write_end(conf->mddev);
- -                              bi->bi_next = *return_bi;
- -                              *return_bi = bi;
+ +                              bio_list_add(return_bi, bi);
                         }
                         bi = nextbi;
                 }
@@@ -3130,10 -3135,12 +3129,11 @@@
                 while (bi && bi->bi_iter.bi_sector <
                        sh->dev[i].sector + STRIPE_SECTORS) {
                         struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
-                       clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ 
+                       bi->bi_error = -EIO;
                         if (!raid5_dec_bi_active_stripes(bi)) {
                                 md_write_end(conf->mddev);
- -                              bi->bi_next = *return_bi;
- -                              *return_bi = bi;
+ +                              bio_list_add(return_bi, bi);
                         }
                         bi = bi2;
                 }
@@@ -3154,9 -3161,12 +3154,10 @@@
                                sh->dev[i].sector + STRIPE_SECTORS) {
                                 struct bio *nextbi =
                                         r5_next_bio(bi, sh->dev[i].sector);
-                               clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ 
+                               bi->bi_error = -EIO;
- -                              if (!raid5_dec_bi_active_stripes(bi)) {
- -                                      bi->bi_next = *return_bi;
- -                                      *return_bi = bi;
- -                              }
+ +                              if (!raid5_dec_bi_active_stripes(bi))
+ +                                      bio_list_add(return_bi, bi);
                                 bi = nextbi;
                         }
                 }
@@@ -3435,7 -3445,7 +3436,7 @@@ static void break_stripe_batch_list(str
    * never LOCKED, so we don't need to test 'failed' directly.
    */
   static void handle_stripe_clean_event(struct r5conf *conf,
- -      struct stripe_head *sh, int disks, struct bio **return_bi)
+ +      struct stripe_head *sh, int disks, struct bio_list *return_bi)
   {
         int i;
         struct r5dev *dev;
@@@ -3469,7 -3479,8 +3470,7 @@@ returnbi
                                         wbi2 = r5_next_bio(wbi, dev->sector);
                                         if (!raid5_dec_bi_active_stripes(wbi)) {
                                                 md_write_end(conf->mddev);
- -                                              wbi->bi_next = *return_bi;
- -                                              *return_bi = wbi;
+ +                                              bio_list_add(return_bi, wbi);
                                         }
                                         wbi = wbi2;
                                 }
@@@ -4602,15 -4613,7 +4603,15 @@@ finish
                         md_wakeup_thread(conf->mddev->thread);
         }
   
- -      return_io(s.return_bi);
+ +      if (!bio_list_empty(&s.return_bi)) {
+ +              if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+ +                      spin_lock_irq(&conf->device_lock);
+ +                      bio_list_merge(&conf->return_bi, &s.return_bi);
+ +                      spin_unlock_irq(&conf->device_lock);
+ +                      md_wakeup_thread(conf->mddev->thread);
+ +              } else
+ +                      return_io(&s.return_bi);
+ +      }
   
         clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
   }
@@@ -4667,43 -4670,14 +4668,14 @@@ static int raid5_congested(struct mdde
         return 0;
   }
   
- /* We want read requests to align with chunks where possible,
-  * but write requests don't need to.
-  */
- static int raid5_mergeable_bvec(struct mddev *mddev,
-                               struct bvec_merge_data *bvm,
-                               struct bio_vec *biovec)
- {
-       struct r5conf *conf = mddev->private;
-       sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
-       int max;
-       unsigned int chunk_sectors;
-       unsigned int bio_sectors = bvm->bi_size >> 9;
- 
-       /*
-        * always allow writes to be mergeable, read as well if array
-        * is degraded as we'll go through stripe cache anyway.
-        */
-       if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
-               return biovec->bv_len;
- 
-       chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
-       max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
-       if (max < 0) max = 0;
-       if (max <= biovec->bv_len && bio_sectors == 0)
-               return biovec->bv_len;
-       else
-               return max;
- }
- 
   static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
   {
+ +      struct r5conf *conf = mddev->private;
         sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
- -      unsigned int chunk_sectors = mddev->chunk_sectors;
+ +      unsigned int chunk_sectors;
         unsigned int bio_sectors = bio_sectors(bio);
   
- -      if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- -              chunk_sectors = mddev->new_chunk_sectors;
+ +      chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
         return  chunk_sectors >=
                 ((sector & (chunk_sectors - 1)) + bio_sectors);
   }
@@@ -4754,13 -4728,13 +4726,13 @@@ static struct bio *remove_bio_from_retr
    *  first).
    *  If the read failed..
    */
- static void raid5_align_endio(struct bio *bi, int error)
+ static void raid5_align_endio(struct bio *bi)
   {
         struct bio* raid_bi  = bi->bi_private;
         struct mddev *mddev;
         struct r5conf *conf;
-       int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
         struct md_rdev *rdev;
+       int error = bi->bi_error;
   
         bio_put(bi);
   
@@@ -4771,10 -4745,10 +4743,10 @@@
   
         rdev_dec_pending(rdev, conf->mddev);
   
-       if (!error && uptodate) {
+       if (!error) {
                 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
                                          raid_bi, 0);
-               bio_endio(raid_bi, 0);
+               bio_endio(raid_bi);
                 if (atomic_dec_and_test(&conf->active_aligned_reads))
                         wake_up(&conf->wait_for_quiescent);
                 return;
@@@ -4785,26 -4759,7 +4757,7 @@@
         add_bio_to_retry(raid_bi, conf);
   }
   
- static int bio_fits_rdev(struct bio *bi)
- {
-       struct request_queue *q = bdev_get_queue(bi->bi_bdev);
- 
-       if (bio_sectors(bi) > queue_max_sectors(q))
-               return 0;
-       blk_recount_segments(q, bi);
-       if (bi->bi_phys_segments > queue_max_segments(q))
-               return 0;
- 
-       if (q->merge_bvec_fn)
-               /* it's too hard to apply the merge_bvec_fn at this stage,
-                * just just give up
-                */
-               return 0;
- 
-       return 1;
- }
- 
- static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
+ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
   {
         struct r5conf *conf = mddev->private;
         int dd_idx;
@@@ -4813,7 -4768,7 +4766,7 @@@
         sector_t end_sector;
   
         if (!in_chunk_boundary(mddev, raid_bio)) {
-               pr_debug("chunk_aligned_read : non aligned\n");
+               pr_debug("%s: non aligned\n", __func__);
                 return 0;
         }
         /*
@@@ -4855,13 -4810,11 +4808,11 @@@
                 rcu_read_unlock();
                 raid_bio->bi_next = (void*)rdev;
                 align_bi->bi_bdev =  rdev->bdev;
-               __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
+               bio_clear_flag(align_bi, BIO_SEG_VALID);
   
-               if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_iter.bi_sector,
+               if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
                                 bio_sectors(align_bi),
                                 &first_bad, &bad_sectors)) {
-                       /* too big in some way, or has a known bad block */
                         bio_put(align_bi);
                         rdev_dec_pending(rdev, mddev);
                         return 0;
@@@ -4890,6 -4843,31 +4841,31 @@@
         }
   }
   
+ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
+ {
+       struct bio *split;
+ 
+       do {
+               sector_t sector = raid_bio->bi_iter.bi_sector;
+               unsigned chunk_sects = mddev->chunk_sectors;
+               unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
+ 
+               if (sectors < bio_sectors(raid_bio)) {
+                       split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
+                       bio_chain(split, raid_bio);
+               } else
+                       split = raid_bio;
+ 
+               if (!raid5_read_one_chunk(mddev, split)) {
+                       if (split != raid_bio)
+                               generic_make_request(raid_bio);
+                       return split;
+               }
+       } while (split != raid_bio);
+ 
+       return NULL;
+ }
+ 
   /* __get_priority_stripe - get the next stripe to process
    *
    * Full stripe writes are allowed to pass preread active stripes up until
@@@ -5138,7 -5116,7 +5114,7 @@@ static void make_discard_request(struc
         remaining = raid5_dec_bi_active_stripes(bi);
         if (remaining == 0) {
                 md_write_end(mddev);
-               bio_endio(bi, 0);
+               bio_endio(bi);
         }
   }
   
@@@ -5167,9 -5145,11 +5143,11 @@@ static void make_request(struct mddev *
          * data on failed drives.
          */
         if (rw == READ && mddev->degraded == 0 &&
-            mddev->reshape_position == MaxSector &&
-            chunk_aligned_read(mddev,bi))
-               return;
+           mddev->reshape_position == MaxSector) {
+               bi = chunk_aligned_read(mddev, bi);
+               if (!bi)
+                       return;
+       }
   
         if (unlikely(bi->bi_rw & REQ_DISCARD)) {
                 make_discard_request(mddev, bi);
@@@ -5302,7 -5282,7 +5280,7 @@@
                         release_stripe_plug(mddev, sh);
                 } else {
                         /* cannot get stripe for read-ahead, just give-up */
-                       clear_bit(BIO_UPTODATE, &bi->bi_flags);
+                       bi->bi_error = -EIO;
                         break;
                 }
         }
@@@ -5316,7 -5296,7 +5294,7 @@@
   
                 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
                                          bi, 0);
-               bio_endio(bi, 0);
+               bio_endio(bi);
         }
   }
   
@@@ -5345,7 -5325,6 +5323,7 @@@ static sector_t reshape_request(struct 
         sector_t stripe_addr;
         int reshape_sectors;
         struct list_head stripes;
+ +      sector_t retn;
   
         if (sector_nr == 0) {
                 /* If restarting in the middle, skip the initial sectors */
@@@ -5353,10 -5332,6 +5331,10 @@@
                     conf->reshape_progress < raid5_size(mddev, 0, 0)) {
                         sector_nr = raid5_size(mddev, 0, 0)
                                 - conf->reshape_progress;
+ +              } else if (mddev->reshape_backwards &&
+ +                         conf->reshape_progress == MaxSector) {
+ +                      /* shouldn't happen, but just in case, finish up.*/
+ +                      sector_nr = MaxSector;
                 } else if (!mddev->reshape_backwards &&
                            conf->reshape_progress > 0)
                         sector_nr = conf->reshape_progress;
@@@ -5365,8 -5340,7 +5343,8 @@@
                         mddev->curr_resync_completed = sector_nr;
                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                         *skipped = 1;
- -                      return sector_nr;
+ +                      retn = sector_nr;
+ +                      goto finish;
                 }
         }
   
@@@ -5374,8 -5348,10 +5352,8 @@@
          * If old and new chunk sizes differ, we need to process the
          * largest of these
          */
- -      if (mddev->new_chunk_sectors > mddev->chunk_sectors)
- -              reshape_sectors = mddev->new_chunk_sectors;
- -      else
- -              reshape_sectors = mddev->chunk_sectors;
+ +
+ +      reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
   
         /* We update the metadata at least every 10 seconds, or when
          * the data about to be copied would over-write the source of
@@@ -5390,16 -5366,11 +5368,16 @@@
         safepos = conf->reshape_safe;
         sector_div(safepos, data_disks);
         if (mddev->reshape_backwards) {
- -              writepos -= min_t(sector_t, reshape_sectors, writepos);
+ +              BUG_ON(writepos < reshape_sectors);
+ +              writepos -= reshape_sectors;
                 readpos += reshape_sectors;
                 safepos += reshape_sectors;
         } else {
                 writepos += reshape_sectors;
+ +              /* readpos and safepos are worst-case calculations.
+ +               * A negative number is overly pessimistic, and causes
+ +               * obvious problems for unsigned storage.  So clip to 0.
+ +               */
                 readpos -= min_t(sector_t, reshape_sectors, readpos);
                 safepos -= min_t(sector_t, reshape_sectors, safepos);
         }
@@@ -5542,10 -5513,7 +5520,10 @@@
          * then we need to write out the superblock.
          */
         sector_nr += reshape_sectors;
- -      if ((sector_nr - mddev->curr_resync_completed) * 2
+ +      retn = reshape_sectors;
+ +finish:
+ +      if (mddev->curr_resync_completed > mddev->resync_max ||
+ +          (sector_nr - mddev->curr_resync_completed) * 2
             >= mddev->resync_max - mddev->curr_resync_completed) {
                 /* Cannot proceed until we've updated the superblock... */
                 wait_event(conf->wait_for_overlap,
@@@ -5570,7 -5538,7 +5548,7 @@@
                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
         }
   ret:
- -      return reshape_sectors;
+ +      return retn;
   }
   
   static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
@@@ -5724,7 -5692,7 +5702,7 @@@ static int  retry_aligned_read(struct r
         if (remaining == 0) {
                 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
                                          raid_bio, 0);
-               bio_endio(raid_bio, 0);
+               bio_endio(raid_bio);
         }
         if (atomic_dec_and_test(&conf->active_aligned_reads))
                 wake_up(&conf->wait_for_quiescent);
@@@ -5826,18 -5794,6 +5804,18 @@@ static void raid5d(struct md_thread *th
   
         md_check_recovery(mddev);
   
+ +      if (!bio_list_empty(&conf->return_bi) &&
+ +          !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ +              struct bio_list tmp = BIO_EMPTY_LIST;
+ +              spin_lock_irq(&conf->device_lock);
+ +              if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ +                      bio_list_merge(&tmp, &conf->return_bi);
+ +                      bio_list_init(&conf->return_bi);
+ +              }
+ +              spin_unlock_irq(&conf->device_lock);
+ +              return_io(&tmp);
+ +      }
+ +
         blk_start_plug(&plug);
         handled = 0;
         spin_lock_irq(&conf->device_lock);
@@@ -6278,8 -6234,8 +6256,8 @@@ raid5_size(struct mddev *mddev, sector_
                 /* size is defined by the smallest of previous and new size */
                 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
   
- -      sectors &= ~((sector_t)mddev->chunk_sectors - 1);
- -      sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
+ +      sectors &= ~((sector_t)conf->chunk_sectors - 1);
+ +      sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
         return sectors * (raid_disks - conf->max_degraded);
   }
   
@@@ -6497,7 -6453,6 +6475,7 @@@ static struct r5conf *setup_conf(struc
         INIT_LIST_HEAD(&conf->hold_list);
         INIT_LIST_HEAD(&conf->delayed_list);
         INIT_LIST_HEAD(&conf->bitmap_list);
+ +      bio_list_init(&conf->return_bi);
         init_llist_head(&conf->released_stripes);
         atomic_set(&conf->active_stripes, 0);
         atomic_set(&conf->preread_active_stripes, 0);
@@@ -6587,9 -6542,6 +6565,9 @@@
         if (conf->reshape_progress != MaxSector) {
                 conf->prev_chunk_sectors = mddev->chunk_sectors;
                 conf->prev_algo = mddev->layout;
+ +      } else {
+ +              conf->prev_chunk_sectors = conf->chunk_sectors;
+ +              conf->prev_algo = conf->algorithm;
         }
   
         conf->min_nr_stripes = NR_STRIPES;
@@@ -6709,8 -6661,6 +6687,8 @@@ static int run(struct mddev *mddev
                 sector_t here_new, here_old;
                 int old_disks;
                 int max_degraded = (mddev->level == 6 ? 2 : 1);
+ +              int chunk_sectors;
+ +              int new_data_disks;
   
                 if (mddev->new_level != mddev->level) {
                         printk(KERN_ERR "md/raid:%s: unsupported reshape "
@@@ -6722,25 -6672,28 +6700,25 @@@
                 /* reshape_position must be on a new-stripe boundary, and one
                  * further up in new geometry must map after here in old
                  * geometry.
+ +               * If the chunk sizes are different, then as we perform reshape
+ +               * in units of the largest of the two, reshape_position needs
+ +               * be a multiple of the largest chunk size times new data disks.
                  */
                 here_new = mddev->reshape_position;
- -              if (sector_div(here_new, mddev->new_chunk_sectors *
- -                             (mddev->raid_disks - max_degraded))) {
+ +              chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
+ +              new_data_disks = mddev->raid_disks - max_degraded;
+ +              if (sector_div(here_new, chunk_sectors * new_data_disks)) {
                         printk(KERN_ERR "md/raid:%s: reshape_position not "
                                "on a stripe boundary\n", mdname(mddev));
                         return -EINVAL;
                 }
- -              reshape_offset = here_new * mddev->new_chunk_sectors;
+ +              reshape_offset = here_new * chunk_sectors;
                 /* here_new is the stripe we will write to */
                 here_old = mddev->reshape_position;
- -              sector_div(here_old, mddev->chunk_sectors *
- -                         (old_disks-max_degraded));
+ +              sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
                 /* here_old is the first stripe that we might need to read
                  * from */
                 if (mddev->delta_disks == 0) {
- -                      if ((here_new * mddev->new_chunk_sectors !=
- -                           here_old * mddev->chunk_sectors)) {
- -                              printk(KERN_ERR "md/raid:%s: reshape position is"
- -                                     " confused - aborting\n", mdname(mddev));
- -                              return -EINVAL;
- -                      }
                         /* We cannot be sure it is safe to start an in-place
                          * reshape.  It is only safe if user-space is monitoring
                          * and taking constant backups.
@@@ -6759,10 -6712,10 +6737,10 @@@
                                 return -EINVAL;
                         }
                 } else if (mddev->reshape_backwards
- -                  ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
- -                     here_old * mddev->chunk_sectors)
- -                  : (here_new * mddev->new_chunk_sectors >=
- -                     here_old * mddev->chunk_sectors + (-min_offset_diff))) {
+ +                  ? (here_new * chunk_sectors + min_offset_diff <=
+ +                     here_old * chunk_sectors)
+ +                  : (here_new * chunk_sectors >=
+ +                     here_old * chunk_sectors + (-min_offset_diff))) {
                         /* Reading from the same stripe as writing to - bad */
                         printk(KERN_ERR "md/raid:%s: reshape_position too early for "
                                "auto-recovery - aborting.\n",
@@@ -7014,7 -6967,7 +6992,7 @@@ static void status(struct seq_file *seq
         int i;
   
         seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
- -              mddev->chunk_sectors / 2, mddev->layout);
+ +              conf->chunk_sectors / 2, mddev->layout);
         seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
         for (i = 0; i < conf->raid_disks; i++)
                 seq_printf (seq, "%s",
@@@ -7220,9 -7173,7 +7198,9 @@@ static int raid5_resize(struct mddev *m
          * worth it.
          */
         sector_t newsize;
- -      sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+ +      struct r5conf *conf = mddev->private;
+ +
+ +      sectors &= ~((sector_t)conf->chunk_sectors - 1);
         newsize = raid5_size(mddev, sectors, mddev->raid_disks);
         if (mddev->external_size &&
             mddev->array_sectors > newsize)
@@@ -7461,7 -7412,6 +7439,7 @@@ static void end_reshape(struct r5conf *
                         rdev->data_offset = rdev->new_data_offset;
                 smp_wmb();
                 conf->reshape_progress = MaxSector;
+ +              conf->mddev->reshape_position = MaxSector;
                 spin_unlock_irq(&conf->device_lock);
                 wake_up(&conf->wait_for_overlap);
   
@@@ -7807,7 -7757,6 +7785,6 @@@ static struct md_personality raid6_pers
         .quiesce        = raid5_quiesce,
         .takeover       = raid6_takeover,
         .congested      = raid5_congested,
-       .mergeable_bvec = raid5_mergeable_bvec,
   };
   static struct md_personality raid5_personality =
   {
@@@ -7831,7 -7780,6 +7808,6 @@@
         .quiesce        = raid5_quiesce,
         .takeover       = raid5_takeover,
         .congested      = raid5_congested,
-       .mergeable_bvec = raid5_mergeable_bvec,
   };
   
   static struct md_personality raid4_personality =
@@@ -7856,7 -7804,6 +7832,6 @@@
         .quiesce        = raid5_quiesce,
         .takeover       = raid4_takeover,
         .congested      = raid5_congested,
-       .mergeable_bvec = raid5_mergeable_bvec,
   };
   
   static int __init raid5_init(void)
author	NeilBrown <neilb@suse.com>
	Sat, 5 Sep 2015 09:07:04 +0000 (11:07 +0200)
committer	NeilBrown <neilb@suse.com>
	Sat, 5 Sep 2015 09:08:32 +0000 (11:08 +0200)
		1	2
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid0.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid1.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid10.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5.c	patch \|	diff1 \|	diff2 \|	blob \| history