Merge tag 'for-5.11/drivers-2020-12-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / drivers / md / raid10.c
index 3153183..c5d88ef 100644 (file)
@@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio)
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
        struct r10conf *conf = data;
-       int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
+       int size = offsetof(struct r10bio, devs[conf->copies]);
 
        /* allocate a r10bio with room for raid_disks entries in the
         * bios array */
@@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 {
        int i;
 
-       for (i = 0; i < conf->geo.raid_disks; i++) {
+       for (i = 0; i < conf->copies; i++) {
                struct bio **bio = & r10_bio->devs[i].bio;
                if (!BIO_SPECIAL(*bio))
                        bio_put(*bio);
@@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
        int slot;
        int repl = 0;
 
-       for (slot = 0; slot < conf->geo.raid_disks; slot++) {
+       for (slot = 0; slot < conf->copies; slot++) {
                if (r10_bio->devs[slot].bio == bio)
                        break;
                if (r10_bio->devs[slot].repl_bio == bio) {
@@ -336,6 +336,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
                }
        }
 
+       BUG_ON(slot == conf->copies);
        update_head_pos(slot, r10_bio);
 
        if (slotp)
@@ -1200,8 +1201,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
        read_bio->bi_private = r10_bio;
 
        if (mddev->gendisk)
-               trace_block_bio_remap(read_bio->bi_disk->queue,
-                                     read_bio, disk_devt(mddev->gendisk),
+               trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
                                      r10_bio->sector);
        submit_bio_noacct(read_bio);
        return;
@@ -1250,8 +1250,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
        mbio->bi_private = r10_bio;
 
        if (conf->mddev->gendisk)
-               trace_block_bio_remap(mbio->bi_disk->queue,
-                                     mbio, disk_devt(conf->mddev->gendisk),
+               trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
                                      r10_bio->sector);
        /* flush_pending_writes() needs access to the rdev so...*/
        mbio->bi_disk = (void *)rdev;
@@ -1275,75 +1274,12 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
        }
 }
 
-static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
-{
-       int i;
-       struct r10conf *conf = mddev->private;
-       struct md_rdev *blocked_rdev;
-
-retry_wait:
-       blocked_rdev = NULL;
-       rcu_read_lock();
-       for (i = 0; i < conf->copies; i++) {
-               struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
-               struct md_rdev *rrdev = rcu_dereference(
-                       conf->mirrors[i].replacement);
-               if (rdev == rrdev)
-                       rrdev = NULL;
-               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-                       atomic_inc(&rdev->nr_pending);
-                       blocked_rdev = rdev;
-                       break;
-               }
-               if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
-                       atomic_inc(&rrdev->nr_pending);
-                       blocked_rdev = rrdev;
-                       break;
-               }
-
-               if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
-                       sector_t first_bad;
-                       sector_t dev_sector = r10_bio->devs[i].addr;
-                       int bad_sectors;
-                       int is_bad;
-
-                       /* Discard request doesn't care the write result
-                        * so it doesn't need to wait blocked disk here.
-                        */
-                       if (!r10_bio->sectors)
-                               continue;
-
-                       is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
-                                            &first_bad, &bad_sectors);
-                       if (is_bad < 0) {
-                               /* Mustn't write here until the bad block
-                                * is acknowledged
-                                */
-                               atomic_inc(&rdev->nr_pending);
-                               set_bit(BlockedBadBlocks, &rdev->flags);
-                               blocked_rdev = rdev;
-                               break;
-                       }
-               }
-       }
-       rcu_read_unlock();
-
-       if (unlikely(blocked_rdev)) {
-               /* Have to wait for this device to get unblocked, then retry */
-               allow_barrier(conf);
-               raid10_log(conf->mddev, "%s wait rdev %d blocked",
-                               __func__, blocked_rdev->raid_disk);
-               md_wait_for_blocked_rdev(blocked_rdev, mddev);
-               wait_barrier(conf);
-               goto retry_wait;
-       }
-}
-
 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
                                 struct r10bio *r10_bio)
 {
        struct r10conf *conf = mddev->private;
        int i;
+       struct md_rdev *blocked_rdev;
        sector_t sectors;
        int max_sectors;
 
@@ -1401,9 +1337,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
        r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
        raid10_find_phys(conf, r10_bio);
-
-       wait_blocked_dev(mddev, r10_bio);
-
+retry_write:
+       blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r10_bio->sectors;
 
@@ -1414,6 +1349,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
                        conf->mirrors[d].replacement);
                if (rdev == rrdev)
                        rrdev = NULL;
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       atomic_inc(&rdev->nr_pending);
+                       blocked_rdev = rdev;
+                       break;
+               }
+               if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+                       atomic_inc(&rrdev->nr_pending);
+                       blocked_rdev = rrdev;
+                       break;
+               }
                if (rdev && (test_bit(Faulty, &rdev->flags)))
                        rdev = NULL;
                if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1434,6 +1379,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
                        is_bad = is_badblock(rdev, dev_sector, max_sectors,
                                             &first_bad, &bad_sectors);
+                       if (is_bad < 0) {
+                               /* Mustn't write here until the bad block
+                                * is acknowledged
+                                */
+                               atomic_inc(&rdev->nr_pending);
+                               set_bit(BlockedBadBlocks, &rdev->flags);
+                               blocked_rdev = rdev;
+                               break;
+                       }
                        if (is_bad && first_bad <= dev_sector) {
                                /* Cannot write here at all */
                                bad_sectors -= (dev_sector - first_bad);
@@ -1469,6 +1423,35 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
        }
        rcu_read_unlock();
 
+       if (unlikely(blocked_rdev)) {
+               /* Have to wait for this device to get unblocked, then retry */
+               int j;
+               int d;
+
+               for (j = 0; j < i; j++) {
+                       if (r10_bio->devs[j].bio) {
+                               d = r10_bio->devs[j].devnum;
+                               rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+                       }
+                       if (r10_bio->devs[j].repl_bio) {
+                               struct md_rdev *rdev;
+                               d = r10_bio->devs[j].devnum;
+                               rdev = conf->mirrors[d].replacement;
+                               if (!rdev) {
+                                       /* Race with remove_disk */
+                                       smp_mb();
+                                       rdev = conf->mirrors[d].rdev;
+                               }
+                               rdev_dec_pending(rdev, mddev);
+                       }
+               }
+               allow_barrier(conf);
+               raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
+               md_wait_for_blocked_rdev(blocked_rdev, mddev);
+               wait_barrier(conf);
+               goto retry_write;
+       }
+
        if (max_sectors < r10_bio->sectors)
                r10_bio->sectors = max_sectors;
 
@@ -1509,7 +1492,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
        r10_bio->sector = bio->bi_iter.bi_sector;
        r10_bio->state = 0;
        r10_bio->read_slot = -1;
-       memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
+       memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
 
        if (bio_data_dir(bio) == READ)
                raid10_read_request(mddev, bio, r10_bio);
@@ -1517,296 +1500,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
                raid10_write_request(mddev, bio, r10_bio);
 }
 
-static struct bio *raid10_split_bio(struct r10conf *conf,
-                       struct bio *bio, sector_t sectors, bool want_first)
-{
-       struct bio *split;
-
-       split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
-       bio_chain(split, bio);
-       allow_barrier(conf);
-       if (want_first) {
-               submit_bio_noacct(bio);
-               bio = split;
-       } else
-               submit_bio_noacct(split);
-       wait_barrier(conf);
-
-       return bio;
-}
-
-static void raid_end_discard_bio(struct r10bio *r10bio)
-{
-       struct r10conf *conf = r10bio->mddev->private;
-       struct r10bio *first_r10bio;
-
-       while (atomic_dec_and_test(&r10bio->remaining)) {
-
-               allow_barrier(conf);
-
-               if (!test_bit(R10BIO_Discard, &r10bio->state)) {
-                       first_r10bio = (struct r10bio *)r10bio->master_bio;
-                       free_r10bio(r10bio);
-                       r10bio = first_r10bio;
-               } else {
-                       md_write_end(r10bio->mddev);
-                       bio_endio(r10bio->master_bio);
-                       free_r10bio(r10bio);
-                       break;
-               }
-       }
-}
-
-static void raid10_end_discard_request(struct bio *bio)
-{
-       struct r10bio *r10_bio = bio->bi_private;
-       struct r10conf *conf = r10_bio->mddev->private;
-       struct md_rdev *rdev = NULL;
-       int dev;
-       int slot, repl;
-
-       /*
-        * We don't care the return value of discard bio
-        */
-       if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-               set_bit(R10BIO_Uptodate, &r10_bio->state);
-
-       dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
-       if (repl)
-               rdev = conf->mirrors[dev].replacement;
-       if (!rdev) {
-               /* raid10_remove_disk uses smp_mb to make sure rdev is set to
-                * replacement before setting replacement to NULL. It can read
-                * rdev first without barrier protect even replacment is NULL
-                */
-               smp_rmb();
-               rdev = conf->mirrors[dev].rdev;
-       }
-
-       raid_end_discard_bio(r10_bio);
-       rdev_dec_pending(rdev, conf->mddev);
-}
-
-/* There are some limitations to handle discard bio
- * 1st, the discard size is bigger than stripe_size*2.
- * 2st, if the discard bio spans reshape progress, we use the old way to
- * handle discard bio
- */
-static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
-{
-       struct r10conf *conf = mddev->private;
-       struct geom *geo = &conf->geo;
-       struct r10bio *r10_bio, *first_r10bio;
-       int far_copies = geo->far_copies;
-       bool first_copy = true;
-
-       int disk;
-       sector_t chunk;
-       unsigned int stripe_size;
-       sector_t split_size;
-
-       sector_t bio_start, bio_end;
-       sector_t first_stripe_index, last_stripe_index;
-       sector_t start_disk_offset;
-       unsigned int start_disk_index;
-       sector_t end_disk_offset;
-       unsigned int end_disk_index;
-       unsigned int remainder;
-
-       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-               return -EAGAIN;
-
-       wait_barrier(conf);
-
-       /* Check reshape again to avoid reshape happens after checking
-        * MD_RECOVERY_RESHAPE and before wait_barrier
-        */
-       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-               goto out;
-
-       stripe_size = geo->raid_disks << geo->chunk_shift;
-       bio_start = bio->bi_iter.bi_sector;
-       bio_end = bio_end_sector(bio);
-
-       /* Maybe one discard bio is smaller than strip size or across one stripe
-        * and discard region is larger than one stripe size. For far offset layout,
-        * if the discard region is not aligned with stripe size, there is hole
-        * when we submit discard bio to member disk. For simplicity, we only
-        * handle discard bio which discard region is bigger than stripe_size*2
-        */
-       if (bio_sectors(bio) < stripe_size*2)
-               goto out;
-
-       /* For far and far offset layout, if bio is not aligned with stripe size,
-        * it splits the part that is not aligned with strip size.
-        */
-       div_u64_rem(bio_start, stripe_size, &remainder);
-       if ((far_copies > 1) && remainder) {
-               split_size = stripe_size - remainder;
-               bio = raid10_split_bio(conf, bio, split_size, false);
-       }
-       div_u64_rem(bio_end, stripe_size, &remainder);
-       if ((far_copies > 1) && remainder) {
-               split_size = bio_sectors(bio) - remainder;
-               bio = raid10_split_bio(conf, bio, split_size, true);
-       }
-
-       bio_start = bio->bi_iter.bi_sector;
-       bio_end = bio_end_sector(bio);
-
-       /* raid10 uses chunk as the unit to store data. It's similar like raid0.
-        * One stripe contains the chunks from all member disk (one chunk from
-        * one disk at the same HBA address). For layout detail, see 'man md 4'
-        */
-       chunk = bio_start >> geo->chunk_shift;
-       chunk *= geo->near_copies;
-       first_stripe_index = chunk;
-       start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
-       if (geo->far_offset)
-               first_stripe_index *= geo->far_copies;
-       start_disk_offset = (bio_start & geo->chunk_mask) +
-                               (first_stripe_index << geo->chunk_shift);
-
-       chunk = bio_end >> geo->chunk_shift;
-       chunk *= geo->near_copies;
-       last_stripe_index = chunk;
-       end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
-       if (geo->far_offset)
-               last_stripe_index *= geo->far_copies;
-       end_disk_offset = (bio_end & geo->chunk_mask) +
-                               (last_stripe_index << geo->chunk_shift);
-
-retry_discard:
-       r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
-       r10_bio->mddev = mddev;
-       r10_bio->state = 0;
-       r10_bio->sectors = 0;
-       memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
-       wait_blocked_dev(mddev, r10_bio);
-
-       /* For far layout it needs more than one r10bio to cover all regions.
-        * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
-        * to record the discard bio. Other r10bio->master_bio record the first
-        * r10bio. The first r10bio only release after all other r10bios finish.
-        * The discard bio returns only first r10bio finishes
-        */
-       if (first_copy) {
-               r10_bio->master_bio = bio;
-               set_bit(R10BIO_Discard, &r10_bio->state);
-               first_copy = false;
-               first_r10bio = r10_bio;
-       } else
-               r10_bio->master_bio = (struct bio *)first_r10bio;
-
-       rcu_read_lock();
-       for (disk = 0; disk < geo->raid_disks; disk++) {
-               struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
-               struct md_rdev *rrdev = rcu_dereference(
-                       conf->mirrors[disk].replacement);
-
-               r10_bio->devs[disk].bio = NULL;
-               r10_bio->devs[disk].repl_bio = NULL;
-
-               if (rdev && (test_bit(Faulty, &rdev->flags)))
-                       rdev = NULL;
-               if (rrdev && (test_bit(Faulty, &rrdev->flags)))
-                       rrdev = NULL;
-               if (!rdev && !rrdev)
-                       continue;
-
-               if (rdev) {
-                       r10_bio->devs[disk].bio = bio;
-                       atomic_inc(&rdev->nr_pending);
-               }
-               if (rrdev) {
-                       r10_bio->devs[disk].repl_bio = bio;
-                       atomic_inc(&rrdev->nr_pending);
-               }
-       }
-       rcu_read_unlock();
-
-       atomic_set(&r10_bio->remaining, 1);
-       for (disk = 0; disk < geo->raid_disks; disk++) {
-               sector_t dev_start, dev_end;
-               struct bio *mbio, *rbio = NULL;
-               struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
-               struct md_rdev *rrdev = rcu_dereference(
-                       conf->mirrors[disk].replacement);
-
-               /*
-                * Now start to calculate the start and end address for each disk.
-                * The space between dev_start and dev_end is the discard region.
-                *
-                * For dev_start, it needs to consider three conditions:
-                * 1st, the disk is before start_disk, you can imagine the disk in
-                * the next stripe. So the dev_start is the start address of next
-                * stripe.
-                * 2st, the disk is after start_disk, it means the disk is at the
-                * same stripe of first disk
-                * 3st, the first disk itself, we can use start_disk_offset directly
-                */
-               if (disk < start_disk_index)
-                       dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
-               else if (disk > start_disk_index)
-                       dev_start = first_stripe_index * mddev->chunk_sectors;
-               else
-                       dev_start = start_disk_offset;
-
-               if (disk < end_disk_index)
-                       dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
-               else if (disk > end_disk_index)
-                       dev_end = last_stripe_index * mddev->chunk_sectors;
-               else
-                       dev_end = end_disk_offset;
-
-               /* It only handles discard bio which size is >= stripe size, so
-                * dev_end > dev_start all the time
-                */
-               if (r10_bio->devs[disk].bio) {
-                       mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-                       mbio->bi_end_io = raid10_end_discard_request;
-                       mbio->bi_private = r10_bio;
-                       r10_bio->devs[disk].bio = mbio;
-                       r10_bio->devs[disk].devnum = disk;
-                       atomic_inc(&r10_bio->remaining);
-                       md_submit_discard_bio(mddev, rdev, mbio,
-                                       dev_start + choose_data_offset(r10_bio, rdev),
-                                       dev_end - dev_start);
-                       bio_endio(mbio);
-               }
-               if (r10_bio->devs[disk].repl_bio) {
-                       rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-                       rbio->bi_end_io = raid10_end_discard_request;
-                       rbio->bi_private = r10_bio;
-                       r10_bio->devs[disk].repl_bio = rbio;
-                       r10_bio->devs[disk].devnum = disk;
-                       atomic_inc(&r10_bio->remaining);
-                       md_submit_discard_bio(mddev, rrdev, rbio,
-                                       dev_start + choose_data_offset(r10_bio, rrdev),
-                                       dev_end - dev_start);
-                       bio_endio(rbio);
-               }
-       }
-
-       if (!geo->far_offset && --far_copies) {
-               first_stripe_index += geo->stride >> geo->chunk_shift;
-               start_disk_offset += geo->stride;
-               last_stripe_index += geo->stride >> geo->chunk_shift;
-               end_disk_offset += geo->stride;
-               atomic_inc(&first_r10bio->remaining);
-               raid_end_discard_bio(r10_bio);
-               wait_barrier(conf);
-               goto retry_discard;
-       }
-
-       raid_end_discard_bio(r10_bio);
-
-       return 0;
-out:
-       allow_barrier(conf);
-       return -EAGAIN;
-}
-
 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
        struct r10conf *conf = mddev->private;
@@ -1821,10 +1514,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
        if (!md_write_start(mddev, bio))
                return false;
 
-       if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
-               if (!raid10_handle_discard(mddev, bio))
-                       return true;
-
        /*
         * If this request crosses a chunk boundary, we need to split
         * it.
@@ -4064,7 +3753,7 @@ static int raid10_run(struct mddev *mddev)
 
        if (mddev->queue) {
                blk_queue_max_discard_sectors(mddev->queue,
-                                             UINT_MAX);
+                                             mddev->chunk_sectors);
                blk_queue_max_write_same_sectors(mddev->queue, 0);
                blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
                blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);