Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux
[linux-2.6-microblaze.git] / drivers / md / raid5.c
index 6a7a32f..d874abf 100644 (file)
@@ -36,6 +36,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
@@ -760,6 +761,7 @@ enum stripe_result {
        STRIPE_RETRY,
        STRIPE_SCHEDULE_AND_RETRY,
        STRIPE_FAIL,
+       STRIPE_WAIT_RESHAPE,
 };
 
 struct stripe_request_ctx {
@@ -1210,10 +1212,8 @@ again:
                 */
                while (op_is_write(op) && rdev &&
                       test_bit(WriteErrorSeen, &rdev->flags)) {
-                       sector_t first_bad;
-                       int bad_sectors;
-                       int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
-                                             &first_bad, &bad_sectors);
+                       int bad = rdev_has_badblock(rdev, sh->sector,
+                                                   RAID5_STRIPE_SECTORS(conf));
                        if (!bad)
                                break;
 
@@ -1295,10 +1295,7 @@ again:
                        if (rrdev)
                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
 
-                       if (conf->mddev->gendisk)
-                               trace_block_bio_remap(bi,
-                                               disk_devt(conf->mddev->gendisk),
-                                               sh->dev[i].sector);
+                       mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector);
                        if (should_defer && op_is_write(op))
                                bio_list_add(&pending_bios, bi);
                        else
@@ -1342,10 +1339,7 @@ again:
                         */
                        if (op == REQ_OP_DISCARD)
                                rbi->bi_vcnt = 0;
-                       if (conf->mddev->gendisk)
-                               trace_block_bio_remap(rbi,
-                                               disk_devt(conf->mddev->gendisk),
-                                               sh->dev[i].sector);
+                       mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector);
                        if (should_defer && op_is_write(op))
                                bio_list_add(&pending_bios, rbi);
                        else
@@ -2412,7 +2406,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
        atomic_inc(&conf->active_stripes);
 
        raid5_release_stripe(sh);
-       conf->max_nr_stripes++;
+       WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1);
        return 1;
 }
 
@@ -2422,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num)
        size_t namelen = sizeof(conf->cache_name[0]);
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
 
-       if (conf->mddev->gendisk)
+       if (mddev_is_dm(conf->mddev))
                snprintf(conf->cache_name[0], namelen,
-                       "raid%d-%s", conf->level, mdname(conf->mddev));
+                       "raid%d-%p", conf->level, conf->mddev);
        else
                snprintf(conf->cache_name[0], namelen,
-                       "raid%d-%p", conf->level, conf->mddev);
+                       "raid%d-%s", conf->level, mdname(conf->mddev));
        snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
 
        conf->active_name = 0;
@@ -2707,7 +2701,7 @@ static int drop_one_stripe(struct r5conf *conf)
        shrink_buffers(sh);
        free_stripe(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
-       conf->max_nr_stripes--;
+       WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1);
        return 1;
 }
 
@@ -2855,8 +2849,6 @@ static void raid5_end_write_request(struct bio *bi)
        struct r5conf *conf = sh->raid_conf;
        int disks = sh->disks, i;
        struct md_rdev *rdev;
-       sector_t first_bad;
-       int bad_sectors;
        int replacement = 0;
 
        for (i = 0 ; i < disks; i++) {
@@ -2888,9 +2880,8 @@ static void raid5_end_write_request(struct bio *bi)
        if (replacement) {
                if (bi->bi_status)
                        md_error(conf->mddev, rdev);
-               else if (is_badblock(rdev, sh->sector,
-                                    RAID5_STRIPE_SECTORS(conf),
-                                    &first_bad, &bad_sectors))
+               else if (rdev_has_badblock(rdev, sh->sector,
+                                          RAID5_STRIPE_SECTORS(conf)))
                        set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
        } else {
                if (bi->bi_status) {
@@ -2900,9 +2891,8 @@ static void raid5_end_write_request(struct bio *bi)
                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
                                set_bit(MD_RECOVERY_NEEDED,
                                        &rdev->mddev->recovery);
-               } else if (is_badblock(rdev, sh->sector,
-                                      RAID5_STRIPE_SECTORS(conf),
-                                      &first_bad, &bad_sectors)) {
+               } else if (rdev_has_badblock(rdev, sh->sector,
+                                            RAID5_STRIPE_SECTORS(conf))) {
                        set_bit(R5_MadeGood, &sh->dev[i].flags);
                        if (test_bit(R5_ReadError, &sh->dev[i].flags))
                                /* That was a successful write so make
@@ -4205,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf,
        set_bit(STRIPE_HANDLE, &sh->state);
        if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
                /* prefer read-modify-write, but need to get some data */
-               if (conf->mddev->queue)
-                       blk_add_trace_msg(conf->mddev->queue,
-                                         "raid5 rmw %llu %d",
-                                         (unsigned long long)sh->sector, rmw);
+               mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
+                               sh->sector, rmw);
+
                for (i = disks; i--; ) {
                        struct r5dev *dev = &sh->dev[i];
                        if (test_bit(R5_InJournal, &dev->flags) &&
@@ -4285,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
                                        set_bit(STRIPE_DELAYED, &sh->state);
                        }
                }
-               if (rcw && conf->mddev->queue)
-                       blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
-                                         (unsigned long long)sh->sector,
-                                         rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
+               if (rcw && !mddev_is_dm(conf->mddev))
+                       blk_add_trace_msg(conf->mddev->gendisk->queue,
+                               "raid5 rcw %llu %d %d %d",
+                               (unsigned long long)sh->sector, rcw, qread,
+                               test_bit(STRIPE_DELAYED, &sh->state));
        }
 
        if (rcw > disks && rmw > disks &&
@@ -4674,8 +4664,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
        /* Now to look around and see what can be done */
        for (i=disks; i--; ) {
                struct md_rdev *rdev;
-               sector_t first_bad;
-               int bad_sectors;
                int is_bad = 0;
 
                dev = &sh->dev[i];
@@ -4719,8 +4707,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                rdev = conf->disks[i].replacement;
                if (rdev && !test_bit(Faulty, &rdev->flags) &&
                    rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
-                   !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
-                                &first_bad, &bad_sectors))
+                   !rdev_has_badblock(rdev, sh->sector,
+                                      RAID5_STRIPE_SECTORS(conf)))
                        set_bit(R5_ReadRepl, &dev->flags);
                else {
                        if (rdev && !test_bit(Faulty, &rdev->flags))
@@ -4733,8 +4721,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                if (rdev && test_bit(Faulty, &rdev->flags))
                        rdev = NULL;
                if (rdev) {
-                       is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
-                                            &first_bad, &bad_sectors);
+                       is_bad = rdev_has_badblock(rdev, sh->sector,
+                                                  RAID5_STRIPE_SECTORS(conf));
                        if (s->blocked_rdev == NULL
                            && (test_bit(Blocked, &rdev->flags)
                                || is_bad < 0)) {
@@ -5463,8 +5451,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
        struct r5conf *conf = mddev->private;
        struct bio *align_bio;
        struct md_rdev *rdev;
-       sector_t sector, end_sector, first_bad;
-       int bad_sectors, dd_idx;
+       sector_t sector, end_sector;
+       int dd_idx;
        bool did_inc;
 
        if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5493,8 +5481,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 
        atomic_inc(&rdev->nr_pending);
 
-       if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
-                       &bad_sectors)) {
+       if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
                rdev_dec_pending(rdev, mddev);
                return 0;
        }
@@ -5530,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
                spin_unlock_irq(&conf->device_lock);
        }
 
-       if (mddev->gendisk)
-               trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
-                                     raid_bio->bi_iter.bi_sector);
+       mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector);
        submit_bio_noacct(align_bio);
        return 1;
 }
@@ -5701,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
        }
        release_inactive_stripe_list(conf, cb->temp_inactive_list,
                                     NR_STRIPE_HASH_LOCKS);
-       if (mddev->queue)
-               trace_block_unplug(mddev->queue, cnt, !from_schedule);
+       if (!mddev_is_dm(mddev))
+               trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule);
        kfree(cb);
 }
 
@@ -5946,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
                        if (ahead_of_reshape(mddev, logical_sector,
                                             conf->reshape_safe)) {
                                spin_unlock_irq(&conf->device_lock);
-                               return STRIPE_SCHEDULE_AND_RETRY;
+                               ret = STRIPE_SCHEDULE_AND_RETRY;
+                               goto out;
                        }
                }
                spin_unlock_irq(&conf->device_lock);
@@ -6025,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
 
 out_release:
        raid5_release_stripe(sh);
+out:
+       if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
+               bi->bi_status = BLK_STS_RESOURCE;
+               ret = STRIPE_WAIT_RESHAPE;
+               pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
+       }
        return ret;
 }
 
@@ -6146,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
        while (1) {
                res = make_stripe_request(mddev, conf, &ctx, logical_sector,
                                          bi);
-               if (res == STRIPE_FAIL)
+               if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
                        break;
 
                if (res == STRIPE_RETRY)
@@ -6184,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 
        if (rw == WRITE)
                md_write_end(mddev);
+       if (res == STRIPE_WAIT_RESHAPE) {
+               md_free_cloned_bio(bi);
+               return false;
+       }
+
        bio_endio(bi);
        return true;
 }
@@ -6773,7 +6770,18 @@ static void raid5d(struct md_thread *thread)
                        spin_unlock_irq(&conf->device_lock);
                        md_check_recovery(mddev);
                        spin_lock_irq(&conf->device_lock);
+
+                       /*
+                        * Waiting on MD_SB_CHANGE_PENDING below may deadlock
+                        * seeing md_check_recovery() is needed to clear
+                        * the flag when using mdmon.
+                        */
+                       continue;
                }
+
+               wait_event_lock_irq(mddev->sb_wait,
+                       !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+                       conf->device_lock);
        }
        pr_debug("%d stripes handled\n", handled);
 
@@ -6820,7 +6828,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
        if (size <= 16 || size > 32768)
                return -EINVAL;
 
-       conf->min_nr_stripes = size;
+       WRITE_ONCE(conf->min_nr_stripes, size);
        mutex_lock(&conf->cache_size_mutex);
        while (size < conf->max_nr_stripes &&
               drop_one_stripe(conf))
@@ -6832,7 +6840,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
        mutex_lock(&conf->cache_size_mutex);
        while (size > conf->max_nr_stripes)
                if (!grow_one_stripe(conf, GFP_KERNEL)) {
-                       conf->min_nr_stripes = conf->max_nr_stripes;
+                       WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
                        result = -ENOMEM;
                        break;
                }
@@ -6967,10 +6975,8 @@ raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
        pr_debug("md/raid: change stripe_size from %lu to %lu\n",
                        conf->stripe_size, new);
 
-       if (mddev->sync_thread ||
-               test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-               mddev->reshape_position != MaxSector ||
-               mddev->sysfs_active) {
+       if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+           mddev->reshape_position != MaxSector || mddev->sysfs_active) {
                err = -EBUSY;
                goto out_unlock;
        }
@@ -7084,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
        if (!conf)
                err = -ENODEV;
        else if (new != conf->skip_copy) {
-               struct request_queue *q = mddev->queue;
+               struct request_queue *q = mddev->gendisk->queue;
 
                conf->skip_copy = new;
                if (new)
@@ -7390,11 +7396,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
                                       struct shrink_control *sc)
 {
        struct r5conf *conf = shrink->private_data;
+       int max_stripes = READ_ONCE(conf->max_nr_stripes);
+       int min_stripes = READ_ONCE(conf->min_nr_stripes);
 
-       if (conf->max_nr_stripes < conf->min_nr_stripes)
+       if (max_stripes < min_stripes)
                /* unlikely, but not impossible */
                return 0;
-       return conf->max_nr_stripes - conf->min_nr_stripes;
+       return max_stripes - min_stripes;
 }
 
 static struct r5conf *setup_conf(struct mddev *mddev)
@@ -7684,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
        return 0;
 }
 
-static void raid5_set_io_opt(struct r5conf *conf)
+static int raid5_set_limits(struct mddev *mddev)
 {
-       blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
-                        (conf->raid_disks - conf->max_degraded));
+       struct r5conf *conf = mddev->private;
+       struct queue_limits lim;
+       int data_disks, stripe;
+       struct md_rdev *rdev;
+
+       /*
+        * The read-ahead size must cover two whole stripes, which is
+        * 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
+        */
+       data_disks = conf->previous_raid_disks - conf->max_degraded;
+
+       /*
+        * We can only discard a whole stripe. It doesn't make sense to
+        * discard data disk but write parity disk
+        */
+       stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
+
+       blk_set_stacking_limits(&lim);
+       lim.io_min = mddev->chunk_sectors << 9;
+       lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
+       lim.raid_partial_stripes_expensive = 1;
+       lim.discard_granularity = stripe;
+       lim.max_write_zeroes_sectors = 0;
+       mddev_stack_rdev_limits(mddev, &lim);
+       rdev_for_each(rdev, mddev)
+               queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
+                               mddev->gendisk->disk_name);
+
+       /*
+        * Zeroing is required for discard, otherwise data could be lost.
+        *
+        * Consider a scenario: discard a stripe (the stripe could be
+        * inconsistent if discard_zeroes_data is 0); write one disk of the
+        * stripe (the stripe could be inconsistent again depending on which
+        * disks are used to calculate parity); the disk is broken; The stripe
+        * data of this disk is lost.
+        *
+        * We only allow DISCARD if the sysadmin has confirmed that only safe
+        * devices are in use by setting a module parameter.  A better idea
+        * might be to turn DISCARD into WRITE_ZEROES requests, as that is
+        * required to be safe.
+        */
+       if (!devices_handle_discard_safely ||
+           lim.max_discard_sectors < (stripe >> 9) ||
+           lim.discard_granularity < stripe)
+               lim.max_hw_discard_sectors = 0;
+
+       /*
+        * Requests require having a bitmap for each stripe.
+        * Limit the max sectors based on this.
+        */
+       lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
+
+       /* No restrictions on the number of segments in the request */
+       lim.max_segments = USHRT_MAX;
+
+       return queue_limits_set(mddev->gendisk->queue, &lim);
 }
 
 static int raid5_run(struct mddev *mddev)
@@ -7700,6 +7763,7 @@ static int raid5_run(struct mddev *mddev)
        int i;
        long long min_offset_diff = 0;
        int first = 1;
+       int ret = -EIO;
 
        if (mddev->recovery_cp != MaxSector)
                pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7948,66 +8012,10 @@ static int raid5_run(struct mddev *mddev)
                        mdname(mddev));
        md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
-       if (mddev->queue) {
-               int chunk_size;
-               /* read-ahead size must cover two whole stripes, which
-                * is 2 * (datadisks) * chunksize where 'n' is the
-                * number of raid devices
-                */
-               int data_disks = conf->previous_raid_disks - conf->max_degraded;
-               int stripe = data_disks *
-                       ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-
-               chunk_size = mddev->chunk_sectors << 9;
-               blk_queue_io_min(mddev->queue, chunk_size);
-               raid5_set_io_opt(conf);
-               mddev->queue->limits.raid_partial_stripes_expensive = 1;
-               /*
-                * We can only discard a whole stripe. It doesn't make sense to
-                * discard data disk but write parity disk
-                */
-               stripe = stripe * PAGE_SIZE;
-               stripe = roundup_pow_of_two(stripe);
-               mddev->queue->limits.discard_granularity = stripe;
-
-               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-
-               rdev_for_each(rdev, mddev) {
-                       disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                         rdev->data_offset << 9);
-                       disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                         rdev->new_data_offset << 9);
-               }
-
-               /*
-                * zeroing is required, otherwise data
-                * could be lost. Consider a scenario: discard a stripe
-                * (the stripe could be inconsistent if
-                * discard_zeroes_data is 0); write one disk of the
-                * stripe (the stripe could be inconsistent again
-                * depending on which disks are used to calculate
-                * parity); the disk is broken; The stripe data of this
-                * disk is lost.
-                *
-                * We only allow DISCARD if the sysadmin has confirmed that
-                * only safe devices are in use by setting a module parameter.
-                * A better idea might be to turn DISCARD into WRITE_ZEROES
-                * requests, as that is required to be safe.
-                */
-               if (!devices_handle_discard_safely ||
-                   mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
-                   mddev->queue->limits.discard_granularity < stripe)
-                       blk_queue_max_discard_sectors(mddev->queue, 0);
-
-               /*
-                * Requests require having a bitmap for each stripe.
-                * Limit the max sectors based on this.
-                */
-               blk_queue_max_hw_sectors(mddev->queue,
-                       RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
-
-               /* No restrictions on the number of segments in the request */
-               blk_queue_max_segments(mddev->queue, USHRT_MAX);
+       if (!mddev_is_dm(mddev)) {
+               ret = raid5_set_limits(mddev);
+               if (ret)
+                       goto abort;
        }
 
        if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@@ -8020,7 +8028,7 @@ abort:
        free_conf(conf);
        mddev->private = NULL;
        pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
-       return -EIO;
+       return ret;
 }
 
 static void raid5_free(struct mddev *mddev, void *priv)
@@ -8531,8 +8539,8 @@ static void end_reshape(struct r5conf *conf)
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
 
-               if (conf->mddev->queue)
-                       raid5_set_io_opt(conf);
+               mddev_update_io_opt(conf->mddev,
+                       conf->raid_disks - conf->max_degraded);
        }
 }
 
@@ -8909,6 +8917,18 @@ static int raid5_start(struct mddev *mddev)
        return r5l_start(conf->log);
 }
 
+/*
+ * This is only used for dm-raid456, caller already frozen sync_thread, hence
+ * if rehsape is still in progress, io that is waiting for reshape can never be
+ * done now, hence wake up and handle those IO.
+ */
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+       struct r5conf *conf = mddev->private;
+
+       wake_up(&conf->wait_for_overlap);
+}
+
 static struct md_personality raid6_personality =
 {
        .name           = "raid6",
@@ -8932,6 +8952,7 @@ static struct md_personality raid6_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid6_takeover,
        .change_consistency_policy = raid5_change_consistency_policy,
+       .prepare_suspend = raid5_prepare_suspend,
 };
 static struct md_personality raid5_personality =
 {
@@ -8956,6 +8977,7 @@ static struct md_personality raid5_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid5_takeover,
        .change_consistency_policy = raid5_change_consistency_policy,
+       .prepare_suspend = raid5_prepare_suspend,
 };
 
 static struct md_personality raid4_personality =
@@ -8981,6 +9003,7 @@ static struct md_personality raid4_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid4_takeover,
        .change_consistency_policy = raid5_change_consistency_policy,
+       .prepare_suspend = raid5_prepare_suspend,
 };
 
 static int __init raid5_init(void)