Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf
[linux-2.6-microblaze.git] / drivers / md / dm-mpath.c
index f7810cc..7d3e572 100644 (file)
@@ -64,36 +64,30 @@ struct priority_group {
 
 /* Multipath context */
 struct multipath {
-       struct list_head list;
-       struct dm_target *ti;
-
-       const char *hw_handler_name;
-       char *hw_handler_params;
+       unsigned long flags;            /* Multipath state flags */
 
        spinlock_t lock;
-
-       unsigned nr_priority_groups;
-       struct list_head priority_groups;
-
-       wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+       enum dm_queue_mode queue_mode;
 
        struct pgpath *current_pgpath;
        struct priority_group *current_pg;
        struct priority_group *next_pg; /* Switch to this PG if set */
 
-       unsigned long flags;            /* Multipath state flags */
+       atomic_t nr_valid_paths;        /* Total number of usable paths */
+       unsigned nr_priority_groups;
+       struct list_head priority_groups;
 
+       const char *hw_handler_name;
+       char *hw_handler_params;
+       wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
-
-       atomic_t nr_valid_paths;        /* Total number of usable paths */
        atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
        atomic_t pg_init_count;         /* Number of times pg_init called */
 
-       enum dm_queue_mode queue_mode;
-
        struct mutex work_mutex;
        struct work_struct trigger_event;
+       struct dm_target *ti;
 
        struct work_struct process_queued_bios;
        struct bio_list queued_bios;
@@ -135,10 +129,10 @@ static struct pgpath *alloc_pgpath(void)
 {
        struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 
-       if (pgpath) {
-               pgpath->is_active = true;
-               INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
-       }
+       if (!pgpath)
+               return NULL;
+
+       pgpath->is_active = true;
 
        return pgpath;
 }
@@ -193,13 +187,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
        if (m) {
                INIT_LIST_HEAD(&m->priority_groups);
                spin_lock_init(&m->lock);
-               set_bit(MPATHF_QUEUE_IO, &m->flags);
                atomic_set(&m->nr_valid_paths, 0);
-               atomic_set(&m->pg_init_in_progress, 0);
-               atomic_set(&m->pg_init_count, 0);
-               m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
                INIT_WORK(&m->trigger_event, trigger_event);
-               init_waitqueue_head(&m->pg_init_wait);
                mutex_init(&m->work_mutex);
 
                m->queue_mode = DM_TYPE_NONE;
@@ -221,13 +210,26 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
                        m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
                else
                        m->queue_mode = DM_TYPE_REQUEST_BASED;
-       } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+
+       } else if (m->queue_mode == DM_TYPE_BIO_BASED ||
+                  m->queue_mode == DM_TYPE_NVME_BIO_BASED) {
                INIT_WORK(&m->process_queued_bios, process_queued_bios);
-               /*
-                * bio-based doesn't support any direct scsi_dh management;
-                * it just discovers if a scsi_dh is attached.
-                */
-               set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+
+               if (m->queue_mode == DM_TYPE_BIO_BASED) {
+                       /*
+                        * bio-based doesn't support any direct scsi_dh management;
+                        * it just discovers if a scsi_dh is attached.
+                        */
+                       set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+               }
+       }
+
+       if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) {
+               set_bit(MPATHF_QUEUE_IO, &m->flags);
+               atomic_set(&m->pg_init_in_progress, 0);
+               atomic_set(&m->pg_init_count, 0);
+               m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
+               init_waitqueue_head(&m->pg_init_wait);
        }
 
        dm_table_set_type(ti->table, m->queue_mode);
@@ -246,6 +248,7 @@ static void free_multipath(struct multipath *m)
 
        kfree(m->hw_handler_name);
        kfree(m->hw_handler_params);
+       mutex_destroy(&m->work_mutex);
        kfree(m);
 }
 
@@ -264,29 +267,23 @@ static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
        return dm_per_bio_data(bio, multipath_per_bio_data_size());
 }
 
-static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio)
+static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
 {
        /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
-       struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
        void *bio_details = mpio + 1;
-
        return bio_details;
 }
 
-static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p,
-                                       struct dm_bio_details **bio_details_p)
+static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
 {
        struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
-       struct dm_bio_details *bio_details = get_bio_details_from_bio(bio);
+       struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
 
-       memset(mpio, 0, sizeof(*mpio));
-       memset(bio_details, 0, sizeof(*bio_details));
-       dm_bio_record(bio_details, bio);
+       mpio->nr_bytes = bio->bi_iter.bi_size;
+       mpio->pgpath = NULL;
+       *mpio_p = mpio;
 
-       if (mpio_p)
-               *mpio_p = mpio;
-       if (bio_details_p)
-               *bio_details_p = bio_details;
+       dm_bio_record(bio_details, bio);
 }
 
 /*-----------------------------------------------
@@ -340,6 +337,9 @@ static void __switch_pg(struct multipath *m, struct priority_group *pg)
 {
        m->current_pg = pg;
 
+       if (m->queue_mode == DM_TYPE_NVME_BIO_BASED)
+               return;
+
        /* Must we initialise the PG first, and queue I/O till it's ready? */
        if (m->hw_handler_name) {
                set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
@@ -385,7 +385,8 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
        unsigned bypassed = 1;
 
        if (!atomic_read(&m->nr_valid_paths)) {
-               clear_bit(MPATHF_QUEUE_IO, &m->flags);
+               if (m->queue_mode != DM_TYPE_NVME_BIO_BASED)
+                       clear_bit(MPATHF_QUEUE_IO, &m->flags);
                goto failed;
        }
 
@@ -516,12 +517,10 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
                return DM_MAPIO_KILL;
        } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
                   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-               if (pg_init_all_paths(m))
-                       return DM_MAPIO_DELAY_REQUEUE;
-               return DM_MAPIO_REQUEUE;
+               pg_init_all_paths(m);
+               return DM_MAPIO_DELAY_REQUEUE;
        }
 
-       memset(mpio, 0, sizeof(*mpio));
        mpio->pgpath = pgpath;
        mpio->nr_bytes = nr_bytes;
 
@@ -530,12 +529,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
        clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
        if (IS_ERR(clone)) {
                /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-               bool queue_dying = blk_queue_dying(q);
-               if (queue_dying) {
+               if (blk_queue_dying(q)) {
                        atomic_inc(&m->pg_init_in_progress);
                        activate_or_offline_path(pgpath);
+                       return DM_MAPIO_DELAY_REQUEUE;
                }
-               return DM_MAPIO_DELAY_REQUEUE;
+
+               /*
+                * blk-mq's SCHED_RESTART can cover this requeue, so we
+                * needn't deal with it by DELAY_REQUEUE. More importantly,
+                * we have to return DM_MAPIO_REQUEUE so that blk-mq can
+                * get the queue busy feedback (via BLK_STS_RESOURCE),
+                * otherwise I/O merging can suffer.
+                */
+               if (q->mq_ops)
+                       return DM_MAPIO_REQUEUE;
+               else
+                       return DM_MAPIO_DELAY_REQUEUE;
        }
        clone->bio = clone->biotail = NULL;
        clone->rq_disk = bdev->bd_disk;
@@ -557,9 +567,9 @@ static void multipath_release_clone(struct request *clone)
 /*
  * Map cloned bios (bio-based multipath)
  */
-static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio)
+
+static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 {
-       size_t nr_bytes = bio->bi_iter.bi_size;
        struct pgpath *pgpath;
        unsigned long flags;
        bool queue_io;
@@ -568,7 +578,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
        pgpath = READ_ONCE(m->current_pgpath);
        queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
        if (!pgpath || !queue_io)
-               pgpath = choose_pgpath(m, nr_bytes);
+               pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
 
        if ((pgpath && queue_io) ||
            (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
@@ -576,14 +586,62 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
                spin_lock_irqsave(&m->lock, flags);
                bio_list_add(&m->queued_bios, bio);
                spin_unlock_irqrestore(&m->lock, flags);
+
                /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
                if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
                        pg_init_all_paths(m);
                else if (!queue_io)
                        queue_work(kmultipathd, &m->process_queued_bios);
-               return DM_MAPIO_SUBMITTED;
+
+               return ERR_PTR(-EAGAIN);
+       }
+
+       return pgpath;
+}
+
+static struct pgpath *__map_bio_nvme(struct multipath *m, struct bio *bio)
+{
+       struct pgpath *pgpath;
+       unsigned long flags;
+
+       /* Do we need to select a new pgpath? */
+       /*
+        * FIXME: currently only switching path if no path (due to failure, etc)
+        * - which negates the point of using a path selector
+        */
+       pgpath = READ_ONCE(m->current_pgpath);
+       if (!pgpath)
+               pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
+
+       if (!pgpath) {
+               if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+                       /* Queue for the daemon to resubmit */
+                       spin_lock_irqsave(&m->lock, flags);
+                       bio_list_add(&m->queued_bios, bio);
+                       spin_unlock_irqrestore(&m->lock, flags);
+                       queue_work(kmultipathd, &m->process_queued_bios);
+
+                       return ERR_PTR(-EAGAIN);
+               }
+               return NULL;
        }
 
+       return pgpath;
+}
+
+static int __multipath_map_bio(struct multipath *m, struct bio *bio,
+                              struct dm_mpath_io *mpio)
+{
+       struct pgpath *pgpath;
+
+       if (m->queue_mode == DM_TYPE_NVME_BIO_BASED)
+               pgpath = __map_bio_nvme(m, bio);
+       else
+               pgpath = __map_bio(m, bio);
+
+       if (IS_ERR(pgpath))
+               return DM_MAPIO_SUBMITTED;
+
        if (!pgpath) {
                if (must_push_back_bio(m))
                        return DM_MAPIO_REQUEUE;
@@ -592,7 +650,6 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
        }
 
        mpio->pgpath = pgpath;
-       mpio->nr_bytes = nr_bytes;
 
        bio->bi_status = 0;
        bio_set_dev(bio, pgpath->path.dev->bdev);
@@ -601,7 +658,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
        if (pgpath->pg->ps.type->start_io)
                pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
                                              &pgpath->path,
-                                             nr_bytes);
+                                             mpio->nr_bytes);
        return DM_MAPIO_REMAPPED;
 }
 
@@ -610,8 +667,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
        struct multipath *m = ti->private;
        struct dm_mpath_io *mpio = NULL;
 
-       multipath_init_per_bio_data(bio, &mpio, NULL);
-
+       multipath_init_per_bio_data(bio, &mpio);
        return __multipath_map_bio(m, bio, mpio);
 }
 
@@ -619,7 +675,8 @@ static void process_queued_io_list(struct multipath *m)
 {
        if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
                dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
-       else if (m->queue_mode == DM_TYPE_BIO_BASED)
+       else if (m->queue_mode == DM_TYPE_BIO_BASED ||
+                m->queue_mode == DM_TYPE_NVME_BIO_BASED)
                queue_work(kmultipathd, &m->process_queued_bios);
 }
 
@@ -649,7 +706,9 @@ static void process_queued_bios(struct work_struct *work)
 
        blk_start_plug(&plug);
        while ((bio = bio_list_pop(&bios))) {
-               r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
+               struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
+               dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
+               r = __multipath_map_bio(m, bio, mpio);
                switch (r) {
                case DM_MAPIO_KILL:
                        bio->bi_status = BLK_STS_IOERR;
@@ -752,34 +811,11 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
        return 0;
 }
 
-static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
-                              struct dm_target *ti)
+static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, char **error)
 {
-       int r;
-       struct pgpath *p;
-       struct multipath *m = ti->private;
-       struct request_queue *q = NULL;
+       struct request_queue *q = bdev_get_queue(bdev);
        const char *attached_handler_name;
-
-       /* we need at least a path arg */
-       if (as->argc < 1) {
-               ti->error = "no device given";
-               return ERR_PTR(-EINVAL);
-       }
-
-       p = alloc_pgpath();
-       if (!p)
-               return ERR_PTR(-ENOMEM);
-
-       r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
-                         &p->path.dev);
-       if (r) {
-               ti->error = "error getting device";
-               goto bad;
-       }
-
-       if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name)
-               q = bdev_get_queue(p->path.dev->bdev);
+       int r;
 
        if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
 retain:
@@ -811,26 +847,59 @@ retain:
                        char b[BDEVNAME_SIZE];
 
                        printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
-                               bdevname(p->path.dev->bdev, b));
+                              bdevname(bdev, b));
                        goto retain;
                }
                if (r < 0) {
-                       ti->error = "error attaching hardware handler";
-                       dm_put_device(ti, p->path.dev);
-                       goto bad;
+                       *error = "error attaching hardware handler";
+                       return r;
                }
 
                if (m->hw_handler_params) {
                        r = scsi_dh_set_params(q, m->hw_handler_params);
                        if (r < 0) {
-                               ti->error = "unable to set hardware "
-                                                       "handler parameters";
-                               dm_put_device(ti, p->path.dev);
-                               goto bad;
+                               *error = "unable to set hardware handler parameters";
+                               return r;
                        }
                }
        }
 
+       return 0;
+}
+
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
+                                struct dm_target *ti)
+{
+       int r;
+       struct pgpath *p;
+       struct multipath *m = ti->private;
+
+       /* we need at least a path arg */
+       if (as->argc < 1) {
+               ti->error = "no device given";
+               return ERR_PTR(-EINVAL);
+       }
+
+       p = alloc_pgpath();
+       if (!p)
+               return ERR_PTR(-ENOMEM);
+
+       r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+                         &p->path.dev);
+       if (r) {
+               ti->error = "error getting device";
+               goto bad;
+       }
+
+       if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) {
+               INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
+               r = setup_scsi_dh(p->path.dev->bdev, m, &ti->error);
+               if (r) {
+                       dm_put_device(ti, p->path.dev);
+                       goto bad;
+               }
+       }
+
        r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
        if (r) {
                dm_put_device(ti, p->path.dev);
@@ -838,7 +907,6 @@ retain:
        }
 
        return p;
-
  bad:
        free_pgpath(p);
        return ERR_PTR(r);
@@ -933,7 +1001,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
        if (!hw_argc)
                return 0;
 
-       if (m->queue_mode == DM_TYPE_BIO_BASED) {
+       if (m->queue_mode == DM_TYPE_BIO_BASED ||
+           m->queue_mode == DM_TYPE_NVME_BIO_BASED) {
                dm_consume_args(as, hw_argc);
                DMERR("bio-based multipath doesn't allow hardware handler args");
                return 0;
@@ -1022,6 +1091,8 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 
                        if (!strcasecmp(queue_mode_name, "bio"))
                                m->queue_mode = DM_TYPE_BIO_BASED;
+                       else if (!strcasecmp(queue_mode_name, "nvme"))
+                               m->queue_mode = DM_TYPE_NVME_BIO_BASED;
                        else if (!strcasecmp(queue_mode_name, "rq"))
                                m->queue_mode = DM_TYPE_REQUEST_BASED;
                        else if (!strcasecmp(queue_mode_name, "mq"))
@@ -1122,7 +1193,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
        ti->num_discard_bios = 1;
        ti->num_write_same_bios = 1;
        ti->num_write_zeroes_bios = 1;
-       if (m->queue_mode == DM_TYPE_BIO_BASED)
+       if (m->queue_mode == DM_TYPE_BIO_BASED || m->queue_mode == DM_TYPE_NVME_BIO_BASED)
                ti->per_io_data_size = multipath_per_bio_data_size();
        else
                ti->per_io_data_size = sizeof(struct dm_mpath_io);
@@ -1151,16 +1222,19 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
 
 static void flush_multipath_work(struct multipath *m)
 {
-       set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
-       smp_mb__after_atomic();
+       if (m->hw_handler_name) {
+               set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+               smp_mb__after_atomic();
+
+               flush_workqueue(kmpath_handlerd);
+               multipath_wait_for_pg_init_completion(m);
+
+               clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
+               smp_mb__after_atomic();
+       }
 
-       flush_workqueue(kmpath_handlerd);
-       multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
        flush_work(&m->trigger_event);
-
-       clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
-       smp_mb__after_atomic();
 }
 
 static void multipath_dtr(struct dm_target *ti)
@@ -1475,21 +1549,6 @@ static void activate_path_work(struct work_struct *work)
        activate_or_offline_path(pgpath);
 }
 
-static int noretry_error(blk_status_t error)
-{
-       switch (error) {
-       case BLK_STS_NOTSUPP:
-       case BLK_STS_NOSPC:
-       case BLK_STS_TARGET:
-       case BLK_STS_NEXUS:
-       case BLK_STS_MEDIUM:
-               return 1;
-       }
-
-       /* Anything else could be a path failure, so should be retried */
-       return 0;
-}
-
 static int multipath_end_io(struct dm_target *ti, struct request *clone,
                            blk_status_t error, union map_info *map_context)
 {
@@ -1508,10 +1567,13 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
         * request into dm core, which will remake a clone request and
         * clone bios for it and resubmit it later.
         */
-       if (error && !noretry_error(error)) {
+       if (error && blk_path_error(error)) {
                struct multipath *m = ti->private;
 
-               r = DM_ENDIO_REQUEUE;
+               if (error == BLK_STS_RESOURCE)
+                       r = DM_ENDIO_DELAY_REQUEUE;
+               else
+                       r = DM_ENDIO_REQUEUE;
 
                if (pgpath)
                        fail_path(pgpath);
@@ -1536,7 +1598,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 }
 
 static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
-               blk_status_t *error)
+                               blk_status_t *error)
 {
        struct multipath *m = ti->private;
        struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
@@ -1544,7 +1606,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
        unsigned long flags;
        int r = DM_ENDIO_DONE;
 
-       if (!*error || noretry_error(*error))
+       if (!*error || !blk_path_error(*error))
                goto done;
 
        if (pgpath)
@@ -1561,9 +1623,6 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
                goto done;
        }
 
-       /* Queue for the daemon to resubmit */
-       dm_bio_restore(get_bio_details_from_bio(clone), clone);
-
        spin_lock_irqsave(&m->lock, flags);
        bio_list_add(&m->queued_bios, clone);
        spin_unlock_irqrestore(&m->lock, flags);
@@ -1671,6 +1730,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
                        case DM_TYPE_BIO_BASED:
                                DMEMIT("queue_mode bio ");
                                break;
+                       case DM_TYPE_NVME_BIO_BASED:
+                               DMEMIT("queue_mode nvme ");
+                               break;
                        case DM_TYPE_MQ_REQUEST_BASED:
                                DMEMIT("queue_mode mq ");
                                break;