smb311: Add support for lookup with posix extensions query info
[linux-2.6-microblaze.git] / block / blk-mq.c
index a7785df..9a36ac1 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
+#include <linux/blk-crypto.h>
 
 #include <trace/events/block.h>
 
@@ -270,14 +271,14 @@ static inline bool blk_mq_need_time_stamp(struct request *rq)
 }
 
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-               unsigned int tag, unsigned int op, u64 alloc_time_ns)
+               unsigned int tag, u64 alloc_time_ns)
 {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
        req_flags_t rq_flags = 0;
 
        if (data->flags & BLK_MQ_REQ_INTERNAL) {
-               rq->tag = -1;
+               rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
                if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
@@ -285,7 +286,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                        atomic_inc(&data->hctx->nr_active);
                }
                rq->tag = tag;
-               rq->internal_tag = -1;
+               rq->internal_tag = BLK_MQ_NO_TAG;
                data->hctx->tags->rqs[rq->tag] = rq;
        }
 
@@ -294,7 +295,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
        rq->mq_ctx = data->ctx;
        rq->mq_hctx = data->hctx;
        rq->rq_flags = rq_flags;
-       rq->cmd_flags = op;
+       rq->cmd_flags = data->cmd_flags;
        if (data->flags & BLK_MQ_REQ_PREEMPT)
                rq->rq_flags |= RQF_PREEMPT;
        if (blk_queue_io_stat(data->q))
@@ -317,8 +318,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
 #endif
+       blk_crypto_rq_set_defaults(rq);
        /* tag was already set */
-       rq->extra_len = 0;
        WRITE_ONCE(rq->deadline, 0);
 
        rq->timeout = 0;
@@ -326,35 +327,37 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
        rq->end_io = NULL;
        rq->end_io_data = NULL;
 
-       data->ctx->rq_dispatched[op_is_sync(op)]++;
+       data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
        refcount_set(&rq->ref, 1);
+
+       if (!op_is_flush(data->cmd_flags)) {
+               struct elevator_queue *e = data->q->elevator;
+
+               rq->elv.icq = NULL;
+               if (e && e->type->ops.prepare_request) {
+                       if (e->type->icq_cache)
+                               blk_mq_sched_assign_ioc(rq);
+
+                       e->type->ops.prepare_request(rq);
+                       rq->rq_flags |= RQF_ELVPRIV;
+               }
+       }
+
+       data->hctx->queued++;
        return rq;
 }
 
-static struct request *blk_mq_get_request(struct request_queue *q,
-                                         struct bio *bio,
-                                         struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 {
+       struct request_queue *q = data->q;
        struct elevator_queue *e = q->elevator;
-       struct request *rq;
-       unsigned int tag;
-       bool clear_ctx_on_error = false;
        u64 alloc_time_ns = 0;
-
-       blk_queue_enter_live(q);
+       unsigned int tag;
 
        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = ktime_get_ns();
 
-       data->q = q;
-       if (likely(!data->ctx)) {
-               data->ctx = blk_mq_get_ctx(q);
-               clear_ctx_on_error = true;
-       }
-       if (likely(!data->hctx))
-               data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-                                               data->ctx);
        if (data->cmd_flags & REQ_NOWAIT)
                data->flags |= BLK_MQ_REQ_NOWAIT;
 
@@ -370,37 +373,43 @@ static struct request *blk_mq_get_request(struct request_queue *q,
                    e->type->ops.limit_depth &&
                    !(data->flags & BLK_MQ_REQ_RESERVED))
                        e->type->ops.limit_depth(data->cmd_flags, data);
-       } else {
-               blk_mq_tag_busy(data->hctx);
        }
 
-       tag = blk_mq_get_tag(data);
-       if (tag == BLK_MQ_TAG_FAIL) {
-               if (clear_ctx_on_error)
-                       data->ctx = NULL;
-               blk_queue_exit(q);
-               return NULL;
-       }
+retry:
+       data->ctx = blk_mq_get_ctx(q);
+       data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+       if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+               blk_mq_tag_busy(data->hctx);
 
-       rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
-       if (!op_is_flush(data->cmd_flags)) {
-               rq->elv.icq = NULL;
-               if (e && e->type->ops.prepare_request) {
-                       if (e->type->icq_cache)
-                               blk_mq_sched_assign_ioc(rq);
+       /*
+        * Waiting allocations only fail because of an inactive hctx.  In that
+        * case just retry the hctx assignment and tag allocation as CPU hotplug
+        * should have migrated us to an online CPU by now.
+        */
+       tag = blk_mq_get_tag(data);
+       if (tag == BLK_MQ_NO_TAG) {
+               if (data->flags & BLK_MQ_REQ_NOWAIT)
+                       return NULL;
 
-                       e->type->ops.prepare_request(rq, bio);
-                       rq->rq_flags |= RQF_ELVPRIV;
-               }
+               /*
+                * Give up the CPU and sleep for a random short time to ensure
+                * that thread using a realtime scheduling class are migrated
+                * off the the CPU, and thus off the hctx that is going away.
+                */
+               msleep(3);
+               goto retry;
        }
-       data->hctx->queued++;
-       return rq;
+       return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                blk_mq_req_flags_t flags)
 {
-       struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
+       struct blk_mq_alloc_data data = {
+               .q              = q,
+               .flags          = flags,
+               .cmd_flags      = op,
+       };
        struct request *rq;
        int ret;
 
@@ -408,34 +417,43 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
        if (ret)
                return ERR_PTR(ret);
 
-       rq = blk_mq_get_request(q, NULL, &alloc_data);
-       blk_queue_exit(q);
-
+       rq = __blk_mq_alloc_request(&data);
        if (!rq)
-               return ERR_PTR(-EWOULDBLOCK);
-
+               goto out_queue_exit;
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
+out_queue_exit:
+       blk_queue_exit(q);
+       return ERR_PTR(-EWOULDBLOCK);
 }
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
-       struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
-       struct request *rq;
+       struct blk_mq_alloc_data data = {
+               .q              = q,
+               .flags          = flags,
+               .cmd_flags      = op,
+       };
+       u64 alloc_time_ns = 0;
        unsigned int cpu;
+       unsigned int tag;
        int ret;
 
+       /* alloc_time includes depth and tag waits */
+       if (blk_queue_rq_alloc_time(q))
+               alloc_time_ns = ktime_get_ns();
+
        /*
         * If the tag allocator sleeps we could get an allocation for a
         * different hardware context.  No need to complicate the low level
         * allocator for this for the rare use case of a command tied to
         * a specific queue.
         */
-       if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+       if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
                return ERR_PTR(-EINVAL);
 
        if (hctx_idx >= q->nr_hw_queues)
@@ -449,21 +467,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         * Check if the hardware context is actually mapped to anything.
         * If not tell the caller that it should skip this queue.
         */
-       alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
-       if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
-               blk_queue_exit(q);
-               return ERR_PTR(-EXDEV);
-       }
-       cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
-       alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
-
-       rq = blk_mq_get_request(q, NULL, &alloc_data);
-       blk_queue_exit(q);
+       ret = -EXDEV;
+       data.hctx = q->queue_hw_ctx[hctx_idx];
+       if (!blk_mq_hw_queue_mapped(data.hctx))
+               goto out_queue_exit;
+       cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+       data.ctx = __blk_mq_get_ctx(q, cpu);
+
+       if (q->elevator)
+               data.flags |= BLK_MQ_REQ_INTERNAL;
+       else
+               blk_mq_tag_busy(data.hctx);
 
-       if (!rq)
-               return ERR_PTR(-EWOULDBLOCK);
+       ret = -EWOULDBLOCK;
+       tag = blk_mq_get_tag(&data);
+       if (tag == BLK_MQ_NO_TAG)
+               goto out_queue_exit;
+       return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
 
-       return rq;
+out_queue_exit:
+       blk_queue_exit(q);
+       return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
@@ -474,11 +498,12 @@ static void __blk_mq_free_request(struct request *rq)
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        const int sched_tag = rq->internal_tag;
 
+       blk_crypto_free_request(rq);
        blk_pm_mark_last_busy(rq);
        rq->mq_hctx = NULL;
-       if (rq->tag != -1)
+       if (rq->tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->tags, ctx, rq->tag);
-       if (sched_tag != -1)
+       if (sched_tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
        blk_mq_sched_restart(hctx);
        blk_queue_exit(q);
@@ -527,7 +552,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
                blk_stat_add(rq, now);
        }
 
-       if (rq->internal_tag != -1)
+       if (rq->internal_tag != BLK_MQ_NO_TAG)
                blk_mq_sched_completed_request(rq, now);
 
        blk_account_io_done(rq, now);
@@ -557,7 +582,17 @@ static void __blk_mq_complete_request_remote(void *data)
        q->mq_ops->complete(rq);
 }
 
-static void __blk_mq_complete_request(struct request *rq)
+/**
+ * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
+ *                             injection that could drop the completion.
+ * @rq: Request to be force completed
+ *
+ * Drivers should use blk_mq_complete_request() to complete requests in their
+ * normal IO path. For timeout error recovery, drivers may call this forced
+ * completion routine after they've reclaimed timed out requests to bypass
+ * potentially subsequent fake timeouts.
+ */
+void blk_mq_force_complete_rq(struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct request_queue *q = rq->q;
@@ -603,6 +638,7 @@ static void __blk_mq_complete_request(struct request *rq)
        }
        put_cpu();
 }
+EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
 
 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
        __releases(hctx->srcu)
@@ -636,7 +672,7 @@ bool blk_mq_complete_request(struct request *rq)
 {
        if (unlikely(blk_should_fake_timeout(rq->q)))
                return false;
-       __blk_mq_complete_request(rq);
+       blk_mq_force_complete_rq(rq);
        return true;
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
@@ -667,15 +703,6 @@ void blk_mq_start_request(struct request *rq)
        blk_add_timer(rq);
        WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 
-       if (q->dma_drain_size && blk_rq_bytes(rq)) {
-               /*
-                * Make sure space for the drain appears.  We know we can do
-                * this because max_hw_segments has been adjusted to be one
-                * fewer than the device can handle.
-                */
-               rq->nr_phys_segments++;
-       }
-
 #ifdef CONFIG_BLK_DEV_INTEGRITY
        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                q->integrity.profile->prepare_fn(rq);
@@ -695,8 +722,6 @@ static void __blk_mq_requeue_request(struct request *rq)
        if (blk_mq_request_started(rq)) {
                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                rq->rq_flags &= ~RQF_TIMED_OUT;
-               if (q->dma_drain_size && blk_rq_bytes(rq))
-                       rq->nr_phys_segments--;
        }
 }
 
@@ -1037,7 +1062,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
        };
        bool shared;
 
-       if (rq->tag != -1)
+       if (rq->tag != BLK_MQ_NO_TAG)
                return true;
 
        if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
@@ -1053,7 +1078,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
                data.hctx->tags->rqs[rq->tag] = rq;
        }
 
-       return rq->tag != -1;
+       return rq->tag != BLK_MQ_NO_TAG;
 }
 
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
@@ -1195,6 +1220,19 @@ static void blk_mq_handle_dev_resource(struct request *rq,
        __blk_mq_requeue_request(rq);
 }
 
+static void blk_mq_handle_zone_resource(struct request *rq,
+                                       struct list_head *zone_list)
+{
+       /*
+        * If we end up here it is because we cannot dispatch a request to a
+        * specific zone due to LLD level zone-write locking or other zone
+        * related resource not being available. In this case, set the request
+        * aside in zone_list for retrying it later.
+        */
+       list_add(&rq->queuelist, zone_list);
+       __blk_mq_requeue_request(rq);
+}
+
 /*
  * Returns true if we did some work AND can potentially do more.
  */
@@ -1206,6 +1244,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
        bool no_tag = false;
        int errors, queued;
        blk_status_t ret = BLK_STS_OK;
+       bool no_budget_avail = false;
+       LIST_HEAD(zone_list);
 
        if (list_empty(list))
                return false;
@@ -1224,6 +1264,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                hctx = rq->mq_hctx;
                if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
                        blk_mq_put_driver_tag(rq);
+                       no_budget_avail = true;
                        break;
                }
 
@@ -1266,6 +1307,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
                        blk_mq_handle_dev_resource(rq, list);
                        break;
+               } else if (ret == BLK_STS_ZONE_RESOURCE) {
+                       /*
+                        * Move the request to zone_list and keep going through
+                        * the dispatch list to find more requests the drive can
+                        * accept.
+                        */
+                       blk_mq_handle_zone_resource(rq, &zone_list);
+                       if (list_empty(list))
+                               break;
+                       continue;
                }
 
                if (unlikely(ret != BLK_STS_OK)) {
@@ -1277,6 +1328,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                queued++;
        } while (!list_empty(list));
 
+       if (!list_empty(&zone_list))
+               list_splice_tail_init(&zone_list, list);
+
        hctx->dispatched[queued_to_index(queued)]++;
 
        /*
@@ -1320,13 +1374,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 *
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
-                * that could otherwise occur if the queue is idle.
+                * that could otherwise occur if the queue is idle.  We'll do
+                * similar if we couldn't get budget and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
-               else if (needs_restart && (ret == BLK_STS_RESOURCE))
+               else if (needs_restart && (ret == BLK_STS_RESOURCE ||
+                                          no_budget_avail))
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
 
                blk_mq_update_dispatch_busy(hctx, true);
@@ -1541,6 +1597,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
+/**
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
+ * @q: Pointer to the request queue to run.
+ * @msecs: Microseconds of delay to wait before running the queues.
+ */
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (blk_mq_hctx_stopped(hctx))
+                       continue;
+
+               blk_mq_delay_run_hw_queue(hctx, msecs);
+       }
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
+
 /**
  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
  * @q: request queue.
@@ -1782,8 +1857,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
        rq->__sector = bio->bi_iter.bi_sector;
        rq->write_hint = bio->bi_write_hint;
        blk_rq_bio_prep(rq, bio, nr_segs);
+       blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
 
-       blk_account_io_start(rq, true);
+       blk_account_io_start(rq);
 }
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
@@ -1973,39 +2049,42 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
  *
  * Returns: Request queue cookie.
  */
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
        const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_mq_alloc_data data = { .flags = 0};
+       struct blk_mq_alloc_data data = {
+               .q              = q,
+       };
        struct request *rq;
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
        unsigned int nr_segs;
        blk_qc_t cookie;
+       blk_status_t ret;
 
        blk_queue_bounce(q, &bio);
        __blk_queue_split(q, &bio, &nr_segs);
 
        if (!bio_integrity_prep(bio))
-               return BLK_QC_T_NONE;
+               goto queue_exit;
 
        if (!is_flush_fua && !blk_queue_nomerges(q) &&
            blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
-               return BLK_QC_T_NONE;
+               goto queue_exit;
 
        if (blk_mq_sched_bio_merge(q, bio, nr_segs))
-               return BLK_QC_T_NONE;
+               goto queue_exit;
 
        rq_qos_throttle(q, bio);
 
        data.cmd_flags = bio->bi_opf;
-       rq = blk_mq_get_request(q, bio, &data);
+       rq = __blk_mq_alloc_request(&data);
        if (unlikely(!rq)) {
                rq_qos_cleanup(q, bio);
                if (bio->bi_opf & REQ_NOWAIT)
                        bio_wouldblock_error(bio);
-               return BLK_QC_T_NONE;
+               goto queue_exit;
        }
 
        trace_block_getrq(q, bio, bio->bi_opf);
@@ -2016,6 +2095,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
        blk_mq_bio_to_request(rq, bio, nr_segs);
 
+       ret = blk_crypto_init_request(rq);
+       if (ret != BLK_STS_OK) {
+               bio->bi_status = ret;
+               bio_endio(bio);
+               blk_mq_free_request(rq);
+               return BLK_QC_T_NONE;
+       }
+
        plug = blk_mq_plug(q, bio);
        if (unlikely(is_flush_fua)) {
                /* Bypass scheduler for flush requests */
@@ -2084,7 +2171,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
        }
 
        return cookie;
+queue_exit:
+       blk_queue_exit(q);
+       return BLK_QC_T_NONE;
 }
+EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
 
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
@@ -2260,6 +2351,86 @@ fail:
        return -ENOMEM;
 }
 
+struct rq_iter_data {
+       struct blk_mq_hw_ctx *hctx;
+       bool has_rq;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+       struct rq_iter_data *iter_data = data;
+
+       if (rq->mq_hctx != iter_data->hctx)
+               return true;
+       iter_data->has_rq = true;
+       return false;
+}
+
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_tags *tags = hctx->sched_tags ?
+                       hctx->sched_tags : hctx->tags;
+       struct rq_iter_data data = {
+               .hctx   = hctx,
+       };
+
+       blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+       return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+               struct blk_mq_hw_ctx *hctx)
+{
+       if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+               return false;
+       if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+               return false;
+       return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+       struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+                       struct blk_mq_hw_ctx, cpuhp_online);
+
+       if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
+           !blk_mq_last_cpu_in_hctx(cpu, hctx))
+               return 0;
+
+       /*
+        * Prevent new request from being allocated on the current hctx.
+        *
+        * The smp_mb__after_atomic() Pairs with the implied barrier in
+        * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
+        * seen once we return from the tag allocator.
+        */
+       set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+       smp_mb__after_atomic();
+
+       /*
+        * Try to grab a reference to the queue and wait for any outstanding
+        * requests.  If we could not grab a reference the queue has been
+        * frozen and there are no requests.
+        */
+       if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
+               while (blk_mq_hctx_has_requests(hctx))
+                       msleep(5);
+               percpu_ref_put(&hctx->queue->q_usage_counter);
+       }
+
+       return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+       struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+                       struct blk_mq_hw_ctx, cpuhp_online);
+
+       if (cpumask_test_cpu(cpu, hctx->cpumask))
+               clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+       return 0;
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2273,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
        enum hctx_type type;
 
        hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+       if (!cpumask_test_cpu(cpu, hctx->cpumask))
+               return 0;
+
        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
        type = hctx->type;
 
@@ -2296,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 
 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
+       if (!(hctx->flags & BLK_MQ_F_STACKING))
+               cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+                                                   &hctx->cpuhp_online);
        cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                            &hctx->cpuhp_dead);
 }
@@ -2355,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 {
        hctx->queue_num = hctx_idx;
 
+       if (!(hctx->flags & BLK_MQ_F_STACKING))
+               cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+                               &hctx->cpuhp_online);
        cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
        hctx->tags = set->tags[hctx_idx];
@@ -2473,7 +2653,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
        }
 }
 
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
+                                       int hctx_idx)
 {
        int ret = 0;
 
@@ -2521,18 +2702,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {
-               hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
-               /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[hctx_idx] &&
-                   !__blk_mq_alloc_rq_map(set, hctx_idx)) {
-                       /*
-                        * If tags initialization fail for some hctx,
-                        * that hctx won't be brought online.  In this
-                        * case, remap the current ctx to hctx[0] which
-                        * is guaranteed to always have tags allocated
-                        */
-                       set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
-               }
 
                ctx = per_cpu_ptr(q->queue_ctx, i);
                for (j = 0; j < set->nr_maps; j++) {
@@ -2541,6 +2710,18 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                                                HCTX_TYPE_DEFAULT, i);
                                continue;
                        }
+                       hctx_idx = set->map[j].mq_map[i];
+                       /* unmapped hw queue can be remapped after CPU topo changed */
+                       if (!set->tags[hctx_idx] &&
+                           !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+                               /*
+                                * If tags initialization fail for some hctx,
+                                * that hctx won't be brought online.  In this
+                                * case, remap the current ctx to hctx[0] which
+                                * is guaranteed to always have tags allocated
+                                */
+                               set->map[j].mq_map[i] = 0;
+                       }
 
                        hctx = blk_mq_map_queue_type(q, j, i);
                        ctx->hctxs[j] = hctx;
@@ -2944,7 +3125,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        INIT_LIST_HEAD(&q->requeue_list);
        spin_lock_init(&q->requeue_lock);
 
-       q->make_request_fn = blk_mq_make_request;
        q->nr_requests = set->queue_depth;
 
        /*
@@ -2988,14 +3168,14 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
        int i;
 
        for (i = 0; i < set->nr_hw_queues; i++)
-               if (!__blk_mq_alloc_rq_map(set, i))
+               if (!__blk_mq_alloc_map_and_request(set, i))
                        goto out_unwind;
 
        return 0;
 
 out_unwind:
        while (--i >= 0)
-               blk_mq_free_rq_map(set->tags[i]);
+               blk_mq_free_map_and_requests(set, i);
 
        return -ENOMEM;
 }
@@ -3005,7 +3185,7 @@ out_unwind:
  * may reduce the depth asked for, if memory is tight. set->queue_depth
  * will be updated to reflect the allocated depth.
  */
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
 {
        unsigned int depth;
        int err;
@@ -3165,7 +3345,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
        if (ret)
                goto out_free_mq_map;
 
-       ret = blk_mq_alloc_rq_maps(set);
+       ret = blk_mq_alloc_map_and_requests(set);
        if (ret)
                goto out_free_mq_map;
 
@@ -3347,14 +3527,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                blk_mq_sysfs_unregister(q);
        }
 
+       prev_nr_hw_queues = set->nr_hw_queues;
        if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
            0)
                goto reregister;
 
-       prev_nr_hw_queues = set->nr_hw_queues;
        set->nr_hw_queues = nr_hw_queues;
-       blk_mq_update_queue_map(set);
 fallback:
+       blk_mq_update_queue_map(set);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q);
                if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -3609,6 +3789,9 @@ static int __init blk_mq_init(void)
 {
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
+       cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+                               blk_mq_hctx_notify_online,
+                               blk_mq_hctx_notify_offline);
        return 0;
 }
 subsys_initcall(blk_mq_init);