smb311: Add support for lookup with posix extensions query info

[linux-2.6-microblaze.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index a7785df..9a36ac1 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -26,6 +26,7 @@
  #include <linux/delay.h>
  #include <linux/crash_dump.h>
  #include <linux/prefetch.h>
+#include <linux/blk-crypto.h>
  
  #include <trace/events/block.h>
  
@@ -270,14 +271,14 @@ static inline bool blk_mq_need_time_stamp(struct request *rq)
  }
  
  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-               unsigned int tag, unsigned int op, u64 alloc_time_ns)
+               unsigned int tag, u64 alloc_time_ns)
  {
         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
         struct request *rq = tags->static_rqs[tag];
         req_flags_t rq_flags = 0;
  
         if (data->flags & BLK_MQ_REQ_INTERNAL) {
-               rq->tag = -1;
+               rq->tag = BLK_MQ_NO_TAG;
                 rq->internal_tag = tag;
         } else {
                 if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
@@ -285,7 +286,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                         atomic_inc(&data->hctx->nr_active);
                 }
                 rq->tag = tag;
-               rq->internal_tag = -1;
+               rq->internal_tag = BLK_MQ_NO_TAG;
                 data->hctx->tags->rqs[rq->tag] = rq;
         }
  
@@ -294,7 +295,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
         rq->mq_ctx = data->ctx;
         rq->mq_hctx = data->hctx;
         rq->rq_flags = rq_flags;
-       rq->cmd_flags = op;
+       rq->cmd_flags = data->cmd_flags;
         if (data->flags & BLK_MQ_REQ_PREEMPT)
                 rq->rq_flags |= RQF_PREEMPT;
         if (blk_queue_io_stat(data->q))
@@ -317,8 +318,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
         rq->nr_integrity_segments = 0;
  #endif
+       blk_crypto_rq_set_defaults(rq);
         /* tag was already set */
-       rq->extra_len = 0;
         WRITE_ONCE(rq->deadline, 0);
  
         rq->timeout = 0;
@@ -326,35 +327,37 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
         rq->end_io = NULL;
         rq->end_io_data = NULL;
  
-       data->ctx->rq_dispatched[op_is_sync(op)]++;
+       data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
         refcount_set(&rq->ref, 1);
+
+       if (!op_is_flush(data->cmd_flags)) {
+               struct elevator_queue *e = data->q->elevator;
+
+               rq->elv.icq = NULL;
+               if (e && e->type->ops.prepare_request) {
+                       if (e->type->icq_cache)
+                               blk_mq_sched_assign_ioc(rq);
+
+                       e->type->ops.prepare_request(rq);
+                       rq->rq_flags |= RQF_ELVPRIV;
+               }
+       }
+
+       data->hctx->queued++;
         return rq;
  }
  
-static struct request *blk_mq_get_request(struct request_queue *q,
-                                         struct bio *bio,
-                                         struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
  {
+       struct request_queue *q = data->q;
         struct elevator_queue *e = q->elevator;
-       struct request *rq;
-       unsigned int tag;
-       bool clear_ctx_on_error = false;
         u64 alloc_time_ns = 0;
-
-       blk_queue_enter_live(q);
+       unsigned int tag;
  
         /* alloc_time includes depth and tag waits */
         if (blk_queue_rq_alloc_time(q))
                 alloc_time_ns = ktime_get_ns();
  
-       data->q = q;
-       if (likely(!data->ctx)) {
-               data->ctx = blk_mq_get_ctx(q);
-               clear_ctx_on_error = true;
-       }
-       if (likely(!data->hctx))
-               data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-                                               data->ctx);
         if (data->cmd_flags & REQ_NOWAIT)
                 data->flags |= BLK_MQ_REQ_NOWAIT;
  
@@ -370,37 +373,43 @@ static struct request *blk_mq_get_request(struct request_queue *q,
                     e->type->ops.limit_depth &&
                     !(data->flags & BLK_MQ_REQ_RESERVED))
                         e->type->ops.limit_depth(data->cmd_flags, data);
-       } else {
-               blk_mq_tag_busy(data->hctx);
         }
  
-       tag = blk_mq_get_tag(data);
-       if (tag == BLK_MQ_TAG_FAIL) {
-               if (clear_ctx_on_error)
-                       data->ctx = NULL;
-               blk_queue_exit(q);
-               return NULL;
-       }
+retry:
+       data->ctx = blk_mq_get_ctx(q);
+       data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+       if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+               blk_mq_tag_busy(data->hctx);
  
-       rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
-       if (!op_is_flush(data->cmd_flags)) {
-               rq->elv.icq = NULL;
-               if (e && e->type->ops.prepare_request) {
-                       if (e->type->icq_cache)
-                               blk_mq_sched_assign_ioc(rq);
+       /*
+        * Waiting allocations only fail because of an inactive hctx.  In that
+        * case just retry the hctx assignment and tag allocation as CPU hotplug
+        * should have migrated us to an online CPU by now.
+        */
+       tag = blk_mq_get_tag(data);
+       if (tag == BLK_MQ_NO_TAG) {
+               if (data->flags & BLK_MQ_REQ_NOWAIT)
+                       return NULL;
  
-                       e->type->ops.prepare_request(rq, bio);
-                       rq->rq_flags |= RQF_ELVPRIV;
-               }
+               /*
+                * Give up the CPU and sleep for a random short time to ensure
+                * that thread using a realtime scheduling class are migrated
+                * off the the CPU, and thus off the hctx that is going away.
+                */
+               msleep(3);
+               goto retry;
         }
-       data->hctx->queued++;
-       return rq;
+       return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
  }
  
  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                 blk_mq_req_flags_t flags)
  {
-       struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
+       struct blk_mq_alloc_data data = {
+               .q              = q,
+               .flags          = flags,
+               .cmd_flags      = op,
+       };
         struct request *rq;
         int ret;
  
@@ -408,34 +417,43 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
         if (ret)
                 return ERR_PTR(ret);
  
-       rq = blk_mq_get_request(q, NULL, &alloc_data);
-       blk_queue_exit(q);
-
+       rq = __blk_mq_alloc_request(&data);
         if (!rq)
-               return ERR_PTR(-EWOULDBLOCK);
-
+               goto out_queue_exit;
         rq->__data_len = 0;
         rq->__sector = (sector_t) -1;
         rq->bio = rq->biotail = NULL;
         return rq;
+out_queue_exit:
+       blk_queue_exit(q);
+       return ERR_PTR(-EWOULDBLOCK);
  }
  EXPORT_SYMBOL(blk_mq_alloc_request);
  
  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
  {
-       struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
-       struct request *rq;
+       struct blk_mq_alloc_data data = {
+               .q              = q,
+               .flags          = flags,
+               .cmd_flags      = op,
+       };
+       u64 alloc_time_ns = 0;
         unsigned int cpu;
+       unsigned int tag;
         int ret;
  
+       /* alloc_time includes depth and tag waits */
+       if (blk_queue_rq_alloc_time(q))
+               alloc_time_ns = ktime_get_ns();
+
         /*
          * If the tag allocator sleeps we could get an allocation for a
          * different hardware context.  No need to complicate the low level
          * allocator for this for the rare use case of a command tied to
          * a specific queue.
          */
-       if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+       if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
                 return ERR_PTR(-EINVAL);
  
         if (hctx_idx >= q->nr_hw_queues)
@@ -449,21 +467,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
          * Check if the hardware context is actually mapped to anything.
          * If not tell the caller that it should skip this queue.
          */
-       alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
-       if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
-               blk_queue_exit(q);
-               return ERR_PTR(-EXDEV);
-       }
-       cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
-       alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
-
-       rq = blk_mq_get_request(q, NULL, &alloc_data);
-       blk_queue_exit(q);
+       ret = -EXDEV;
+       data.hctx = q->queue_hw_ctx[hctx_idx];
+       if (!blk_mq_hw_queue_mapped(data.hctx))
+               goto out_queue_exit;
+       cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+       data.ctx = __blk_mq_get_ctx(q, cpu);
+
+       if (q->elevator)
+               data.flags |= BLK_MQ_REQ_INTERNAL;
+       else
+               blk_mq_tag_busy(data.hctx);
  
-       if (!rq)
-               return ERR_PTR(-EWOULDBLOCK);
+       ret = -EWOULDBLOCK;
+       tag = blk_mq_get_tag(&data);
+       if (tag == BLK_MQ_NO_TAG)
+               goto out_queue_exit;
+       return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
  
-       return rq;
+out_queue_exit:
+       blk_queue_exit(q);
+       return ERR_PTR(ret);
  }
  EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
  
@@ -474,11 +498,12 @@ static void __blk_mq_free_request(struct request *rq)
         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
         const int sched_tag = rq->internal_tag;
  
+       blk_crypto_free_request(rq);
         blk_pm_mark_last_busy(rq);
         rq->mq_hctx = NULL;
-       if (rq->tag != -1)
+       if (rq->tag != BLK_MQ_NO_TAG)
                 blk_mq_put_tag(hctx->tags, ctx, rq->tag);
-       if (sched_tag != -1)
+       if (sched_tag != BLK_MQ_NO_TAG)
                 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
         blk_mq_sched_restart(hctx);
         blk_queue_exit(q);
@@ -527,7 +552,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
                 blk_stat_add(rq, now);
         }
  
-       if (rq->internal_tag != -1)
+       if (rq->internal_tag != BLK_MQ_NO_TAG)
                 blk_mq_sched_completed_request(rq, now);
  
         blk_account_io_done(rq, now);
@@ -557,7 +582,17 @@ static void __blk_mq_complete_request_remote(void *data)
         q->mq_ops->complete(rq);
  }
  
-static void __blk_mq_complete_request(struct request *rq)
+/**
+ * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
+ *                             injection that could drop the completion.
+ * @rq: Request to be force completed
+ *
+ * Drivers should use blk_mq_complete_request() to complete requests in their
+ * normal IO path. For timeout error recovery, drivers may call this forced
+ * completion routine after they've reclaimed timed out requests to bypass
+ * potentially subsequent fake timeouts.
+ */
+void blk_mq_force_complete_rq(struct request *rq)
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
         struct request_queue *q = rq->q;
@@ -603,6 +638,7 @@ static void __blk_mq_complete_request(struct request *rq)
         }
         put_cpu();
  }
+EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
  
  static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
         __releases(hctx->srcu)
@@ -636,7 +672,7 @@ bool blk_mq_complete_request(struct request *rq)
  {
         if (unlikely(blk_should_fake_timeout(rq->q)))
                 return false;
-       __blk_mq_complete_request(rq);
+       blk_mq_force_complete_rq(rq);
         return true;
  }
  EXPORT_SYMBOL(blk_mq_complete_request);
@@ -667,15 +703,6 @@ void blk_mq_start_request(struct request *rq)
         blk_add_timer(rq);
         WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
  
-       if (q->dma_drain_size && blk_rq_bytes(rq)) {
-               /*
-                * Make sure space for the drain appears.  We know we can do
-                * this because max_hw_segments has been adjusted to be one
-                * fewer than the device can handle.
-                */
-               rq->nr_phys_segments++;
-       }
-
  #ifdef CONFIG_BLK_DEV_INTEGRITY
         if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                 q->integrity.profile->prepare_fn(rq);
@@ -695,8 +722,6 @@ static void __blk_mq_requeue_request(struct request *rq)
         if (blk_mq_request_started(rq)) {
                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                 rq->rq_flags &= ~RQF_TIMED_OUT;
-               if (q->dma_drain_size && blk_rq_bytes(rq))
-                       rq->nr_phys_segments--;
         }
  }
  
@@ -1037,7 +1062,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
         };
         bool shared;
  
-       if (rq->tag != -1)
+       if (rq->tag != BLK_MQ_NO_TAG)
                 return true;
  
         if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
@@ -1053,7 +1078,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
                 data.hctx->tags->rqs[rq->tag] = rq;
         }
  
-       return rq->tag != -1;
+       return rq->tag != BLK_MQ_NO_TAG;
  }
  
  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
@@ -1195,6 +1220,19 @@ static void blk_mq_handle_dev_resource(struct request *rq,
         __blk_mq_requeue_request(rq);
  }
  
+static void blk_mq_handle_zone_resource(struct request *rq,
+                                       struct list_head *zone_list)
+{
+       /*
+        * If we end up here it is because we cannot dispatch a request to a
+        * specific zone due to LLD level zone-write locking or other zone
+        * related resource not being available. In this case, set the request
+        * aside in zone_list for retrying it later.
+        */
+       list_add(&rq->queuelist, zone_list);
+       __blk_mq_requeue_request(rq);
+}
+
  /*
   * Returns true if we did some work AND can potentially do more.
   */
@@ -1206,6 +1244,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
         bool no_tag = false;
         int errors, queued;
         blk_status_t ret = BLK_STS_OK;
+       bool no_budget_avail = false;
+       LIST_HEAD(zone_list);
  
         if (list_empty(list))
                 return false;
@@ -1224,6 +1264,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 hctx = rq->mq_hctx;
                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
                         blk_mq_put_driver_tag(rq);
+                       no_budget_avail = true;
                         break;
                 }
  
@@ -1266,6 +1307,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
                         blk_mq_handle_dev_resource(rq, list);
                         break;
+               } else if (ret == BLK_STS_ZONE_RESOURCE) {
+                       /*
+                        * Move the request to zone_list and keep going through
+                        * the dispatch list to find more requests the drive can
+                        * accept.
+                        */
+                       blk_mq_handle_zone_resource(rq, &zone_list);
+                       if (list_empty(list))
+                               break;
+                       continue;
                 }
  
                 if (unlikely(ret != BLK_STS_OK)) {
@@ -1277,6 +1328,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 queued++;
         } while (!list_empty(list));
  
+       if (!list_empty(&zone_list))
+               list_splice_tail_init(&zone_list, list);
+
         hctx->dispatched[queued_to_index(queued)]++;
  
         /*
@@ -1320,13 +1374,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                  *
                  * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                  * bit is set, run queue after a delay to avoid IO stalls
-                * that could otherwise occur if the queue is idle.
+                * that could otherwise occur if the queue is idle.  We'll do
+                * similar if we couldn't get budget and SCHED_RESTART is set.
                  */
                 needs_restart = blk_mq_sched_needs_restart(hctx);
                 if (!needs_restart ||
                     (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                         blk_mq_run_hw_queue(hctx, true);
-               else if (needs_restart && (ret == BLK_STS_RESOURCE))
+               else if (needs_restart && (ret == BLK_STS_RESOURCE ||
+                                          no_budget_avail))
                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
  
                 blk_mq_update_dispatch_busy(hctx, true);
@@ -1541,6 +1597,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
  }
  EXPORT_SYMBOL(blk_mq_run_hw_queues);
  
+/**
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
+ * @q: Pointer to the request queue to run.
+ * @msecs: Microseconds of delay to wait before running the queues.
+ */
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (blk_mq_hctx_stopped(hctx))
+                       continue;
+
+               blk_mq_delay_run_hw_queue(hctx, msecs);
+       }
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
+
  /**
   * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
   * @q: request queue.
@@ -1782,8 +1857,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
         rq->__sector = bio->bi_iter.bi_sector;
         rq->write_hint = bio->bi_write_hint;
         blk_rq_bio_prep(rq, bio, nr_segs);
+       blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
  
-       blk_account_io_start(rq, true);
+       blk_account_io_start(rq);
  }
  
  static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
@@ -1973,39 +2049,42 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
   *
   * Returns: Request queue cookie.
   */
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  {
         const int is_sync = op_is_sync(bio->bi_opf);
         const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_mq_alloc_data data = { .flags = 0};
+       struct blk_mq_alloc_data data = {
+               .q              = q,
+       };
         struct request *rq;
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         unsigned int nr_segs;
         blk_qc_t cookie;
+       blk_status_t ret;
  
         blk_queue_bounce(q, &bio);
         __blk_queue_split(q, &bio, &nr_segs);
  
         if (!bio_integrity_prep(bio))
-               return BLK_QC_T_NONE;
+               goto queue_exit;
  
         if (!is_flush_fua && !blk_queue_nomerges(q) &&
             blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
-               return BLK_QC_T_NONE;
+               goto queue_exit;
  
         if (blk_mq_sched_bio_merge(q, bio, nr_segs))
-               return BLK_QC_T_NONE;
+               goto queue_exit;
  
         rq_qos_throttle(q, bio);
  
         data.cmd_flags = bio->bi_opf;
-       rq = blk_mq_get_request(q, bio, &data);
+       rq = __blk_mq_alloc_request(&data);
         if (unlikely(!rq)) {
                 rq_qos_cleanup(q, bio);
                 if (bio->bi_opf & REQ_NOWAIT)
                         bio_wouldblock_error(bio);
-               return BLK_QC_T_NONE;
+               goto queue_exit;
         }
  
         trace_block_getrq(q, bio, bio->bi_opf);
@@ -2016,6 +2095,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  
         blk_mq_bio_to_request(rq, bio, nr_segs);
  
+       ret = blk_crypto_init_request(rq);
+       if (ret != BLK_STS_OK) {
+               bio->bi_status = ret;
+               bio_endio(bio);
+               blk_mq_free_request(rq);
+               return BLK_QC_T_NONE;
+       }
+
         plug = blk_mq_plug(q, bio);
         if (unlikely(is_flush_fua)) {
                 /* Bypass scheduler for flush requests */
@@ -2084,7 +2171,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         }
  
         return cookie;
+queue_exit:
+       blk_queue_exit(q);
+       return BLK_QC_T_NONE;
  }
+EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
  
  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                      unsigned int hctx_idx)
@@ -2260,6 +2351,86 @@ fail:
         return -ENOMEM;
  }
  
+struct rq_iter_data {
+       struct blk_mq_hw_ctx *hctx;
+       bool has_rq;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+       struct rq_iter_data *iter_data = data;
+
+       if (rq->mq_hctx != iter_data->hctx)
+               return true;
+       iter_data->has_rq = true;
+       return false;
+}
+
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_tags *tags = hctx->sched_tags ?
+                       hctx->sched_tags : hctx->tags;
+       struct rq_iter_data data = {
+               .hctx   = hctx,
+       };
+
+       blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+       return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+               struct blk_mq_hw_ctx *hctx)
+{
+       if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+               return false;
+       if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+               return false;
+       return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+       struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+                       struct blk_mq_hw_ctx, cpuhp_online);
+
+       if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
+           !blk_mq_last_cpu_in_hctx(cpu, hctx))
+               return 0;
+
+       /*
+        * Prevent new request from being allocated on the current hctx.
+        *
+        * The smp_mb__after_atomic() Pairs with the implied barrier in
+        * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
+        * seen once we return from the tag allocator.
+        */
+       set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+       smp_mb__after_atomic();
+
+       /*
+        * Try to grab a reference to the queue and wait for any outstanding
+        * requests.  If we could not grab a reference the queue has been
+        * frozen and there are no requests.
+        */
+       if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
+               while (blk_mq_hctx_has_requests(hctx))
+                       msleep(5);
+               percpu_ref_put(&hctx->queue->q_usage_counter);
+       }
+
+       return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+       struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+                       struct blk_mq_hw_ctx, cpuhp_online);
+
+       if (cpumask_test_cpu(cpu, hctx->cpumask))
+               clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+       return 0;
+}
+
  /*
   * 'cpu' is going away. splice any existing rq_list entries from this
   * software queue to the hw queue dispatch list, and ensure that it
@@ -2273,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
         enum hctx_type type;
  
         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+       if (!cpumask_test_cpu(cpu, hctx->cpumask))
+               return 0;
+
         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
         type = hctx->type;
  
@@ -2296,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
  
  static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
  {
+       if (!(hctx->flags & BLK_MQ_F_STACKING))
+               cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+                                                   &hctx->cpuhp_online);
         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                             &hctx->cpuhp_dead);
  }
@@ -2355,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
  {
         hctx->queue_num = hctx_idx;
  
+       if (!(hctx->flags & BLK_MQ_F_STACKING))
+               cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+                               &hctx->cpuhp_online);
         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
  
         hctx->tags = set->tags[hctx_idx];
@@ -2473,7 +2653,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
         }
  }
  
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
+                                       int hctx_idx)
  {
         int ret = 0;
  
@@ -2521,18 +2702,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
          * If the cpu isn't present, the cpu is mapped to first hctx.
          */
         for_each_possible_cpu(i) {
-               hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
-               /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[hctx_idx] &&
-                   !__blk_mq_alloc_rq_map(set, hctx_idx)) {
-                       /*
-                        * If tags initialization fail for some hctx,
-                        * that hctx won't be brought online.  In this
-                        * case, remap the current ctx to hctx[0] which
-                        * is guaranteed to always have tags allocated
-                        */
-                       set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
-               }
  
                 ctx = per_cpu_ptr(q->queue_ctx, i);
                 for (j = 0; j < set->nr_maps; j++) {
@@ -2541,6 +2710,18 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                                                 HCTX_TYPE_DEFAULT, i);
                                 continue;
                         }
+                       hctx_idx = set->map[j].mq_map[i];
+                       /* unmapped hw queue can be remapped after CPU topo changed */
+                       if (!set->tags[hctx_idx] &&
+                           !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+                               /*
+                                * If tags initialization fail for some hctx,
+                                * that hctx won't be brought online.  In this
+                                * case, remap the current ctx to hctx[0] which
+                                * is guaranteed to always have tags allocated
+                                */
+                               set->map[j].mq_map[i] = 0;
+                       }
  
                         hctx = blk_mq_map_queue_type(q, j, i);
                         ctx->hctxs[j] = hctx;
@@ -2944,7 +3125,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         INIT_LIST_HEAD(&q->requeue_list);
         spin_lock_init(&q->requeue_lock);
  
-       q->make_request_fn = blk_mq_make_request;
         q->nr_requests = set->queue_depth;
  
         /*
@@ -2988,14 +3168,14 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
         int i;
  
         for (i = 0; i < set->nr_hw_queues; i++)
-               if (!__blk_mq_alloc_rq_map(set, i))
+               if (!__blk_mq_alloc_map_and_request(set, i))
                         goto out_unwind;
  
         return 0;
  
  out_unwind:
         while (--i >= 0)
-               blk_mq_free_rq_map(set->tags[i]);
+               blk_mq_free_map_and_requests(set, i);
  
         return -ENOMEM;
  }
@@ -3005,7 +3185,7 @@ out_unwind:
   * may reduce the depth asked for, if memory is tight. set->queue_depth
   * will be updated to reflect the allocated depth.
   */
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
  {
         unsigned int depth;
         int err;
@@ -3165,7 +3345,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
         if (ret)
                 goto out_free_mq_map;
  
-       ret = blk_mq_alloc_rq_maps(set);
+       ret = blk_mq_alloc_map_and_requests(set);
         if (ret)
                 goto out_free_mq_map;
  
@@ -3347,14 +3527,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                 blk_mq_sysfs_unregister(q);
         }
  
+       prev_nr_hw_queues = set->nr_hw_queues;
         if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
             0)
                 goto reregister;
  
-       prev_nr_hw_queues = set->nr_hw_queues;
         set->nr_hw_queues = nr_hw_queues;
-       blk_mq_update_queue_map(set);
  fallback:
+       blk_mq_update_queue_map(set);
         list_for_each_entry(q, &set->tag_list, tag_set_list) {
                 blk_mq_realloc_hw_ctxs(set, q);
                 if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -3609,6 +3789,9 @@ static int __init blk_mq_init(void)
  {
         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                 blk_mq_hctx_notify_dead);
+       cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+                               blk_mq_hctx_notify_online,
+                               blk_mq_hctx_notify_offline);
         return 0;
  }
  subsys_initcall(blk_mq_init);