Merge branch 'for-5.9/block' into for-5.9/block-merge

[linux-2.6-microblaze.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 4e0d173..667155f 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,6 +41,8 @@
  #include "blk-mq-sched.h"
  #include "blk-rq-qos.h"
  
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
  static void blk_mq_poll_stats_start(struct request_queue *q);
  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  
@@ -275,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
  {
         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
         struct request *rq = tags->static_rqs[tag];
-       req_flags_t rq_flags = 0;
  
-       if (data->flags & BLK_MQ_REQ_INTERNAL) {
+       if (data->q->elevator) {
                 rq->tag = BLK_MQ_NO_TAG;
                 rq->internal_tag = tag;
         } else {
-               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
-                       rq_flags = RQF_MQ_INFLIGHT;
-                       atomic_inc(&data->hctx->nr_active);
-               }
                 rq->tag = tag;
                 rq->internal_tag = BLK_MQ_NO_TAG;
-               data->hctx->tags->rqs[rq->tag] = rq;
         }
  
         /* csd/requeue_work/fifo_time is initialized before use */
         rq->q = data->q;
         rq->mq_ctx = data->ctx;
         rq->mq_hctx = data->hctx;
-       rq->rq_flags = rq_flags;
+       rq->rq_flags = 0;
         rq->cmd_flags = data->cmd_flags;
         if (data->flags & BLK_MQ_REQ_PREEMPT)
                 rq->rq_flags |= RQF_PREEMPT;
@@ -362,8 +358,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
                 data->flags |= BLK_MQ_REQ_NOWAIT;
  
         if (e) {
-               data->flags |= BLK_MQ_REQ_INTERNAL;
-
                 /*
                  * Flush requests are special and go directly to the
                  * dispatch list. Don't include reserved tags in the
@@ -378,7 +372,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
  retry:
         data->ctx = blk_mq_get_ctx(q);
         data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
-       if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+       if (!e)
                 blk_mq_tag_busy(data->hctx);
  
         /*
@@ -474,9 +468,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
         data.ctx = __blk_mq_get_ctx(q, cpu);
  
-       if (q->elevator)
-               data.flags |= BLK_MQ_REQ_INTERNAL;
-       else
+       if (!q->elevator)
                 blk_mq_tag_busy(data.hctx);
  
         ret = -EWOULDBLOCK;
@@ -552,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
                 blk_stat_add(rq, now);
         }
  
-       if (rq->internal_tag != BLK_MQ_NO_TAG)
-               blk_mq_sched_completed_request(rq, now);
+       blk_mq_sched_completed_request(rq, now);
  
         blk_account_io_done(rq, now);
  
@@ -574,71 +565,139 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
  }
  EXPORT_SYMBOL(blk_mq_end_request);
  
-static void __blk_mq_complete_request_remote(void *data)
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
  {
-       struct request *rq = data;
-       struct request_queue *q = rq->q;
+       struct list_head *cpu_list, local_list;
  
-       q->mq_ops->complete(rq);
+       local_irq_disable();
+       cpu_list = this_cpu_ptr(&blk_cpu_done);
+       list_replace_init(cpu_list, &local_list);
+       local_irq_enable();
+
+       while (!list_empty(&local_list)) {
+               struct request *rq;
+
+               rq = list_entry(local_list.next, struct request, ipi_list);
+               list_del_init(&rq->ipi_list);
+               rq->q->mq_ops->complete(rq);
+       }
  }
  
-/**
- * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
- *                             injection that could drop the completion.
- * @rq: Request to be force completed
- *
- * Drivers should use blk_mq_complete_request() to complete requests in their
- * normal IO path. For timeout error recovery, drivers may call this forced
- * completion routine after they've reclaimed timed out requests to bypass
- * potentially subsequent fake timeouts.
- */
-void blk_mq_force_complete_rq(struct request *rq)
+static void blk_mq_trigger_softirq(struct request *rq)
  {
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct request_queue *q = rq->q;
-       bool shared = false;
-       int cpu;
+       struct list_head *list;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       list = this_cpu_ptr(&blk_cpu_done);
+       list_add_tail(&rq->ipi_list, list);
+
+       /*
+        * If the list only contains our just added request, signal a raise of
+        * the softirq.  If there are already entries there, someone already
+        * raised the irq but it hasn't run yet.
+        */
+       if (list->next == &rq->ipi_list)
+               raise_softirq_irqoff(BLOCK_SOFTIRQ);
+       local_irq_restore(flags);
+}
+
+static int blk_softirq_cpu_dead(unsigned int cpu)
+{
+       /*
+        * If a CPU goes away, splice its entries to the current CPU
+        * and trigger a run of the softirq
+        */
+       local_irq_disable();
+       list_splice_init(&per_cpu(blk_cpu_done, cpu),
+                        this_cpu_ptr(&blk_cpu_done));
+       raise_softirq_irqoff(BLOCK_SOFTIRQ);
+       local_irq_enable();
+
+       return 0;
+}
+
+
+static void __blk_mq_complete_request_remote(void *data)
+{
+       struct request *rq = data;
  
-       WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
         /*
-        * Most of single queue controllers, there is only one irq vector
-        * for handling IO completion, and the only irq's affinity is set
-        * as all possible CPUs. On most of ARCHs, this affinity means the
-        * irq is handled on one specific CPU.
+        * For most of single queue controllers, there is only one irq vector
+        * for handling I/O completion, and the only irq's affinity is set
+        * to all possible CPUs.  On most of ARCHs, this affinity means the irq
+        * is handled on one specific CPU.
          *
-        * So complete IO reqeust in softirq context in case of single queue
-        * for not degrading IO performance by irqsoff latency.
+        * So complete I/O requests in softirq context in case of single queue
+        * devices to avoid degrading I/O performance due to irqsoff latency.
          */
-       if (q->nr_hw_queues == 1) {
-               __blk_complete_request(rq);
-               return;
-       }
+       if (rq->q->nr_hw_queues == 1)
+               blk_mq_trigger_softirq(rq);
+       else
+               rq->q->mq_ops->complete(rq);
+}
+
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
+{
+       int cpu = raw_smp_processor_id();
+
+       if (!IS_ENABLED(CONFIG_SMP) ||
+           !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
+               return false;
+
+       /* same CPU or cache domain?  Complete locally */
+       if (cpu == rq->mq_ctx->cpu ||
+           (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
+            cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+               return false;
+
+       /* don't try to IPI to an offline CPU */
+       return cpu_online(rq->mq_ctx->cpu);
+}
+
+bool blk_mq_complete_request_remote(struct request *rq)
+{
+       WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
  
         /*
          * For a polled request, always complete locallly, it's pointless
          * to redirect the completion.
          */
-       if ((rq->cmd_flags & REQ_HIPRI) ||
-           !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
-               q->mq_ops->complete(rq);
-               return;
-       }
-
-       cpu = get_cpu();
-       if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-               shared = cpus_share_cache(cpu, ctx->cpu);
+       if (rq->cmd_flags & REQ_HIPRI)
+               return false;
  
-       if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
+       if (blk_mq_complete_need_ipi(rq)) {
                 rq->csd.func = __blk_mq_complete_request_remote;
                 rq->csd.info = rq;
                 rq->csd.flags = 0;
-               smp_call_function_single_async(ctx->cpu, &rq->csd);
+               smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
         } else {
-               q->mq_ops->complete(rq);
+               if (rq->q->nr_hw_queues > 1)
+                       return false;
+               blk_mq_trigger_softirq(rq);
         }
-       put_cpu();
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
+
+/**
+ * blk_mq_complete_request - end I/O on a request
+ * @rq:                the request being processed
+ *
+ * Description:
+ *     Complete a request by scheduling the ->complete_rq operation.
+ **/
+void blk_mq_complete_request(struct request *rq)
+{
+       if (!blk_mq_complete_request_remote(rq))
+               rq->q->mq_ops->complete(rq);
  }
-EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
+EXPORT_SYMBOL(blk_mq_complete_request);
  
  static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
         __releases(hctx->srcu)
@@ -660,23 +719,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
                 *srcu_idx = srcu_read_lock(hctx->srcu);
  }
  
-/**
- * blk_mq_complete_request - end I/O on a request
- * @rq:                the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions.
- *     The actual completion happens out-of-order, through a IPI handler.
- **/
-bool blk_mq_complete_request(struct request *rq)
-{
-       if (unlikely(blk_should_fake_timeout(rq->q)))
-               return false;
-       blk_mq_force_complete_rq(rq);
-       return true;
-}
-EXPORT_SYMBOL(blk_mq_complete_request);
-
  /**
   * blk_mq_start_request - Start processing a request
   * @rq: Pointer to request to be started
@@ -1052,6 +1094,45 @@ static inline unsigned int queued_to_index(unsigned int queued)
         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
  }
  
+static bool __blk_mq_get_driver_tag(struct request *rq)
+{
+       struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+       unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+       int tag;
+
+       blk_mq_tag_busy(rq->mq_hctx);
+
+       if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+               bt = &rq->mq_hctx->tags->breserved_tags;
+               tag_offset = 0;
+       }
+
+       if (!hctx_may_queue(rq->mq_hctx, bt))
+               return false;
+       tag = __sbitmap_queue_get(bt);
+       if (tag == BLK_MQ_NO_TAG)
+               return false;
+
+       rq->tag = tag + tag_offset;
+       return true;
+}
+
+static bool blk_mq_get_driver_tag(struct request *rq)
+{
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+               return false;
+
+       if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+                       !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
+               rq->rq_flags |= RQF_MQ_INFLIGHT;
+               atomic_inc(&hctx->nr_active);
+       }
+       hctx->tags->rqs[rq->tag] = rq;
+       return true;
+}
+
  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                 int flags, void *key)
  {
@@ -1204,25 +1285,70 @@ static void blk_mq_handle_zone_resource(struct request *rq,
         __blk_mq_requeue_request(rq);
  }
  
+enum prep_dispatch {
+       PREP_DISPATCH_OK,
+       PREP_DISPATCH_NO_TAG,
+       PREP_DISPATCH_NO_BUDGET,
+};
+
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
+                                                 bool need_budget)
+{
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+       if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
+               blk_mq_put_driver_tag(rq);
+               return PREP_DISPATCH_NO_BUDGET;
+       }
+
+       if (!blk_mq_get_driver_tag(rq)) {
+               /*
+                * The initial allocation attempt failed, so we need to
+                * rerun the hardware queue when a tag is freed. The
+                * waitqueue takes care of that. If the queue is run
+                * before we add this entry back on the dispatch list,
+                * we'll re-run it below.
+                */
+               if (!blk_mq_mark_tag_wait(hctx, rq)) {
+                       /*
+                        * All budgets not got from this function will be put
+                        * together during handling partial dispatch
+                        */
+                       if (need_budget)
+                               blk_mq_put_dispatch_budget(rq->q);
+                       return PREP_DISPATCH_NO_TAG;
+               }
+       }
+
+       return PREP_DISPATCH_OK;
+}
+
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
+static void blk_mq_release_budgets(struct request_queue *q,
+               unsigned int nr_budgets)
+{
+       int i;
+
+       for (i = 0; i < nr_budgets; i++)
+               blk_mq_put_dispatch_budget(q);
+}
+
  /*
   * Returns true if we did some work AND can potentially do more.
   */
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-                            bool got_budget)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
+                            unsigned int nr_budgets)
  {
-       struct blk_mq_hw_ctx *hctx;
+       enum prep_dispatch prep;
+       struct request_queue *q = hctx->queue;
         struct request *rq, *nxt;
-       bool no_tag = false;
         int errors, queued;
         blk_status_t ret = BLK_STS_OK;
-       bool no_budget_avail = false;
         LIST_HEAD(zone_list);
  
         if (list_empty(list))
                 return false;
  
-       WARN_ON(!list_is_singular(list) && got_budget);
-
         /*
          * Now process all the entries, sending them to the driver.
          */
@@ -1232,32 +1358,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  
                 rq = list_first_entry(list, struct request, queuelist);
  
-               hctx = rq->mq_hctx;
-               if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-                       blk_mq_put_driver_tag(rq);
-                       no_budget_avail = true;
+               WARN_ON_ONCE(hctx != rq->mq_hctx);
+               prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+               if (prep != PREP_DISPATCH_OK)
                         break;
-               }
-
-               if (!blk_mq_get_driver_tag(rq)) {
-                       /*
-                        * The initial allocation attempt failed, so we need to
-                        * rerun the hardware queue when a tag is freed. The
-                        * waitqueue takes care of that. If the queue is run
-                        * before we add this entry back on the dispatch list,
-                        * we'll re-run it below.
-                        */
-                       if (!blk_mq_mark_tag_wait(hctx, rq)) {
-                               blk_mq_put_dispatch_budget(hctx);
-                               /*
-                                * For non-shared tags, the RESTART check
-                                * will suffice.
-                                */
-                               if (hctx->flags & BLK_MQ_F_TAG_SHARED)
-                                       no_tag = true;
-                               break;
-                       }
-               }
  
                 list_del_init(&rq->queuelist);
  
@@ -1274,31 +1378,35 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                         bd.last = !blk_mq_get_driver_tag(nxt);
                 }
  
+               /*
+                * once the request is queued to lld, no need to cover the
+                * budget any more
+                */
+               if (nr_budgets)
+                       nr_budgets--;
                 ret = q->mq_ops->queue_rq(hctx, &bd);
-               if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
-                       blk_mq_handle_dev_resource(rq, list);
+               switch (ret) {
+               case BLK_STS_OK:
+                       queued++;
                         break;
-               } else if (ret == BLK_STS_ZONE_RESOURCE) {
+               case BLK_STS_RESOURCE:
+               case BLK_STS_DEV_RESOURCE:
+                       blk_mq_handle_dev_resource(rq, list);
+                       goto out;
+               case BLK_STS_ZONE_RESOURCE:
                         /*
                          * Move the request to zone_list and keep going through
                          * the dispatch list to find more requests the drive can
                          * accept.
                          */
                         blk_mq_handle_zone_resource(rq, &zone_list);
-                       if (list_empty(list))
-                               break;
-                       continue;
-               }
-
-               if (unlikely(ret != BLK_STS_OK)) {
+                       break;
+               default:
                         errors++;
                         blk_mq_end_request(rq, BLK_STS_IOERR);
-                       continue;
                 }
-
-               queued++;
         } while (!list_empty(list));
-
+out:
         if (!list_empty(&zone_list))
                 list_splice_tail_init(&zone_list, list);
  
@@ -1310,6 +1418,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
          */
         if (!list_empty(list)) {
                 bool needs_restart;
+               /* For non-shared tags, the RESTART check will suffice */
+               bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
+                        (hctx->flags & BLK_MQ_F_TAG_SHARED);
+               bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
+
+               blk_mq_release_budgets(q, nr_budgets);
  
                 /*
                  * If we didn't flush the entire list, we could have told
@@ -1361,13 +1475,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
         } else
                 blk_mq_update_dispatch_busy(hctx, false);
  
-       /*
-        * If the host/device is unable to accept more work, inform the
-        * caller of that.
-        */
-       if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
-               return false;
-
         return (queued + errors) != 0;
  }
  
@@ -1896,11 +2003,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
         if (q->elevator && !bypass_insert)
                 goto insert;
  
-       if (!blk_mq_get_dispatch_budget(hctx))
+       if (!blk_mq_get_dispatch_budget(q))
                 goto insert;
  
         if (!blk_mq_get_driver_tag(rq)) {
-               blk_mq_put_dispatch_budget(hctx);
+               blk_mq_put_dispatch_budget(q);
                 goto insert;
         }
  
@@ -2005,8 +2112,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
  }
  
  /**
- * blk_mq_make_request - Create and send a request to block device.
- * @q: Request queue pointer.
+ * blk_mq_submit_bio - Create and send a request to block device.
   * @bio: Bio pointer.
   *
   * Builds up a request structure from @q and @bio and send to the device. The
@@ -2020,8 +2126,9 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
   *
   * Returns: Request queue cookie.
   */
-blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
  {
+       struct request_queue *q = bio->bi_disk->queue;
         const int is_sync = op_is_sync(bio->bi_opf);
         const int is_flush_fua = op_is_flush(bio->bi_opf);
         struct blk_mq_alloc_data data = {
@@ -2035,7 +2142,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         blk_status_t ret;
  
         blk_queue_bounce(q, &bio);
-       __blk_queue_split(q, &bio, &nr_segs);
+       __blk_queue_split(&bio, &nr_segs);
  
         if (!bio_integrity_prep(bio))
                 goto queue_exit;
@@ -2146,7 +2253,7 @@ queue_exit:
         blk_queue_exit(q);
         return BLK_QC_T_NONE;
  }
-EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
+EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */
  
  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                      unsigned int hctx_idx)
@@ -2886,7 +2993,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
  {
         struct request_queue *uninit_q, *q;
  
-       uninit_q = __blk_alloc_queue(set->numa_node);
+       uninit_q = blk_alloc_queue(set->numa_node);
         if (!uninit_q)
                 return ERR_PTR(-ENOMEM);
         uninit_q->queuedata = queuedata;
@@ -3760,6 +3867,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
  
  static int __init blk_mq_init(void)
  {
+       int i;
+
+       for_each_possible_cpu(i)
+               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+
+       cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+                                 "block/softirq:dead", NULL,
+                                 blk_softirq_cpu_dead);
         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                 blk_mq_hctx_notify_dead);
         cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",