blk-mq: drain I/O when all CPUs in a hctx are offline

author Ming Lei <ming.lei@redhat.com>

Fri, 29 May 2020 13:53:15 +0000 (15:53 +0200)

committer Jens Axboe <axboe@kernel.dk>

Fri, 29 May 2020 16:23:25 +0000 (10:23 -0600)
author Ming Lei <ming.lei@redhat.com>
Fri, 29 May 2020 13:53:15 +0000 (15:53 +0200)
committer Jens Axboe <axboe@kernel.dk>
Fri, 29 May 2020 16:23:25 +0000 (10:23 -0600)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c

index 96b7a35..15df3a3 100644 (file)
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
         HCTX_STATE_NAME(STOPPED),
         HCTX_STATE_NAME(TAG_ACTIVE),
         HCTX_STATE_NAME(SCHED_RESTART),
+       HCTX_STATE_NAME(INACTIVE),
  };
  #undef HCTX_STATE_NAME
  
@@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = {
         HCTX_FLAG_NAME(TAG_SHARED),
         HCTX_FLAG_NAME(BLOCKING),
         HCTX_FLAG_NAME(NO_SCHED),
+       HCTX_FLAG_NAME(STACKING),
  };
  #undef HCTX_FLAG_NAME
  
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c

index 762198b..96a39d0 100644 (file)
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
         sbitmap_finish_wait(bt, ws, &wait);
  
  found_tag:
+       /*
+        * Give up this allocation if the hctx is inactive.  The caller will
+        * retry on an active hctx.
+        */
+       if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
+               blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
+               return BLK_MQ_NO_TAG;
+       }
         return tag + tag_offset;
  }
  
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 560ef5d..9a36ac1 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
                         e->type->ops.limit_depth(data->cmd_flags, data);
         }
  
+retry:
         data->ctx = blk_mq_get_ctx(q);
         data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
         if (!(data->flags & BLK_MQ_REQ_INTERNAL))
                 blk_mq_tag_busy(data->hctx);
  
+       /*
+        * Waiting allocations only fail because of an inactive hctx.  In that
+        * case just retry the hctx assignment and tag allocation as CPU hotplug
+        * should have migrated us to an online CPU by now.
+        */
         tag = blk_mq_get_tag(data);
-       if (tag == BLK_MQ_NO_TAG)
-               return NULL;
+       if (tag == BLK_MQ_NO_TAG) {
+               if (data->flags & BLK_MQ_REQ_NOWAIT)
+                       return NULL;
+
+               /*
+                * Give up the CPU and sleep for a random short time to ensure
+                * that thread using a realtime scheduling class are migrated
+                * off the the CPU, and thus off the hctx that is going away.
+                */
+               msleep(3);
+               goto retry;
+       }
         return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
  }
  
@@ -2335,6 +2351,86 @@ fail:
         return -ENOMEM;
  }
  
+struct rq_iter_data {
+       struct blk_mq_hw_ctx *hctx;
+       bool has_rq;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+       struct rq_iter_data *iter_data = data;
+
+       if (rq->mq_hctx != iter_data->hctx)
+               return true;
+       iter_data->has_rq = true;
+       return false;
+}
+
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_tags *tags = hctx->sched_tags ?
+                       hctx->sched_tags : hctx->tags;
+       struct rq_iter_data data = {
+               .hctx   = hctx,
+       };
+
+       blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+       return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+               struct blk_mq_hw_ctx *hctx)
+{
+       if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+               return false;
+       if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+               return false;
+       return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+       struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+                       struct blk_mq_hw_ctx, cpuhp_online);
+
+       if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
+           !blk_mq_last_cpu_in_hctx(cpu, hctx))
+               return 0;
+
+       /*
+        * Prevent new request from being allocated on the current hctx.
+        *
+        * The smp_mb__after_atomic() Pairs with the implied barrier in
+        * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
+        * seen once we return from the tag allocator.
+        */
+       set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+       smp_mb__after_atomic();
+
+       /*
+        * Try to grab a reference to the queue and wait for any outstanding
+        * requests.  If we could not grab a reference the queue has been
+        * frozen and there are no requests.
+        */
+       if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
+               while (blk_mq_hctx_has_requests(hctx))
+                       msleep(5);
+               percpu_ref_put(&hctx->queue->q_usage_counter);
+       }
+
+       return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+       struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+                       struct blk_mq_hw_ctx, cpuhp_online);
+
+       if (cpumask_test_cpu(cpu, hctx->cpumask))
+               clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+       return 0;
+}
+
  /*
   * 'cpu' is going away. splice any existing rq_list entries from this
   * software queue to the hw queue dispatch list, and ensure that it
@@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
         enum hctx_type type;
  
         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+       if (!cpumask_test_cpu(cpu, hctx->cpumask))
+               return 0;
+
         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
         type = hctx->type;
  
@@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
  
  static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
  {
+       if (!(hctx->flags & BLK_MQ_F_STACKING))
+               cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+                                                   &hctx->cpuhp_online);
         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                             &hctx->cpuhp_dead);
  }
@@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
  {
         hctx->queue_num = hctx_idx;
  
+       if (!(hctx->flags & BLK_MQ_F_STACKING))
+               cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+                               &hctx->cpuhp_online);
         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
  
         hctx->tags = set->tags[hctx_idx];
@@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void)
  {
         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                 blk_mq_hctx_notify_dead);
+       cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+                               blk_mq_hctx_notify_online,
+                               blk_mq_hctx_notify_offline);
         return 0;
  }
  subsys_initcall(blk_mq_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index da693e6..d7904b4 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i)
         lo->tag_set.queue_depth = 128;
         lo->tag_set.numa_node = NUMA_NO_NODE;
         lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-       lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+       lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
         lo->tag_set.driver_data = lo;
  
         err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c

index 3f8577e..f60c025 100644 (file)
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
         md->tag_set->ops = &dm_mq_ops;
         md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
         md->tag_set->numa_node = md->numa_node_id;
-       md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
+       md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
         md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
         md->tag_set->driver_data = md;
  
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h

index 856bb10..d6fcae1 100644 (file)
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
          */
         atomic_t                nr_active;
  
+       /** @cpuhp_online: List to store request if CPU is going to die */
+       struct hlist_node       cpuhp_online;
         /** @cpuhp_dead: List to store request if some CPU die. */
         struct hlist_node       cpuhp_dead;
         /** @kobj: Kernel object for sysfs. */
@@ -391,6 +393,11 @@ struct blk_mq_ops {
  enum {
         BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
         BLK_MQ_F_TAG_SHARED     = 1 << 1,
+       /*
+        * Set when this device requires underlying blk-mq device for
+        * completing IO:
+        */
+       BLK_MQ_F_STACKING       = 1 << 2,
         BLK_MQ_F_BLOCKING       = 1 << 5,
         BLK_MQ_F_NO_SCHED       = 1 << 6,
         BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -400,6 +407,9 @@ enum {
         BLK_MQ_S_TAG_ACTIVE     = 1,
         BLK_MQ_S_SCHED_RESTART  = 2,
  
+       /* hw queue is inactive after all its CPUs become offline */
+       BLK_MQ_S_INACTIVE       = 3,
+
         BLK_MQ_MAX_DEPTH        = 10240,
  
         BLK_MQ_CPU_WORK_BATCH   = 8,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h

index 77d70b6..24b3a77 100644 (file)
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,6 +152,7 @@ enum cpuhp_state {
         CPUHP_AP_SMPBOOT_THREADS,
         CPUHP_AP_X86_VDSO_VMA_ONLINE,
         CPUHP_AP_IRQ_AFFINITY_ONLINE,
+       CPUHP_AP_BLK_MQ_ONLINE,
         CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
         CPUHP_AP_X86_INTEL_EPB_ONLINE,
         CPUHP_AP_PERF_ONLINE,
author	Ming Lei <ming.lei@redhat.com>
	Fri, 29 May 2020 13:53:15 +0000 (15:53 +0200)
committer	Jens Axboe <axboe@kernel.dk>
	Fri, 29 May 2020 16:23:25 +0000 (10:23 -0600)
block/blk-mq-debugfs.c		patch \| blob \| history
block/blk-mq-tag.c		patch \| blob \| history
block/blk-mq.c		patch \| blob \| history
drivers/block/loop.c		patch \| blob \| history
drivers/md/dm-rq.c		patch \| blob \| history
include/linux/blk-mq.h		patch \| blob \| history
include/linux/cpuhotplug.h		patch \| blob \| history