Merge branch 'for-5.9/block' into for-5.9/block-merge
authorJens Axboe <axboe@kernel.dk>
Mon, 20 Jul 2020 21:38:23 +0000 (15:38 -0600)
committerJens Axboe <axboe@kernel.dk>
Mon, 20 Jul 2020 21:38:23 +0000 (15:38 -0600)
* for-5.9/block: (124 commits)
  blk-cgroup: show global disk stats in root cgroup io.stat
  blk-cgroup: make iostat functions visible to stat printing
  block: improve discard bio alignment in __blkdev_issue_discard()
  block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers
  block: defer flush request no matter whether we have elevator
  block: make blk_timeout_init() static
  block: remove retry loop in ioc_release_fn()
  block: remove unnecessary ioc nested locking
  block: integrate bd_start_claiming into __blkdev_get
  block: use bd_prepare_to_claim directly in the loop driver
  block: refactor bd_start_claiming
  block: simplify the restart case in __blkdev_get
  Revert "blk-rq-qos: remove redundant finish_wait to rq_qos_wait."
  block: always remove partitions from blk_drop_partitions()
  block: relax jiffies rounding for timeouts
  blk-mq: remove redundant validation in __blk_mq_end_request()
  blk-mq: Remove unnecessary local variable
  writeback: remove bdi->congested_fn
  writeback: remove struct bdi_writeback_congested
  writeback: remove {set,clear}_wb_congested
  ...

20 files changed:
1  2 
Documentation/admin-guide/cgroup-v2.rst
block/blk-mq-debugfs.c
block/blk-mq.c
drivers/block/nbd.c
drivers/block/virtio_blk.c
drivers/block/zram/zram_drv.c
drivers/md/dm-rq.c
drivers/md/dm-writecache.c
drivers/md/dm-zoned-target.c
drivers/md/dm.c
drivers/nvme/host/core.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/target/loop.c
fs/btrfs/disk-io.c
include/linux/blkdev.h
include/linux/fs.h

@@@ -1356,8 -1356,8 +1356,8 @@@ PAGE_SIZE multiple when read back
  
          thp_fault_alloc
                Number of transparent hugepages which were allocated to satisfy
 -              a page fault, including COW faults. This counter is not present
 -              when CONFIG_TRANSPARENT_HUGEPAGE is not set.
 +              a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE
 +                is not set.
  
          thp_collapse_alloc
                Number of transparent hugepages which were allocated to allow
@@@ -1483,8 -1483,7 +1483,7 @@@ IO Interface File
  ~~~~~~~~~~~~~~~~~~
  
    io.stat
-       A read-only nested-keyed file which exists on non-root
-       cgroups.
+       A read-only nested-keyed file.
  
        Lines are keyed by $MAJ:$MIN device numbers and not ordered.
        The following nested keys are defined.
diff --combined block/blk-mq-debugfs.c
@@@ -125,9 -125,6 +125,9 @@@ static const char *const blk_queue_flag
        QUEUE_FLAG_NAME(REGISTERED),
        QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
        QUEUE_FLAG_NAME(QUIESCED),
 +      QUEUE_FLAG_NAME(PCI_P2PDMA),
 +      QUEUE_FLAG_NAME(ZONE_RESETALL),
 +      QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
  };
  #undef QUEUE_FLAG_NAME
  
@@@ -404,8 -401,7 +404,7 @@@ static bool hctx_show_busy_rq(struct re
        const struct show_busy_params *params = data;
  
        if (rq->mq_hctx == params->hctx)
-               __blk_mq_debugfs_rq_show(params->m,
-                                        list_entry_rq(&rq->queuelist));
+               __blk_mq_debugfs_rq_show(params->m, rq);
  
        return true;
  }
@@@ -827,9 -823,6 +826,6 @@@ void blk_mq_debugfs_register(struct req
        struct blk_mq_hw_ctx *hctx;
        int i;
  
-       q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
-                                           blk_debugfs_root);
        debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
  
        /*
  
  void blk_mq_debugfs_unregister(struct request_queue *q)
  {
-       debugfs_remove_recursive(q->debugfs_dir);
        q->sched_debugfs_dir = NULL;
-       q->debugfs_dir = NULL;
  }
  
  static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
diff --combined block/blk-mq.c
@@@ -41,6 -41,8 +41,8 @@@
  #include "blk-mq-sched.h"
  #include "blk-rq-qos.h"
  
+ static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
  static void blk_mq_poll_stats_start(struct request_queue *q);
  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  
@@@ -275,26 -277,20 +277,20 @@@ static struct request *blk_mq_rq_ctx_in
  {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
-       req_flags_t rq_flags = 0;
  
-       if (data->flags & BLK_MQ_REQ_INTERNAL) {
+       if (data->q->elevator) {
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
-               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
-                       rq_flags = RQF_MQ_INFLIGHT;
-                       atomic_inc(&data->hctx->nr_active);
-               }
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
-               data->hctx->tags->rqs[rq->tag] = rq;
        }
  
        /* csd/requeue_work/fifo_time is initialized before use */
        rq->q = data->q;
        rq->mq_ctx = data->ctx;
        rq->mq_hctx = data->hctx;
-       rq->rq_flags = rq_flags;
+       rq->rq_flags = 0;
        rq->cmd_flags = data->cmd_flags;
        if (data->flags & BLK_MQ_REQ_PREEMPT)
                rq->rq_flags |= RQF_PREEMPT;
@@@ -362,8 -358,6 +358,6 @@@ static struct request *__blk_mq_alloc_r
                data->flags |= BLK_MQ_REQ_NOWAIT;
  
        if (e) {
-               data->flags |= BLK_MQ_REQ_INTERNAL;
                /*
                 * Flush requests are special and go directly to the
                 * dispatch list. Don't include reserved tags in the
  retry:
        data->ctx = blk_mq_get_ctx(q);
        data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
-       if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+       if (!e)
                blk_mq_tag_busy(data->hctx);
  
        /*
@@@ -474,9 -468,7 +468,7 @@@ struct request *blk_mq_alloc_request_hc
        cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
        data.ctx = __blk_mq_get_ctx(q, cpu);
  
-       if (q->elevator)
-               data.flags |= BLK_MQ_REQ_INTERNAL;
-       else
+       if (!q->elevator)
                blk_mq_tag_busy(data.hctx);
  
        ret = -EWOULDBLOCK;
@@@ -552,8 -544,7 +544,7 @@@ inline void __blk_mq_end_request(struc
                blk_stat_add(rq, now);
        }
  
-       if (rq->internal_tag != BLK_MQ_NO_TAG)
-               blk_mq_sched_completed_request(rq, now);
+       blk_mq_sched_completed_request(rq, now);
  
        blk_account_io_done(rq, now);
  
@@@ -574,71 -565,139 +565,139 @@@ void blk_mq_end_request(struct request 
  }
  EXPORT_SYMBOL(blk_mq_end_request);
  
- static void __blk_mq_complete_request_remote(void *data)
+ /*
+  * Softirq action handler - move entries to local list and loop over them
+  * while passing them to the queue registered handler.
+  */
+ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
  {
-       struct request *rq = data;
-       struct request_queue *q = rq->q;
+       struct list_head *cpu_list, local_list;
  
-       q->mq_ops->complete(rq);
+       local_irq_disable();
+       cpu_list = this_cpu_ptr(&blk_cpu_done);
+       list_replace_init(cpu_list, &local_list);
+       local_irq_enable();
+       while (!list_empty(&local_list)) {
+               struct request *rq;
+               rq = list_entry(local_list.next, struct request, ipi_list);
+               list_del_init(&rq->ipi_list);
+               rq->q->mq_ops->complete(rq);
+       }
  }
  
- /**
-  * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
-  *                            injection that could drop the completion.
-  * @rq: Request to be force completed
-  *
-  * Drivers should use blk_mq_complete_request() to complete requests in their
-  * normal IO path. For timeout error recovery, drivers may call this forced
-  * completion routine after they've reclaimed timed out requests to bypass
-  * potentially subsequent fake timeouts.
-  */
- void blk_mq_force_complete_rq(struct request *rq)
+ static void blk_mq_trigger_softirq(struct request *rq)
  {
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct request_queue *q = rq->q;
-       bool shared = false;
-       int cpu;
+       struct list_head *list;
+       unsigned long flags;
+       local_irq_save(flags);
+       list = this_cpu_ptr(&blk_cpu_done);
+       list_add_tail(&rq->ipi_list, list);
+       /*
+        * If the list only contains our just added request, signal a raise of
+        * the softirq.  If there are already entries there, someone already
+        * raised the irq but it hasn't run yet.
+        */
+       if (list->next == &rq->ipi_list)
+               raise_softirq_irqoff(BLOCK_SOFTIRQ);
+       local_irq_restore(flags);
+ }
+ static int blk_softirq_cpu_dead(unsigned int cpu)
+ {
+       /*
+        * If a CPU goes away, splice its entries to the current CPU
+        * and trigger a run of the softirq
+        */
+       local_irq_disable();
+       list_splice_init(&per_cpu(blk_cpu_done, cpu),
+                        this_cpu_ptr(&blk_cpu_done));
+       raise_softirq_irqoff(BLOCK_SOFTIRQ);
+       local_irq_enable();
+       return 0;
+ }
+ static void __blk_mq_complete_request_remote(void *data)
+ {
+       struct request *rq = data;
  
-       WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
        /*
-        * Most of single queue controllers, there is only one irq vector
-        * for handling IO completion, and the only irq's affinity is set
-        * as all possible CPUs. On most of ARCHs, this affinity means the
-        * irq is handled on one specific CPU.
+        * For most of single queue controllers, there is only one irq vector
+        * for handling I/O completion, and the only irq's affinity is set
+        * to all possible CPUs.  On most of ARCHs, this affinity means the irq
+        * is handled on one specific CPU.
         *
-        * So complete IO reqeust in softirq context in case of single queue
-        * for not degrading IO performance by irqsoff latency.
+        * So complete I/O requests in softirq context in case of single queue
+        * devices to avoid degrading I/O performance due to irqsoff latency.
         */
-       if (q->nr_hw_queues == 1) {
-               __blk_complete_request(rq);
-               return;
-       }
+       if (rq->q->nr_hw_queues == 1)
+               blk_mq_trigger_softirq(rq);
+       else
+               rq->q->mq_ops->complete(rq);
+ }
+ static inline bool blk_mq_complete_need_ipi(struct request *rq)
+ {
+       int cpu = raw_smp_processor_id();
+       if (!IS_ENABLED(CONFIG_SMP) ||
+           !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
+               return false;
+       /* same CPU or cache domain?  Complete locally */
+       if (cpu == rq->mq_ctx->cpu ||
+           (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
+            cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+               return false;
+       /* don't try to IPI to an offline CPU */
+       return cpu_online(rq->mq_ctx->cpu);
+ }
+ bool blk_mq_complete_request_remote(struct request *rq)
+ {
+       WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
  
        /*
         * For a polled request, always complete locallly, it's pointless
         * to redirect the completion.
         */
-       if ((rq->cmd_flags & REQ_HIPRI) ||
-           !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
-               q->mq_ops->complete(rq);
-               return;
-       }
-       cpu = get_cpu();
-       if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-               shared = cpus_share_cache(cpu, ctx->cpu);
+       if (rq->cmd_flags & REQ_HIPRI)
+               return false;
  
-       if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
+       if (blk_mq_complete_need_ipi(rq)) {
                rq->csd.func = __blk_mq_complete_request_remote;
                rq->csd.info = rq;
                rq->csd.flags = 0;
-               smp_call_function_single_async(ctx->cpu, &rq->csd);
+               smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
        } else {
-               q->mq_ops->complete(rq);
+               if (rq->q->nr_hw_queues > 1)
+                       return false;
+               blk_mq_trigger_softirq(rq);
        }
-       put_cpu();
+       return true;
+ }
+ EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
+ /**
+  * blk_mq_complete_request - end I/O on a request
+  * @rq:               the request being processed
+  *
+  * Description:
+  *    Complete a request by scheduling the ->complete_rq operation.
+  **/
+ void blk_mq_complete_request(struct request *rq)
+ {
+       if (!blk_mq_complete_request_remote(rq))
+               rq->q->mq_ops->complete(rq);
  }
- EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
+ EXPORT_SYMBOL(blk_mq_complete_request);
  
  static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
        __releases(hctx->srcu)
@@@ -660,23 -719,6 +719,6 @@@ static void hctx_lock(struct blk_mq_hw_
                *srcu_idx = srcu_read_lock(hctx->srcu);
  }
  
- /**
-  * blk_mq_complete_request - end I/O on a request
-  * @rq:               the request being processed
-  *
-  * Description:
-  *    Ends all I/O on a request. It does not handle partial completions.
-  *    The actual completion happens out-of-order, through a IPI handler.
-  **/
- bool blk_mq_complete_request(struct request *rq)
- {
-       if (unlikely(blk_should_fake_timeout(rq->q)))
-               return false;
-       blk_mq_force_complete_rq(rq);
-       return true;
- }
- EXPORT_SYMBOL(blk_mq_complete_request);
  /**
   * blk_mq_start_request - Start processing a request
   * @rq: Pointer to request to be started
@@@ -828,10 -870,10 +870,10 @@@ static bool blk_mq_rq_inflight(struct b
                               void *priv, bool reserved)
  {
        /*
 -       * If we find a request that is inflight and the queue matches,
 +       * If we find a request that isn't idle and the queue matches,
         * we know the queue is busy. Return false to stop the iteration.
         */
 -      if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
 +      if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
                bool *busy = priv;
  
                *busy = true;
@@@ -1052,6 -1094,45 +1094,45 @@@ static inline unsigned int queued_to_in
        return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
  }
  
+ static bool __blk_mq_get_driver_tag(struct request *rq)
+ {
+       struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+       unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+       int tag;
+       blk_mq_tag_busy(rq->mq_hctx);
+       if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+               bt = &rq->mq_hctx->tags->breserved_tags;
+               tag_offset = 0;
+       }
+       if (!hctx_may_queue(rq->mq_hctx, bt))
+               return false;
+       tag = __sbitmap_queue_get(bt);
+       if (tag == BLK_MQ_NO_TAG)
+               return false;
+       rq->tag = tag + tag_offset;
+       return true;
+ }
+ static bool blk_mq_get_driver_tag(struct request *rq)
+ {
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+       if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+               return false;
+       if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+                       !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
+               rq->rq_flags |= RQF_MQ_INFLIGHT;
+               atomic_inc(&hctx->nr_active);
+       }
+       hctx->tags->rqs[rq->tag] = rq;
+       return true;
+ }
  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                int flags, void *key)
  {
@@@ -1204,25 -1285,70 +1285,70 @@@ static void blk_mq_handle_zone_resource
        __blk_mq_requeue_request(rq);
  }
  
+ enum prep_dispatch {
+       PREP_DISPATCH_OK,
+       PREP_DISPATCH_NO_TAG,
+       PREP_DISPATCH_NO_BUDGET,
+ };
+ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
+                                                 bool need_budget)
+ {
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+       if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
+               blk_mq_put_driver_tag(rq);
+               return PREP_DISPATCH_NO_BUDGET;
+       }
+       if (!blk_mq_get_driver_tag(rq)) {
+               /*
+                * The initial allocation attempt failed, so we need to
+                * rerun the hardware queue when a tag is freed. The
+                * waitqueue takes care of that. If the queue is run
+                * before we add this entry back on the dispatch list,
+                * we'll re-run it below.
+                */
+               if (!blk_mq_mark_tag_wait(hctx, rq)) {
+                       /*
+                        * All budgets not got from this function will be put
+                        * together during handling partial dispatch
+                        */
+                       if (need_budget)
+                               blk_mq_put_dispatch_budget(rq->q);
+                       return PREP_DISPATCH_NO_TAG;
+               }
+       }
+       return PREP_DISPATCH_OK;
+ }
+ /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
+ static void blk_mq_release_budgets(struct request_queue *q,
+               unsigned int nr_budgets)
+ {
+       int i;
+       for (i = 0; i < nr_budgets; i++)
+               blk_mq_put_dispatch_budget(q);
+ }
  /*
   * Returns true if we did some work AND can potentially do more.
   */
- bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-                            bool got_budget)
+ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
+                            unsigned int nr_budgets)
  {
-       struct blk_mq_hw_ctx *hctx;
+       enum prep_dispatch prep;
+       struct request_queue *q = hctx->queue;
        struct request *rq, *nxt;
-       bool no_tag = false;
        int errors, queued;
        blk_status_t ret = BLK_STS_OK;
-       bool no_budget_avail = false;
        LIST_HEAD(zone_list);
  
        if (list_empty(list))
                return false;
  
-       WARN_ON(!list_is_singular(list) && got_budget);
        /*
         * Now process all the entries, sending them to the driver.
         */
  
                rq = list_first_entry(list, struct request, queuelist);
  
-               hctx = rq->mq_hctx;
-               if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-                       blk_mq_put_driver_tag(rq);
-                       no_budget_avail = true;
+               WARN_ON_ONCE(hctx != rq->mq_hctx);
+               prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+               if (prep != PREP_DISPATCH_OK)
                        break;
-               }
-               if (!blk_mq_get_driver_tag(rq)) {
-                       /*
-                        * The initial allocation attempt failed, so we need to
-                        * rerun the hardware queue when a tag is freed. The
-                        * waitqueue takes care of that. If the queue is run
-                        * before we add this entry back on the dispatch list,
-                        * we'll re-run it below.
-                        */
-                       if (!blk_mq_mark_tag_wait(hctx, rq)) {
-                               blk_mq_put_dispatch_budget(hctx);
-                               /*
-                                * For non-shared tags, the RESTART check
-                                * will suffice.
-                                */
-                               if (hctx->flags & BLK_MQ_F_TAG_SHARED)
-                                       no_tag = true;
-                               break;
-                       }
-               }
  
                list_del_init(&rq->queuelist);
  
                        bd.last = !blk_mq_get_driver_tag(nxt);
                }
  
+               /*
+                * once the request is queued to lld, no need to cover the
+                * budget any more
+                */
+               if (nr_budgets)
+                       nr_budgets--;
                ret = q->mq_ops->queue_rq(hctx, &bd);
-               if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
-                       blk_mq_handle_dev_resource(rq, list);
+               switch (ret) {
+               case BLK_STS_OK:
+                       queued++;
                        break;
-               } else if (ret == BLK_STS_ZONE_RESOURCE) {
+               case BLK_STS_RESOURCE:
+               case BLK_STS_DEV_RESOURCE:
+                       blk_mq_handle_dev_resource(rq, list);
+                       goto out;
+               case BLK_STS_ZONE_RESOURCE:
                        /*
                         * Move the request to zone_list and keep going through
                         * the dispatch list to find more requests the drive can
                         * accept.
                         */
                        blk_mq_handle_zone_resource(rq, &zone_list);
-                       if (list_empty(list))
-                               break;
-                       continue;
-               }
-               if (unlikely(ret != BLK_STS_OK)) {
+                       break;
+               default:
                        errors++;
                        blk_mq_end_request(rq, BLK_STS_IOERR);
-                       continue;
                }
-               queued++;
        } while (!list_empty(list));
+ out:
        if (!list_empty(&zone_list))
                list_splice_tail_init(&zone_list, list);
  
         */
        if (!list_empty(list)) {
                bool needs_restart;
+               /* For non-shared tags, the RESTART check will suffice */
+               bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
+                         (hctx->flags & BLK_MQ_F_TAG_SHARED);
+               bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
+               blk_mq_release_budgets(q, nr_budgets);
  
                /*
                 * If we didn't flush the entire list, we could have told
        } else
                blk_mq_update_dispatch_busy(hctx, false);
  
-       /*
-        * If the host/device is unable to accept more work, inform the
-        * caller of that.
-        */
-       if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
-               return false;
        return (queued + errors) != 0;
  }
  
@@@ -1896,11 -2003,11 +2003,11 @@@ static blk_status_t __blk_mq_try_issue_
        if (q->elevator && !bypass_insert)
                goto insert;
  
-       if (!blk_mq_get_dispatch_budget(hctx))
+       if (!blk_mq_get_dispatch_budget(q))
                goto insert;
  
        if (!blk_mq_get_driver_tag(rq)) {
-               blk_mq_put_dispatch_budget(hctx);
+               blk_mq_put_dispatch_budget(q);
                goto insert;
        }
  
@@@ -2005,8 -2112,7 +2112,7 @@@ static void blk_add_rq_to_plug(struct b
  }
  
  /**
-  * blk_mq_make_request - Create and send a request to block device.
-  * @q: Request queue pointer.
+  * blk_mq_submit_bio - Create and send a request to block device.
   * @bio: Bio pointer.
   *
   * Builds up a request structure from @q and @bio and send to the device. The
   *
   * Returns: Request queue cookie.
   */
- blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+ blk_qc_t blk_mq_submit_bio(struct bio *bio)
  {
+       struct request_queue *q = bio->bi_disk->queue;
        const int is_sync = op_is_sync(bio->bi_opf);
        const int is_flush_fua = op_is_flush(bio->bi_opf);
        struct blk_mq_alloc_data data = {
        blk_status_t ret;
  
        blk_queue_bounce(q, &bio);
-       __blk_queue_split(q, &bio, &nr_segs);
+       __blk_queue_split(&bio, &nr_segs);
  
        if (!bio_integrity_prep(bio))
                goto queue_exit;
@@@ -2146,7 -2253,7 +2253,7 @@@ queue_exit
        blk_queue_exit(q);
        return BLK_QC_T_NONE;
  }
- EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
+ EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */
  
  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
@@@ -2886,7 -2993,7 +2993,7 @@@ struct request_queue *blk_mq_init_queue
  {
        struct request_queue *uninit_q, *q;
  
-       uninit_q = __blk_alloc_queue(set->numa_node);
+       uninit_q = blk_alloc_queue(set->numa_node);
        if (!uninit_q)
                return ERR_PTR(-ENOMEM);
        uninit_q->queuedata = queuedata;
@@@ -3760,6 -3867,15 +3867,15 @@@ EXPORT_SYMBOL(blk_mq_rq_cpu)
  
  static int __init blk_mq_init(void)
  {
+       int i;
+       for_each_possible_cpu(i)
+               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+       cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+                                 "block/softirq:dead", NULL,
+                                 blk_softirq_cpu_dead);
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
diff --combined drivers/block/nbd.c
@@@ -784,6 -784,7 +784,7 @@@ static void recv_work(struct work_struc
        struct nbd_device *nbd = args->nbd;
        struct nbd_config *config = nbd->config;
        struct nbd_cmd *cmd;
+       struct request *rq;
  
        while (1) {
                cmd = nbd_read_stat(nbd, args->index);
                        break;
                }
  
-               blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
+               rq = blk_mq_rq_from_pdu(cmd);
+               if (likely(!blk_should_fake_timeout(rq->q)))
+                       blk_mq_complete_request(rq);
        }
        atomic_dec(&config->recv_threads);
        wake_up(&config->recv_wq);
@@@ -1033,26 -1036,25 +1036,26 @@@ static int nbd_add_socket(struct nbd_de
             test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
                dev_err(disk_to_dev(nbd->disk),
                        "Device being setup by another task");
 -              sockfd_put(sock);
 -              return -EBUSY;
 +              err = -EBUSY;
 +              goto put_socket;
 +      }
 +
 +      nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
 +      if (!nsock) {
 +              err = -ENOMEM;
 +              goto put_socket;
        }
  
        socks = krealloc(config->socks, (config->num_connections + 1) *
                         sizeof(struct nbd_sock *), GFP_KERNEL);
        if (!socks) {
 -              sockfd_put(sock);
 -              return -ENOMEM;
 +              kfree(nsock);
 +              err = -ENOMEM;
 +              goto put_socket;
        }
  
        config->socks = socks;
  
 -      nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
 -      if (!nsock) {
 -              sockfd_put(sock);
 -              return -ENOMEM;
 -      }
 -
        nsock->fallback_index = -1;
        nsock->dead = false;
        mutex_init(&nsock->tx_lock);
        atomic_inc(&config->live_connections);
  
        return 0;
 +
 +put_socket:
 +      sockfd_put(sock);
 +      return err;
  }
  
  static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
@@@ -171,7 -171,8 +171,8 @@@ static void virtblk_done(struct virtque
                while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
                        struct request *req = blk_mq_rq_from_pdu(vbr);
  
-                       blk_mq_complete_request(req);
+                       if (likely(!blk_should_fake_timeout(req->q)))
+                               blk_mq_complete_request(req);
                        req_done = true;
                }
                if (unlikely(virtqueue_is_broken(vq)))
@@@ -878,7 -879,6 +879,7 @@@ out_put_disk
        put_disk(vblk->disk);
  out_free_vq:
        vdev->config->del_vqs(vdev);
 +      kfree(vblk->vqs);
  out_free_vblk:
        kfree(vblk);
  out_free_index:
@@@ -793,9 -793,9 +793,9 @@@ static void zram_sync_read(struct work_
  }
  
  /*
-  * Block layer want one ->make_request_fn to be active at a time
-  * so if we use chained IO with parent IO in same context,
-  * it's a deadlock. To avoid, it, it uses worker thread context.
+  * Block layer want one ->submit_bio to be active at a time, so if we use
+  * chained IO with parent IO in same context, it's a deadlock. To avoid that,
+  * use a worker thread context.
   */
  static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
                                unsigned long entry, struct bio *bio)
@@@ -1584,9 -1584,9 +1584,9 @@@ static void __zram_make_request(struct 
  /*
   * Handler function for all zram I/O requests.
   */
- static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
+ static blk_qc_t zram_submit_bio(struct bio *bio)
  {
-       struct zram *zram = queue->queuedata;
+       struct zram *zram = bio->bi_disk->private_data;
  
        if (!valid_io_request(zram, bio->bi_iter.bi_sector,
                                        bio->bi_iter.bi_size)) {
@@@ -1813,6 -1813,7 +1813,7 @@@ static int zram_open(struct block_devic
  
  static const struct block_device_operations zram_devops = {
        .open = zram_open,
+       .submit_bio = zram_submit_bio,
        .swap_slot_free_notify = zram_slot_free_notify,
        .rw_page = zram_rw_page,
        .owner = THIS_MODULE
@@@ -1891,7 -1892,7 +1892,7 @@@ static int zram_add(void
  #ifdef CONFIG_ZRAM_WRITEBACK
        spin_lock_init(&zram->wb_limit_lock);
  #endif
-       queue = blk_alloc_queue(zram_make_request, NUMA_NO_NODE);
+       queue = blk_alloc_queue(NUMA_NO_NODE);
        if (!queue) {
                pr_err("Error allocating disk queue for device %d\n",
                        device_id);
        zram->disk->first_minor = device_id;
        zram->disk->fops = &zram_devops;
        zram->disk->queue = queue;
-       zram->disk->queue->queuedata = zram;
        zram->disk->private_data = zram;
        snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
  
@@@ -2021,8 -2021,7 +2021,8 @@@ static ssize_t hot_add_show(struct clas
                return ret;
        return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
  }
 -static CLASS_ATTR_RO(hot_add);
 +static struct class_attribute class_attr_hot_add =
 +      __ATTR(hot_add, 0400, hot_add_show, NULL);
  
  static ssize_t hot_remove_store(struct class *class,
                        struct class_attribute *attr,
diff --combined drivers/md/dm-rq.c
@@@ -146,6 -146,10 +146,6 @@@ static void rq_end_stats(struct mapped_
   */
  static void rq_completed(struct mapped_device *md)
  {
 -      /* nudge anyone waiting on suspend queue */
 -      if (unlikely(wq_has_sleeper(&md->wait)))
 -              wake_up(&md->wait);
 -
        /*
         * dm_put() must be at the end of this function. See the comment above
         */
@@@ -284,7 -288,8 +284,8 @@@ static void dm_complete_request(struct 
        struct dm_rq_target_io *tio = tio_from_request(rq);
  
        tio->error = error;
-       blk_mq_complete_request(rq);
+       if (likely(!blk_should_fake_timeout(rq->q)))
+               blk_mq_complete_request(rq);
  }
  
  /*
@@@ -282,8 -282,6 +282,8 @@@ static int persistent_memory_claim(stru
                        while (daa-- && i < p) {
                                pages[i++] = pfn_t_to_page(pfn);
                                pfn.val++;
 +                              if (!(i & 15))
 +                                      cond_resched();
                        }
                } while (i < p);
                wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
@@@ -851,14 -849,10 +851,14 @@@ static void writecache_discard(struct d
  
                if (likely(!e->write_in_progress)) {
                        if (!discarded_something) {
 -                              writecache_wait_for_ios(wc, READ);
 -                              writecache_wait_for_ios(wc, WRITE);
 +                              if (!WC_MODE_PMEM(wc)) {
 +                                      writecache_wait_for_ios(wc, READ);
 +                                      writecache_wait_for_ios(wc, WRITE);
 +                              }
                                discarded_something = true;
                        }
 +                      if (!writecache_entry_is_committed(wc, e))
 +                              wc->uncommitted_blocks--;
                        writecache_free_entry(wc, e);
                }
  
@@@ -1244,7 -1238,7 +1244,7 @@@ static int writecache_flush_thread(voi
                                           bio_end_sector(bio));
                        wc_unlock(wc);
                        bio_set_dev(bio, wc->dev->bdev);
-                       generic_make_request(bio);
+                       submit_bio_noacct(bio);
                } else {
                        writecache_flush(wc);
                        wc_unlock(wc);
@@@ -2266,12 -2260,6 +2266,12 @@@ invalid_optional
        }
  
        if (WC_MODE_PMEM(wc)) {
 +              if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
 +                      r = -EOPNOTSUPP;
 +                      ti->error = "Asynchronous persistent memory not supported as pmem cache";
 +                      goto bad;
 +              }
 +
                r = persistent_memory_claim(wc);
                if (r) {
                        ti->error = "Unable to map persistent memory for cache";
@@@ -140,7 -140,7 +140,7 @@@ static int dmz_submit_bio(struct dmz_ta
        bio_advance(bio, clone->bi_iter.bi_size);
  
        refcount_inc(&bioctx->ref);
-       generic_make_request(clone);
+       submit_bio_noacct(clone);
  
        if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
                zone->wp_block += nr_blocks;
@@@ -400,7 -400,15 +400,7 @@@ static void dmz_handle_bio(struct dmz_t
                dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
        struct dmz_metadata *zmd = dmz->metadata;
        struct dm_zone *zone;
 -      int i, ret;
 -
 -      /*
 -       * Write may trigger a zone allocation. So make sure the
 -       * allocation can succeed.
 -       */
 -      if (bio_op(bio) == REQ_OP_WRITE)
 -              for (i = 0; i < dmz->nr_ddevs; i++)
 -                      dmz_schedule_reclaim(dmz->dev[i].reclaim);
 +      int ret;
  
        dmz_lock_metadata(zmd);
  
@@@ -882,7 -890,7 +882,7 @@@ static int dmz_ctr(struct dm_target *ti
        }
  
        /* Set target (no write same support) */
 -      ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata) << 9;
 +      ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata);
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
        ti->num_write_zeroes_bios = 1;
diff --combined drivers/md/dm.c
@@@ -12,7 -12,6 +12,7 @@@
  #include <linux/init.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
 +#include <linux/sched/mm.h>
  #include <linux/sched/signal.h>
  #include <linux/blkpg.h>
  #include <linux/bio.h>
@@@ -655,6 -654,28 +655,6 @@@ static void free_tio(struct dm_target_i
        bio_put(&tio->clone);
  }
  
 -static bool md_in_flight_bios(struct mapped_device *md)
 -{
 -      int cpu;
 -      struct hd_struct *part = &dm_disk(md)->part0;
 -      long sum = 0;
 -
 -      for_each_possible_cpu(cpu) {
 -              sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
 -              sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 -      }
 -
 -      return sum != 0;
 -}
 -
 -static bool md_in_flight(struct mapped_device *md)
 -{
 -      if (queue_is_mq(md->queue))
 -              return blk_mq_queue_inflight(md->queue);
 -      else
 -              return md_in_flight_bios(md);
 -}
 -
  u64 dm_start_time_ns_from_clone(struct bio *bio)
  {
        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
@@@ -988,7 -1009,6 +988,7 @@@ static void clone_endio(struct bio *bio
        struct dm_io *io = tio->io;
        struct mapped_device *md = tio->io->md;
        dm_endio_fn endio = tio->ti->type->end_io;
 +      struct bio *orig_bio = io->orig_bio;
  
        if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
                if (bio_op(bio) == REQ_OP_DISCARD &&
                        disable_write_zeroes(md);
        }
  
 +      /*
 +       * For zone-append bios get offset in zone of the written
 +       * sector and add that to the original bio sector pos.
 +       */
 +      if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
 +              sector_t written_sector = bio->bi_iter.bi_sector;
 +              struct request_queue *q = orig_bio->bi_disk->queue;
 +              u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
 +
 +              orig_bio->bi_iter.bi_sector += written_sector & mask;
 +      }
 +
        if (endio) {
                int r = endio(tio->ti, bio, &error);
                switch (r) {
@@@ -1272,7 -1280,6 +1272,6 @@@ static blk_qc_t __map_bio(struct dm_tar
        sector_t sector;
        struct bio *clone = &tio->clone;
        struct dm_io *io = tio->io;
-       struct mapped_device *md = io->md;
        struct dm_target *ti = tio->ti;
        blk_qc_t ret = BLK_QC_T_NONE;
  
                /* the bio has been remapped so dispatch it */
                trace_block_bio_remap(clone->bi_disk->queue, clone,
                                      bio_dev(io->orig_bio), sector);
-               if (md->type == DM_TYPE_NVME_BIO_BASED)
-                       ret = direct_make_request(clone);
-               else
-                       ret = generic_make_request(clone);
+               ret = submit_bio_noacct(clone);
                break;
        case DM_MAPIO_KILL:
                free_tio(tio);
@@@ -1644,7 -1648,7 +1640,7 @@@ static blk_qc_t __split_and_process_bio
                        error = __split_and_process_non_flush(&ci);
                        if (current->bio_list && ci.sector_count && !error) {
                                /*
-                                * Remainder must be passed to generic_make_request()
+                                * Remainder must be passed to submit_bio_noacct()
                                 * so that it gets handled *after* bios already submitted
                                 * have been completely processed.
                                 * We take a clone of the original to store in
  
                                bio_chain(b, bio);
                                trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
-                               ret = generic_make_request(bio);
+                               ret = submit_bio_noacct(bio);
                                break;
                        }
                }
@@@ -1737,7 -1741,7 +1733,7 @@@ static void dm_queue_split(struct mappe
  
                bio_chain(split, *bio);
                trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
-               generic_make_request(*bio);
+               submit_bio_noacct(*bio);
                *bio = split;
        }
  }
@@@ -1762,13 -1766,13 +1758,13 @@@ static blk_qc_t dm_process_bio(struct m
        }
  
        /*
-        * If in ->make_request_fn we need to use blk_queue_split(), otherwise
+        * If in ->queue_bio we need to use blk_queue_split(), otherwise
         * queue_limits for abnormal requests (e.g. discard, writesame, etc)
         * won't be imposed.
         */
        if (current->bio_list) {
                if (is_abnormal_io(bio))
-                       blk_queue_split(md->queue, &bio);
+                       blk_queue_split(&bio);
                else
                        dm_queue_split(md, ti, &bio);
        }
                return __split_and_process_bio(md, map, bio);
  }
  
- static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
+ static blk_qc_t dm_submit_bio(struct bio *bio)
  {
-       struct mapped_device *md = q->queuedata;
+       struct mapped_device *md = bio->bi_disk->private_data;
        blk_qc_t ret = BLK_QC_T_NONE;
        int srcu_idx;
        struct dm_table *map;
                /*
                 * We are called with a live reference on q_usage_counter, but
                 * that one will be released as soon as we return.  Grab an
-                * extra one as blk_mq_make_request expects to be able to
-                * consume a reference (which lives until the request is freed
-                * in case a request is allocated).
+                * extra one as blk_mq_submit_bio expects to be able to consume
+                * a reference (which lives until the request is freed in case a
+                * request is allocated).
                 */
-               percpu_ref_get(&q->q_usage_counter);
-               return blk_mq_make_request(q, bio);
+               percpu_ref_get(&bio->bi_disk->queue->q_usage_counter);
+               return blk_mq_submit_bio(bio);
        }
  
        map = dm_get_live_table(md, &srcu_idx);
        return ret;
  }
  
- static int dm_any_congested(void *congested_data, int bdi_bits)
- {
-       int r = bdi_bits;
-       struct mapped_device *md = congested_data;
-       struct dm_table *map;
-       if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-               if (dm_request_based(md)) {
-                       /*
-                        * With request-based DM we only need to check the
-                        * top-level queue for congestion.
-                        */
-                       struct backing_dev_info *bdi = md->queue->backing_dev_info;
-                       r = bdi->wb.congested->state & bdi_bits;
-               } else {
-                       map = dm_get_live_table_fast(md);
-                       if (map)
-                               r = dm_table_any_congested(map, bdi_bits);
-                       dm_put_live_table_fast(md);
-               }
-       }
-       return r;
- }
  /*-----------------------------------------------------------------
   * An IDR is used to keep track of allocated minor numbers.
   *---------------------------------------------------------------*/
@@@ -1980,14 -1959,13 +1951,13 @@@ static struct mapped_device *alloc_dev(
        spin_lock_init(&md->uevent_lock);
  
        /*
-        * default to bio-based required ->make_request_fn until DM
-        * table is loaded and md->type established. If request-based
-        * table is loaded: blk-mq will override accordingly.
+        * default to bio-based until DM table is loaded and md->type
+        * established. If request-based table is loaded: blk-mq will
+        * override accordingly.
         */
-       md->queue = blk_alloc_queue(dm_make_request, numa_node_id);
+       md->queue = blk_alloc_queue(numa_node_id);
        if (!md->queue)
                goto bad;
-       md->queue->queuedata = md;
  
        md->disk = alloc_disk_node(1, md->numa_node_id);
        if (!md->disk)
@@@ -2281,12 -2259,6 +2251,6 @@@ struct queue_limits *dm_get_queue_limit
  }
  EXPORT_SYMBOL_GPL(dm_get_queue_limits);
  
- static void dm_init_congested_fn(struct mapped_device *md)
- {
-       md->queue->backing_dev_info->congested_data = md;
-       md->queue->backing_dev_info->congested_fn = dm_any_congested;
- }
  /*
   * Setup the DM device's queue based on md's type
   */
@@@ -2303,12 -2275,10 +2267,10 @@@ int dm_setup_md_queue(struct mapped_dev
                        DMERR("Cannot initialize queue for request-based dm-mq mapped device");
                        return r;
                }
-               dm_init_congested_fn(md);
                break;
        case DM_TYPE_BIO_BASED:
        case DM_TYPE_DAX_BIO_BASED:
        case DM_TYPE_NVME_BIO_BASED:
-               dm_init_congested_fn(md);
                break;
        case DM_TYPE_NONE:
                WARN_ON_ONCE(true);
@@@ -2448,29 -2418,15 +2410,29 @@@ void dm_put(struct mapped_device *md
  }
  EXPORT_SYMBOL_GPL(dm_put);
  
 -static int dm_wait_for_completion(struct mapped_device *md, long task_state)
 +static bool md_in_flight_bios(struct mapped_device *md)
 +{
 +      int cpu;
 +      struct hd_struct *part = &dm_disk(md)->part0;
 +      long sum = 0;
 +
 +      for_each_possible_cpu(cpu) {
 +              sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
 +              sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 +      }
 +
 +      return sum != 0;
 +}
 +
 +static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
  {
        int r = 0;
        DEFINE_WAIT(wait);
  
 -      while (1) {
 +      while (true) {
                prepare_to_wait(&md->wait, &wait, task_state);
  
 -              if (!md_in_flight(md))
 +              if (!md_in_flight_bios(md))
                        break;
  
                if (signal_pending_state(task_state, current)) {
        return r;
  }
  
 +static int dm_wait_for_completion(struct mapped_device *md, long task_state)
 +{
 +      int r = 0;
 +
 +      if (!queue_is_mq(md->queue))
 +              return dm_wait_for_bios_completion(md, task_state);
 +
 +      while (true) {
 +              if (!blk_mq_queue_inflight(md->queue))
 +                      break;
 +
 +              if (signal_pending_state(task_state, current)) {
 +                      r = -EINTR;
 +                      break;
 +              }
 +
 +              msleep(5);
 +      }
 +
 +      return r;
 +}
 +
  /*
   * Process the deferred bios
   */
@@@ -2529,7 -2463,7 +2491,7 @@@ static void dm_wq_work(struct work_stru
                        break;
  
                if (dm_request_based(md))
-                       (void) generic_make_request(c);
+                       (void) submit_bio_noacct(c);
                else
                        (void) dm_process_bio(md, map, c);
        }
@@@ -2940,25 -2874,17 +2902,25 @@@ EXPORT_SYMBOL_GPL(dm_internal_resume_fa
  int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
                       unsigned cookie)
  {
 +      int r;
 +      unsigned noio_flag;
        char udev_cookie[DM_COOKIE_LENGTH];
        char *envp[] = { udev_cookie, NULL };
  
 +      noio_flag = memalloc_noio_save();
 +
        if (!cookie)
 -              return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
 +              r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
        else {
                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
                         DM_COOKIE_ENV_VAR_NAME, cookie);
 -              return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
 -                                        action, envp);
 +              r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
 +                                     action, envp);
        }
 +
 +      memalloc_noio_restore(noio_flag);
 +
 +      return r;
  }
  
  uint32_t dm_next_uevent_seq(struct mapped_device *md)
@@@ -3269,6 -3195,7 +3231,7 @@@ static const struct pr_ops dm_pr_ops = 
  };
  
  static const struct block_device_operations dm_blk_dops = {
+       .submit_bio = dm_submit_bio,
        .open = dm_blk_open,
        .release = dm_blk_close,
        .ioctl = dm_blk_ioctl,
diff --combined drivers/nvme/host/core.c
@@@ -304,7 -304,7 +304,7 @@@ bool nvme_cancel_request(struct reques
                return true;
  
        nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
-       blk_mq_force_complete_rq(req);
+       blk_mq_complete_request(req);
        return true;
  }
  EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@@ -1116,16 -1116,10 +1116,16 @@@ static int nvme_identify_ns_descs(struc
                dev_warn(ctrl->device,
                        "Identify Descriptors failed (%d)\n", status);
                 /*
 -                * Don't treat an error as fatal, as we potentially already
 -                * have a NGUID or EUI-64.
 +                * Don't treat non-retryable errors as fatal, as we potentially
 +                * already have a NGUID or EUI-64.  If we failed with DNR set,
 +                * we want to silently ignore the error as we can still
 +                * identify the device, but if the status has DNR set, we want
 +                * to propagate the error back specifically for the disk
 +                * revalidation flow to make sure we don't abandon the
 +                * device just because of a temporal retry-able error (such
 +                * as path of transport errors).
                  */
 -              if (status > 0 && !(status & NVME_SC_DNR))
 +              if (status > 0 && (status & NVME_SC_DNR))
                        status = 0;
                goto free_data;
        }
@@@ -1980,7 -1974,7 +1980,7 @@@ static int __nvme_revalidate_disk(struc
        if (ns->head->disk) {
                nvme_update_disk_info(ns->head->disk, ns, id);
                blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
 -              revalidate_disk(ns->head->disk);
 +              nvme_mpath_update_disk_size(ns->head->disk);
        }
  #endif
        return 0;
@@@ -2184,6 -2178,7 +2184,7 @@@ static void nvme_ns_head_release(struc
  
  const struct block_device_operations nvme_ns_head_ops = {
        .owner          = THIS_MODULE,
+       .submit_bio     = nvme_ns_head_submit_bio,
        .open           = nvme_ns_head_open,
        .release        = nvme_ns_head_release,
        .ioctl          = nvme_ioctl,
@@@ -4180,7 -4175,6 +4181,7 @@@ int nvme_init_ctrl(struct nvme_ctrl *ct
        ctrl->dev = dev;
        ctrl->ops = ops;
        ctrl->quirks = quirks;
 +      ctrl->numa_node = NUMA_NO_NODE;
        INIT_WORK(&ctrl->scan_work, nvme_scan_work);
        INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
        INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
@@@ -291,8 -291,7 +291,7 @@@ static bool nvme_available_path(struct 
        return false;
  }
  
- static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
-               struct bio *bio)
+ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
  {
        struct nvme_ns_head *head = bio->bi_disk->private_data;
        struct device *dev = disk_to_dev(head->disk);
        int srcu_idx;
  
        /*
-        * The namespace might be going away and the bio might
-        * be moved to a different queue via blk_steal_bios(),
-        * so we need to use the bio_split pool from the original
-        * queue to allocate the bvecs from.
+        * The namespace might be going away and the bio might be moved to a
+        * different queue via blk_steal_bios(), so we need to use the bio_split
+        * pool from the original queue to allocate the bvecs from.
         */
-       blk_queue_split(q, &bio);
+       blk_queue_split(&bio);
  
        srcu_idx = srcu_read_lock(&head->srcu);
        ns = nvme_find_path(head);
                trace_block_bio_remap(bio->bi_disk->queue, bio,
                                      disk_devt(ns->head->disk),
                                      bio->bi_iter.bi_sector);
-               ret = direct_make_request(bio);
+               ret = submit_bio_noacct(bio);
        } else if (nvme_available_path(head)) {
                dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
  
@@@ -353,7 -351,7 +351,7 @@@ static void nvme_requeue_work(struct wo
                 * path.
                 */
                bio->bi_disk = head->disk;
-               generic_make_request(bio);
+               submit_bio_noacct(bio);
        }
  }
  
@@@ -375,7 -373,7 +373,7 @@@ int nvme_mpath_alloc_disk(struct nvme_c
        if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
                return 0;
  
-       q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node);
+       q = blk_alloc_queue(ctrl->numa_node);
        if (!q)
                goto out;
        blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
@@@ -409,14 -407,15 +407,14 @@@ static void nvme_mpath_set_live(struct 
  {
        struct nvme_ns_head *head = ns->head;
  
 -      lockdep_assert_held(&ns->head->lock);
 -
        if (!head->disk)
                return;
  
 -      if (!(head->disk->flags & GENHD_FL_UP))
 +      if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
                device_add_disk(&head->subsys->dev, head->disk,
                                nvme_ns_id_attr_groups);
  
 +      mutex_lock(&head->lock);
        if (nvme_path_is_optimized(ns)) {
                int node, srcu_idx;
  
                        __nvme_find_path(head, node);
                srcu_read_unlock(&head->srcu, srcu_idx);
        }
 +      mutex_unlock(&head->lock);
  
 -      synchronize_srcu(&ns->head->srcu);
 -      kblockd_schedule_work(&ns->head->requeue_work);
 +      synchronize_srcu(&head->srcu);
 +      kblockd_schedule_work(&head->requeue_work);
  }
  
  static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
@@@ -483,12 -481,14 +481,12 @@@ static inline bool nvme_state_is_live(e
  static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
                struct nvme_ns *ns)
  {
 -      mutex_lock(&ns->head->lock);
        ns->ana_grpid = le32_to_cpu(desc->grpid);
        ns->ana_state = desc->state;
        clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
  
        if (nvme_state_is_live(ns->ana_state))
                nvme_mpath_set_live(ns);
 -      mutex_unlock(&ns->head->lock);
  }
  
  static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@@ -638,45 -638,38 +636,45 @@@ static ssize_t ana_state_show(struct de
  }
  DEVICE_ATTR_RO(ana_state);
  
 -static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
 +static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
                struct nvme_ana_group_desc *desc, void *data)
  {
 -      struct nvme_ns *ns = data;
 +      struct nvme_ana_group_desc *dst = data;
  
 -      if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
 -              nvme_update_ns_ana_state(desc, ns);
 -              return -ENXIO; /* just break out of the loop */
 -      }
 +      if (desc->grpid != dst->grpid)
 +              return 0;
  
 -      return 0;
 +      *dst = *desc;
 +      return -ENXIO; /* just break out of the loop */
  }
  
  void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
  {
        if (nvme_ctrl_use_ana(ns->ctrl)) {
 +              struct nvme_ana_group_desc desc = {
 +                      .grpid = id->anagrpid,
 +                      .state = 0,
 +              };
 +
                mutex_lock(&ns->ctrl->ana_lock);
                ns->ana_grpid = le32_to_cpu(id->anagrpid);
 -              nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
 +              nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
                mutex_unlock(&ns->ctrl->ana_lock);
 +              if (desc.state) {
 +                      /* found the group desc: update */
 +                      nvme_update_ns_ana_state(&desc, ns);
 +              }
        } else {
 -              mutex_lock(&ns->head->lock);
                ns->ana_state = NVME_ANA_OPTIMIZED; 
                nvme_mpath_set_live(ns);
 -              mutex_unlock(&ns->head->lock);
        }
  
        if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
 -              struct backing_dev_info *info =
 -                                      ns->head->disk->queue->backing_dev_info;
 +              struct gendisk *disk = ns->head->disk;
  
 -              info->capabilities |= BDI_CAP_STABLE_WRITES;
 +              if (disk)
 +                      disk->queue->backing_dev_info->capabilities |=
 +                                      BDI_CAP_STABLE_WRITES;
        }
  }
  
@@@ -691,14 -684,6 +689,14 @@@ void nvme_mpath_remove_disk(struct nvme
        kblockd_schedule_work(&head->requeue_work);
        flush_work(&head->requeue_work);
        blk_cleanup_queue(head->disk->queue);
 +      if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
 +              /*
 +               * if device_add_disk wasn't called, prevent
 +               * disk release to put a bogus reference on the
 +               * request queue
 +               */
 +              head->disk->queue = NULL;
 +      }
        put_disk(head->disk);
  }
  
diff --combined drivers/nvme/host/nvme.h
@@@ -364,8 -364,6 +364,8 @@@ struct nvme_ns_head 
        spinlock_t              requeue_lock;
        struct work_struct      requeue_work;
        struct mutex            lock;
 +      unsigned long           flags;
 +#define NVME_NSHEAD_DISK_LIVE 0
        struct nvme_ns __rcu    *current_path[];
  #endif
  };
@@@ -474,7 -472,7 +474,7 @@@ static inline u32 nvme_bytes_to_numd(si
        return (len >> 2) - 1;
  }
  
- static inline void nvme_end_request(struct request *req, __le16 status,
+ static inline bool nvme_end_request(struct request *req, __le16 status,
                union nvme_result result)
  {
        struct nvme_request *rq = nvme_req(req);
        rq->result = result;
        /* inject error when permitted by fault injection framework */
        nvme_should_fail(req);
-       blk_mq_complete_request(req);
+       if (unlikely(blk_should_fake_timeout(req->q)))
+               return true;
+       return blk_mq_complete_request_remote(req);
  }
  
  static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
@@@ -586,6 -586,7 +588,7 @@@ void nvme_mpath_stop(struct nvme_ctrl *
  bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
  void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
  struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio);
  
  static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
  {
@@@ -604,16 -605,6 +607,16 @@@ static inline void nvme_trace_bio_compl
                trace_block_bio_complete(ns->head->disk->queue, req->bio);
  }
  
 +static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
 +{
 +      struct block_device *bdev = bdget_disk(disk, 0);
 +
 +      if (bdev) {
 +              bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT);
 +              bdput(bdev);
 +      }
 +}
 +
  extern struct device_attribute dev_attr_ana_grpid;
  extern struct device_attribute dev_attr_ana_state;
  extern struct device_attribute subsys_attr_iopolicy;
@@@ -689,9 -680,6 +692,9 @@@ static inline void nvme_mpath_wait_free
  static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
  {
  }
 +static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
 +{
 +}
  #endif /* CONFIG_NVME_MULTIPATH */
  
  #ifdef CONFIG_NVM
diff --combined drivers/nvme/host/pci.c
@@@ -963,7 -963,8 +963,8 @@@ static inline void nvme_handle_cqe(stru
  
        req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
        trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-       nvme_end_request(req, cqe->status, cqe->result);
+       if (!nvme_end_request(req, cqe->status, cqe->result))
+               nvme_pci_complete_rq(req);
  }
  
  static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@@ -1593,7 -1594,7 +1594,7 @@@ static int nvme_alloc_admin_tags(struc
  
                dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
                dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 -              dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 +              dev->admin_tagset.numa_node = dev->ctrl.numa_node;
                dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
                dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
                dev->admin_tagset.driver_data = dev;
@@@ -1669,8 -1670,6 +1670,8 @@@ static int nvme_pci_configure_admin_que
        if (result)
                return result;
  
 +      dev->ctrl.numa_node = dev_to_node(dev->dev);
 +
        nvmeq = &dev->queues[0];
        aqa = nvmeq->q_depth - 1;
        aqa |= aqa << 16;
@@@ -2259,7 -2258,7 +2260,7 @@@ static void nvme_dev_add(struct nvme_de
                if (dev->io_queues[HCTX_TYPE_POLL])
                        dev->tagset.nr_maps++;
                dev->tagset.timeout = NVME_IO_TIMEOUT;
 -              dev->tagset.numa_node = dev_to_node(dev->dev);
 +              dev->tagset.numa_node = dev->ctrl.numa_node;
                dev->tagset.queue_depth =
                                min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
                dev->tagset.cmd_size = sizeof(struct nvme_iod);
diff --combined drivers/nvme/host/rdma.c
@@@ -149,6 -149,7 +149,7 @@@ MODULE_PARM_DESC(register_always
  static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
                struct rdma_cm_event *event);
  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+ static void nvme_rdma_complete_rq(struct request *rq);
  
  static const struct blk_mq_ops nvme_rdma_mq_ops;
  static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
@@@ -470,7 -471,7 +471,7 @@@ static int nvme_rdma_create_queue_ib(st
         * Spread I/O queues completion vectors according their queue index.
         * Admin queues can always go on completion vector 0.
         */
 -      comp_vector = idx == 0 ? idx : idx - 1;
 +      comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
  
        /* Polling queues need direct cq polling context */
        if (nvme_rdma_poll_queue(queue))
@@@ -1149,6 -1150,16 +1150,16 @@@ static void nvme_rdma_error_recovery(st
        queue_work(nvme_reset_wq, &ctrl->err_work);
  }
  
+ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
+ {
+       struct request *rq = blk_mq_rq_from_pdu(req);
+       if (!refcount_dec_and_test(&req->ref))
+               return;
+       if (!nvme_end_request(rq, req->status, req->result))
+               nvme_rdma_complete_rq(rq);
+ }
  static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
                const char *op)
  {
@@@ -1173,16 -1184,11 +1184,11 @@@ static void nvme_rdma_inv_rkey_done(str
  {
        struct nvme_rdma_request *req =
                container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
-       struct request *rq = blk_mq_rq_from_pdu(req);
  
-       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+       if (unlikely(wc->status != IB_WC_SUCCESS))
                nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
-               return;
-       }
-       if (refcount_dec_and_test(&req->ref))
-               nvme_end_request(rq, req->status, req->result);
+       else
+               nvme_rdma_end_request(req);
  }
  
  static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
@@@ -1547,15 -1553,11 +1553,11 @@@ static void nvme_rdma_send_done(struct 
                container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
        struct nvme_rdma_request *req =
                container_of(qe, struct nvme_rdma_request, sqe);
-       struct request *rq = blk_mq_rq_from_pdu(req);
  
-       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+       if (unlikely(wc->status != IB_WC_SUCCESS))
                nvme_rdma_wr_error(cq, wc, "SEND");
-               return;
-       }
-       if (refcount_dec_and_test(&req->ref))
-               nvme_end_request(rq, req->status, req->result);
+       else
+               nvme_rdma_end_request(req);
  }
  
  static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
@@@ -1697,8 -1699,7 +1699,7 @@@ static void nvme_rdma_process_nvme_rsp(
                return;
        }
  
-       if (refcount_dec_and_test(&req->ref))
-               nvme_end_request(rq, req->status, req->result);
+       nvme_rdma_end_request(req);
  }
  
  static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
diff --combined drivers/nvme/host/tcp.c
@@@ -464,7 -464,8 +464,8 @@@ static int nvme_tcp_process_nvme_cqe(st
                return -EINVAL;
        }
  
-       nvme_end_request(rq, cqe->status, cqe->result);
+       if (!nvme_end_request(rq, cqe->status, cqe->result))
+               nvme_complete_rq(rq);
        queue->nr_cqe++;
  
        return 0;
@@@ -654,7 -655,8 +655,8 @@@ static inline void nvme_tcp_end_request
  {
        union nvme_result res = {};
  
-       nvme_end_request(rq, cpu_to_le16(status << 1), res);
+       if (!nvme_end_request(rq, cpu_to_le16(status << 1), res))
+               nvme_complete_rq(rq);
  }
  
  static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
@@@ -1532,7 -1534,7 +1534,7 @@@ static struct blk_mq_tag_set *nvme_tcp_
                set->ops = &nvme_tcp_admin_mq_ops;
                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
                set->reserved_tags = 2; /* connect + keep-alive */
 -              set->numa_node = NUMA_NO_NODE;
 +              set->numa_node = nctrl->numa_node;
                set->flags = BLK_MQ_F_BLOCKING;
                set->cmd_size = sizeof(struct nvme_tcp_request);
                set->driver_data = ctrl;
                set->ops = &nvme_tcp_mq_ops;
                set->queue_depth = nctrl->sqsize + 1;
                set->reserved_tags = 1; /* fabric connect */
 -              set->numa_node = NUMA_NO_NODE;
 +              set->numa_node = nctrl->numa_node;
                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
                set->cmd_size = sizeof(struct nvme_tcp_request);
                set->driver_data = ctrl;
@@@ -116,7 -116,8 +116,8 @@@ static void nvme_loop_queue_response(st
                        return;
                }
  
-               nvme_end_request(rq, cqe->status, cqe->result);
+               if (!nvme_end_request(rq, cqe->status, cqe->result))
+                       nvme_loop_complete_rq(rq);
        }
  }
  
@@@ -340,7 -341,7 +341,7 @@@ static int nvme_loop_configure_admin_qu
        ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
        ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
        ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
 -      ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
 +      ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
        ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
                NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
        ctrl->admin_tag_set.driver_data = ctrl;
@@@ -512,7 -513,7 +513,7 @@@ static int nvme_loop_create_io_queues(s
        ctrl->tag_set.ops = &nvme_loop_mq_ops;
        ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
        ctrl->tag_set.reserved_tags = 1; /* fabric connect */
 -      ctrl->tag_set.numa_node = NUMA_NO_NODE;
 +      ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
        ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
        ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
                NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
diff --combined fs/btrfs/disk-io.c
@@@ -1616,27 -1616,6 +1616,6 @@@ fail
        return ERR_PTR(ret);
  }
  
- static int btrfs_congested_fn(void *congested_data, int bdi_bits)
- {
-       struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
-       int ret = 0;
-       struct btrfs_device *device;
-       struct backing_dev_info *bdi;
-       rcu_read_lock();
-       list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
-               if (!device->bdev)
-                       continue;
-               bdi = device->bdev->bd_bdi;
-               if (bdi_congested(bdi, bdi_bits)) {
-                       ret = 1;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-       return ret;
- }
  /*
   * called by the kthread helper functions to finally call the bio end_io
   * functions.  This is where read checksum verification actually happens
@@@ -2593,12 -2572,10 +2572,12 @@@ static int __cold init_tree_roots(struc
                    !extent_buffer_uptodate(tree_root->node)) {
                        handle_error = true;
  
 -                      if (IS_ERR(tree_root->node))
 +                      if (IS_ERR(tree_root->node)) {
                                ret = PTR_ERR(tree_root->node);
 -                      else if (!extent_buffer_uptodate(tree_root->node))
 +                              tree_root->node = NULL;
 +                      } else if (!extent_buffer_uptodate(tree_root->node)) {
                                ret = -EUCLEAN;
 +                      }
  
                        btrfs_warn(fs_info, "failed to read tree root");
                        continue;
@@@ -3053,8 -3030,6 +3032,6 @@@ int __cold open_ctree(struct super_bloc
                goto fail_sb_buffer;
        }
  
-       sb->s_bdi->congested_fn = btrfs_congested_fn;
-       sb->s_bdi->congested_data = fs_info;
        sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
        sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
        sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
diff --combined include/linux/blkdev.h
@@@ -4,9 -4,6 +4,6 @@@
  
  #include <linux/sched.h>
  #include <linux/sched/clock.h>
- #ifdef CONFIG_BLOCK
  #include <linux/major.h>
  #include <linux/genhd.h>
  #include <linux/list.h>
@@@ -289,8 -286,6 +286,6 @@@ static inline unsigned short req_get_io
  
  struct blk_queue_ctx;
  
- typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
  struct bio_vec;
  
  enum blk_eh_timer_return {
@@@ -401,8 -396,6 +396,6 @@@ struct request_queue 
        struct blk_queue_stats  *stats;
        struct rq_qos           *rq_qos;
  
-       make_request_fn         *make_request_fn;
        const struct blk_mq_ops *mq_ops;
  
        /* sw queues */
        unsigned int            sg_timeout;
        unsigned int            sg_reserved_size;
        int                     node;
+       struct mutex            debugfs_mutex;
  #ifdef CONFIG_BLK_DEV_IO_TRACE
        struct blk_trace __rcu  *blk_trace;
-       struct mutex            blk_trace_mutex;
  #endif
        /*
         * for flush operations
        struct list_head        tag_set_list;
        struct bio_set          bio_split;
  
- #ifdef CONFIG_BLK_DEBUG_FS
        struct dentry           *debugfs_dir;
+ #ifdef CONFIG_BLK_DEBUG_FS
        struct dentry           *sched_debugfs_dir;
        struct dentry           *rqos_debugfs_dir;
  #endif
  
        size_t                  cmd_size;
  
-       struct work_struct      release_work;
  #define BLK_MAX_WRITE_HINTS   5
        u64                     write_hints[BLK_MAX_WRITE_HINTS];
  };
  
 +/* Keep blk_queue_flag_name[] in sync with the definitions below */
  #define QUEUE_FLAG_STOPPED    0       /* queue is stopped */
  #define QUEUE_FLAG_DYING      1       /* queue being torn down */
  #define QUEUE_FLAG_NOMERGES     3     /* disable merge attempts */
@@@ -861,8 -852,7 +853,7 @@@ static inline void rq_flush_dcache_page
  
  extern int blk_register_queue(struct gendisk *disk);
  extern void blk_unregister_queue(struct gendisk *disk);
- extern blk_qc_t generic_make_request(struct bio *bio);
- extern blk_qc_t direct_make_request(struct bio *bio);
+ blk_qc_t submit_bio_noacct(struct bio *bio);
  extern void blk_rq_init(struct request_queue *q, struct request *rq);
  extern void blk_put_request(struct request *);
  extern struct request *blk_get_request(struct request_queue *, unsigned int op,
@@@ -876,7 -866,7 +867,7 @@@ extern void blk_rq_unprep_clone(struct 
  extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
                                     struct request *rq);
  extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
- extern void blk_queue_split(struct request_queue *, struct bio **);
+ extern void blk_queue_split(struct bio **);
  extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
  extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
                              unsigned int, void __user *);
@@@ -1079,7 -1069,6 +1070,6 @@@ void blk_steal_bios(struct bio_list *li
  extern bool blk_update_request(struct request *rq, blk_status_t error,
                               unsigned int nr_bytes);
  
- extern void __blk_complete_request(struct request *);
  extern void blk_abort_request(struct request *);
  
  /*
@@@ -1166,13 -1155,13 +1156,13 @@@ static inline int blk_rq_map_sg(struct 
        return __blk_rq_map_sg(q, rq, sglist, &last_sg);
  }
  extern void blk_dump_rq_flags(struct request *, char *);
- extern long nr_blockdev_pages(void);
  
  bool __must_check blk_get_queue(struct request_queue *);
- struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id);
+ struct request_queue *blk_alloc_queue(int node_id);
  extern void blk_put_queue(struct request_queue *);
  extern void blk_set_queue_dying(struct request_queue *);
  
+ #ifdef CONFIG_BLOCK
  /*
   * blk_plug permits building a queue of related requests by holding the I/O
   * fragments for a short period. This allows merging of sequential requests
@@@ -1232,9 -1221,47 +1222,47 @@@ static inline bool blk_needs_flush_plug
                 !list_empty(&plug->cb_list));
  }
  
+ int blkdev_issue_flush(struct block_device *, gfp_t);
+ long nr_blockdev_pages(void);
+ #else /* CONFIG_BLOCK */
+ struct blk_plug {
+ };
+ static inline void blk_start_plug(struct blk_plug *plug)
+ {
+ }
+ static inline void blk_finish_plug(struct blk_plug *plug)
+ {
+ }
+ static inline void blk_flush_plug(struct task_struct *task)
+ {
+ }
+ static inline void blk_schedule_flush_plug(struct task_struct *task)
+ {
+ }
+ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
+ {
+       return false;
+ }
+ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+ {
+       return 0;
+ }
+ static inline long nr_blockdev_pages(void)
+ {
+       return 0;
+ }
+ #endif /* CONFIG_BLOCK */
  extern void blk_io_schedule(void);
  
- int blkdev_issue_flush(struct block_device *, gfp_t);
  extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct page *page);
  
@@@ -1516,7 -1543,7 +1544,7 @@@ static inline unsigned int blksize_bits
  
  static inline unsigned int block_size(struct block_device *bdev)
  {
-       return bdev->bd_block_size;
+       return 1 << bdev->bd_inode->i_blkbits;
  }
  
  int kblockd_schedule_work(struct work_struct *work);
@@@ -1746,6 -1773,7 +1774,7 @@@ static inline void blk_ksm_unregister(s
  
  
  struct block_device_operations {
+       blk_qc_t (*submit_bio) (struct bio *bio);
        int (*open) (struct block_device *, fmode_t);
        void (*release) (struct gendisk *, fmode_t);
        int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
-       /* ->media_changed() is DEPRECATED, use ->check_events() instead */
-       int (*media_changed) (struct gendisk *);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*revalidate_disk) (struct gendisk *);
        int (*getgeo)(struct block_device *, struct hd_geometry *);
@@@ -1834,52 -1860,6 +1861,6 @@@ static inline bool blk_req_can_dispatch
  }
  #endif /* CONFIG_BLK_DEV_ZONED */
  
- #else /* CONFIG_BLOCK */
- struct block_device;
- /*
-  * stubs for when the block layer is configured out
-  */
- #define buffer_heads_over_limit 0
- static inline long nr_blockdev_pages(void)
- {
-       return 0;
- }
- struct blk_plug {
- };
- static inline void blk_start_plug(struct blk_plug *plug)
- {
- }
- static inline void blk_finish_plug(struct blk_plug *plug)
- {
- }
- static inline void blk_flush_plug(struct task_struct *task)
- {
- }
- static inline void blk_schedule_flush_plug(struct task_struct *task)
- {
- }
- static inline bool blk_needs_flush_plug(struct task_struct *tsk)
- {
-       return false;
- }
- static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
- {
-       return 0;
- }
- #endif /* CONFIG_BLOCK */
  static inline void blk_wake_io_task(struct task_struct *waiter)
  {
        /*
                wake_up_process(waiter);
  }
  
- #ifdef CONFIG_BLOCK
  unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
                unsigned int op);
  void disk_end_io_acct(struct gendisk *disk, unsigned int op,
@@@ -1919,6 -1898,53 +1899,53 @@@ static inline void bio_end_io_acct(stru
  {
        return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time);
  }
- #endif /* CONFIG_BLOCK */
  
+ int bdev_read_only(struct block_device *bdev);
+ int set_blocksize(struct block_device *bdev, int size);
+ const char *bdevname(struct block_device *bdev, char *buffer);
+ struct block_device *lookup_bdev(const char *);
+ void blkdev_show(struct seq_file *seqf, off_t offset);
+ #define BDEVNAME_SIZE 32      /* Largest string for a blockdev identifier */
+ #define BDEVT_SIZE    10      /* Largest string for MAJ:MIN for blkdev */
+ #ifdef CONFIG_BLOCK
+ #define BLKDEV_MAJOR_MAX      512
+ #else
+ #define BLKDEV_MAJOR_MAX      0
+ #endif
+ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+               void *holder);
+ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
+ int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
+               void *holder);
+ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
+               void *holder);
+ void blkdev_put(struct block_device *bdev, fmode_t mode);
+ struct block_device *I_BDEV(struct inode *inode);
+ struct block_device *bdget(dev_t);
+ struct block_device *bdgrab(struct block_device *bdev);
+ void bdput(struct block_device *);
+ #ifdef CONFIG_BLOCK
+ void invalidate_bdev(struct block_device *bdev);
+ int sync_blockdev(struct block_device *bdev);
+ #else
+ static inline void invalidate_bdev(struct block_device *bdev)
+ {
+ }
+ static inline int sync_blockdev(struct block_device *bdev)
+ {
+       return 0;
+ }
  #endif
+ int fsync_bdev(struct block_device *bdev);
+ struct super_block *freeze_bdev(struct block_device *bdev);
+ int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+ #endif /* _LINUX_BLKDEV_H */
diff --combined include/linux/fs.h
@@@ -315,7 -315,6 +315,7 @@@ enum rw_hint 
  #define IOCB_SYNC             (1 << 5)
  #define IOCB_WRITE            (1 << 6)
  #define IOCB_NOWAIT           (1 << 7)
 +#define IOCB_NOIO             (1 << 9)
  
  struct kiocb {
        struct file             *ki_filp;
@@@ -471,45 -470,6 +471,6 @@@ struct address_space 
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */
- struct request_queue;
- struct block_device {
-       dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
-       int                     bd_openers;
-       struct inode *          bd_inode;       /* will die */
-       struct super_block *    bd_super;
-       struct mutex            bd_mutex;       /* open/close mutex */
-       void *                  bd_claiming;
-       void *                  bd_holder;
-       int                     bd_holders;
-       bool                    bd_write_holder;
- #ifdef CONFIG_SYSFS
-       struct list_head        bd_holder_disks;
- #endif
-       struct block_device *   bd_contains;
-       unsigned                bd_block_size;
-       u8                      bd_partno;
-       struct hd_struct *      bd_part;
-       /* number of times partitions within this device have been opened. */
-       unsigned                bd_part_count;
-       int                     bd_invalidated;
-       struct gendisk *        bd_disk;
-       struct request_queue *  bd_queue;
-       struct backing_dev_info *bd_bdi;
-       struct list_head        bd_list;
-       /*
-        * Private data.  You must have bd_claim'ed the block_device
-        * to use this.  NOTE:  bd_claim allows an owner to claim
-        * the same device multiple times, the owner must take special
-        * care to not mess up bd_private for that case.
-        */
-       unsigned long           bd_private;
-       /* The counter of freeze processes */
-       int                     bd_fsfreeze_count;
-       /* Mutex for freeze */
-       struct mutex            bd_fsfreeze_mutex;
- } __randomize_layout;
  
  /* XArray tags, for tagging dirty and writeback pages in the pagecache. */
  #define PAGECACHE_TAG_DIRTY   XA_MARK_0
@@@ -908,8 -868,6 +869,6 @@@ static inline unsigned imajor(const str
        return MAJOR(inode->i_rdev);
  }
  
- extern struct block_device *I_BDEV(struct inode *inode);
  struct fown_struct {
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
@@@ -1775,14 -1733,6 +1734,6 @@@ struct dir_context 
        loff_t pos;
  };
  
- struct block_device_operations;
- /* These macros are for out of kernel modules to test that
-  * the kernel supports the unlocked_ioctl and compat_ioctl
-  * fields in struct file_operations. */
- #define HAVE_COMPAT_IOCTL 1
- #define HAVE_UNLOCKED_IOCTL 1
  /*
   * These flags let !MMU mmap() govern direct device mapping vs immediate
   * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
@@@ -1918,6 -1868,7 +1869,6 @@@ ssize_t rw_copy_check_uvector(int type
                              struct iovec *fast_pointer,
                              struct iovec **ret_pointer);
  
 -extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
  extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
  extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
  extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
@@@ -2264,18 -2215,9 +2215,9 @@@ struct file_system_type 
  
  #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
  
- #ifdef CONFIG_BLOCK
  extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
- #else
- static inline struct dentry *mount_bdev(struct file_system_type *fs_type,
-       int flags, const char *dev_name, void *data,
-       int (*fill_super)(struct super_block *, void *, int))
- {
-       return ERR_PTR(-ENODEV);
- }
- #endif
  extern struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
@@@ -2284,14 -2226,7 +2226,7 @@@ extern struct dentry *mount_nodev(struc
        int (*fill_super)(struct super_block *, void *, int));
  extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
  void generic_shutdown_super(struct super_block *sb);
- #ifdef CONFIG_BLOCK
  void kill_block_super(struct super_block *sb);
- #else
- static inline void kill_block_super(struct super_block *sb)
- {
-       BUG();
- }
- #endif
  void kill_anon_super(struct super_block *sb);
  void kill_litter_super(struct super_block *sb);
  void deactivate_super(struct super_block *sb);
@@@ -2581,93 -2516,16 +2516,16 @@@ extern struct kmem_cache *names_cachep
  #define __getname()           kmem_cache_alloc(names_cachep, GFP_KERNEL)
  #define __putname(name)               kmem_cache_free(names_cachep, (void *)(name))
  
- #ifdef CONFIG_BLOCK
- extern int register_blkdev(unsigned int, const char *);
- extern void unregister_blkdev(unsigned int, const char *);
- extern struct block_device *bdget(dev_t);
- extern struct block_device *bdgrab(struct block_device *bdev);
- extern void bd_set_size(struct block_device *, loff_t size);
- extern void bd_forget(struct inode *inode);
- extern void bdput(struct block_device *);
- extern void invalidate_bdev(struct block_device *);
- extern void iterate_bdevs(void (*)(struct block_device *, void *), void *);
- extern int sync_blockdev(struct block_device *bdev);
- extern struct super_block *freeze_bdev(struct block_device *);
- extern void emergency_thaw_all(void);
- extern void emergency_thaw_bdev(struct super_block *sb);
- extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
- extern int fsync_bdev(struct block_device *);
  extern struct super_block *blockdev_superblock;
  static inline bool sb_is_blkdev_sb(struct super_block *sb)
  {
-       return sb == blockdev_superblock;
- }
- #else
- static inline void bd_forget(struct inode *inode) {}
- static inline int sync_blockdev(struct block_device *bdev) { return 0; }
- static inline void invalidate_bdev(struct block_device *bdev) {}
- static inline struct super_block *freeze_bdev(struct block_device *sb)
- {
-       return NULL;
- }
- static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
- {
-       return 0;
+       return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
  }
  
- static inline int emergency_thaw_bdev(struct super_block *sb)
- {
-       return 0;
- }
- static inline void iterate_bdevs(void (*f)(struct block_device *, void *), void *arg)
- {
- }
- static inline bool sb_is_blkdev_sb(struct super_block *sb)
- {
-       return false;
- }
- #endif
+ void emergency_thaw_all(void);
  extern int sync_filesystem(struct super_block *);
  extern const struct file_operations def_blk_fops;
  extern const struct file_operations def_chr_fops;
- #ifdef CONFIG_BLOCK
- extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
- extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
- extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
- extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
-                                              void *holder);
- extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
-                                             void *holder);
- extern struct block_device *bd_start_claiming(struct block_device *bdev,
-                                             void *holder);
- extern void bd_finish_claiming(struct block_device *bdev,
-                              struct block_device *whole, void *holder);
- extern void bd_abort_claiming(struct block_device *bdev,
-                             struct block_device *whole, void *holder);
- extern void blkdev_put(struct block_device *bdev, fmode_t mode);
- #ifdef CONFIG_SYSFS
- extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
- extern void bd_unlink_disk_holder(struct block_device *bdev,
-                                 struct gendisk *disk);
- #else
- static inline int bd_link_disk_holder(struct block_device *bdev,
-                                     struct gendisk *disk)
- {
-       return 0;
- }
- static inline void bd_unlink_disk_holder(struct block_device *bdev,
-                                        struct gendisk *disk)
- {
- }
- #endif
- #endif
  
  /* fs/char_dev.c */
  #define CHRDEV_MAJOR_MAX 512
@@@ -2698,31 -2556,12 +2556,12 @@@ static inline void unregister_chrdev(un
        __unregister_chrdev(major, 0, 256, name);
  }
  
- /* fs/block_dev.c */
- #define BDEVNAME_SIZE 32      /* Largest string for a blockdev identifier */
- #define BDEVT_SIZE    10      /* Largest string for MAJ:MIN for blkdev */
- #ifdef CONFIG_BLOCK
- #define BLKDEV_MAJOR_MAX      512
- extern const char *bdevname(struct block_device *bdev, char *buffer);
- extern struct block_device *lookup_bdev(const char *);
- extern void blkdev_show(struct seq_file *,off_t);
- #else
- #define BLKDEV_MAJOR_MAX      0
- #endif
  extern void init_special_inode(struct inode *, umode_t, dev_t);
  
  /* Invalid inode operations -- fs/bad_inode.c */
  extern void make_bad_inode(struct inode *);
  extern bool is_bad_inode(struct inode *);
  
- #ifdef CONFIG_BLOCK
- extern int revalidate_disk(struct gendisk *);
- extern int check_disk_change(struct block_device *);
- extern int __invalidate_device(struct block_device *, bool);
- #endif
  unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);
  
@@@ -3033,7 -2872,6 +2872,7 @@@ extern int kernel_read_file_from_path_i
  extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
                                    enum kernel_read_file_id);
  extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
 +ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
  extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
  extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
  extern struct file * open_exec(const char *);
@@@ -3123,10 -2961,6 +2962,6 @@@ static inline void remove_inode_hash(st
  
  extern void inode_sb_list_add(struct inode *inode);
  
- #ifdef CONFIG_BLOCK
- extern int bdev_read_only(struct block_device *);
- #endif
- extern int set_blocksize(struct block_device *, int);
  extern int sb_set_blocksize(struct super_block *, int);
  extern int sb_min_blocksize(struct super_block *, int);