Merge tag 'for-5.15/block-2021-08-30' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 01:52:11 +0000 (18:52 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 01:52:11 +0000 (18:52 -0700)
Pull block updates from Jens Axboe:
 "Nothing major in here - lots of good cleanups and tech debt handling,
  which is also evident in the diffstats. In particular:

   - Add disk sequence numbers (Matteo)

   - Discard merge fix (Ming)

   - Relax disk zoned reporting restrictions (Niklas)

   - Bio error handling zoned leak fix (Pavel)

   - Start of proper add_disk() error handling (Luis, Christoph)

   - blk crypto fix (Eric)

   - Non-standard GPT location support (Dmitry)

   - IO priority improvements and cleanups (Damien)o

   - blk-throtl improvements (Chunguang)

   - diskstats_show() stack reduction (Abd-Alrhman)

   - Loop scheduler selection (Bart)

   - Switch block layer to use kmap_local_page() (Christoph)

   - Remove obsolete disk_name helper (Christoph)

   - block_device refcounting improvements (Christoph)

   - Ensure gendisk always has a request queue reference (Christoph)

   - Misc fixes/cleanups (Shaokun, Oliver, Guoqing)"

* tag 'for-5.15/block-2021-08-30' of git://git.kernel.dk/linux-block: (129 commits)
  sg: pass the device name to blk_trace_setup
  block, bfq: cleanup the repeated declaration
  blk-crypto: fix check for too-large dun_bytes
  blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN
  blk-zoned: allow zone management send operations without CAP_SYS_ADMIN
  block: mark blkdev_fsync static
  block: refine the disk_live check in del_gendisk
  mmc: sdhci-tegra: Enable MMC_CAP2_ALT_GPT_TEGRA
  mmc: block: Support alternative_gpt_sector() operation
  partitions/efi: Support non-standard GPT location
  block: Add alternative_gpt_sector() operation
  bio: fix page leak bio_add_hw_page failure
  block: remove CONFIG_DEBUG_BLOCK_EXT_DEVT
  block: remove a pointless call to MINOR() in device_add_disk
  null_blk: add error handling support for add_disk()
  virtio_blk: add error handling support for add_disk()
  block: add error handling for device_add_disk / add_disk
  block: return errors from disk_alloc_events
  block: return errors from blk_integrity_add
  block: call blk_register_queue earlier in device_add_disk
  ...

14 files changed:
1  2 
block/Makefile
block/blk-cgroup.c
block/blk-core.c
block/blk-iocost.c
block/blk-iolatency.c
block/blk-mq.c
block/blk.h
block/mq-deadline.c
block/partitions/ldm.c
drivers/block/virtio_blk.c
drivers/s390/block/dasd_eckd.c
drivers/scsi/sr.c
include/linux/fs.h
lib/Kconfig.debug

diff --combined block/Makefile
@@@ -22,11 -22,12 +22,10 @@@ obj-$(CONFIG_BLK_CGROUP_IOPRIO)    += blk-
  obj-$(CONFIG_BLK_CGROUP_IOLATENCY)    += blk-iolatency.o
  obj-$(CONFIG_BLK_CGROUP_IOCOST)       += blk-iocost.o
  obj-$(CONFIG_MQ_IOSCHED_DEADLINE)     += mq-deadline.o
 -mq-deadline-y += mq-deadline-main.o
 -mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o
  obj-$(CONFIG_MQ_IOSCHED_KYBER)        += kyber-iosched.o
  bfq-y                         := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
  obj-$(CONFIG_IOSCHED_BFQ)     += bfq.o
  
- obj-$(CONFIG_BLK_CMDLINE_PARSER)      += cmdline-parser.o
  obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
  obj-$(CONFIG_BLK_DEV_INTEGRITY_T10)   += t10-pi.o
  obj-$(CONFIG_BLK_MQ_PCI)      += blk-mq-pci.o
@@@ -40,3 -41,4 +39,4 @@@ obj-$(CONFIG_BLK_SED_OPAL)    += sed-opal.
  obj-$(CONFIG_BLK_PM)          += blk-pm.o
  obj-$(CONFIG_BLK_INLINE_ENCRYPTION)   += keyslot-manager.o blk-crypto.o
  obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)  += blk-crypto-fallback.o
+ obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --combined block/blk-cgroup.c
@@@ -489,10 -489,9 +489,9 @@@ static int blkcg_reset_stats(struct cgr
  
  const char *blkg_dev_name(struct blkcg_gq *blkg)
  {
-       /* some drivers (floppy) instantiate a queue w/o disk registered */
-       if (blkg->q->backing_dev_info->dev)
-               return bdi_dev_name(blkg->q->backing_dev_info);
-       return NULL;
+       if (!blkg->q->disk || !blkg->q->disk->bdi->dev)
+               return NULL;
+       return bdi_dev_name(blkg->q->disk->bdi);
  }
  
  /**
@@@ -790,7 -789,6 +789,7 @@@ static void blkcg_rstat_flush(struct cg
                struct blkcg_gq *parent = blkg->parent;
                struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
                struct blkg_iostat cur, delta;
 +              unsigned long flags;
                unsigned int seq;
  
                /* fetch the current per-cpu values */
                } while (u64_stats_fetch_retry(&bisc->sync, seq));
  
                /* propagate percpu delta to global */
 -              u64_stats_update_begin(&blkg->iostat.sync);
 +              flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                blkg_iostat_set(&delta, &cur);
                blkg_iostat_sub(&delta, &bisc->last);
                blkg_iostat_add(&blkg->iostat.cur, &delta);
                blkg_iostat_add(&bisc->last, &delta);
 -              u64_stats_update_end(&blkg->iostat.sync);
 +              u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
  
                /* propagate global delta to parent (unless that's root) */
                if (parent && parent->parent) {
 -                      u64_stats_update_begin(&parent->iostat.sync);
 +                      flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
                        blkg_iostat_set(&delta, &blkg->iostat.cur);
                        blkg_iostat_sub(&delta, &blkg->iostat.last);
                        blkg_iostat_add(&parent->iostat.cur, &delta);
                        blkg_iostat_add(&blkg->iostat.last, &delta);
 -                      u64_stats_update_end(&parent->iostat.sync);
 +                      u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
                }
        }
  
@@@ -849,7 -847,6 +848,7 @@@ static void blkcg_fill_root_iostats(voi
                memset(&tmp, 0, sizeof(tmp));
                for_each_possible_cpu(cpu) {
                        struct disk_stats *cpu_dkstats;
 +                      unsigned long flags;
  
                        cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
                        tmp.ios[BLKG_IOSTAT_READ] +=
                        tmp.bytes[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->sectors[STAT_DISCARD] << 9;
  
 -                      u64_stats_update_begin(&blkg->iostat.sync);
 +                      flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                        blkg_iostat_set(&blkg->iostat.cur, &tmp);
 -                      u64_stats_update_end(&blkg->iostat.sync);
 +                      u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
                }
        }
  }
  
- static int blkcg_print_stat(struct seq_file *sf, void *v)
+ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
  {
-       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
-       struct blkcg_gq *blkg;
-       if (!seq_css(sf)->parent)
-               blkcg_fill_root_iostats();
-       else
-               cgroup_rstat_flush(blkcg->css.cgroup);
+       struct blkg_iostat_set *bis = &blkg->iostat;
+       u64 rbytes, wbytes, rios, wios, dbytes, dios;
+       bool has_stats = false;
+       const char *dname;
+       unsigned seq;
+       int i;
  
-       rcu_read_lock();
+       if (!blkg->online)
+               return;
  
-       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
-               struct blkg_iostat_set *bis = &blkg->iostat;
-               const char *dname;
-               char *buf;
-               u64 rbytes, wbytes, rios, wios, dbytes, dios;
-               size_t size = seq_get_buf(sf, &buf), off = 0;
-               int i;
-               bool has_stats = false;
-               unsigned seq;
+       dname = blkg_dev_name(blkg);
+       if (!dname)
+               return;
  
-               spin_lock_irq(&blkg->q->queue_lock);
+       seq_printf(s, "%s ", dname);
  
-               if (!blkg->online)
-                       goto skip;
+       do {
+               seq = u64_stats_fetch_begin(&bis->sync);
  
-               dname = blkg_dev_name(blkg);
-               if (!dname)
-                       goto skip;
+               rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+               wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+               dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+               rios = bis->cur.ios[BLKG_IOSTAT_READ];
+               wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+               dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+       } while (u64_stats_fetch_retry(&bis->sync, seq));
  
-               /*
-                * Hooray string manipulation, count is the size written NOT
-                * INCLUDING THE \0, so size is now count+1 less than what we
-                * had before, but we want to start writing the next bit from
-                * the \0 so we only add count to buf.
-                */
-               off += scnprintf(buf+off, size-off, "%s ", dname);
+       if (rbytes || wbytes || rios || wios) {
+               has_stats = true;
+               seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+                       rbytes, wbytes, rios, wios,
+                       dbytes, dios);
+       }
  
-               do {
-                       seq = u64_stats_fetch_begin(&bis->sync);
+       if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+               has_stats = true;
+               seq_printf(s, " use_delay=%d delay_nsec=%llu",
+                       atomic_read(&blkg->use_delay),
+                       atomic64_read(&blkg->delay_nsec));
+       }
  
-                       rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
-                       wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
-                       dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
-                       rios = bis->cur.ios[BLKG_IOSTAT_READ];
-                       wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
-                       dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
-               } while (u64_stats_fetch_retry(&bis->sync, seq));
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
  
-               if (rbytes || wbytes || rios || wios) {
-                       has_stats = true;
-                       off += scnprintf(buf+off, size-off,
-                                        "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
-                                        rbytes, wbytes, rios, wios,
-                                        dbytes, dios);
-               }
+               if (!blkg->pd[i] || !pol->pd_stat_fn)
+                       continue;
  
-               if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+               if (pol->pd_stat_fn(blkg->pd[i], s))
                        has_stats = true;
-                       off += scnprintf(buf+off, size-off,
-                                        " use_delay=%d delay_nsec=%llu",
-                                        atomic_read(&blkg->use_delay),
-                                       (unsigned long long)atomic64_read(&blkg->delay_nsec));
-               }
+       }
  
-               for (i = 0; i < BLKCG_MAX_POLS; i++) {
-                       struct blkcg_policy *pol = blkcg_policy[i];
-                       size_t written;
+       if (has_stats)
+               seq_printf(s, "\n");
+ }
  
-                       if (!blkg->pd[i] || !pol->pd_stat_fn)
-                               continue;
+ static int blkcg_print_stat(struct seq_file *sf, void *v)
+ {
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct blkcg_gq *blkg;
  
-                       written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
-                       if (written)
-                               has_stats = true;
-                       off += written;
-               }
+       if (!seq_css(sf)->parent)
+               blkcg_fill_root_iostats();
+       else
+               cgroup_rstat_flush(blkcg->css.cgroup);
  
-               if (has_stats) {
-                       if (off < size - 1) {
-                               off += scnprintf(buf+off, size-off, "\n");
-                               seq_commit(sf, off);
-                       } else {
-                               seq_commit(sf, -1);
-                       }
-               }
-       skip:
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               spin_lock_irq(&blkg->q->queue_lock);
+               blkcg_print_one_stat(blkg, sf);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();
        return 0;
  }
diff --combined block/blk-core.c
@@@ -14,7 -14,6 +14,6 @@@
   */
  #include <linux/kernel.h>
  #include <linux/module.h>
- #include <linux/backing-dev.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
@@@ -122,6 -121,7 +121,6 @@@ void blk_rq_init(struct request_queue *
        rq->internal_tag = BLK_MQ_NO_TAG;
        rq->start_time_ns = ktime_get_ns();
        rq->part = NULL;
 -      refcount_set(&rq->ref, 1);
        blk_crypto_rq_set_defaults(rq);
  }
  EXPORT_SYMBOL(blk_rq_init);
@@@ -393,10 -393,7 +392,7 @@@ void blk_cleanup_queue(struct request_q
        /* for synchronous bio-based driver finish in-flight integrity i/o */
        blk_flush_integrity();
  
-       /* @q won't process any more request, flush async actions */
-       del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
        blk_sync_queue(q);
        if (queue_is_mq(q))
                blk_mq_exit_queue(q);
  
@@@ -533,20 -530,14 +529,14 @@@ struct request_queue *blk_alloc_queue(i
        if (ret)
                goto fail_id;
  
-       q->backing_dev_info = bdi_alloc(node_id);
-       if (!q->backing_dev_info)
-               goto fail_split;
        q->stats = blk_alloc_queue_stats();
        if (!q->stats)
-               goto fail_stats;
+               goto fail_split;
  
        q->node = node_id;
  
        atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
  
-       timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
-                   laptop_mode_timer_fn, 0);
        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
        INIT_LIST_HEAD(&q->icq_list);
        if (percpu_ref_init(&q->q_usage_counter,
                                blk_queue_usage_counter_release,
                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
-               goto fail_bdi;
+               goto fail_stats;
  
        if (blkcg_init_queue(q))
                goto fail_ref;
  
  fail_ref:
        percpu_ref_exit(&q->q_usage_counter);
- fail_bdi:
-       blk_free_queue_stats(q->stats);
  fail_stats:
-       bdi_put(q->backing_dev_info);
+       blk_free_queue_stats(q->stats);
  fail_split:
        bioset_exit(&q->bio_split);
  fail_id:
diff --combined block/blk-iocost.c
@@@ -2988,34 -2988,29 +2988,29 @@@ static void ioc_pd_free(struct blkg_pol
        kfree(iocg);
  }
  
- static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
+ static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
  {
        struct ioc_gq *iocg = pd_to_iocg(pd);
        struct ioc *ioc = iocg->ioc;
-       size_t pos = 0;
  
        if (!ioc->enabled)
-               return 0;
+               return false;
  
        if (iocg->level == 0) {
                unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
                        ioc->vtime_base_rate * 10000,
                        VTIME_PER_USEC);
-               pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
-                                 vp10k / 100, vp10k % 100);
+               seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
        }
  
-       pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
-                        iocg->last_stat.usage_us);
+       seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
  
        if (blkcg_debug_stats)
-               pos += scnprintf(buf + pos, size - pos,
-                                " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
-                                iocg->last_stat.wait_us,
-                                iocg->last_stat.indebt_us,
-                                iocg->last_stat.indelay_us);
-       return pos;
+               seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
+                       iocg->last_stat.wait_us,
+                       iocg->last_stat.indebt_us,
+                       iocg->last_stat.indelay_us);
+       return true;
  }
  
  static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
@@@ -3061,19 -3056,19 +3056,19 @@@ static ssize_t ioc_weight_write(struct 
                if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
                        return -EINVAL;
  
 -              spin_lock(&blkcg->lock);
 +              spin_lock_irq(&blkcg->lock);
                iocc->dfl_weight = v * WEIGHT_ONE;
                hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                        struct ioc_gq *iocg = blkg_to_iocg(blkg);
  
                        if (iocg) {
 -                              spin_lock_irq(&iocg->ioc->lock);
 +                              spin_lock(&iocg->ioc->lock);
                                ioc_now(iocg->ioc, &now);
                                weight_updated(iocg, &now);
 -                              spin_unlock_irq(&iocg->ioc->lock);
 +                              spin_unlock(&iocg->ioc->lock);
                        }
                }
 -              spin_unlock(&blkcg->lock);
 +              spin_unlock_irq(&blkcg->lock);
  
                return nbytes;
        }
diff --combined block/blk-iolatency.c
@@@ -833,11 -833,7 +833,11 @@@ static ssize_t iolatency_set_limit(stru
  
        enable = iolatency_set_min_lat_nsec(blkg, lat_val);
        if (enable) {
 -              WARN_ON_ONCE(!blk_get_queue(blkg->q));
 +              if (!blk_get_queue(blkg->q)) {
 +                      ret = -ENODEV;
 +                      goto out;
 +              }
 +
                blkg_get(blkg);
        }
  
@@@ -890,8 -886,7 +890,7 @@@ static int iolatency_print_limit(struc
        return 0;
  }
  
- static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
-                                size_t size)
+ static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
  {
        struct latency_stat stat;
        int cpu;
        preempt_enable();
  
        if (iolat->rq_depth.max_depth == UINT_MAX)
-               return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
-                                (unsigned long long)stat.ps.missed,
-                                (unsigned long long)stat.ps.total);
-       return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
-                        (unsigned long long)stat.ps.missed,
-                        (unsigned long long)stat.ps.total,
-                        iolat->rq_depth.max_depth);
+               seq_printf(s, " missed=%llu total=%llu depth=max",
+                       (unsigned long long)stat.ps.missed,
+                       (unsigned long long)stat.ps.total);
+       else
+               seq_printf(s, " missed=%llu total=%llu depth=%u",
+                       (unsigned long long)stat.ps.missed,
+                       (unsigned long long)stat.ps.total,
+                       iolat->rq_depth.max_depth);
+       return true;
  }
  
- static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
-                               size_t size)
+ static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
  {
        struct iolatency_grp *iolat = pd_to_lat(pd);
        unsigned long long avg_lat;
        unsigned long long cur_win;
  
        if (!blkcg_debug_stats)
-               return 0;
+               return false;
  
        if (iolat->ssd)
-               return iolatency_ssd_stat(iolat, buf, size);
+               return iolatency_ssd_stat(iolat, s);
  
        avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
        cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
        if (iolat->rq_depth.max_depth == UINT_MAX)
-               return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
-                                avg_lat, cur_win);
-       return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
-                        iolat->rq_depth.max_depth, avg_lat, cur_win);
+               seq_printf(s, " depth=max avg_lat=%llu win=%llu",
+                       avg_lat, cur_win);
+       else
+               seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
+                       iolat->rq_depth.max_depth, avg_lat, cur_win);
+       return true;
  }
  
  static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
                                                   struct request_queue *q,
                                                   struct blkcg *blkcg)
diff --combined block/blk-mq.c
@@@ -525,7 -525,7 +525,7 @@@ void blk_mq_free_request(struct reques
                __blk_mq_dec_active_requests(hctx);
  
        if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
-               laptop_io_completion(q->backing_dev_info);
+               laptop_io_completion(q->disk->bdi);
  
        rq_qos_done(q, rq);
  
@@@ -606,7 -606,7 +606,7 @@@ static inline bool blk_mq_complete_need
         * This is probably worse than completing the request on a different
         * cache domain.
         */
 -      if (force_irqthreads)
 +      if (force_irqthreads())
                return false;
  
        /* same CPU or cache domain?  Complete locally */
@@@ -911,7 -911,7 +911,7 @@@ static bool blk_mq_req_expired(struct r
  
  void blk_mq_put_rq_ref(struct request *rq)
  {
 -      if (is_flush_rq(rq, rq->mq_hctx))
 +      if (is_flush_rq(rq))
                rq->end_io(rq, 0);
        else if (refcount_dec_and_test(&rq->ref))
                __blk_mq_free_request(rq);
@@@ -923,14 -923,34 +923,14 @@@ static bool blk_mq_check_expired(struc
        unsigned long *next = priv;
  
        /*
 -       * Just do a quick check if it is expired before locking the request in
 -       * so we're not unnecessarilly synchronizing across CPUs.
 -       */
 -      if (!blk_mq_req_expired(rq, next))
 -              return true;
 -
 -      /*
 -       * We have reason to believe the request may be expired. Take a
 -       * reference on the request to lock this request lifetime into its
 -       * currently allocated context to prevent it from being reallocated in
 -       * the event the completion by-passes this timeout handler.
 -       *
 -       * If the reference was already released, then the driver beat the
 -       * timeout handler to posting a natural completion.
 -       */
 -      if (!refcount_inc_not_zero(&rq->ref))
 -              return true;
 -
 -      /*
 -       * The request is now locked and cannot be reallocated underneath the
 -       * timeout handler's processing. Re-verify this exact request is truly
 -       * expired; if it is not expired, then the request was completed and
 -       * reallocated as a new request.
 +       * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
 +       * be reallocated underneath the timeout handler's processing, then
 +       * the expire check is reliable. If the request is not expired, then
 +       * it was completed and reallocated as a new request after returning
 +       * from blk_mq_check_expired().
         */
        if (blk_mq_req_expired(rq, next))
                blk_mq_rq_timed_out(rq, reserved);
 -
 -      blk_mq_put_rq_ref(rq);
        return true;
  }
  
@@@ -2974,12 -2994,10 +2974,12 @@@ static void queue_set_hctx_shared(struc
        int i;
  
        queue_for_each_hw_ctx(q, hctx, i) {
 -              if (shared)
 +              if (shared) {
                        hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
 -              else
 +              } else {
 +                      blk_mq_tag_idle(hctx);
                        hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
 +              }
        }
  }
  
@@@ -3115,7 -3133,8 +3115,8 @@@ struct request_queue *blk_mq_init_queue
  }
  EXPORT_SYMBOL(blk_mq_init_queue);
  
- struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
+ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+               struct lock_class_key *lkclass)
  {
        struct request_queue *q;
        struct gendisk *disk;
        if (IS_ERR(q))
                return ERR_CAST(q);
  
-       disk = __alloc_disk_node(0, set->numa_node);
+       disk = __alloc_disk_node(q, set->numa_node, lkclass);
        if (!disk) {
                blk_cleanup_queue(q);
                return ERR_PTR(-ENOMEM);
        }
-       disk->queue = q;
        return disk;
  }
  EXPORT_SYMBOL(__blk_mq_alloc_disk);
diff --combined block/blk.h
@@@ -44,7 -44,11 +44,7 @@@ static inline void __blk_get_queue(stru
        kobject_get(&q->kobj);
  }
  
 -static inline bool
 -is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
 -{
 -      return hctx->fq->flush_rq == req;
 -}
 +bool is_flush_rq(struct request *req);
  
  struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags);
@@@ -128,7 -132,7 +128,7 @@@ static inline bool integrity_req_gap_fr
                                bip_next->bip_vec[0].bv_offset);
  }
  
void blk_integrity_add(struct gendisk *);
int blk_integrity_add(struct gendisk *disk);
  void blk_integrity_del(struct gendisk *);
  #else /* CONFIG_BLK_DEV_INTEGRITY */
  static inline bool blk_integrity_merge_rq(struct request_queue *rq,
@@@ -162,8 -166,9 +162,9 @@@ static inline bool bio_integrity_endio(
  static inline void bio_integrity_free(struct bio *bio)
  {
  }
- static inline void blk_integrity_add(struct gendisk *disk)
+ static inline int blk_integrity_add(struct gendisk *disk)
  {
+       return 0;
  }
  static inline void blk_integrity_del(struct gendisk *disk)
  {
@@@ -289,11 -294,13 +290,13 @@@ int create_task_io_context(struct task_
  extern int blk_throtl_init(struct request_queue *q);
  extern void blk_throtl_exit(struct request_queue *q);
  extern void blk_throtl_register_queue(struct request_queue *q);
+ extern void blk_throtl_charge_bio_split(struct bio *bio);
  bool blk_throtl_bio(struct bio *bio);
  #else /* CONFIG_BLK_DEV_THROTTLING */
  static inline int blk_throtl_init(struct request_queue *q) { return 0; }
  static inline void blk_throtl_exit(struct request_queue *q) { }
  static inline void blk_throtl_register_queue(struct request_queue *q) { }
+ static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
  static inline bool blk_throtl_bio(struct bio *bio) { return false; }
  #endif /* CONFIG_BLK_DEV_THROTTLING */
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
@@@ -340,15 -347,14 +343,14 @@@ static inline void blk_queue_clear_zone
  
  int blk_alloc_ext_minor(void);
  void blk_free_ext_minor(unsigned int minor);
- char *disk_name(struct gendisk *hd, int partno, char *buf);
  #define ADDPART_FLAG_NONE     0
  #define ADDPART_FLAG_RAID     1
  #define ADDPART_FLAG_WHOLEDISK        2
- int bdev_add_partition(struct block_device *bdev, int partno,
-               sector_t start, sector_t length);
- int bdev_del_partition(struct block_device *bdev, int partno);
- int bdev_resize_partition(struct block_device *bdev, int partno,
-               sector_t start, sector_t length);
+ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+               sector_t length);
+ int bdev_del_partition(struct gendisk *disk, int partno);
+ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+               sector_t length);
  
  int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
  
  struct request_queue *blk_alloc_queue(int node_id);
  
void disk_alloc_events(struct gendisk *disk);
int disk_alloc_events(struct gendisk *disk);
  void disk_add_events(struct gendisk *disk);
  void disk_del_events(struct gendisk *disk);
  void disk_release_events(struct gendisk *disk);
diff --combined block/mq-deadline.c
index 3692067,0000000..3c3693c
mode 100644,000000..100644
--- /dev/null
@@@ -1,1104 -1,0 +1,1106 @@@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
 + *  for the blk-mq scheduling framework
 + *
 + *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
 + */
 +#include <linux/kernel.h>
 +#include <linux/fs.h>
 +#include <linux/blkdev.h>
 +#include <linux/blk-mq.h>
 +#include <linux/elevator.h>
 +#include <linux/bio.h>
 +#include <linux/module.h>
 +#include <linux/slab.h>
 +#include <linux/init.h>
 +#include <linux/compiler.h>
 +#include <linux/rbtree.h>
 +#include <linux/sbitmap.h>
 +
 +#include <trace/events/block.h>
 +
 +#include "blk.h"
 +#include "blk-mq.h"
 +#include "blk-mq-debugfs.h"
 +#include "blk-mq-tag.h"
 +#include "blk-mq-sched.h"
 +
 +/*
 + * See Documentation/block/deadline-iosched.rst
 + */
 +static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
 +static const int writes_starved = 2;    /* max times reads can starve a write */
 +static const int fifo_batch = 16;       /* # of sequential requests treated as one
 +                                   by the above parameters. For throughput. */
 +
 +enum dd_data_dir {
 +      DD_READ         = READ,
 +      DD_WRITE        = WRITE,
 +};
 +
 +enum { DD_DIR_COUNT = 2 };
 +
 +enum dd_prio {
 +      DD_RT_PRIO      = 0,
 +      DD_BE_PRIO      = 1,
 +      DD_IDLE_PRIO    = 2,
 +      DD_PRIO_MAX     = 2,
 +};
 +
 +enum { DD_PRIO_COUNT = 3 };
 +
 +/* I/O statistics per I/O priority. */
 +struct io_stats_per_prio {
 +      local_t inserted;
 +      local_t merged;
 +      local_t dispatched;
 +      local_t completed;
 +};
 +
 +/* I/O statistics for all I/O priorities (enum dd_prio). */
 +struct io_stats {
 +      struct io_stats_per_prio stats[DD_PRIO_COUNT];
 +};
 +
 +/*
 + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
 + * present on both sort_list[] and fifo_list[].
 + */
 +struct dd_per_prio {
 +      struct list_head dispatch;
 +      struct rb_root sort_list[DD_DIR_COUNT];
 +      struct list_head fifo_list[DD_DIR_COUNT];
 +      /* Next request in FIFO order. Read, write or both are NULL. */
 +      struct request *next_rq[DD_DIR_COUNT];
 +};
 +
 +struct deadline_data {
 +      /*
 +       * run time data
 +       */
 +
 +      struct dd_per_prio per_prio[DD_PRIO_COUNT];
 +
 +      /* Data direction of latest dispatched request. */
 +      enum dd_data_dir last_dir;
 +      unsigned int batching;          /* number of sequential requests made */
 +      unsigned int starved;           /* times reads have starved writes */
 +
 +      struct io_stats __percpu *stats;
 +
 +      /*
 +       * settings that change how the i/o scheduler behaves
 +       */
 +      int fifo_expire[DD_DIR_COUNT];
 +      int fifo_batch;
 +      int writes_starved;
 +      int front_merges;
 +      u32 async_depth;
 +
 +      spinlock_t lock;
 +      spinlock_t zone_lock;
 +};
 +
 +/* Count one event of type 'event_type' and with I/O priority 'prio' */
 +#define dd_count(dd, event_type, prio) do {                           \
 +      struct io_stats *io_stats = get_cpu_ptr((dd)->stats);           \
 +                                                                      \
 +      BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
 +      BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
 +      local_inc(&io_stats->stats[(prio)].event_type);                 \
 +      put_cpu_ptr(io_stats);                                          \
 +} while (0)
 +
 +/*
 + * Returns the total number of dd_count(dd, event_type, prio) calls across all
 + * CPUs. No locking or barriers since it is fine if the returned sum is slightly
 + * outdated.
 + */
 +#define dd_sum(dd, event_type, prio) ({                                       \
 +      unsigned int cpu;                                               \
 +      u32 sum = 0;                                                    \
 +                                                                      \
 +      BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));       \
 +      BUILD_BUG_ON(!__same_type((prio), enum dd_prio));               \
 +      for_each_present_cpu(cpu)                                       \
 +              sum += local_read(&per_cpu_ptr((dd)->stats, cpu)->      \
 +                                stats[(prio)].event_type);            \
 +      sum;                                                            \
 +})
 +
 +/* Maps an I/O priority class to a deadline scheduler priority. */
 +static const enum dd_prio ioprio_class_to_prio[] = {
 +      [IOPRIO_CLASS_NONE]     = DD_BE_PRIO,
 +      [IOPRIO_CLASS_RT]       = DD_RT_PRIO,
 +      [IOPRIO_CLASS_BE]       = DD_BE_PRIO,
 +      [IOPRIO_CLASS_IDLE]     = DD_IDLE_PRIO,
 +};
 +
 +static inline struct rb_root *
 +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 +{
 +      return &per_prio->sort_list[rq_data_dir(rq)];
 +}
 +
 +/*
 + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
 + * request.
 + */
 +static u8 dd_rq_ioclass(struct request *rq)
 +{
 +      return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
 +}
 +
 +/*
 + * get the request after `rq' in sector-sorted order
 + */
 +static inline struct request *
 +deadline_latter_request(struct request *rq)
 +{
 +      struct rb_node *node = rb_next(&rq->rb_node);
 +
 +      if (node)
 +              return rb_entry_rq(node);
 +
 +      return NULL;
 +}
 +
 +static void
 +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
 +{
 +      struct rb_root *root = deadline_rb_root(per_prio, rq);
 +
 +      elv_rb_add(root, rq);
 +}
 +
 +static inline void
 +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
 +{
 +      const enum dd_data_dir data_dir = rq_data_dir(rq);
 +
 +      if (per_prio->next_rq[data_dir] == rq)
 +              per_prio->next_rq[data_dir] = deadline_latter_request(rq);
 +
 +      elv_rb_del(deadline_rb_root(per_prio, rq), rq);
 +}
 +
 +/*
 + * remove rq from rbtree and fifo.
 + */
 +static void deadline_remove_request(struct request_queue *q,
 +                                  struct dd_per_prio *per_prio,
 +                                  struct request *rq)
 +{
 +      list_del_init(&rq->queuelist);
 +
 +      /*
 +       * We might not be on the rbtree, if we are doing an insert merge
 +       */
 +      if (!RB_EMPTY_NODE(&rq->rb_node))
 +              deadline_del_rq_rb(per_prio, rq);
 +
 +      elv_rqhash_del(q, rq);
 +      if (q->last_merge == rq)
 +              q->last_merge = NULL;
 +}
 +
 +static void dd_request_merged(struct request_queue *q, struct request *req,
 +                            enum elv_merge type)
 +{
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      const u8 ioprio_class = dd_rq_ioclass(req);
 +      const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];
 +
 +      /*
 +       * if the merge was a front merge, we need to reposition request
 +       */
 +      if (type == ELEVATOR_FRONT_MERGE) {
 +              elv_rb_del(deadline_rb_root(per_prio, req), req);
 +              deadline_add_rq_rb(per_prio, req);
 +      }
 +}
 +
 +/*
 + * Callback function that is invoked after @next has been merged into @req.
 + */
 +static void dd_merged_requests(struct request_queue *q, struct request *req,
 +                             struct request *next)
 +{
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      const u8 ioprio_class = dd_rq_ioclass(next);
 +      const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 +
 +      dd_count(dd, merged, prio);
 +
 +      /*
 +       * if next expires before rq, assign its expire time to rq
 +       * and move into next position (next will be deleted) in fifo
 +       */
 +      if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
 +              if (time_before((unsigned long)next->fifo_time,
 +                              (unsigned long)req->fifo_time)) {
 +                      list_move(&req->queuelist, &next->queuelist);
 +                      req->fifo_time = next->fifo_time;
 +              }
 +      }
 +
 +      /*
 +       * kill knowledge of next, this one is a goner
 +       */
 +      deadline_remove_request(q, &dd->per_prio[prio], next);
 +}
 +
 +/*
 + * move an entry to dispatch queue
 + */
 +static void
 +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 +                    struct request *rq)
 +{
 +      const enum dd_data_dir data_dir = rq_data_dir(rq);
 +
 +      per_prio->next_rq[data_dir] = deadline_latter_request(rq);
 +
 +      /*
 +       * take it off the sort and fifo list
 +       */
 +      deadline_remove_request(rq->q, per_prio, rq);
 +}
 +
 +/* Number of requests queued for a given priority level. */
 +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
 +{
 +      return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
 +}
 +
 +/*
 + * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
 + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
 + */
 +static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
 +                                    enum dd_data_dir data_dir)
 +{
 +      struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
 +
 +      /*
 +       * rq is expired!
 +       */
 +      if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
 +              return 1;
 +
 +      return 0;
 +}
 +
 +/*
 + * For the specified data direction, return the next request to
 + * dispatch using arrival ordered lists.
 + */
 +static struct request *
 +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 +                    enum dd_data_dir data_dir)
 +{
 +      struct request *rq;
 +      unsigned long flags;
 +
 +      if (list_empty(&per_prio->fifo_list[data_dir]))
 +              return NULL;
 +
 +      rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
 +      if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
 +              return rq;
 +
 +      /*
 +       * Look for a write request that can be dispatched, that is one with
 +       * an unlocked target zone.
 +       */
 +      spin_lock_irqsave(&dd->zone_lock, flags);
 +      list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
 +              if (blk_req_can_dispatch_to_zone(rq))
 +                      goto out;
 +      }
 +      rq = NULL;
 +out:
 +      spin_unlock_irqrestore(&dd->zone_lock, flags);
 +
 +      return rq;
 +}
 +
 +/*
 + * For the specified data direction, return the next request to
 + * dispatch using sector position sorted lists.
 + */
 +static struct request *
 +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 +                    enum dd_data_dir data_dir)
 +{
 +      struct request *rq;
 +      unsigned long flags;
 +
 +      rq = per_prio->next_rq[data_dir];
 +      if (!rq)
 +              return NULL;
 +
 +      if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
 +              return rq;
 +
 +      /*
 +       * Look for a write request that can be dispatched, that is one with
 +       * an unlocked target zone.
 +       */
 +      spin_lock_irqsave(&dd->zone_lock, flags);
 +      while (rq) {
 +              if (blk_req_can_dispatch_to_zone(rq))
 +                      break;
 +              rq = deadline_latter_request(rq);
 +      }
 +      spin_unlock_irqrestore(&dd->zone_lock, flags);
 +
 +      return rq;
 +}
 +
 +/*
 + * deadline_dispatch_requests selects the best request according to
 + * read/write expire, fifo_batch, etc
 + */
 +static struct request *__dd_dispatch_request(struct deadline_data *dd,
 +                                           struct dd_per_prio *per_prio)
 +{
 +      struct request *rq, *next_rq;
 +      enum dd_data_dir data_dir;
 +      enum dd_prio prio;
 +      u8 ioprio_class;
 +
 +      lockdep_assert_held(&dd->lock);
 +
 +      if (!list_empty(&per_prio->dispatch)) {
 +              rq = list_first_entry(&per_prio->dispatch, struct request,
 +                                    queuelist);
 +              list_del_init(&rq->queuelist);
 +              goto done;
 +      }
 +
 +      /*
 +       * batches are currently reads XOR writes
 +       */
 +      rq = deadline_next_request(dd, per_prio, dd->last_dir);
 +      if (rq && dd->batching < dd->fifo_batch)
 +              /* we have a next request are still entitled to batch */
 +              goto dispatch_request;
 +
 +      /*
 +       * at this point we are not running a batch. select the appropriate
 +       * data direction (read / write)
 +       */
 +
 +      if (!list_empty(&per_prio->fifo_list[DD_READ])) {
 +              BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
 +
 +              if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
 +                  (dd->starved++ >= dd->writes_starved))
 +                      goto dispatch_writes;
 +
 +              data_dir = DD_READ;
 +
 +              goto dispatch_find_request;
 +      }
 +
 +      /*
 +       * there are either no reads or writes have been starved
 +       */
 +
 +      if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
 +dispatch_writes:
 +              BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
 +
 +              dd->starved = 0;
 +
 +              data_dir = DD_WRITE;
 +
 +              goto dispatch_find_request;
 +      }
 +
 +      return NULL;
 +
 +dispatch_find_request:
 +      /*
 +       * we are not running a batch, find best request for selected data_dir
 +       */
 +      next_rq = deadline_next_request(dd, per_prio, data_dir);
 +      if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
 +              /*
 +               * A deadline has expired, the last request was in the other
 +               * direction, or we have run out of higher-sectored requests.
 +               * Start again from the request with the earliest expiry time.
 +               */
 +              rq = deadline_fifo_request(dd, per_prio, data_dir);
 +      } else {
 +              /*
 +               * The last req was the same dir and we have a next request in
 +               * sort order. No expired requests so continue on from here.
 +               */
 +              rq = next_rq;
 +      }
 +
 +      /*
 +       * For a zoned block device, if we only have writes queued and none of
 +       * them can be dispatched, rq will be NULL.
 +       */
 +      if (!rq)
 +              return NULL;
 +
 +      dd->last_dir = data_dir;
 +      dd->batching = 0;
 +
 +dispatch_request:
 +      /*
 +       * rq is the selected appropriate request.
 +       */
 +      dd->batching++;
 +      deadline_move_request(dd, per_prio, rq);
 +done:
 +      ioprio_class = dd_rq_ioclass(rq);
 +      prio = ioprio_class_to_prio[ioprio_class];
 +      dd_count(dd, dispatched, prio);
 +      /*
 +       * If the request needs its target zone locked, do it.
 +       */
 +      blk_req_zone_write_lock(rq);
 +      rq->rq_flags |= RQF_STARTED;
 +      return rq;
 +}
 +
 +/*
 + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
 + *
 + * One confusing aspect here is that we get called for a specific
 + * hardware queue, but we may return a request that is for a
 + * different hardware queue. This is because mq-deadline has shared
 + * state for all hardware queues, in terms of sorting, FIFOs, etc.
 + */
 +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 +{
 +      struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 +      struct request *rq;
 +      enum dd_prio prio;
 +
 +      spin_lock(&dd->lock);
 +      for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 +              rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
 +              if (rq)
 +                      break;
 +      }
 +      spin_unlock(&dd->lock);
 +
 +      return rq;
 +}
 +
 +/*
 + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
 + * function is used by __blk_mq_get_tag().
 + */
 +static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 +{
 +      struct deadline_data *dd = data->q->elevator->elevator_data;
 +
 +      /* Do not throttle synchronous reads. */
 +      if (op_is_sync(op) && !op_is_write(op))
 +              return;
 +
 +      /*
 +       * Throttle asynchronous requests and writes such that these requests
 +       * do not block the allocation of synchronous requests.
 +       */
 +      data->shallow_depth = dd->async_depth;
 +}
 +
 +/* Called by blk_mq_update_nr_requests(). */
 +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
 +{
 +      struct request_queue *q = hctx->queue;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      struct blk_mq_tags *tags = hctx->sched_tags;
 +
 +      dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
 +
 +      sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
 +}
 +
 +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
 +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 +{
 +      dd_depth_updated(hctx);
 +      return 0;
 +}
 +
 +static void dd_exit_sched(struct elevator_queue *e)
 +{
 +      struct deadline_data *dd = e->elevator_data;
 +      enum dd_prio prio;
 +
 +      for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 +              struct dd_per_prio *per_prio = &dd->per_prio[prio];
 +
 +              WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
 +              WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
 +      }
 +
 +      free_percpu(dd->stats);
 +
 +      kfree(dd);
 +}
 +
 +/*
 + * initialize elevator private data (deadline_data).
 + */
 +static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 +{
 +      struct deadline_data *dd;
 +      struct elevator_queue *eq;
 +      enum dd_prio prio;
 +      int ret = -ENOMEM;
 +
 +      eq = elevator_alloc(q, e);
 +      if (!eq)
 +              return ret;
 +
 +      dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
 +      if (!dd)
 +              goto put_eq;
 +
 +      eq->elevator_data = dd;
 +
 +      dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
 +                                   GFP_KERNEL | __GFP_ZERO);
 +      if (!dd->stats)
 +              goto free_dd;
 +
 +      for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 +              struct dd_per_prio *per_prio = &dd->per_prio[prio];
 +
 +              INIT_LIST_HEAD(&per_prio->dispatch);
 +              INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
 +              INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
 +              per_prio->sort_list[DD_READ] = RB_ROOT;
 +              per_prio->sort_list[DD_WRITE] = RB_ROOT;
 +      }
 +      dd->fifo_expire[DD_READ] = read_expire;
 +      dd->fifo_expire[DD_WRITE] = write_expire;
 +      dd->writes_starved = writes_starved;
 +      dd->front_merges = 1;
 +      dd->last_dir = DD_WRITE;
 +      dd->fifo_batch = fifo_batch;
 +      spin_lock_init(&dd->lock);
 +      spin_lock_init(&dd->zone_lock);
 +
 +      q->elevator = eq;
 +      return 0;
 +
 +free_dd:
 +      kfree(dd);
 +
 +put_eq:
 +      kobject_put(&eq->kobj);
 +      return ret;
 +}
 +
 +/*
 + * Try to merge @bio into an existing request. If @bio has been merged into
 + * an existing request, store the pointer to that request into *@rq.
 + */
 +static int dd_request_merge(struct request_queue *q, struct request **rq,
 +                          struct bio *bio)
 +{
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
 +      const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];
 +      sector_t sector = bio_end_sector(bio);
 +      struct request *__rq;
 +
 +      if (!dd->front_merges)
 +              return ELEVATOR_NO_MERGE;
 +
 +      __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
 +      if (__rq) {
 +              BUG_ON(sector != blk_rq_pos(__rq));
 +
 +              if (elv_bio_merge_ok(__rq, bio)) {
 +                      *rq = __rq;
++                      if (blk_discard_mergable(__rq))
++                              return ELEVATOR_DISCARD_MERGE;
 +                      return ELEVATOR_FRONT_MERGE;
 +              }
 +      }
 +
 +      return ELEVATOR_NO_MERGE;
 +}
 +
 +/*
 + * Attempt to merge a bio into an existing request. This function is called
 + * before @bio is associated with a request.
 + */
 +static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
 +              unsigned int nr_segs)
 +{
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      struct request *free = NULL;
 +      bool ret;
 +
 +      spin_lock(&dd->lock);
 +      ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 +      spin_unlock(&dd->lock);
 +
 +      if (free)
 +              blk_mq_free_request(free);
 +
 +      return ret;
 +}
 +
 +/*
 + * add rq to rbtree and fifo
 + */
 +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 +                            bool at_head)
 +{
 +      struct request_queue *q = hctx->queue;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      const enum dd_data_dir data_dir = rq_data_dir(rq);
 +      u16 ioprio = req_get_ioprio(rq);
 +      u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
 +      struct dd_per_prio *per_prio;
 +      enum dd_prio prio;
 +      LIST_HEAD(free);
 +
 +      lockdep_assert_held(&dd->lock);
 +
 +      /*
 +       * This may be a requeue of a write request that has locked its
 +       * target zone. If it is the case, this releases the zone lock.
 +       */
 +      blk_req_zone_write_unlock(rq);
 +
 +      prio = ioprio_class_to_prio[ioprio_class];
 +      dd_count(dd, inserted, prio);
 +      rq->elv.priv[0] = (void *)(uintptr_t)1;
 +
 +      if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
 +              blk_mq_free_requests(&free);
 +              return;
 +      }
 +
 +      trace_block_rq_insert(rq);
 +
 +      per_prio = &dd->per_prio[prio];
 +      if (at_head) {
 +              list_add(&rq->queuelist, &per_prio->dispatch);
 +      } else {
 +              deadline_add_rq_rb(per_prio, rq);
 +
 +              if (rq_mergeable(rq)) {
 +                      elv_rqhash_add(q, rq);
 +                      if (!q->last_merge)
 +                              q->last_merge = rq;
 +              }
 +
 +              /*
 +               * set expire time and add to fifo list
 +               */
 +              rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
 +              list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
 +      }
 +}
 +
 +/*
 + * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests().
 + */
 +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 +                             struct list_head *list, bool at_head)
 +{
 +      struct request_queue *q = hctx->queue;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +
 +      spin_lock(&dd->lock);
 +      while (!list_empty(list)) {
 +              struct request *rq;
 +
 +              rq = list_first_entry(list, struct request, queuelist);
 +              list_del_init(&rq->queuelist);
 +              dd_insert_request(hctx, rq, at_head);
 +      }
 +      spin_unlock(&dd->lock);
 +}
 +
 +/* Callback from inside blk_mq_rq_ctx_init(). */
 +static void dd_prepare_request(struct request *rq)
 +{
 +      rq->elv.priv[0] = NULL;
 +}
 +
 +/*
 + * Callback from inside blk_mq_free_request().
 + *
 + * For zoned block devices, write unlock the target zone of
 + * completed write requests. Do this while holding the zone lock
 + * spinlock so that the zone is never unlocked while deadline_fifo_request()
 + * or deadline_next_request() are executing. This function is called for
 + * all requests, whether or not these requests complete successfully.
 + *
 + * For a zoned block device, __dd_dispatch_request() may have stopped
 + * dispatching requests if all the queued requests are write requests directed
 + * at zones that are already locked due to on-going write requests. To ensure
 + * write request dispatch progress in this case, mark the queue as needing a
 + * restart to ensure that the queue is run again after completion of the
 + * request and zones being unlocked.
 + */
 +static void dd_finish_request(struct request *rq)
 +{
 +      struct request_queue *q = rq->q;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +      const u8 ioprio_class = dd_rq_ioclass(rq);
 +      const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];
 +
 +      /*
 +       * The block layer core may call dd_finish_request() without having
 +       * called dd_insert_requests(). Hence only update statistics for
 +       * requests for which dd_insert_requests() has been called. See also
 +       * blk_mq_request_bypass_insert().
 +       */
 +      if (rq->elv.priv[0])
 +              dd_count(dd, completed, prio);
 +
 +      if (blk_queue_is_zoned(q)) {
 +              unsigned long flags;
 +
 +              spin_lock_irqsave(&dd->zone_lock, flags);
 +              blk_req_zone_write_unlock(rq);
 +              if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
 +                      blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
 +              spin_unlock_irqrestore(&dd->zone_lock, flags);
 +      }
 +}
 +
 +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
 +{
 +      return !list_empty_careful(&per_prio->dispatch) ||
 +              !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
 +              !list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
 +}
 +
 +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 +{
 +      struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 +      enum dd_prio prio;
 +
 +      for (prio = 0; prio <= DD_PRIO_MAX; prio++)
 +              if (dd_has_work_for_prio(&dd->per_prio[prio]))
 +                      return true;
 +
 +      return false;
 +}
 +
 +/*
 + * sysfs parts below
 + */
 +#define SHOW_INT(__FUNC, __VAR)                                               \
 +static ssize_t __FUNC(struct elevator_queue *e, char *page)           \
 +{                                                                     \
 +      struct deadline_data *dd = e->elevator_data;                    \
 +                                                                      \
 +      return sysfs_emit(page, "%d\n", __VAR);                         \
 +}
 +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
 +SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 +SHOW_INT(deadline_front_merges_show, dd->front_merges);
 +SHOW_INT(deadline_async_depth_show, dd->front_merges);
 +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
 +#undef SHOW_INT
 +#undef SHOW_JIFFIES
 +
 +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                       \
 +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)       \
 +{                                                                     \
 +      struct deadline_data *dd = e->elevator_data;                    \
 +      int __data, __ret;                                              \
 +                                                                      \
 +      __ret = kstrtoint(page, 0, &__data);                            \
 +      if (__ret < 0)                                                  \
 +              return __ret;                                           \
 +      if (__data < (MIN))                                             \
 +              __data = (MIN);                                         \
 +      else if (__data > (MAX))                                        \
 +              __data = (MAX);                                         \
 +      *(__PTR) = __CONV(__data);                                      \
 +      return count;                                                   \
 +}
 +#define STORE_INT(__FUNC, __PTR, MIN, MAX)                            \
 +      STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
 +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX)                                \
 +      STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
 +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 +STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
 +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
 +#undef STORE_FUNCTION
 +#undef STORE_INT
 +#undef STORE_JIFFIES
 +
 +#define DD_ATTR(name) \
 +      __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
 +
 +static struct elv_fs_entry deadline_attrs[] = {
 +      DD_ATTR(read_expire),
 +      DD_ATTR(write_expire),
 +      DD_ATTR(writes_starved),
 +      DD_ATTR(front_merges),
 +      DD_ATTR(async_depth),
 +      DD_ATTR(fifo_batch),
 +      __ATTR_NULL
 +};
 +
 +#ifdef CONFIG_BLK_DEBUG_FS
 +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name)             \
 +static void *deadline_##name##_fifo_start(struct seq_file *m,         \
 +                                        loff_t *pos)                  \
 +      __acquires(&dd->lock)                                           \
 +{                                                                     \
 +      struct request_queue *q = m->private;                           \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
 +                                                                      \
 +      spin_lock(&dd->lock);                                           \
 +      return seq_list_start(&per_prio->fifo_list[data_dir], *pos);    \
 +}                                                                     \
 +                                                                      \
 +static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
 +                                       loff_t *pos)                   \
 +{                                                                     \
 +      struct request_queue *q = m->private;                           \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
 +                                                                      \
 +      return seq_list_next(v, &per_prio->fifo_list[data_dir], pos);   \
 +}                                                                     \
 +                                                                      \
 +static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)  \
 +      __releases(&dd->lock)                                           \
 +{                                                                     \
 +      struct request_queue *q = m->private;                           \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +                                                                      \
 +      spin_unlock(&dd->lock);                                         \
 +}                                                                     \
 +                                                                      \
 +static const struct seq_operations deadline_##name##_fifo_seq_ops = { \
 +      .start  = deadline_##name##_fifo_start,                         \
 +      .next   = deadline_##name##_fifo_next,                          \
 +      .stop   = deadline_##name##_fifo_stop,                          \
 +      .show   = blk_mq_debugfs_rq_show,                               \
 +};                                                                    \
 +                                                                      \
 +static int deadline_##name##_next_rq_show(void *data,                 \
 +                                        struct seq_file *m)           \
 +{                                                                     \
 +      struct request_queue *q = data;                                 \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
 +      struct request *rq = per_prio->next_rq[data_dir];               \
 +                                                                      \
 +      if (rq)                                                         \
 +              __blk_mq_debugfs_rq_show(m, rq);                        \
 +      return 0;                                                       \
 +}
 +
 +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
 +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
 +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
 +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
 +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
 +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
 +#undef DEADLINE_DEBUGFS_DDIR_ATTRS
 +
 +static int deadline_batching_show(void *data, struct seq_file *m)
 +{
 +      struct request_queue *q = data;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +
 +      seq_printf(m, "%u\n", dd->batching);
 +      return 0;
 +}
 +
 +static int deadline_starved_show(void *data, struct seq_file *m)
 +{
 +      struct request_queue *q = data;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +
 +      seq_printf(m, "%u\n", dd->starved);
 +      return 0;
 +}
 +
 +static int dd_async_depth_show(void *data, struct seq_file *m)
 +{
 +      struct request_queue *q = data;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +
 +      seq_printf(m, "%u\n", dd->async_depth);
 +      return 0;
 +}
 +
 +static int dd_queued_show(void *data, struct seq_file *m)
 +{
 +      struct request_queue *q = data;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +
 +      seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
 +                 dd_queued(dd, DD_BE_PRIO),
 +                 dd_queued(dd, DD_IDLE_PRIO));
 +      return 0;
 +}
 +
 +/* Number of requests owned by the block driver for a given priority. */
 +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
 +{
 +      return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
 +              - dd_sum(dd, completed, prio);
 +}
 +
 +static int dd_owned_by_driver_show(void *data, struct seq_file *m)
 +{
 +      struct request_queue *q = data;
 +      struct deadline_data *dd = q->elevator->elevator_data;
 +
 +      seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
 +                 dd_owned_by_driver(dd, DD_BE_PRIO),
 +                 dd_owned_by_driver(dd, DD_IDLE_PRIO));
 +      return 0;
 +}
 +
 +#define DEADLINE_DISPATCH_ATTR(prio)                                  \
 +static void *deadline_dispatch##prio##_start(struct seq_file *m,      \
 +                                           loff_t *pos)               \
 +      __acquires(&dd->lock)                                           \
 +{                                                                     \
 +      struct request_queue *q = m->private;                           \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
 +                                                                      \
 +      spin_lock(&dd->lock);                                           \
 +      return seq_list_start(&per_prio->dispatch, *pos);               \
 +}                                                                     \
 +                                                                      \
 +static void *deadline_dispatch##prio##_next(struct seq_file *m,               \
 +                                          void *v, loff_t *pos)       \
 +{                                                                     \
 +      struct request_queue *q = m->private;                           \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +      struct dd_per_prio *per_prio = &dd->per_prio[prio];             \
 +                                                                      \
 +      return seq_list_next(v, &per_prio->dispatch, pos);              \
 +}                                                                     \
 +                                                                      \
 +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v)       \
 +      __releases(&dd->lock)                                           \
 +{                                                                     \
 +      struct request_queue *q = m->private;                           \
 +      struct deadline_data *dd = q->elevator->elevator_data;          \
 +                                                                      \
 +      spin_unlock(&dd->lock);                                         \
 +}                                                                     \
 +                                                                      \
 +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
 +      .start  = deadline_dispatch##prio##_start,                      \
 +      .next   = deadline_dispatch##prio##_next,                       \
 +      .stop   = deadline_dispatch##prio##_stop,                       \
 +      .show   = blk_mq_debugfs_rq_show,                               \
 +}
 +
 +DEADLINE_DISPATCH_ATTR(0);
 +DEADLINE_DISPATCH_ATTR(1);
 +DEADLINE_DISPATCH_ATTR(2);
 +#undef DEADLINE_DISPATCH_ATTR
 +
 +#define DEADLINE_QUEUE_DDIR_ATTRS(name)                                       \
 +      {#name "_fifo_list", 0400,                                      \
 +                      .seq_ops = &deadline_##name##_fifo_seq_ops}
 +#define DEADLINE_NEXT_RQ_ATTR(name)                                   \
 +      {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
 +static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
 +      DEADLINE_QUEUE_DDIR_ATTRS(read0),
 +      DEADLINE_QUEUE_DDIR_ATTRS(write0),
 +      DEADLINE_QUEUE_DDIR_ATTRS(read1),
 +      DEADLINE_QUEUE_DDIR_ATTRS(write1),
 +      DEADLINE_QUEUE_DDIR_ATTRS(read2),
 +      DEADLINE_QUEUE_DDIR_ATTRS(write2),
 +      DEADLINE_NEXT_RQ_ATTR(read0),
 +      DEADLINE_NEXT_RQ_ATTR(write0),
 +      DEADLINE_NEXT_RQ_ATTR(read1),
 +      DEADLINE_NEXT_RQ_ATTR(write1),
 +      DEADLINE_NEXT_RQ_ATTR(read2),
 +      DEADLINE_NEXT_RQ_ATTR(write2),
 +      {"batching", 0400, deadline_batching_show},
 +      {"starved", 0400, deadline_starved_show},
 +      {"async_depth", 0400, dd_async_depth_show},
 +      {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
 +      {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
 +      {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
 +      {"owned_by_driver", 0400, dd_owned_by_driver_show},
 +      {"queued", 0400, dd_queued_show},
 +      {},
 +};
 +#undef DEADLINE_QUEUE_DDIR_ATTRS
 +#endif
 +
 +static struct elevator_type mq_deadline = {
 +      .ops = {
 +              .depth_updated          = dd_depth_updated,
 +              .limit_depth            = dd_limit_depth,
 +              .insert_requests        = dd_insert_requests,
 +              .dispatch_request       = dd_dispatch_request,
 +              .prepare_request        = dd_prepare_request,
 +              .finish_request         = dd_finish_request,
 +              .next_request           = elv_rb_latter_request,
 +              .former_request         = elv_rb_former_request,
 +              .bio_merge              = dd_bio_merge,
 +              .request_merge          = dd_request_merge,
 +              .requests_merged        = dd_merged_requests,
 +              .request_merged         = dd_request_merged,
 +              .has_work               = dd_has_work,
 +              .init_sched             = dd_init_sched,
 +              .exit_sched             = dd_exit_sched,
 +              .init_hctx              = dd_init_hctx,
 +      },
 +
 +#ifdef CONFIG_BLK_DEBUG_FS
 +      .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
 +#endif
 +      .elevator_attrs = deadline_attrs,
 +      .elevator_name = "mq-deadline",
 +      .elevator_alias = "deadline",
 +      .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
 +      .elevator_owner = THIS_MODULE,
 +};
 +MODULE_ALIAS("mq-deadline-iosched");
 +
 +static int __init deadline_init(void)
 +{
 +      return elv_register(&mq_deadline);
 +}
 +
 +static void __exit deadline_exit(void)
 +{
 +      elv_unregister(&mq_deadline);
 +}
 +
 +module_init(deadline_init);
 +module_exit(deadline_exit);
 +
 +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
 +MODULE_LICENSE("GPL");
 +MODULE_DESCRIPTION("MQ deadline IO scheduler");
diff --combined block/partitions/ldm.c
@@@ -1,5 -1,5 +1,5 @@@
  // SPDX-License-Identifier: GPL-2.0-or-later
 -/**
 +/*
   * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
   *
   * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
@@@ -304,7 -304,7 +304,7 @@@ static bool ldm_validate_privheads(stru
                }
        }
  
-       num_sects = state->bdev->bd_inode->i_size >> 9;
+       num_sects = get_capacity(state->disk);
  
        if ((ph[0]->config_start > num_sects) ||
           ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@@ -339,11 -339,11 +339,11 @@@ out
  /**
   * ldm_validate_tocblocks - Validate the table of contents and its backups
   * @state: Partition check state including device holding the LDM Database
-  * @base:  Offset, into @state->bdev, of the database
+  * @base:  Offset, into @state->disk, of the database
   * @ldb:   Cache of the database structures
   *
   * Find and compare the four tables of contents of the LDM Database stored on
-  * @state->bdev and return the parsed information into @toc1.
+  * @state->disk and return the parsed information into @toc1.
   *
   * The offsets and sizes of the configs are range-checked against a privhead.
   *
@@@ -486,8 -486,8 +486,8 @@@ out
   *       only likely to happen if the underlying device is strange.  If that IS
   *       the case we should return zero to let someone else try.
   *
-  * Return:  'true'   @state->bdev is a dynamic disk
-  *          'false'  @state->bdev is not a dynamic disk, or an error occurred
+  * Return:  'true'   @state->disk is a dynamic disk
+  *          'false'  @state->disk is not a dynamic disk, or an error occurred
   */
  static bool ldm_validate_partition_table(struct parsed_partitions *state)
  {
@@@ -1340,7 -1340,7 +1340,7 @@@ static bool ldm_frag_commit (struct lis
  /**
   * ldm_get_vblks - Read the on-disk database of VBLKs into memory
   * @state: Partition check state including device holding the LDM Database
-  * @base:  Offset, into @state->bdev, of the database
+  * @base:  Offset, into @state->disk, of the database
   * @ldb:   Cache of the database structures
   *
   * To use the information from the VBLKs, they need to be read from the disk,
@@@ -1432,10 -1432,10 +1432,10 @@@ static void ldm_free_vblks (struct list
   * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
   * and so on: the actual data containing partitions.
   *
-  * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
-  *          0 Success, @state->bdev is not a dynamic disk
+  * Return:  1 Success, @state->disk is a dynamic disk and we handled it
+  *          0 Success, @state->disk is not a dynamic disk
   *         -1 An error occurred before enough information had been read
-  *            Or @state->bdev is a dynamic disk, but it may be corrupted
+  *            Or @state->disk is a dynamic disk, but it may be corrupted
   */
  int ldm_partition(struct parsed_partitions *state)
  {
@@@ -166,11 -166,8 +166,8 @@@ static inline void virtblk_request_done
  {
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
  
-       if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-               kfree(page_address(req->special_vec.bv_page) +
-                     req->special_vec.bv_offset);
-       }
+       if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+               kfree(bvec_virt(&req->special_vec));
        blk_mq_end_request(req, virtblk_result(vbr));
  }
  
@@@ -692,28 -689,6 +689,28 @@@ static const struct blk_mq_ops virtio_m
  static unsigned int virtblk_queue_depth;
  module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
  
 +static int virtblk_validate(struct virtio_device *vdev)
 +{
 +      u32 blk_size;
 +
 +      if (!vdev->config->get) {
 +              dev_err(&vdev->dev, "%s failure: config access disabled\n",
 +                      __func__);
 +              return -EINVAL;
 +      }
 +
 +      if (!virtio_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE))
 +              return 0;
 +
 +      blk_size = virtio_cread32(vdev,
 +                      offsetof(struct virtio_blk_config, blk_size));
 +
 +      if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)
 +              __virtio_clear_bit(vdev, VIRTIO_BLK_F_BLK_SIZE);
 +
 +      return 0;
 +}
 +
  static int virtblk_probe(struct virtio_device *vdev)
  {
        struct virtio_blk *vblk;
        u8 physical_block_exp, alignment_offset;
        unsigned int queue_depth;
  
 -      if (!vdev->config->get) {
 -              dev_err(&vdev->dev, "%s failure: config access disabled\n",
 -                      __func__);
 -              return -EINVAL;
 -      }
 -
        err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
                             GFP_KERNEL);
        if (err < 0)
        else
                blk_size = queue_logical_block_size(q);
  
-               goto err_cleanup_disk;
 +      if (unlikely(blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)) {
 +              dev_err(&vdev->dev,
 +                      "block size is changed unexpectedly, now is %u\n",
 +                      blk_size);
 +              err = -EINVAL;
++              goto out_cleanup_disk;
 +      }
 +
        /* Use topology information if available */
        err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
                                   struct virtio_blk_config, physical_block_exp,
        virtblk_update_capacity(vblk, false);
        virtio_device_ready(vdev);
  
-       device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
+       err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
+       if (err)
+               goto out_cleanup_disk;
        return 0;
  
err_cleanup_disk:
out_cleanup_disk:
        blk_cleanup_disk(vblk->disk);
  out_free_tags:
        blk_mq_free_tag_set(&vblk->tag_set);
@@@ -1009,7 -985,6 +1009,7 @@@ static struct virtio_driver virtio_blk 
        .driver.name                    = KBUILD_MODNAME,
        .driver.owner                   = THIS_MODULE,
        .id_table                       = id_table,
 +      .validate                       = virtblk_validate,
        .probe                          = virtblk_probe,
        .remove                         = virtblk_remove,
        .config_changed                 = virtblk_config_changed,
@@@ -1004,23 -1004,15 +1004,23 @@@ static unsigned char dasd_eckd_path_acc
  static void dasd_eckd_store_conf_data(struct dasd_device *device,
                                      struct dasd_conf_data *conf_data, int chp)
  {
 +      struct dasd_eckd_private *private = device->private;
        struct channel_path_desc_fmt0 *chp_desc;
        struct subchannel_id sch_id;
 +      void *cdp;
  
 -      ccw_device_get_schid(device->cdev, &sch_id);
        /*
         * path handling and read_conf allocate data
         * free it before replacing the pointer
 +       * also replace the old private->conf_data pointer
 +       * with the new one if this points to the same data
         */
 -      kfree(device->path[chp].conf_data);
 +      cdp = device->path[chp].conf_data;
 +      if (private->conf_data == cdp) {
 +              private->conf_data = (void *)conf_data;
 +              dasd_eckd_identify_conf_parts(private);
 +      }
 +      ccw_device_get_schid(device->cdev, &sch_id);
        device->path[chp].conf_data = conf_data;
        device->path[chp].cssid = sch_id.cssid;
        device->path[chp].ssid = sch_id.ssid;
        if (chp_desc)
                device->path[chp].chpid = chp_desc->chpid;
        kfree(chp_desc);
 +      kfree(cdp);
  }
  
  static void dasd_eckd_clear_conf_data(struct dasd_device *device)
@@@ -3276,7 -3267,7 +3276,7 @@@ static int dasd_eckd_ese_read(struct da
        end_blk = (curr_trk + 1) * recs_per_trk;
  
        rq_for_each_segment(bv, req, iter) {
-               dst = page_address(bv.bv_page) + bv.bv_offset;
+               dst = bvec_virt(&bv);
                for (off = 0; off < bv.bv_len; off += blksize) {
                        if (first_blk + blk_count >= end_blk) {
                                cqr->proc_bytes = blk_count * blksize;
@@@ -4008,7 -3999,7 +4008,7 @@@ static struct dasd_ccw_req *dasd_eckd_b
                              last_rec - recid + 1, cmd, basedev, blksize);
        }
        rq_for_each_segment(bv, req, iter) {
-               dst = page_address(bv.bv_page) + bv.bv_offset;
+               dst = bvec_virt(&bv);
                if (dasd_page_cache) {
                        char *copy = kmem_cache_alloc(dasd_page_cache,
                                                      GFP_DMA | __GFP_NOWARN);
@@@ -4175,7 -4166,7 +4175,7 @@@ static struct dasd_ccw_req *dasd_eckd_b
        idaw_dst = NULL;
        idaw_len = 0;
        rq_for_each_segment(bv, req, iter) {
-               dst = page_address(bv.bv_page) + bv.bv_offset;
+               dst = bvec_virt(&bv);
                seg_len = bv.bv_len;
                while (seg_len) {
                        if (new_track) {
@@@ -4518,7 -4509,7 +4518,7 @@@ static struct dasd_ccw_req *dasd_eckd_b
                new_track = 1;
                recid = first_rec;
                rq_for_each_segment(bv, req, iter) {
-                       dst = page_address(bv.bv_page) + bv.bv_offset;
+                       dst = bvec_virt(&bv);
                        seg_len = bv.bv_len;
                        while (seg_len) {
                                if (new_track) {
                }
        } else {
                rq_for_each_segment(bv, req, iter) {
-                       dst = page_address(bv.bv_page) + bv.bv_offset;
+                       dst = bvec_virt(&bv);
                        last_tidaw = itcw_add_tidaw(itcw, 0x00,
                                                    dst, bv.bv_len);
                        if (IS_ERR(last_tidaw)) {
@@@ -4787,7 -4778,7 +4787,7 @@@ static struct dasd_ccw_req *dasd_eckd_b
                        idaws = idal_create_words(idaws, rawpadpage, PAGE_SIZE);
        }
        rq_for_each_segment(bv, req, iter) {
-               dst = page_address(bv.bv_page) + bv.bv_offset;
+               dst = bvec_virt(&bv);
                seg_len = bv.bv_len;
                if (cmd == DASD_ECKD_CCW_READ_TRACK)
                        memset(dst, 0, seg_len);
@@@ -4848,7 -4839,7 +4848,7 @@@ dasd_eckd_free_cp(struct dasd_ccw_req *
        if (private->uses_cdl == 0 || recid > 2*blk_per_trk)
                ccw++;
        rq_for_each_segment(bv, req, iter) {
-               dst = page_address(bv.bv_page) + bv.bv_offset;
+               dst = bvec_virt(&bv);
                for (off = 0; off < bv.bv_len; off += blksize) {
                        /* Skip locate record. */
                        if (private->uses_cdl && recid <= 2*blk_per_trk)
diff --combined drivers/scsi/sr.c
@@@ -106,6 -106,8 +106,8 @@@ static struct scsi_driver sr_template 
  static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG];
  static DEFINE_SPINLOCK(sr_index_lock);
  
+ static struct lock_class_key sr_bio_compl_lkclass;
  /* This semaphore is used to mediate the 0->1 reference get in the
   * face of object destruction (i.e. we can't allow a get on an
   * object after last put) */
@@@ -221,7 -223,7 +223,7 @@@ static unsigned int sr_get_events(struc
        else if (med->media_event_code == 2)
                return DISK_EVENT_MEDIA_CHANGE;
        else if (med->media_event_code == 3)
 -              return DISK_EVENT_EJECT_REQUEST;
 +              return DISK_EVENT_MEDIA_CHANGE;
        return 0;
  }
  
@@@ -712,7 -714,8 +714,8 @@@ static int sr_probe(struct device *dev
  
        kref_init(&cd->kref);
  
-       disk = alloc_disk(1);
+       disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE,
+                                &sr_bio_compl_lkclass);
        if (!disk)
                goto fail_free;
        mutex_init(&cd->lock);
  
        disk->major = SCSI_CDROM_MAJOR;
        disk->first_minor = minor;
+       disk->minors = 1;
        sprintf(disk->disk_name, "sr%d", minor);
        disk->fops = &sr_bdops;
        disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
  
        set_capacity(disk, cd->capacity);
        disk->private_data = &cd->driver;
-       disk->queue = sdev->request_queue;
  
        if (register_cdrom(disk, &cd->cdi))
                goto fail_minor;
diff --combined include/linux/fs.h
@@@ -436,10 -436,6 +436,10 @@@ int pagecache_write_end(struct file *, 
   * struct address_space - Contents of a cacheable, mappable object.
   * @host: Owner, either the inode or the block_device.
   * @i_pages: Cached pages.
 + * @invalidate_lock: Guards coherency between page cache contents and
 + *   file offset->disk block mappings in the filesystem during invalidates.
 + *   It is also used to block modification of page cache contents through
 + *   memory mappings.
   * @gfp_mask: Memory allocation flags to use for allocating pages.
   * @i_mmap_writable: Number of VM_SHARED mappings.
   * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  struct address_space {
        struct inode            *host;
        struct xarray           i_pages;
 +      struct rw_semaphore     invalidate_lock;
        gfp_t                   gfp_mask;
        atomic_t                i_mmap_writable;
  #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -819,42 -814,9 +819,42 @@@ static inline void inode_lock_shared_ne
        down_read_nested(&inode->i_rwsem, subclass);
  }
  
 +static inline void filemap_invalidate_lock(struct address_space *mapping)
 +{
 +      down_write(&mapping->invalidate_lock);
 +}
 +
 +static inline void filemap_invalidate_unlock(struct address_space *mapping)
 +{
 +      up_write(&mapping->invalidate_lock);
 +}
 +
 +static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
 +{
 +      down_read(&mapping->invalidate_lock);
 +}
 +
 +static inline int filemap_invalidate_trylock_shared(
 +                                      struct address_space *mapping)
 +{
 +      return down_read_trylock(&mapping->invalidate_lock);
 +}
 +
 +static inline void filemap_invalidate_unlock_shared(
 +                                      struct address_space *mapping)
 +{
 +      up_read(&mapping->invalidate_lock);
 +}
 +
  void lock_two_nondirectories(struct inode *, struct inode*);
  void unlock_two_nondirectories(struct inode *, struct inode*);
  
 +void filemap_invalidate_lock_two(struct address_space *mapping1,
 +                               struct address_space *mapping2);
 +void filemap_invalidate_unlock_two(struct address_space *mapping1,
 +                                 struct address_space *mapping2);
 +
 +
  /*
   * NOTE: in a 32bit arch with a preemptable kernel and
   * an UP compile the i_size_read/write must be atomic
@@@ -1545,11 -1507,8 +1545,11 @@@ struct super_block 
        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;
  
 -      /* Pending fsnotify inode refs */
 -      atomic_long_t s_fsnotify_inode_refs;
 +      /*
 +       * Number of inode/mount/sb objects that are being watched, note that
 +       * inodes objects are currently double-accounted.
 +       */
 +      atomic_long_t s_fsnotify_connectors;
  
        /* Being remounted read-only */
        int s_readonly_remount;
@@@ -2528,7 -2487,6 +2528,7 @@@ struct file_system_type 
  
        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
 +      struct lock_class_key invalidate_lock_key;
        struct lock_class_key i_mutex_dir_key;
  };
  
@@@ -2612,6 -2570,90 +2612,6 @@@ extern struct kobject *fs_kobj
  
  #define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
  
 -#ifdef CONFIG_MANDATORY_FILE_LOCKING
 -extern int locks_mandatory_locked(struct file *);
 -extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
 -
 -/*
 - * Candidates for mandatory locking have the setgid bit set
 - * but no group execute bit -  an otherwise meaningless combination.
 - */
 -
 -static inline int __mandatory_lock(struct inode *ino)
 -{
 -      return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
 -}
 -
 -/*
 - * ... and these candidates should be on SB_MANDLOCK mounted fs,
 - * otherwise these will be advisory locks
 - */
 -
 -static inline int mandatory_lock(struct inode *ino)
 -{
 -      return IS_MANDLOCK(ino) && __mandatory_lock(ino);
 -}
 -
 -static inline int locks_verify_locked(struct file *file)
 -{
 -      if (mandatory_lock(locks_inode(file)))
 -              return locks_mandatory_locked(file);
 -      return 0;
 -}
 -
 -static inline int locks_verify_truncate(struct inode *inode,
 -                                  struct file *f,
 -                                  loff_t size)
 -{
 -      if (!inode->i_flctx || !mandatory_lock(inode))
 -              return 0;
 -
 -      if (size < inode->i_size) {
 -              return locks_mandatory_area(inode, f, size, inode->i_size - 1,
 -                              F_WRLCK);
 -      } else {
 -              return locks_mandatory_area(inode, f, inode->i_size, size - 1,
 -                              F_WRLCK);
 -      }
 -}
 -
 -#else /* !CONFIG_MANDATORY_FILE_LOCKING */
 -
 -static inline int locks_mandatory_locked(struct file *file)
 -{
 -      return 0;
 -}
 -
 -static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
 -                                       loff_t start, loff_t end, unsigned char type)
 -{
 -      return 0;
 -}
 -
 -static inline int __mandatory_lock(struct inode *inode)
 -{
 -      return 0;
 -}
 -
 -static inline int mandatory_lock(struct inode *inode)
 -{
 -      return 0;
 -}
 -
 -static inline int locks_verify_locked(struct file *file)
 -{
 -      return 0;
 -}
 -
 -static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
 -                                      size_t size)
 -{
 -      return 0;
 -}
 -
 -#endif /* CONFIG_MANDATORY_FILE_LOCKING */
 -
 -
  #ifdef CONFIG_FILE_LOCKING
  static inline int break_lease(struct inode *inode, unsigned int mode)
  {
@@@ -3204,10 -3246,6 +3204,6 @@@ ssize_t vfs_iocb_iter_read(struct file 
  ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);
  
- /* fs/block_dev.c */
- extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
-                       int datasync);
  /* fs/splice.c */
  extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                struct pipe_inode_info *, size_t, unsigned int);
diff --combined lib/Kconfig.debug
@@@ -1235,7 -1235,7 +1235,7 @@@ config PROVE_LOCKIN
        depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
        select LOCKDEP
        select DEBUG_SPINLOCK
 -      select DEBUG_MUTEXES
 +      select DEBUG_MUTEXES if !PREEMPT_RT
        select DEBUG_RT_MUTEXES if RT_MUTEXES
        select DEBUG_RWSEMS
        select DEBUG_WW_MUTEX_SLOWPATH
@@@ -1299,7 -1299,7 +1299,7 @@@ config LOCK_STA
        depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
        select LOCKDEP
        select DEBUG_SPINLOCK
 -      select DEBUG_MUTEXES
 +      select DEBUG_MUTEXES if !PREEMPT_RT
        select DEBUG_RT_MUTEXES if RT_MUTEXES
        select DEBUG_LOCK_ALLOC
        default n
@@@ -1335,7 -1335,7 +1335,7 @@@ config DEBUG_SPINLOC
  
  config DEBUG_MUTEXES
        bool "Mutex debugging: basic checks"
 -      depends on DEBUG_KERNEL
 +      depends on DEBUG_KERNEL && !PREEMPT_RT
        help
         This feature allows mutex semantics violations to be detected and
         reported.
@@@ -1345,8 -1345,7 +1345,8 @@@ config DEBUG_WW_MUTEX_SLOWPAT
        depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
        select DEBUG_LOCK_ALLOC
        select DEBUG_SPINLOCK
 -      select DEBUG_MUTEXES
 +      select DEBUG_MUTEXES if !PREEMPT_RT
 +      select DEBUG_RT_MUTEXES if PREEMPT_RT
        help
         This feature enables slowpath testing for w/w mutex users by
         injecting additional -EDEADLK wound/backoff cases. Together with
@@@ -1369,7 -1368,7 +1369,7 @@@ config DEBUG_LOCK_ALLO
        bool "Lock debugging: detect incorrect freeing of live locks"
        depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
        select DEBUG_SPINLOCK
 -      select DEBUG_MUTEXES
 +      select DEBUG_MUTEXES if !PREEMPT_RT
        select DEBUG_RT_MUTEXES if RT_MUTEXES
        select LOCKDEP
        help
@@@ -1680,33 -1679,6 +1680,6 @@@ config DEBUG_WQ_FORCE_RR_CP
          feature by default.  When enabled, memory and cache locality will
          be impacted.
  
- config DEBUG_BLOCK_EXT_DEVT
-       bool "Force extended block device numbers and spread them"
-       depends on DEBUG_KERNEL
-       depends on BLOCK
-       default n
-       help
-         BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON
-         SOME DISTRIBUTIONS.  DO NOT ENABLE THIS UNLESS YOU KNOW WHAT
-         YOU ARE DOING.  Distros, please enable this and fix whatever
-         is broken.
-         Conventionally, block device numbers are allocated from
-         predetermined contiguous area.  However, extended block area
-         may introduce non-contiguous block device numbers.  This
-         option forces most block device numbers to be allocated from
-         the extended space and spreads them to discover kernel or
-         userland code paths which assume predetermined contiguous
-         device number allocation.
-         Note that turning on this debug option shuffles all the
-         device numbers for all IDE and SCSI devices including libata
-         ones, so root partition specified using device number
-         directly (via rdev or root=MAJ:MIN) won't work anymore.
-         Textual device names (root=/dev/sdXn) will continue to work.
-         Say N if you are unsure.
  config CPU_HOTPLUG_STATE_CONTROL
        bool "Enable CPU hotplug state control"
        depends on DEBUG_KERNEL