Merge tag 'for-6.8/block-2024-01-18' of git://git.kernel.dk/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
Pull block fixes from Jens Axboe:

 - NVMe pull request via Keith:
      - tcp, fc, and rdma target fixes (Maurizio, Daniel, Hannes,
        Christoph)
      - discard fixes and improvements (Christoph)
      - timeout debug improvements (Keith, Max)
      - various cleanups (Daniel, Max, Giuxen)
      - trace event string fixes (Arnd)
      - shadow doorbell setup on reset fix (William)
      - a write zeroes quirk for SK Hynix (Jim)

 - MD pull request via Song:
      - Sparse warning since v6.0 (Bart)
      - /proc/mdstat regression since v6.7 (Yu Kuai)

 - Use symbolic error value (Christian)

 - IO Priority documentation update (Christian)

 - Fix for accessing queue limits without having entered the queue
   (Christoph, me)

 - Fix for loop dio support (Christoph)

 - Move null_blk off deprecated ida interface (Christophe)

 - Ensure nbd initializes full msghdr (Eric)

 - Fix for a regression with the folio conversion, which is now easier
   to hit because of an unrelated change (Matthew)

 - Remove redundant check in virtio-blk (Li)

 - Fix for a potential hang in sbitmap (Ming)

 - Fix for partial zone appending (Damien)

 - Misc changes and fixes (Bart, me, Kemeng, Dmitry)

* tag 'for-6.8/block-2024-01-18' of git://git.kernel.dk/linux: (45 commits)
  Documentation: block: ioprio: Update schedulers
  loop: fix the the direct I/O support check when used on top of block devices
  blk-mq: Remove the hctx 'run' debugfs attribute
  nbd: always initialize struct msghdr completely
  block: Fix iterating over an empty bio with bio_for_each_folio_all
  block: bio-integrity: fix kcalloc() arguments order
  virtio_blk: remove duplicate check if queue is broken in virtblk_done
  sbitmap: remove stale comment in sbq_calc_wake_batch
  block: Correct a documentation comment in blk-cgroup.c
  null_blk: Remove usage of the deprecated ida_simple_xx() API
  block: ensure we hold a queue reference when using queue limits
  blk-mq: rename blk_mq_can_use_cached_rq
  block: print symbolic error name instead of error code
  blk-mq: fix IO hang from sbitmap wakeup race
  nvmet-rdma: avoid circular locking dependency on install_queue()
  nvmet-tcp: avoid circular locking dependency on install_queue()
  nvme-pci: set doorbell config before unquiescing
  block: fix partial zone append completion handling in req_bio_endio()
  block/iocost: silence warning on 'last_period' potentially being unused
  md/raid1: Use blk_opf_t for read and write operations
  ...

1  2 
block/blk-mq.c
drivers/block/loop.c
drivers/block/virtio_blk.c
drivers/md/md.c
drivers/nvme/host/core.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c

diff --combined block/blk-mq.c
@@@ -772,11 -772,16 +772,16 @@@ static void req_bio_endio(struct reques
                /*
                 * Partial zone append completions cannot be supported as the
                 * BIO fragments may end up not being written sequentially.
+                * For such case, force the completed nbytes to be equal to
+                * the BIO size so that bio_advance() sets the BIO remaining
+                * size to 0 and we end up calling bio_endio() before returning.
                 */
-               if (bio->bi_iter.bi_size != nbytes)
+               if (bio->bi_iter.bi_size != nbytes) {
                        bio->bi_status = BLK_STS_IOERR;
-               else
+                       nbytes = bio->bi_iter.bi_size;
+               } else {
                        bio->bi_iter.bi_sector = rq->__sector;
+               }
        }
  
        bio_advance(bio, nbytes);
@@@ -1513,26 -1518,14 +1518,26 @@@ void blk_mq_delay_kick_requeue_list(str
  }
  EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
  
 +static bool blk_is_flush_data_rq(struct request *rq)
 +{
 +      return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
 +}
 +
  static bool blk_mq_rq_inflight(struct request *rq, void *priv)
  {
        /*
         * If we find a request that isn't idle we know the queue is busy
         * as it's checked in the iter.
         * Return false to stop the iteration.
 +       *
 +       * In case of queue quiesce, if one flush data request is completed,
 +       * don't count it as inflight given the flush sequence is suspended,
 +       * and the original flush data request is invisible to driver, just
 +       * like other pending requests because of quiesce
         */
 -      if (blk_mq_request_started(rq)) {
 +      if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
 +                              blk_is_flush_data_rq(rq) &&
 +                              blk_mq_request_completed(rq))) {
                bool *busy = priv;
  
                *busy = true;
@@@ -1859,6 -1852,22 +1864,22 @@@ static bool blk_mq_mark_tag_wait(struc
        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq, wait);
  
+       /*
+        * Add one explicit barrier since blk_mq_get_driver_tag() may
+        * not imply barrier in case of failure.
+        *
+        * Order adding us to wait queue and allocating driver tag.
+        *
+        * The pair is the one implied in sbitmap_queue_wake_up() which
+        * orders clearing sbitmap tag bits and waitqueue_active() in
+        * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
+        *
+        * Otherwise, re-order of adding wait queue and getting driver tag
+        * may cause __sbitmap_queue_wake_up() to wake up nothing because
+        * the waitqueue_active() may not observe us in wait queue.
+        */
+       smp_mb();
        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
@@@ -2891,8 -2900,11 +2912,11 @@@ static struct request *blk_mq_get_new_r
        return NULL;
  }
  
- /* return true if this @rq can be used for @bio */
- static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+ /*
+  * Check if we can use the passed on request for submitting the passed in bio,
+  * and remove it from the request list if it can be used.
+  */
+ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
                struct bio *bio)
  {
        enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
@@@ -2952,12 -2964,6 +2976,6 @@@ void blk_mq_submit_bio(struct bio *bio
        blk_status_t ret;
  
        bio = blk_queue_bounce(bio, q);
-       if (bio_may_exceed_limits(bio, &q->limits)) {
-               bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
-               if (!bio)
-                       return;
-       }
        bio_set_ioprio(bio);
  
        if (plug) {
                        rq = NULL;
        }
        if (rq) {
+               if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+                       bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+                       if (!bio)
+                               return;
+               }
                if (!bio_integrity_prep(bio))
                        return;
                if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                        return;
-               if (blk_mq_can_use_cached_rq(rq, plug, bio))
+               if (blk_mq_use_cached_rq(rq, plug, bio))
                        goto done;
                percpu_ref_get(&q->q_usage_counter);
        } else {
                if (unlikely(bio_queue_enter(bio)))
                        return;
+               if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+                       bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+                       if (!bio)
+                               goto fail;
+               }
                if (!bio_integrity_prep(bio))
                        goto fail;
        }
diff --combined drivers/block/loop.c
@@@ -165,39 -165,37 +165,37 @@@ static loff_t get_loop_size(struct loop
        return get_size(lo->lo_offset, lo->lo_sizelimit, file);
  }
  
+ /*
+  * We support direct I/O only if lo_offset is aligned with the logical I/O size
+  * of backing device, and the logical block size of loop is bigger than that of
+  * the backing device.
+  */
+ static bool lo_bdev_can_use_dio(struct loop_device *lo,
+               struct block_device *backing_bdev)
+ {
+       unsigned short sb_bsize = bdev_logical_block_size(backing_bdev);
+       if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
+               return false;
+       if (lo->lo_offset & (sb_bsize - 1))
+               return false;
+       return true;
+ }
  static void __loop_update_dio(struct loop_device *lo, bool dio)
  {
        struct file *file = lo->lo_backing_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       unsigned short sb_bsize = 0;
-       unsigned dio_align = 0;
+       struct inode *inode = file->f_mapping->host;
+       struct block_device *backing_bdev = NULL;
        bool use_dio;
  
-       if (inode->i_sb->s_bdev) {
-               sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
-               dio_align = sb_bsize - 1;
-       }
+       if (S_ISBLK(inode->i_mode))
+               backing_bdev = I_BDEV(inode);
+       else if (inode->i_sb->s_bdev)
+               backing_bdev = inode->i_sb->s_bdev;
  
-       /*
-        * We support direct I/O only if lo_offset is aligned with the
-        * logical I/O size of backing device, and the logical block
-        * size of loop is bigger than the backing device's.
-        *
-        * TODO: the above condition may be loosed in the future, and
-        * direct I/O may be switched runtime at that time because most
-        * of requests in sane applications should be PAGE_SIZE aligned
-        */
-       if (dio) {
-               if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
-                   !(lo->lo_offset & dio_align) &&
-                   (file->f_mode & FMODE_CAN_ODIRECT))
-                       use_dio = true;
-               else
-                       use_dio = false;
-       } else {
-               use_dio = false;
-       }
+       use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) &&
+               (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev));
  
        if (lo->use_dio == use_dio)
                return;
@@@ -245,7 -243,9 +243,7 @@@ static int lo_write_bvec(struct file *f
  
        iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len);
  
 -      file_start_write(file);
        bw = vfs_iter_write(file, &i, ppos, 0);
 -      file_end_write(file);
  
        if (likely(bw ==  bvec->bv_len))
                return 0;
@@@ -367,8 -367,6 +367,6 @@@ static void virtblk_done(struct virtque
                                blk_mq_complete_request(req);
                        req_done = true;
                }
-               if (unlikely(virtqueue_is_broken(vq)))
-                       break;
        } while (!virtqueue_enable_cb(vq));
  
        /* In case queue is stopped waiting for more buffers. */
@@@ -970,12 -968,12 +968,12 @@@ static void virtblk_config_changed(stru
  static int init_vq(struct virtio_blk *vblk)
  {
        int err;
 -      int i;
 +      unsigned short i;
        vq_callback_t **callbacks;
        const char **names;
        struct virtqueue **vqs;
        unsigned short num_vqs;
 -      unsigned int num_poll_vqs;
 +      unsigned short num_poll_vqs;
        struct virtio_device *vdev = vblk->vdev;
        struct irq_affinity desc = { 0, };
  
  
        for (i = 0; i < num_vqs - num_poll_vqs; i++) {
                callbacks[i] = virtblk_done;
 -              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
 +              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i);
                names[i] = vblk->vqs[i].name;
        }
  
        for (; i < num_vqs; i++) {
                callbacks[i] = NULL;
 -              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
 +              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i);
                names[i] = vblk->vqs[i].name;
        }
  
diff --combined drivers/md/md.c
@@@ -82,14 -82,6 +82,14 @@@ static struct module *md_cluster_mod
  
  static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
  static struct workqueue_struct *md_wq;
 +
 +/*
 + * This workqueue is used for sync_work to register new sync_thread, and for
 + * del_work to remove rdev, and for event_work that is only set by dm-raid.
 + *
 + * Noted that sync_work will grab reconfig_mutex, hence never flush this
 + * workqueue whith reconfig_mutex grabbed.
 + */
  static struct workqueue_struct *md_misc_wq;
  struct workqueue_struct *md_bitmap_wq;
  
@@@ -498,7 -490,7 +498,7 @@@ int mddev_suspend(struct mddev *mddev, 
  }
  EXPORT_SYMBOL_GPL(mddev_suspend);
  
 -void mddev_resume(struct mddev *mddev)
 +static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
  {
        lockdep_assert_not_held(&mddev->reconfig_mutex);
  
        percpu_ref_resurrect(&mddev->active_io);
        wake_up(&mddev->sb_wait);
  
 -      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 +      if (recovery_needed)
 +              set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
  
        mutex_unlock(&mddev->suspend_mutex);
  }
 +
 +void mddev_resume(struct mddev *mddev)
 +{
 +      return __mddev_resume(mddev, true);
 +}
  EXPORT_SYMBOL_GPL(mddev_resume);
  
  /*
@@@ -4874,29 -4860,25 +4874,29 @@@ action_show(struct mddev *mddev, char *
        return sprintf(page, "%s\n", type);
  }
  
 -static void stop_sync_thread(struct mddev *mddev)
 +/**
 + * stop_sync_thread() - wait for sync_thread to stop if it's running.
 + * @mddev:    the array.
 + * @locked:   if set, reconfig_mutex will still be held after this function
 + *            return; if not set, reconfig_mutex will be released after this
 + *            function return.
 + * @check_seq:        if set, only wait for curent running sync_thread to stop, noted
 + *            that new sync_thread can still start.
 + */
 +static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
  {
 -      if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 -              return;
 +      int sync_seq;
  
 -      if (mddev_lock(mddev))
 -              return;
 +      if (check_seq)
 +              sync_seq = atomic_read(&mddev->sync_seq);
  
 -      /*
 -       * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
 -       * held.
 -       */
        if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
 -              mddev_unlock(mddev);
 +              if (!locked)
 +                      mddev_unlock(mddev);
                return;
        }
  
 -      if (work_pending(&mddev->del_work))
 -              flush_workqueue(md_misc_wq);
 +      mddev_unlock(mddev);
  
        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        /*
         * never happen
         */
        md_wakeup_thread_directly(mddev->sync_thread);
 +      if (work_pending(&mddev->sync_work))
 +              flush_work(&mddev->sync_work);
  
 -      mddev_unlock(mddev);
 +      wait_event(resync_wait,
 +                 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 +                 (check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
 +
 +      if (locked)
 +              mddev_lock_nointr(mddev);
  }
  
  static void idle_sync_thread(struct mddev *mddev)
  {
 -      int sync_seq = atomic_read(&mddev->sync_seq);
 -
        mutex_lock(&mddev->sync_mutex);
        clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 -      stop_sync_thread(mddev);
  
 -      wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
 -                      !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
 +      if (mddev_lock(mddev)) {
 +              mutex_unlock(&mddev->sync_mutex);
 +              return;
 +      }
  
 +      stop_sync_thread(mddev, false, true);
        mutex_unlock(&mddev->sync_mutex);
  }
  
@@@ -4933,13 -4908,11 +4933,13 @@@ static void frozen_sync_thread(struct m
  {
        mutex_lock(&mddev->sync_mutex);
        set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 -      stop_sync_thread(mddev);
  
 -      wait_event(resync_wait, mddev->sync_thread == NULL &&
 -                      !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
 +      if (mddev_lock(mddev)) {
 +              mutex_unlock(&mddev->sync_mutex);
 +              return;
 +      }
  
 +      stop_sync_thread(mddev, false, false);
        mutex_unlock(&mddev->sync_mutex);
  }
  
@@@ -6311,7 -6284,14 +6311,7 @@@ static void md_clean(struct mddev *mdde
  
  static void __md_stop_writes(struct mddev *mddev)
  {
 -      set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 -      if (work_pending(&mddev->del_work))
 -              flush_workqueue(md_misc_wq);
 -      if (mddev->sync_thread) {
 -              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 -              md_reap_sync_thread(mddev);
 -      }
 -
 +      stop_sync_thread(mddev, true, false);
        del_timer_sync(&mddev->safemode_timer);
  
        if (mddev->pers && mddev->pers->quiesce) {
@@@ -6358,6 -6338,9 +6358,6 @@@ static void __md_stop(struct mddev *mdd
        struct md_personality *pers = mddev->pers;
        md_bitmap_destroy(mddev);
        mddev_detach(mddev);
 -      /* Ensure ->event_work is done */
 -      if (mddev->event_work.func)
 -              flush_workqueue(md_misc_wq);
        spin_lock(&mddev->lock);
        mddev->pers = NULL;
        spin_unlock(&mddev->lock);
@@@ -6392,16 -6375,25 +6392,16 @@@ static int md_set_readonly(struct mdde
        int err = 0;
        int did_freeze = 0;
  
 +      if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 +              return -EBUSY;
 +
        if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
                did_freeze = 1;
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
        }
 -      if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 -              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
  
 -      /*
 -       * Thread might be blocked waiting for metadata update which will now
 -       * never happen
 -       */
 -      md_wakeup_thread_directly(mddev->sync_thread);
 -
 -      if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 -              return -EBUSY;
 -      mddev_unlock(mddev);
 -      wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
 -                                        &mddev->recovery));
 +      stop_sync_thread(mddev, false, false);
        wait_event(mddev->sb_wait,
                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
        mddev_lock_nointr(mddev);
            mddev->sync_thread ||
            test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
                pr_warn("md: %s still in use.\n",mdname(mddev));
 -              if (did_freeze) {
 -                      clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 -                      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 -                      md_wakeup_thread(mddev->thread);
 -              }
                err = -EBUSY;
                goto out;
        }
 +
        if (mddev->pers) {
                __md_stop_writes(mddev);
  
 -              err  = -ENXIO;
 -              if (mddev->ro == MD_RDONLY)
 +              if (mddev->ro == MD_RDONLY) {
 +                      err  = -ENXIO;
                        goto out;
 +              }
 +
                mddev->ro = MD_RDONLY;
                set_disk_ro(mddev->gendisk, 1);
 +      }
 +
 +out:
 +      if ((mddev->pers && !err) || did_freeze) {
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
                sysfs_notify_dirent_safe(mddev->sysfs_state);
 -              err = 0;
        }
 -out:
 +
        mutex_unlock(&mddev->open_mutex);
        return err;
  }
@@@ -6455,8 -6446,20 +6455,8 @@@ static int do_md_stop(struct mddev *mdd
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                md_wakeup_thread(mddev->thread);
        }
 -      if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 -              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
  
 -      /*
 -       * Thread might be blocked waiting for metadata update which will now
 -       * never happen
 -       */
 -      md_wakeup_thread_directly(mddev->sync_thread);
 -
 -      mddev_unlock(mddev);
 -      wait_event(resync_wait, (mddev->sync_thread == NULL &&
 -                               !test_bit(MD_RECOVERY_RUNNING,
 -                                         &mddev->recovery)));
 -      mddev_lock_nointr(mddev);
 +      stop_sync_thread(mddev, true, false);
  
        mutex_lock(&mddev->open_mutex);
        if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
@@@ -8132,6 -8135,19 +8132,19 @@@ static void status_unused(struct seq_fi
        seq_printf(seq, "\n");
  }
  
+ static void status_personalities(struct seq_file *seq)
+ {
+       struct md_personality *pers;
+       seq_puts(seq, "Personalities : ");
+       spin_lock(&pers_lock);
+       list_for_each_entry(pers, &pers_list, list)
+               seq_printf(seq, "[%s] ", pers->name);
+       spin_unlock(&pers_lock);
+       seq_puts(seq, "\n");
+ }
  static int status_resync(struct seq_file *seq, struct mddev *mddev)
  {
        sector_t max_sectors, resync, res;
  static void *md_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(&all_mddevs_lock)
  {
-       struct md_personality *pers;
-       seq_puts(seq, "Personalities : ");
-       spin_lock(&pers_lock);
-       list_for_each_entry(pers, &pers_list, list)
-               seq_printf(seq, "[%s] ", pers->name);
-       spin_unlock(&pers_lock);
-       seq_puts(seq, "\n");
        seq->poll_event = atomic_read(&md_event_count);
        spin_lock(&all_mddevs_lock);
  
-       return seq_list_start(&all_mddevs, *pos);
+       return seq_list_start_head(&all_mddevs, *pos);
  }
  
  static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  static void md_seq_stop(struct seq_file *seq, void *v)
        __releases(&all_mddevs_lock)
  {
-       status_unused(seq);
        spin_unlock(&all_mddevs_lock);
  }
  
  static int md_seq_show(struct seq_file *seq, void *v)
  {
-       struct mddev *mddev = list_entry(v, struct mddev, all_mddevs);
+       struct mddev *mddev;
        sector_t sectors;
        struct md_rdev *rdev;
  
+       if (v == &all_mddevs) {
+               status_personalities(seq);
+               if (list_empty(&all_mddevs))
+                       status_unused(seq);
+               return 0;
+       }
+       mddev = list_entry(v, struct mddev, all_mddevs);
        if (!mddev_get(mddev))
                return 0;
  
        }
        spin_unlock(&mddev->lock);
        spin_lock(&all_mddevs_lock);
+       if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
+               status_unused(seq);
        if (atomic_dec_and_test(&mddev->active))
                __mddev_put(mddev);
  
@@@ -9395,15 -9412,7 +9409,15 @@@ static void md_start_sync(struct work_s
                goto not_running;
        }
  
 -      suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
 +      mddev_unlock(mddev);
 +      /*
 +       * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
 +       * not set it again. Otherwise, we may cause issue like this one:
 +       *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
 +       * Therefore, use __mddev_resume(mddev, false).
 +       */
 +      if (suspend)
 +              __mddev_resume(mddev, false);
        md_wakeup_thread(mddev->sync_thread);
        sysfs_notify_dirent_safe(mddev->sysfs_action);
        md_new_event();
@@@ -9415,15 -9424,7 +9429,15 @@@ not_running
        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 -      suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
 +      mddev_unlock(mddev);
 +      /*
 +       * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
 +       * not set it again. Otherwise, we may cause issue like this one:
 +       *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
 +       * Therefore, use __mddev_resume(mddev, false).
 +       */
 +      if (suspend)
 +              __mddev_resume(mddev, false);
  
        wake_up(&resync_wait);
        if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
diff --combined drivers/nvme/host/core.c
@@@ -132,7 -132,7 +132,7 @@@ void nvme_queue_scan(struct nvme_ctrl *
        /*
         * Only new queue scan work when admin and IO queues are both alive
         */
 -      if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
 +      if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
                queue_work(nvme_wq, &ctrl->scan_work);
  }
  
   */
  int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
  {
 -      if (ctrl->state != NVME_CTRL_RESETTING)
 +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
                return -EBUSY;
        if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
                return -EBUSY;
@@@ -157,7 -157,7 +157,7 @@@ static void nvme_failfast_work(struct w
        struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
                        struct nvme_ctrl, failfast_work);
  
 -      if (ctrl->state != NVME_CTRL_CONNECTING)
 +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
                return;
  
        set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
@@@ -201,7 -201,7 +201,7 @@@ int nvme_reset_ctrl_sync(struct nvme_ct
        ret = nvme_reset_ctrl(ctrl);
        if (!ret) {
                flush_work(&ctrl->reset_work);
 -              if (ctrl->state != NVME_CTRL_LIVE)
 +              if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
                        ret = -ENETRESET;
        }
  
@@@ -503,7 -503,7 +503,7 @@@ bool nvme_change_ctrl_state(struct nvme
  
        spin_lock_irqsave(&ctrl->lock, flags);
  
 -      old_state = ctrl->state;
 +      old_state = nvme_ctrl_state(ctrl);
        switch (new_state) {
        case NVME_CTRL_LIVE:
                switch (old_state) {
        }
  
        if (changed) {
 -              ctrl->state = new_state;
 +              WRITE_ONCE(ctrl->state, new_state);
                wake_up_all(&ctrl->state_wq);
        }
  
        if (!changed)
                return false;
  
 -      if (ctrl->state == NVME_CTRL_LIVE) {
 +      if (new_state == NVME_CTRL_LIVE) {
                if (old_state == NVME_CTRL_CONNECTING)
                        nvme_stop_failfast_work(ctrl);
                nvme_kick_requeue_lists(ctrl);
 -      } else if (ctrl->state == NVME_CTRL_CONNECTING &&
 +      } else if (new_state == NVME_CTRL_CONNECTING &&
                old_state == NVME_CTRL_RESETTING) {
                nvme_start_failfast_work(ctrl);
        }
@@@ -596,7 -596,7 +596,7 @@@ EXPORT_SYMBOL_GPL(nvme_change_ctrl_stat
   */
  static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
  {
 -      switch (ctrl->state) {
 +      switch (nvme_ctrl_state(ctrl)) {
        case NVME_CTRL_NEW:
        case NVME_CTRL_LIVE:
        case NVME_CTRL_RESETTING:
@@@ -621,7 -621,7 +621,7 @@@ bool nvme_wait_reset(struct nvme_ctrl *
        wait_event(ctrl->state_wq,
                   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
                   nvme_state_terminal(ctrl));
 -      return ctrl->state == NVME_CTRL_RESETTING;
 +      return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
  }
  EXPORT_SYMBOL_GPL(nvme_wait_reset);
  
@@@ -708,11 -708,9 +708,11 @@@ EXPORT_SYMBOL_GPL(nvme_init_request)
  blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
                struct request *rq)
  {
 -      if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
 -          ctrl->state != NVME_CTRL_DELETING &&
 -          ctrl->state != NVME_CTRL_DEAD &&
 +      enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 +
 +      if (state != NVME_CTRL_DELETING_NOIO &&
 +          state != NVME_CTRL_DELETING &&
 +          state != NVME_CTRL_DEAD &&
            !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
            !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
                return BLK_STS_RESOURCE;
@@@ -742,7 -740,7 +742,7 @@@ bool __nvme_check_ready(struct nvme_ctr
                 * command, which is require to set the queue live in the
                 * appropinquate states.
                 */
 -              switch (ctrl->state) {
 +              switch (nvme_ctrl_state(ctrl)) {
                case NVME_CTRL_CONNECTING:
                        if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
                            (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
@@@ -1202,16 -1200,8 +1202,16 @@@ static unsigned long nvme_keep_alive_wo
  
  static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
  {
 -      queue_delayed_work(nvme_wq, &ctrl->ka_work,
 -                         nvme_keep_alive_work_period(ctrl));
 +      unsigned long now = jiffies;
 +      unsigned long delay = nvme_keep_alive_work_period(ctrl);
 +      unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
 +
 +      if (time_after(now, ka_next_check_tm))
 +              delay = 0;
 +      else
 +              delay = ka_next_check_tm - now;
 +
 +      queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
  }
  
  static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
@@@ -1497,8 -1487,7 +1497,8 @@@ static int nvme_ns_info_from_identify(s
        if (id->ncap == 0) {
                /* namespace not allocated or attached */
                info->is_removed = true;
 -              return -ENODEV;
 +              ret = -ENODEV;
 +              goto error;
        }
  
        info->anagrpid = id->anagrpid;
                    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
                        memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
        }
 +
 +error:
        kfree(id);
 -      return 0;
 +      return ret;
  }
  
  static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
@@@ -1740,13 -1727,13 +1740,13 @@@ static void nvme_config_discard(struct 
                struct nvme_ns_head *head)
  {
        struct request_queue *queue = disk->queue;
-       u32 size = queue_logical_block_size(queue);
+       u32 max_discard_sectors;
  
-       if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
-               ctrl->max_discard_sectors =
-                       nvme_lba_to_sect(head, ctrl->dmrsl);
-       if (ctrl->max_discard_sectors == 0) {
+       if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
+               max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
+       } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
+               max_discard_sectors = UINT_MAX;
+       } else {
                blk_queue_max_discard_sectors(queue, 0);
                return;
        }
        BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                        NVME_DSM_MAX_RANGES);
  
-       queue->limits.discard_granularity = size;
-       /* If discard is already enabled, don't reset queue limits */
+       /*
+        * If discard is already enabled, don't reset queue limits.
+        *
+        * This works around the fact that the block layer can't cope well with
+        * updating the hardware limits when overridden through sysfs.  This is
+        * harmless because discard limits in NVMe are purely advisory.
+        */
        if (queue->limits.max_discard_sectors)
                return;
  
-       blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
-       blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+       blk_queue_max_discard_sectors(queue, max_discard_sectors);
+       if (ctrl->dmrl)
+               blk_queue_max_discard_segments(queue, ctrl->dmrl);
+       else
+               blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+       queue->limits.discard_granularity = queue_logical_block_size(queue);
  
        if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
                blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@@ -1912,10 -1907,9 +1920,10 @@@ static void nvme_update_disk_info(struc
  
        /*
         * The block layer can't support LBA sizes larger than the page size
 -       * yet, so catch this early and don't allow block I/O.
 +       * or smaller than a sector size yet, so catch this early and don't
 +       * allow block I/O.
         */
 -      if (head->lba_shift > PAGE_SHIFT) {
 +      if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
                capacity = 0;
                bs = (1 << 9);
        }
@@@ -2052,13 -2046,6 +2060,13 @@@ static int nvme_update_ns_info_block(st
        if (ret)
                return ret;
  
 +      if (id->ncap == 0) {
 +              /* namespace not allocated or attached */
 +              info->is_removed = true;
 +              ret = -ENODEV;
 +              goto error;
 +      }
 +
        blk_mq_freeze_queue(ns->disk->queue);
        lbaf = nvme_lbaf_index(id->flbas);
        ns->head->lba_shift = id->lbaf[lbaf].ds;
@@@ -2121,8 -2108,6 +2129,8 @@@ out
                set_bit(NVME_NS_READY, &ns->flags);
                ret = 0;
        }
 +
 +error:
        kfree(id);
        return ret;
  }
@@@ -2562,7 -2547,7 +2570,7 @@@ static void nvme_set_latency_tolerance(
  
        if (ctrl->ps_max_latency_us != latency) {
                ctrl->ps_max_latency_us = latency;
 -              if (ctrl->state == NVME_CTRL_LIVE)
 +              if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
                        nvme_configure_apst(ctrl);
        }
  }
@@@ -2930,14 -2915,6 +2938,6 @@@ static int nvme_init_non_mdts_limits(st
        struct nvme_id_ctrl_nvm *id;
        int ret;
  
-       if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
-               ctrl->max_discard_sectors = UINT_MAX;
-               ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
-       } else {
-               ctrl->max_discard_sectors = 0;
-               ctrl->max_discard_segments = 0;
-       }
        /*
         * Even though NVMe spec explicitly states that MDTS is not applicable
         * to the write-zeroes, we are cautious and limit the size to the
        if (ret)
                goto free_data;
  
-       if (id->dmrl)
-               ctrl->max_discard_segments = id->dmrl;
+       ctrl->dmrl = id->dmrl;
        ctrl->dmrsl = le32_to_cpu(id->dmrsl);
        if (id->wzsl)
                ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
@@@ -3270,7 -3246,7 +3269,7 @@@ static int nvme_dev_open(struct inode *
        struct nvme_ctrl *ctrl =
                container_of(inode->i_cdev, struct nvme_ctrl, cdev);
  
 -      switch (ctrl->state) {
 +      switch (nvme_ctrl_state(ctrl)) {
        case NVME_CTRL_LIVE:
                break;
        default:
@@@ -3694,14 -3670,6 +3693,14 @@@ static void nvme_alloc_ns(struct nvme_c
                goto out_unlink_ns;
  
        down_write(&ctrl->namespaces_rwsem);
 +      /*
 +       * Ensure that no namespaces are added to the ctrl list after the queues
 +       * are frozen, thereby avoiding a deadlock between scan and reset.
 +       */
 +      if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
 +              up_write(&ctrl->namespaces_rwsem);
 +              goto out_unlink_ns;
 +      }
        nvme_ns_add_to_ctrl_list(ns);
        up_write(&ctrl->namespaces_rwsem);
        nvme_get_ctrl(ctrl);
@@@ -3966,7 -3934,7 +3965,7 @@@ static void nvme_scan_work(struct work_
        int ret;
  
        /* No tagset on a live ctrl means IO queues could not created */
 -      if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
 +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
                return;
  
        /*
@@@ -4036,7 -4004,7 +4035,7 @@@ void nvme_remove_namespaces(struct nvme
         * removing the namespaces' disks; fail all the queues now to avoid
         * potentially having to clean up the failed sync later.
         */
 -      if (ctrl->state == NVME_CTRL_DEAD)
 +      if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
                nvme_mark_namespaces_dead(ctrl);
  
        /* this is a no-op when called from the controller reset handler */
@@@ -4118,7 -4086,7 +4117,7 @@@ static void nvme_async_event_work(struc
         * flushing ctrl async_event_work after changing the controller state
         * from LIVE and before freeing the admin queue.
        */
 -      if (ctrl->state == NVME_CTRL_LIVE)
 +      if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
                ctrl->ops->submit_async_event(ctrl);
  }
  
@@@ -4169,8 -4137,6 +4168,8 @@@ static void nvme_fw_act_work(struct wor
                                struct nvme_ctrl, fw_act_work);
        unsigned long fw_act_timeout;
  
 +      nvme_auth_stop(ctrl);
 +
        if (ctrl->mtfa)
                fw_act_timeout = jiffies +
                                msecs_to_jiffies(ctrl->mtfa * 100);
@@@ -4226,6 -4192,7 +4225,6 @@@ static bool nvme_handle_aen_notice(stru
                 * firmware activation.
                 */
                if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
 -                      nvme_auth_stop(ctrl);
                        requeue = false;
                        queue_work(nvme_wq, &ctrl->fw_act_work);
                }
@@@ -4514,7 -4481,7 +4513,7 @@@ int nvme_init_ctrl(struct nvme_ctrl *ct
  {
        int ret;
  
 -      ctrl->state = NVME_CTRL_NEW;
 +      WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
        clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
        spin_lock_init(&ctrl->lock);
        mutex_init(&ctrl->scan_lock);
        INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
        memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
        ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 +      ctrl->ka_last_check_time = jiffies;
  
        BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
                        PAGE_SIZE);
@@@ -4624,7 -4590,6 +4623,7 @@@ void nvme_unfreeze(struct nvme_ctrl *ct
        list_for_each_entry(ns, &ctrl->namespaces, list)
                blk_mq_unfreeze_queue(ns->queue);
        up_read(&ctrl->namespaces_rwsem);
 +      clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
  }
  EXPORT_SYMBOL_GPL(nvme_unfreeze);
  
@@@ -4658,7 -4623,6 +4657,7 @@@ void nvme_start_freeze(struct nvme_ctr
  {
        struct nvme_ns *ns;
  
 +      set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
        down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list)
                blk_freeze_queue_start(ns->queue);
diff --combined drivers/nvme/host/nvme.h
@@@ -157,11 -157,6 +157,11 @@@ enum nvme_quirks 
         * No temperature thresholds for channels other than 0 (Composite).
         */
        NVME_QUIRK_NO_SECONDARY_TEMP_THRESH     = (1 << 19),
 +
 +      /*
 +       * Disables simple suspend/resume path.
 +       */
 +      NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND      = (1 << 20),
  };
  
  /*
@@@ -257,7 -252,6 +257,7 @@@ enum nvme_ctrl_flags 
        NVME_CTRL_STOPPED               = 3,
        NVME_CTRL_SKIP_ID_CNS_CS        = 4,
        NVME_CTRL_DIRTY_CAPABILITY      = 5,
 +      NVME_CTRL_FROZEN                = 6,
  };
  
  struct nvme_ctrl {
        u32 max_hw_sectors;
        u32 max_segments;
        u32 max_integrity_segments;
-       u32 max_discard_sectors;
-       u32 max_discard_segments;
        u32 max_zeroes_sectors;
  #ifdef CONFIG_BLK_DEV_ZONED
        u32 max_zone_append;
  #endif
        u16 crdt[3];
        u16 oncs;
+       u8 dmrl;
        u32 dmrsl;
        u16 oacs;
        u16 sqsize;
        enum nvme_dctype dctype;
  };
  
 +static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
 +{
 +      return READ_ONCE(ctrl->state);
 +}
 +
  enum nvme_iopolicy {
        NVME_IOPOLICY_NUMA,
        NVME_IOPOLICY_RR,
@@@ -932,6 -920,10 +931,10 @@@ extern struct device_attribute dev_attr
  extern struct device_attribute dev_attr_ana_state;
  extern struct device_attribute subsys_attr_iopolicy;
  
+ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+ {
+       return disk->fops == &nvme_ns_head_ops;
+ }
  #else
  #define multipath false
  static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@@ -1009,6 -1001,10 +1012,10 @@@ static inline void nvme_mpath_start_req
  static inline void nvme_mpath_end_request(struct request *rq)
  {
  }
+ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+ {
+       return false;
+ }
  #endif /* CONFIG_NVME_MULTIPATH */
  
  int nvme_revalidate_zones(struct nvme_ns *ns);
@@@ -1037,7 -1033,10 +1044,10 @@@ static inline int nvme_update_zone_info
  
  static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
  {
-       return dev_to_disk(dev)->private_data;
+       struct gendisk *disk = dev_to_disk(dev);
+       WARN_ON(nvme_disk_is_ns_head(disk));
+       return disk->private_data;
  }
  
  #ifdef CONFIG_NVME_HWMON
diff --combined drivers/nvme/host/pci.c
@@@ -1233,7 -1233,7 +1233,7 @@@ static bool nvme_should_reset(struct nv
        bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
  
        /* If there is a reset/reinit ongoing, we shouldn't reset again. */
 -      switch (dev->ctrl.state) {
 +      switch (nvme_ctrl_state(&dev->ctrl)) {
        case NVME_CTRL_RESETTING:
        case NVME_CTRL_CONNECTING:
                return false;
@@@ -1284,6 -1284,7 +1284,7 @@@ static enum blk_eh_timer_return nvme_ti
        struct request *abort_req;
        struct nvme_command cmd = { };
        u32 csts = readl(dev->bar + NVME_REG_CSTS);
+       u8 opcode;
  
        /* If PCI error recovery process is happening, we cannot reset or
         * the recovery mechanism will surely fail.
  
        if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
                dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, completion polled\n",
-                        req->tag, nvmeq->qid);
+                        "I/O tag %d (%04x) QID %d timeout, completion polled\n",
+                        req->tag, nvme_cid(req), nvmeq->qid);
                return BLK_EH_DONE;
        }
  
         * cancellation error. All outstanding requests are completed on
         * shutdown, so we return BLK_EH_DONE.
         */
 -      switch (dev->ctrl.state) {
 +      switch (nvme_ctrl_state(&dev->ctrl)) {
        case NVME_CTRL_CONNECTING:
                nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
                fallthrough;
        case NVME_CTRL_DELETING:
                dev_warn_ratelimited(dev->ctrl.device,
-                        "I/O %d QID %d timeout, disable controller\n",
-                        req->tag, nvmeq->qid);
+                        "I/O tag %d (%04x) QID %d timeout, disable controller\n",
+                        req->tag, nvme_cid(req), nvmeq->qid);
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                nvme_dev_disable(dev, true);
                return BLK_EH_DONE;
         * command was already aborted once before and still hasn't been
         * returned to the driver, or if this is the admin queue.
         */
+       opcode = nvme_req(req)->cmd->common.opcode;
        if (!nvmeq->qid || iod->aborted) {
                dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, reset controller\n",
-                        req->tag, nvmeq->qid);
+                        "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
+                        req->tag, nvme_cid(req), opcode,
+                        nvme_opcode_str(nvmeq->qid, opcode, 0), nvmeq->qid);
                nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                goto disable;
        }
        cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
  
        dev_warn(nvmeq->dev->ctrl.device,
-               "I/O %d (%s) QID %d timeout, aborting\n",
-                req->tag,
-                nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
-                nvmeq->qid);
+                "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
+                req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
+                nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
+                blk_rq_bytes(req));
  
        abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
                                         BLK_MQ_REQ_NOWAIT);
@@@ -1593,7 -1596,7 +1596,7 @@@ static int nvme_setup_io_queues_trylock
        /*
         * Controller is in wrong state, fail early.
         */
 -      if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
 +      if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) {
                mutex_unlock(&dev->shutdown_lock);
                return -ENODEV;
        }
@@@ -2573,13 -2576,13 +2576,13 @@@ static bool nvme_pci_ctrl_is_dead(struc
  
  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
  {
 +      enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl);
        struct pci_dev *pdev = to_pci_dev(dev->dev);
        bool dead;
  
        mutex_lock(&dev->shutdown_lock);
        dead = nvme_pci_ctrl_is_dead(dev);
 -      if (dev->ctrl.state == NVME_CTRL_LIVE ||
 -          dev->ctrl.state == NVME_CTRL_RESETTING) {
 +      if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) {
                if (pci_is_enabled(pdev))
                        nvme_start_freeze(&dev->ctrl);
                /*
@@@ -2690,7 -2693,7 +2693,7 @@@ static void nvme_reset_work(struct work
        bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
        int result;
  
 -      if (dev->ctrl.state != NVME_CTRL_RESETTING) {
 +      if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
                dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
                         dev->ctrl.state);
                result = -ENODEV;
         * controller around but remove all namespaces.
         */
        if (dev->online_queues > 1) {
+               nvme_dbbuf_set(dev);
                nvme_unquiesce_io_queues(&dev->ctrl);
                nvme_wait_freeze(&dev->ctrl);
                nvme_pci_update_nr_queues(dev);
-               nvme_dbbuf_set(dev);
                nvme_unfreeze(&dev->ctrl);
        } else {
                dev_warn(dev->ctrl.device, "IO queues lost\n");
@@@ -2902,18 -2905,6 +2905,18 @@@ static unsigned long check_vendor_combi
                if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
                     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
                        return NVME_QUIRK_SIMPLE_SUSPEND;
 +      } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 ||
 +                 pdev->device == 0x500f)) {
 +              /*
 +               * Exclude some Kingston NV1 and A2000 devices from
 +               * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
 +               * lot fo energy with s2idle sleep on some TUXEDO platforms.
 +               */
 +              if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
 +                  dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
 +                  dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") ||
 +                  dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1"))
 +                      return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
        }
  
        return 0;
@@@ -2944,9 -2935,7 +2947,9 @@@ static struct nvme_dev *nvme_pci_alloc_
        dev->dev = get_device(&pdev->dev);
  
        quirks |= check_vendor_combination_bug(pdev);
 -      if (!noacpi && acpi_storage_d3(&pdev->dev)) {
 +      if (!noacpi &&
 +          !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) &&
 +          acpi_storage_d3(&pdev->dev)) {
                /*
                 * Some systems use a bios work around to ask for D3 on
                 * platforms that support kernel managed suspend.
@@@ -3206,7 -3195,7 +3209,7 @@@ static int nvme_suspend(struct device *
        nvme_wait_freeze(ctrl);
        nvme_sync_queues(ctrl);
  
 -      if (ctrl->state != NVME_CTRL_LIVE)
 +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
                goto unfreeze;
  
        /*
@@@ -3408,6 -3397,8 +3411,8 @@@ static const struct pci_device_id nvme_
                .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
        { PCI_DEVICE(0x1c5c, 0x174a),   /* SK Hynix P31 SSD */
                .driver_data = NVME_QUIRK_BOGUS_NID, },
+       { PCI_DEVICE(0x1c5c, 0x1D59),   /* SK Hynix BC901 */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
        { PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
                .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
        { PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
diff --combined drivers/nvme/host/rdma.c
@@@ -984,11 -984,10 +984,11 @@@ free_ctrl
  
  static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
  {
 +      enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
 +
        /* If we are resetting/deleting then do nothing */
 -      if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
 -              WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
 -                      ctrl->ctrl.state == NVME_CTRL_LIVE);
 +      if (state != NVME_CTRL_CONNECTING) {
 +              WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
                return;
        }
  
@@@ -1060,10 -1059,8 +1060,10 @@@ static int nvme_rdma_setup_ctrl(struct 
                 * unless we're during creation of a new controller to
                 * avoid races with teardown flow.
                 */
 -              WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
 -                           ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
 +              enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
 +
 +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
 +                           state != NVME_CTRL_DELETING_NOIO);
                WARN_ON_ONCE(new);
                ret = -EINVAL;
                goto destroy_io;
@@@ -1132,10 -1129,8 +1132,10 @@@ static void nvme_rdma_error_recovery_wo
  
        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                /* state change failure is ok if we started ctrl delete */
 -              WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
 -                           ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
 +              enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
 +
 +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
 +                           state != NVME_CTRL_DELETING_NOIO);
                return;
        }
  
@@@ -1167,7 -1162,7 +1167,7 @@@ static void nvme_rdma_wr_error(struct i
        struct nvme_rdma_queue *queue = wc->qp->qp_context;
        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
  
 -      if (ctrl->ctrl.state == NVME_CTRL_LIVE)
 +      if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE)
                dev_info(ctrl->ctrl.device,
                             "%s for CQE 0x%p failed with status %s (%d)\n",
                             op, wc->wr_cqe,
@@@ -1946,11 -1941,16 +1946,16 @@@ static enum blk_eh_timer_return nvme_rd
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
        struct nvme_rdma_queue *queue = req->queue;
        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
-       dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
-                rq->tag, nvme_rdma_queue_idx(queue));
+       u8 opcode = req->req.cmd->common.opcode;
+       u8 fctype = req->req.cmd->fabrics.fctype;
+       int qid = nvme_rdma_queue_idx(queue);
+       dev_warn(ctrl->ctrl.device,
+                "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n",
+                rq->tag, nvme_cid(rq), opcode,
+                nvme_opcode_str(qid, opcode, fctype), qid);
  
 -      if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
 +      if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) {
                /*
                 * If we are resetting, connecting or deleting we should
                 * complete immediately because we may block controller
diff --combined drivers/nvme/host/tcp.c
@@@ -1922,14 -1922,13 +1922,13 @@@ static int nvme_tcp_alloc_admin_queue(s
                                                      ctrl->opts->subsysnqn);
                if (!pskid) {
                        dev_err(ctrl->device, "no valid PSK found\n");
-                       ret = -ENOKEY;
-                       goto out_free_queue;
+                       return -ENOKEY;
                }
        }
  
        ret = nvme_tcp_alloc_queue(ctrl, 0, pskid);
        if (ret)
-               goto out_free_queue;
+               return ret;
  
        ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
        if (ret)
@@@ -2152,11 -2151,10 +2151,11 @@@ static void nvme_tcp_teardown_io_queues
  
  static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
  {
 +      enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 +
        /* If we are resetting/deleting then do nothing */
 -      if (ctrl->state != NVME_CTRL_CONNECTING) {
 -              WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
 -                      ctrl->state == NVME_CTRL_LIVE);
 +      if (state != NVME_CTRL_CONNECTING) {
 +              WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
                return;
        }
  
@@@ -2216,10 -2214,8 +2215,10 @@@ static int nvme_tcp_setup_ctrl(struct n
                 * unless we're during creation of a new controller to
                 * avoid races with teardown flow.
                 */
 -              WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
 -                           ctrl->state != NVME_CTRL_DELETING_NOIO);
 +              enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 +
 +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
 +                           state != NVME_CTRL_DELETING_NOIO);
                WARN_ON_ONCE(new);
                ret = -EINVAL;
                goto destroy_io;
@@@ -2283,10 -2279,8 +2282,10 @@@ static void nvme_tcp_error_recovery_wor
  
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
                /* state change failure is ok if we started ctrl delete */
 -              WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
 -                           ctrl->state != NVME_CTRL_DELETING_NOIO);
 +              enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 +
 +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
 +                           state != NVME_CTRL_DELETING_NOIO);
                return;
        }
  
@@@ -2316,10 -2310,8 +2315,10 @@@ static void nvme_reset_ctrl_work(struc
  
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
                /* state change failure is ok if we started ctrl delete */
 -              WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
 -                           ctrl->state != NVME_CTRL_DELETING_NOIO);
 +              enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 +
 +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
 +                           state != NVME_CTRL_DELETING_NOIO);
                return;
        }
  
@@@ -2433,11 -2425,11 +2432,11 @@@ static enum blk_eh_timer_return nvme_tc
        int qid = nvme_tcp_queue_id(req->queue);
  
        dev_warn(ctrl->device,
-               "queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
-               nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
-               opc, nvme_opcode_str(qid, opc, fctype));
+                "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
+                rq->tag, nvme_cid(rq), pdu->hdr.type, opc,
+                nvme_opcode_str(qid, opc, fctype), qid);
  
 -      if (ctrl->state != NVME_CTRL_LIVE) {
 +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
                /*
                 * If we are resetting, connecting or deleting we should
                 * complete immediately because we may block controller