Merge tag 'for-6.8/block-2024-01-18' of git://git.kernel.dk/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
diff --combined block/blk-mq.c

index c11c97a,e02c4b1..aa87fcf
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -772,11 -772,16 +772,16 @@@ static void req_bio_endio(struct reques
                 /*
                  * Partial zone append completions cannot be supported as the
                  * BIO fragments may end up not being written sequentially.
+                * For such case, force the completed nbytes to be equal to
+                * the BIO size so that bio_advance() sets the BIO remaining
+                * size to 0 and we end up calling bio_endio() before returning.
                  */
-               if (bio->bi_iter.bi_size != nbytes)
+               if (bio->bi_iter.bi_size != nbytes) {
                         bio->bi_status = BLK_STS_IOERR;
-               else
+                       nbytes = bio->bi_iter.bi_size;
+               } else {
                         bio->bi_iter.bi_sector = rq->__sector;
+               }
         }
   
         bio_advance(bio, nbytes);
@@@ -1513,26 -1518,14 +1518,26 @@@ void blk_mq_delay_kick_requeue_list(str
   }
   EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
   
+ +static bool blk_is_flush_data_rq(struct request *rq)
+ +{
+ +      return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
+ +}
+ +
   static bool blk_mq_rq_inflight(struct request *rq, void *priv)
   {
         /*
          * If we find a request that isn't idle we know the queue is busy
          * as it's checked in the iter.
          * Return false to stop the iteration.
+ +       *
+ +       * In case of queue quiesce, if one flush data request is completed,
+ +       * don't count it as inflight given the flush sequence is suspended,
+ +       * and the original flush data request is invisible to driver, just
+ +       * like other pending requests because of quiesce
          */
- -      if (blk_mq_request_started(rq)) {
+ +      if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
+ +                              blk_is_flush_data_rq(rq) &&
+ +                              blk_mq_request_completed(rq))) {
                 bool *busy = priv;
   
                 *busy = true;
@@@ -1859,6 -1852,22 +1864,22 @@@ static bool blk_mq_mark_tag_wait(struc
         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
         __add_wait_queue(wq, wait);
   
+       /*
+        * Add one explicit barrier since blk_mq_get_driver_tag() may
+        * not imply barrier in case of failure.
+        *
+        * Order adding us to wait queue and allocating driver tag.
+        *
+        * The pair is the one implied in sbitmap_queue_wake_up() which
+        * orders clearing sbitmap tag bits and waitqueue_active() in
+        * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
+        *
+        * Otherwise, re-order of adding wait queue and getting driver tag
+        * may cause __sbitmap_queue_wake_up() to wake up nothing because
+        * the waitqueue_active() may not observe us in wait queue.
+        */
+       smp_mb();
+ 
         /*
          * It's possible that a tag was freed in the window between the
          * allocation failure and adding the hardware queue to the wait
@@@ -2891,8 -2900,11 +2912,11 @@@ static struct request *blk_mq_get_new_r
         return NULL;
   }
   
- /* return true if this @rq can be used for @bio */
- static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+ /*
+  * Check if we can use the passed on request for submitting the passed in bio,
+  * and remove it from the request list if it can be used.
+  */
+ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
                 struct bio *bio)
   {
         enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
@@@ -2952,12 -2964,6 +2976,6 @@@ void blk_mq_submit_bio(struct bio *bio
         blk_status_t ret;
   
         bio = blk_queue_bounce(bio, q);
-       if (bio_may_exceed_limits(bio, &q->limits)) {
-               bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
-               if (!bio)
-                       return;
-       }
- 
         bio_set_ioprio(bio);
   
         if (plug) {
@@@ -2966,16 -2972,26 +2984,26 @@@
                         rq = NULL;
         }
         if (rq) {
+               if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+                       bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+                       if (!bio)
+                               return;
+               }
                 if (!bio_integrity_prep(bio))
                         return;
                 if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                         return;
-               if (blk_mq_can_use_cached_rq(rq, plug, bio))
+               if (blk_mq_use_cached_rq(rq, plug, bio))
                         goto done;
                 percpu_ref_get(&q->q_usage_counter);
         } else {
                 if (unlikely(bio_queue_enter(bio)))
                         return;
+               if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+                       bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+                       if (!bio)
+                               goto fail;
+               }
                 if (!bio_integrity_prep(bio))
                         goto fail;
         }
diff --combined drivers/block/loop.c

index 146b32f,01bb943..f814549
--- 1/drivers/block/loop.c
--- 2/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@@ -165,39 -165,37 +165,37 @@@ static loff_t get_loop_size(struct loop
         return get_size(lo->lo_offset, lo->lo_sizelimit, file);
   }
   
+ /*
+  * We support direct I/O only if lo_offset is aligned with the logical I/O size
+  * of backing device, and the logical block size of loop is bigger than that of
+  * the backing device.
+  */
+ static bool lo_bdev_can_use_dio(struct loop_device *lo,
+               struct block_device *backing_bdev)
+ {
+       unsigned short sb_bsize = bdev_logical_block_size(backing_bdev);
+ 
+       if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
+               return false;
+       if (lo->lo_offset & (sb_bsize - 1))
+               return false;
+       return true;
+ }
+ 
   static void __loop_update_dio(struct loop_device *lo, bool dio)
   {
         struct file *file = lo->lo_backing_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       unsigned short sb_bsize = 0;
-       unsigned dio_align = 0;
+       struct inode *inode = file->f_mapping->host;
+       struct block_device *backing_bdev = NULL;
         bool use_dio;
   
-       if (inode->i_sb->s_bdev) {
-               sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
-               dio_align = sb_bsize - 1;
-       }
+       if (S_ISBLK(inode->i_mode))
+               backing_bdev = I_BDEV(inode);
+       else if (inode->i_sb->s_bdev)
+               backing_bdev = inode->i_sb->s_bdev;
   
-       /*
-        * We support direct I/O only if lo_offset is aligned with the
-        * logical I/O size of backing device, and the logical block
-        * size of loop is bigger than the backing device's.
-        *
-        * TODO: the above condition may be loosed in the future, and
-        * direct I/O may be switched runtime at that time because most
-        * of requests in sane applications should be PAGE_SIZE aligned
-        */
-       if (dio) {
-               if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
-                   !(lo->lo_offset & dio_align) &&
-                   (file->f_mode & FMODE_CAN_ODIRECT))
-                       use_dio = true;
-               else
-                       use_dio = false;
-       } else {
-               use_dio = false;
-       }
+       use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) &&
+               (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev));
   
         if (lo->use_dio == use_dio)
                 return;
@@@ -245,7 -243,9 +243,7 @@@ static int lo_write_bvec(struct file *f
   
         iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len);
   
- -      file_start_write(file);
         bw = vfs_iter_write(file, &i, ppos, 0);
- -      file_end_write(file);
   
         if (likely(bw ==  bvec->bv_len))
                 return 0;
diff --combined drivers/block/virtio_blk.c

index 3b6b9ab,24963f4..5bf98fd
--- 1/drivers/block/virtio_blk.c
--- 2/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@@ -367,8 -367,6 +367,6 @@@ static void virtblk_done(struct virtque
                                 blk_mq_complete_request(req);
                         req_done = true;
                 }
-               if (unlikely(virtqueue_is_broken(vq)))
-                       break;
         } while (!virtqueue_enable_cb(vq));
   
         /* In case queue is stopped waiting for more buffers. */
@@@ -970,12 -968,12 +968,12 @@@ static void virtblk_config_changed(stru
   static int init_vq(struct virtio_blk *vblk)
   {
         int err;
- -      int i;
+ +      unsigned short i;
         vq_callback_t **callbacks;
         const char **names;
         struct virtqueue **vqs;
         unsigned short num_vqs;
- -      unsigned int num_poll_vqs;
+ +      unsigned short num_poll_vqs;
         struct virtio_device *vdev = vblk->vdev;
         struct irq_affinity desc = { 0, };
   
@@@ -1019,13 -1017,13 +1017,13 @@@
   
         for (i = 0; i < num_vqs - num_poll_vqs; i++) {
                 callbacks[i] = virtblk_done;
- -              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+ +              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i);
                 names[i] = vblk->vqs[i].name;
         }
   
         for (; i < num_vqs; i++) {
                 callbacks[i] = NULL;
- -              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%d", i);
+ +              snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i);
                 names[i] = vblk->vqs[i].name;
         }
   
diff --combined drivers/md/md.c

index 0a2bd72,ff3057c..2266358
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -82,14 -82,6 +82,14 @@@ static struct module *md_cluster_mod
   
   static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
   static struct workqueue_struct *md_wq;
+ +
+ +/*
+ + * This workqueue is used for sync_work to register new sync_thread, and for
+ + * del_work to remove rdev, and for event_work that is only set by dm-raid.
+ + *
+ + * Noted that sync_work will grab reconfig_mutex, hence never flush this
+ + * workqueue whith reconfig_mutex grabbed.
+ + */
   static struct workqueue_struct *md_misc_wq;
   struct workqueue_struct *md_bitmap_wq;
   
@@@ -498,7 -490,7 +498,7 @@@ int mddev_suspend(struct mddev *mddev, 
   }
   EXPORT_SYMBOL_GPL(mddev_suspend);
   
- -void mddev_resume(struct mddev *mddev)
+ +static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
   {
         lockdep_assert_not_held(&mddev->reconfig_mutex);
   
@@@ -515,18 -507,12 +515,18 @@@
         percpu_ref_resurrect(&mddev->active_io);
         wake_up(&mddev->sb_wait);
   
- -      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ +      if (recovery_needed)
+ +              set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
         md_wakeup_thread(mddev->thread);
         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
   
         mutex_unlock(&mddev->suspend_mutex);
   }
+ +
+ +void mddev_resume(struct mddev *mddev)
+ +{
+ +      return __mddev_resume(mddev, true);
+ +}
   EXPORT_SYMBOL_GPL(mddev_resume);
   
   /*
@@@ -4874,29 -4860,25 +4874,29 @@@ action_show(struct mddev *mddev, char *
         return sprintf(page, "%s\n", type);
   }
   
- -static void stop_sync_thread(struct mddev *mddev)
+ +/**
+ + * stop_sync_thread() - wait for sync_thread to stop if it's running.
+ + * @mddev:    the array.
+ + * @locked:   if set, reconfig_mutex will still be held after this function
+ + *            return; if not set, reconfig_mutex will be released after this
+ + *            function return.
+ + * @check_seq:        if set, only wait for curent running sync_thread to stop, noted
+ + *            that new sync_thread can still start.
+ + */
+ +static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
   {
- -      if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- -              return;
+ +      int sync_seq;
   
- -      if (mddev_lock(mddev))
- -              return;
+ +      if (check_seq)
+ +              sync_seq = atomic_read(&mddev->sync_seq);
   
- -      /*
- -       * Check again in case MD_RECOVERY_RUNNING is cleared before lock is
- -       * held.
- -       */
         if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
- -              mddev_unlock(mddev);
+ +              if (!locked)
+ +                      mddev_unlock(mddev);
                 return;
         }
   
- -      if (work_pending(&mddev->del_work))
- -              flush_workqueue(md_misc_wq);
+ +      mddev_unlock(mddev);
   
         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
         /*
@@@ -4904,28 -4886,21 +4904,28 @@@
          * never happen
          */
         md_wakeup_thread_directly(mddev->sync_thread);
+ +      if (work_pending(&mddev->sync_work))
+ +              flush_work(&mddev->sync_work);
   
- -      mddev_unlock(mddev);
+ +      wait_event(resync_wait,
+ +                 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ +                 (check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
+ +
+ +      if (locked)
+ +              mddev_lock_nointr(mddev);
   }
   
   static void idle_sync_thread(struct mddev *mddev)
   {
- -      int sync_seq = atomic_read(&mddev->sync_seq);
- -
         mutex_lock(&mddev->sync_mutex);
         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- -      stop_sync_thread(mddev);
   
- -      wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
- -                      !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ +      if (mddev_lock(mddev)) {
+ +              mutex_unlock(&mddev->sync_mutex);
+ +              return;
+ +      }
   
+ +      stop_sync_thread(mddev, false, true);
         mutex_unlock(&mddev->sync_mutex);
   }
   
@@@ -4933,13 -4908,11 +4933,13 @@@ static void frozen_sync_thread(struct m
   {
         mutex_lock(&mddev->sync_mutex);
         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- -      stop_sync_thread(mddev);
   
- -      wait_event(resync_wait, mddev->sync_thread == NULL &&
- -                      !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ +      if (mddev_lock(mddev)) {
+ +              mutex_unlock(&mddev->sync_mutex);
+ +              return;
+ +      }
   
+ +      stop_sync_thread(mddev, false, false);
         mutex_unlock(&mddev->sync_mutex);
   }
   
@@@ -6311,7 -6284,14 +6311,7 @@@ static void md_clean(struct mddev *mdde
   
   static void __md_stop_writes(struct mddev *mddev)
   {
- -      set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- -      if (work_pending(&mddev->del_work))
- -              flush_workqueue(md_misc_wq);
- -      if (mddev->sync_thread) {
- -              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- -              md_reap_sync_thread(mddev);
- -      }
- -
+ +      stop_sync_thread(mddev, true, false);
         del_timer_sync(&mddev->safemode_timer);
   
         if (mddev->pers && mddev->pers->quiesce) {
@@@ -6358,6 -6338,9 +6358,6 @@@ static void __md_stop(struct mddev *mdd
         struct md_personality *pers = mddev->pers;
         md_bitmap_destroy(mddev);
         mddev_detach(mddev);
- -      /* Ensure ->event_work is done */
- -      if (mddev->event_work.func)
- -              flush_workqueue(md_misc_wq);
         spin_lock(&mddev->lock);
         mddev->pers = NULL;
         spin_unlock(&mddev->lock);
@@@ -6392,16 -6375,25 +6392,16 @@@ static int md_set_readonly(struct mdde
         int err = 0;
         int did_freeze = 0;
   
+ +      if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ +              return -EBUSY;
+ +
         if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
                 did_freeze = 1;
                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                 md_wakeup_thread(mddev->thread);
         }
- -      if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- -              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   
- -      /*
- -       * Thread might be blocked waiting for metadata update which will now
- -       * never happen
- -       */
- -      md_wakeup_thread_directly(mddev->sync_thread);
- -
- -      if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
- -              return -EBUSY;
- -      mddev_unlock(mddev);
- -      wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
- -                                        &mddev->recovery));
+ +      stop_sync_thread(mddev, false, false);
         wait_event(mddev->sb_wait,
                    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
         mddev_lock_nointr(mddev);
@@@ -6411,30 -6403,29 +6411,30 @@@
             mddev->sync_thread ||
             test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
                 pr_warn("md: %s still in use.\n",mdname(mddev));
- -              if (did_freeze) {
- -                      clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- -                      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- -                      md_wakeup_thread(mddev->thread);
- -              }
                 err = -EBUSY;
                 goto out;
         }
+ +
         if (mddev->pers) {
                 __md_stop_writes(mddev);
   
- -              err  = -ENXIO;
- -              if (mddev->ro == MD_RDONLY)
+ +              if (mddev->ro == MD_RDONLY) {
+ +                      err  = -ENXIO;
                         goto out;
+ +              }
+ +
                 mddev->ro = MD_RDONLY;
                 set_disk_ro(mddev->gendisk, 1);
+ +      }
+ +
+ +out:
+ +      if ((mddev->pers && !err) || did_freeze) {
                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                 md_wakeup_thread(mddev->thread);
                 sysfs_notify_dirent_safe(mddev->sysfs_state);
- -              err = 0;
         }
- -out:
+ +
         mutex_unlock(&mddev->open_mutex);
         return err;
   }
@@@ -6455,8 -6446,20 +6455,8 @@@ static int do_md_stop(struct mddev *mdd
                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                 md_wakeup_thread(mddev->thread);
         }
- -      if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- -              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
   
- -      /*
- -       * Thread might be blocked waiting for metadata update which will now
- -       * never happen
- -       */
- -      md_wakeup_thread_directly(mddev->sync_thread);
- -
- -      mddev_unlock(mddev);
- -      wait_event(resync_wait, (mddev->sync_thread == NULL &&
- -                               !test_bit(MD_RECOVERY_RUNNING,
- -                                         &mddev->recovery)));
- -      mddev_lock_nointr(mddev);
+ +      stop_sync_thread(mddev, true, false);
   
         mutex_lock(&mddev->open_mutex);
         if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
@@@ -8132,6 -8135,19 +8132,19 @@@ static void status_unused(struct seq_fi
         seq_printf(seq, "\n");
   }
   
+ static void status_personalities(struct seq_file *seq)
+ {
+       struct md_personality *pers;
+ 
+       seq_puts(seq, "Personalities : ");
+       spin_lock(&pers_lock);
+       list_for_each_entry(pers, &pers_list, list)
+               seq_printf(seq, "[%s] ", pers->name);
+ 
+       spin_unlock(&pers_lock);
+       seq_puts(seq, "\n");
+ }
+ 
   static int status_resync(struct seq_file *seq, struct mddev *mddev)
   {
         sector_t max_sectors, resync, res;
@@@ -8273,20 -8289,10 +8286,10 @@@
   static void *md_seq_start(struct seq_file *seq, loff_t *pos)
         __acquires(&all_mddevs_lock)
   {
-       struct md_personality *pers;
- 
-       seq_puts(seq, "Personalities : ");
-       spin_lock(&pers_lock);
-       list_for_each_entry(pers, &pers_list, list)
-               seq_printf(seq, "[%s] ", pers->name);
- 
-       spin_unlock(&pers_lock);
-       seq_puts(seq, "\n");
         seq->poll_event = atomic_read(&md_event_count);
- 
         spin_lock(&all_mddevs_lock);
   
-       return seq_list_start(&all_mddevs, *pos);
+       return seq_list_start_head(&all_mddevs, *pos);
   }
   
   static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@@ -8297,16 -8303,23 +8300,23 @@@
   static void md_seq_stop(struct seq_file *seq, void *v)
         __releases(&all_mddevs_lock)
   {
-       status_unused(seq);
         spin_unlock(&all_mddevs_lock);
   }
   
   static int md_seq_show(struct seq_file *seq, void *v)
   {
-       struct mddev *mddev = list_entry(v, struct mddev, all_mddevs);
+       struct mddev *mddev;
         sector_t sectors;
         struct md_rdev *rdev;
   
+       if (v == &all_mddevs) {
+               status_personalities(seq);
+               if (list_empty(&all_mddevs))
+                       status_unused(seq);
+               return 0;
+       }
+ 
+       mddev = list_entry(v, struct mddev, all_mddevs);
         if (!mddev_get(mddev))
                 return 0;
   
@@@ -8382,6 -8395,10 +8392,10 @@@
         }
         spin_unlock(&mddev->lock);
         spin_lock(&all_mddevs_lock);
+ 
+       if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
+               status_unused(seq);
+ 
         if (atomic_dec_and_test(&mddev->active))
                 __mddev_put(mddev);
   
@@@ -9395,15 -9412,7 +9409,15 @@@ static void md_start_sync(struct work_s
                 goto not_running;
         }
   
- -      suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
+ +      mddev_unlock(mddev);
+ +      /*
+ +       * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
+ +       * not set it again. Otherwise, we may cause issue like this one:
+ +       *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
+ +       * Therefore, use __mddev_resume(mddev, false).
+ +       */
+ +      if (suspend)
+ +              __mddev_resume(mddev, false);
         md_wakeup_thread(mddev->sync_thread);
         sysfs_notify_dirent_safe(mddev->sysfs_action);
         md_new_event();
@@@ -9415,15 -9424,7 +9429,15 @@@ not_running
         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- -      suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
+ +      mddev_unlock(mddev);
+ +      /*
+ +       * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
+ +       * not set it again. Otherwise, we may cause issue like this one:
+ +       *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
+ +       * Therefore, use __mddev_resume(mddev, false).
+ +       */
+ +      if (suspend)
+ +              __mddev_resume(mddev, false);
   
         wake_up(&resync_wait);
         if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
diff --combined drivers/nvme/host/core.c

index 0af6123,50818db..85ab0fc
--- 1/drivers/nvme/host/core.c
--- 2/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@@ -132,7 -132,7 +132,7 @@@ void nvme_queue_scan(struct nvme_ctrl *
         /*
          * Only new queue scan work when admin and IO queues are both alive
          */
- -      if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
+ +      if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
                 queue_work(nvme_wq, &ctrl->scan_work);
   }
   
@@@ -144,7 -144,7 +144,7 @@@
    */
   int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
   {
- -      if (ctrl->state != NVME_CTRL_RESETTING)
+ +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
                 return -EBUSY;
         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
                 return -EBUSY;
@@@ -157,7 -157,7 +157,7 @@@ static void nvme_failfast_work(struct w
         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
                         struct nvme_ctrl, failfast_work);
   
- -      if (ctrl->state != NVME_CTRL_CONNECTING)
+ +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
                 return;
   
         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
@@@ -201,7 -201,7 +201,7 @@@ int nvme_reset_ctrl_sync(struct nvme_ct
         ret = nvme_reset_ctrl(ctrl);
         if (!ret) {
                 flush_work(&ctrl->reset_work);
- -              if (ctrl->state != NVME_CTRL_LIVE)
+ +              if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
                         ret = -ENETRESET;
         }
   
@@@ -503,7 -503,7 +503,7 @@@ bool nvme_change_ctrl_state(struct nvme
   
         spin_lock_irqsave(&ctrl->lock, flags);
   
- -      old_state = ctrl->state;
+ +      old_state = nvme_ctrl_state(ctrl);
         switch (new_state) {
         case NVME_CTRL_LIVE:
                 switch (old_state) {
@@@ -571,7 -571,7 +571,7 @@@
         }
   
         if (changed) {
- -              ctrl->state = new_state;
+ +              WRITE_ONCE(ctrl->state, new_state);
                 wake_up_all(&ctrl->state_wq);
         }
   
@@@ -579,11 -579,11 +579,11 @@@
         if (!changed)
                 return false;
   
- -      if (ctrl->state == NVME_CTRL_LIVE) {
+ +      if (new_state == NVME_CTRL_LIVE) {
                 if (old_state == NVME_CTRL_CONNECTING)
                         nvme_stop_failfast_work(ctrl);
                 nvme_kick_requeue_lists(ctrl);
- -      } else if (ctrl->state == NVME_CTRL_CONNECTING &&
+ +      } else if (new_state == NVME_CTRL_CONNECTING &&
                 old_state == NVME_CTRL_RESETTING) {
                 nvme_start_failfast_work(ctrl);
         }
@@@ -596,7 -596,7 +596,7 @@@ EXPORT_SYMBOL_GPL(nvme_change_ctrl_stat
    */
   static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
   {
- -      switch (ctrl->state) {
+ +      switch (nvme_ctrl_state(ctrl)) {
         case NVME_CTRL_NEW:
         case NVME_CTRL_LIVE:
         case NVME_CTRL_RESETTING:
@@@ -621,7 -621,7 +621,7 @@@ bool nvme_wait_reset(struct nvme_ctrl *
         wait_event(ctrl->state_wq,
                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
                    nvme_state_terminal(ctrl));
- -      return ctrl->state == NVME_CTRL_RESETTING;
+ +      return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
   }
   EXPORT_SYMBOL_GPL(nvme_wait_reset);
   
@@@ -708,11 -708,9 +708,11 @@@ EXPORT_SYMBOL_GPL(nvme_init_request)
   blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
                 struct request *rq)
   {
- -      if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
- -          ctrl->state != NVME_CTRL_DELETING &&
- -          ctrl->state != NVME_CTRL_DEAD &&
+ +      enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+ +
+ +      if (state != NVME_CTRL_DELETING_NOIO &&
+ +          state != NVME_CTRL_DELETING &&
+ +          state != NVME_CTRL_DEAD &&
             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
                 return BLK_STS_RESOURCE;
@@@ -742,7 -740,7 +742,7 @@@ bool __nvme_check_ready(struct nvme_ctr
                  * command, which is require to set the queue live in the
                  * appropinquate states.
                  */
- -              switch (ctrl->state) {
+ +              switch (nvme_ctrl_state(ctrl)) {
                 case NVME_CTRL_CONNECTING:
                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
                             (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
@@@ -1202,16 -1200,8 +1202,16 @@@ static unsigned long nvme_keep_alive_wo
   
   static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
   {
- -      queue_delayed_work(nvme_wq, &ctrl->ka_work,
- -                         nvme_keep_alive_work_period(ctrl));
+ +      unsigned long now = jiffies;
+ +      unsigned long delay = nvme_keep_alive_work_period(ctrl);
+ +      unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
+ +
+ +      if (time_after(now, ka_next_check_tm))
+ +              delay = 0;
+ +      else
+ +              delay = ka_next_check_tm - now;
+ +
+ +      queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
   }
   
   static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
@@@ -1497,8 -1487,7 +1497,8 @@@ static int nvme_ns_info_from_identify(s
         if (id->ncap == 0) {
                 /* namespace not allocated or attached */
                 info->is_removed = true;
- -              return -ENODEV;
+ +              ret = -ENODEV;
+ +              goto error;
         }
   
         info->anagrpid = id->anagrpid;
@@@ -1516,10 -1505,8 +1516,10 @@@
                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
                         memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
         }
+ +
+ +error:
         kfree(id);
- -      return 0;
+ +      return ret;
   }
   
   static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
@@@ -1740,13 -1727,13 +1740,13 @@@ static void nvme_config_discard(struct 
                 struct nvme_ns_head *head)
   {
         struct request_queue *queue = disk->queue;
-       u32 size = queue_logical_block_size(queue);
+       u32 max_discard_sectors;
   
-       if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
-               ctrl->max_discard_sectors =
-                       nvme_lba_to_sect(head, ctrl->dmrsl);
- 
-       if (ctrl->max_discard_sectors == 0) {
+       if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
+               max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
+       } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
+               max_discard_sectors = UINT_MAX;
+       } else {
                 blk_queue_max_discard_sectors(queue, 0);
                 return;
         }
@@@ -1754,14 -1741,22 +1754,22 @@@
         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                         NVME_DSM_MAX_RANGES);
   
-       queue->limits.discard_granularity = size;
- 
-       /* If discard is already enabled, don't reset queue limits */
+       /*
+        * If discard is already enabled, don't reset queue limits.
+        *
+        * This works around the fact that the block layer can't cope well with
+        * updating the hardware limits when overridden through sysfs.  This is
+        * harmless because discard limits in NVMe are purely advisory.
+        */
         if (queue->limits.max_discard_sectors)
                 return;
   
-       blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
-       blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
+       blk_queue_max_discard_sectors(queue, max_discard_sectors);
+       if (ctrl->dmrl)
+               blk_queue_max_discard_segments(queue, ctrl->dmrl);
+       else
+               blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+       queue->limits.discard_granularity = queue_logical_block_size(queue);
   
         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
                 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@@ -1912,10 -1907,9 +1920,10 @@@ static void nvme_update_disk_info(struc
   
         /*
          * The block layer can't support LBA sizes larger than the page size
- -       * yet, so catch this early and don't allow block I/O.
+ +       * or smaller than a sector size yet, so catch this early and don't
+ +       * allow block I/O.
          */
- -      if (head->lba_shift > PAGE_SHIFT) {
+ +      if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
                 capacity = 0;
                 bs = (1 << 9);
         }
@@@ -2052,13 -2046,6 +2060,13 @@@ static int nvme_update_ns_info_block(st
         if (ret)
                 return ret;
   
+ +      if (id->ncap == 0) {
+ +              /* namespace not allocated or attached */
+ +              info->is_removed = true;
+ +              ret = -ENODEV;
+ +              goto error;
+ +      }
+ +
         blk_mq_freeze_queue(ns->disk->queue);
         lbaf = nvme_lbaf_index(id->flbas);
         ns->head->lba_shift = id->lbaf[lbaf].ds;
@@@ -2121,8 -2108,6 +2129,8 @@@ out
                 set_bit(NVME_NS_READY, &ns->flags);
                 ret = 0;
         }
+ +
+ +error:
         kfree(id);
         return ret;
   }
@@@ -2562,7 -2547,7 +2570,7 @@@ static void nvme_set_latency_tolerance(
   
         if (ctrl->ps_max_latency_us != latency) {
                 ctrl->ps_max_latency_us = latency;
- -              if (ctrl->state == NVME_CTRL_LIVE)
+ +              if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
                         nvme_configure_apst(ctrl);
         }
   }
@@@ -2930,14 -2915,6 +2938,6 @@@ static int nvme_init_non_mdts_limits(st
         struct nvme_id_ctrl_nvm *id;
         int ret;
   
-       if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
-               ctrl->max_discard_sectors = UINT_MAX;
-               ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
-       } else {
-               ctrl->max_discard_sectors = 0;
-               ctrl->max_discard_segments = 0;
-       }
- 
         /*
          * Even though NVMe spec explicitly states that MDTS is not applicable
          * to the write-zeroes, we are cautious and limit the size to the
@@@ -2967,8 -2944,7 +2967,7 @@@
         if (ret)
                 goto free_data;
   
-       if (id->dmrl)
-               ctrl->max_discard_segments = id->dmrl;
+       ctrl->dmrl = id->dmrl;
         ctrl->dmrsl = le32_to_cpu(id->dmrsl);
         if (id->wzsl)
                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
@@@ -3270,7 -3246,7 +3269,7 @@@ static int nvme_dev_open(struct inode *
         struct nvme_ctrl *ctrl =
                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
   
- -      switch (ctrl->state) {
+ +      switch (nvme_ctrl_state(ctrl)) {
         case NVME_CTRL_LIVE:
                 break;
         default:
@@@ -3694,14 -3670,6 +3693,14 @@@ static void nvme_alloc_ns(struct nvme_c
                 goto out_unlink_ns;
   
         down_write(&ctrl->namespaces_rwsem);
+ +      /*
+ +       * Ensure that no namespaces are added to the ctrl list after the queues
+ +       * are frozen, thereby avoiding a deadlock between scan and reset.
+ +       */
+ +      if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
+ +              up_write(&ctrl->namespaces_rwsem);
+ +              goto out_unlink_ns;
+ +      }
         nvme_ns_add_to_ctrl_list(ns);
         up_write(&ctrl->namespaces_rwsem);
         nvme_get_ctrl(ctrl);
@@@ -3966,7 -3934,7 +3965,7 @@@ static void nvme_scan_work(struct work_
         int ret;
   
         /* No tagset on a live ctrl means IO queues could not created */
- -      if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
+ +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
                 return;
   
         /*
@@@ -4036,7 -4004,7 +4035,7 @@@ void nvme_remove_namespaces(struct nvme
          * removing the namespaces' disks; fail all the queues now to avoid
          * potentially having to clean up the failed sync later.
          */
- -      if (ctrl->state == NVME_CTRL_DEAD)
+ +      if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
                 nvme_mark_namespaces_dead(ctrl);
   
         /* this is a no-op when called from the controller reset handler */
@@@ -4118,7 -4086,7 +4117,7 @@@ static void nvme_async_event_work(struc
          * flushing ctrl async_event_work after changing the controller state
          * from LIVE and before freeing the admin queue.
         */
- -      if (ctrl->state == NVME_CTRL_LIVE)
+ +      if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
                 ctrl->ops->submit_async_event(ctrl);
   }
   
@@@ -4169,8 -4137,6 +4168,8 @@@ static void nvme_fw_act_work(struct wor
                                 struct nvme_ctrl, fw_act_work);
         unsigned long fw_act_timeout;
   
+ +      nvme_auth_stop(ctrl);
+ +
         if (ctrl->mtfa)
                 fw_act_timeout = jiffies +
                                 msecs_to_jiffies(ctrl->mtfa * 100);
@@@ -4226,6 -4192,7 +4225,6 @@@ static bool nvme_handle_aen_notice(stru
                  * firmware activation.
                  */
                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
- -                      nvme_auth_stop(ctrl);
                         requeue = false;
                         queue_work(nvme_wq, &ctrl->fw_act_work);
                 }
@@@ -4514,7 -4481,7 +4513,7 @@@ int nvme_init_ctrl(struct nvme_ctrl *ct
   {
         int ret;
   
- -      ctrl->state = NVME_CTRL_NEW;
+ +      WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
         spin_lock_init(&ctrl->lock);
         mutex_init(&ctrl->scan_lock);
@@@ -4535,7 -4502,6 +4534,7 @@@
         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+ +      ctrl->ka_last_check_time = jiffies;
   
         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
                         PAGE_SIZE);
@@@ -4624,7 -4590,6 +4623,7 @@@ void nvme_unfreeze(struct nvme_ctrl *ct
         list_for_each_entry(ns, &ctrl->namespaces, list)
                 blk_mq_unfreeze_queue(ns->queue);
         up_read(&ctrl->namespaces_rwsem);
+ +      clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
   }
   EXPORT_SYMBOL_GPL(nvme_unfreeze);
   
@@@ -4658,7 -4623,6 +4657,7 @@@ void nvme_start_freeze(struct nvme_ctr
   {
         struct nvme_ns *ns;
   
+ +      set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
         down_read(&ctrl->namespaces_rwsem);
         list_for_each_entry(ns, &ctrl->namespaces, list)
                 blk_freeze_queue_start(ns->queue);
diff --combined drivers/nvme/host/nvme.h

index 4be7f68,6092cc3..030c808
--- 1/drivers/nvme/host/nvme.h
--- 2/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@@ -157,11 -157,6 +157,11 @@@ enum nvme_quirks 
          * No temperature thresholds for channels other than 0 (Composite).
          */
         NVME_QUIRK_NO_SECONDARY_TEMP_THRESH     = (1 << 19),
+ +
+ +      /*
+ +       * Disables simple suspend/resume path.
+ +       */
+ +      NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND      = (1 << 20),
   };
   
   /*
@@@ -257,7 -252,6 +257,7 @@@ enum nvme_ctrl_flags 
         NVME_CTRL_STOPPED               = 3,
         NVME_CTRL_SKIP_ID_CNS_CS        = 4,
         NVME_CTRL_DIRTY_CAPABILITY      = 5,
+ +      NVME_CTRL_FROZEN                = 6,
   };
   
   struct nvme_ctrl {
@@@ -303,14 -297,13 +303,13 @@@
         u32 max_hw_sectors;
         u32 max_segments;
         u32 max_integrity_segments;
-       u32 max_discard_sectors;
-       u32 max_discard_segments;
         u32 max_zeroes_sectors;
   #ifdef CONFIG_BLK_DEV_ZONED
         u32 max_zone_append;
   #endif
         u16 crdt[3];
         u16 oncs;
+       u8 dmrl;
         u32 dmrsl;
         u16 oacs;
         u16 sqsize;
@@@ -394,11 -387,6 +393,11 @@@
         enum nvme_dctype dctype;
   };
   
+ +static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
+ +{
+ +      return READ_ONCE(ctrl->state);
+ +}
+ +
   enum nvme_iopolicy {
         NVME_IOPOLICY_NUMA,
         NVME_IOPOLICY_RR,
@@@ -932,6 -920,10 +931,10 @@@ extern struct device_attribute dev_attr
   extern struct device_attribute dev_attr_ana_state;
   extern struct device_attribute subsys_attr_iopolicy;
   
+ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+ {
+       return disk->fops == &nvme_ns_head_ops;
+ }
   #else
   #define multipath false
   static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@@ -1009,6 -1001,10 +1012,10 @@@ static inline void nvme_mpath_start_req
   static inline void nvme_mpath_end_request(struct request *rq)
   {
   }
+ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
+ {
+       return false;
+ }
   #endif /* CONFIG_NVME_MULTIPATH */
   
   int nvme_revalidate_zones(struct nvme_ns *ns);
@@@ -1037,7 -1033,10 +1044,10 @@@ static inline int nvme_update_zone_info
   
   static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
   {
-       return dev_to_disk(dev)->private_data;
+       struct gendisk *disk = dev_to_disk(dev);
+ 
+       WARN_ON(nvme_disk_is_ns_head(disk));
+       return disk->private_data;
   }
   
   #ifdef CONFIG_NVME_HWMON
diff --combined drivers/nvme/host/pci.c

index 61af7ff,46d3897..c1d6357
--- 1/drivers/nvme/host/pci.c
--- 2/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@@ -1233,7 -1233,7 +1233,7 @@@ static bool nvme_should_reset(struct nv
         bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
   
         /* If there is a reset/reinit ongoing, we shouldn't reset again. */
- -      switch (dev->ctrl.state) {
+ +      switch (nvme_ctrl_state(&dev->ctrl)) {
         case NVME_CTRL_RESETTING:
         case NVME_CTRL_CONNECTING:
                 return false;
@@@ -1284,6 -1284,7 +1284,7 @@@ static enum blk_eh_timer_return nvme_ti
         struct request *abort_req;
         struct nvme_command cmd = { };
         u32 csts = readl(dev->bar + NVME_REG_CSTS);
+       u8 opcode;
   
         /* If PCI error recovery process is happening, we cannot reset or
          * the recovery mechanism will surely fail.
@@@ -1310,8 -1311,8 +1311,8 @@@
   
         if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) {
                 dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, completion polled\n",
-                        req->tag, nvmeq->qid);
+                        "I/O tag %d (%04x) QID %d timeout, completion polled\n",
+                        req->tag, nvme_cid(req), nvmeq->qid);
                 return BLK_EH_DONE;
         }
   
@@@ -1321,14 -1322,14 +1322,14 @@@
          * cancellation error. All outstanding requests are completed on
          * shutdown, so we return BLK_EH_DONE.
          */
- -      switch (dev->ctrl.state) {
+ +      switch (nvme_ctrl_state(&dev->ctrl)) {
         case NVME_CTRL_CONNECTING:
                 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
                 fallthrough;
         case NVME_CTRL_DELETING:
                 dev_warn_ratelimited(dev->ctrl.device,
-                        "I/O %d QID %d timeout, disable controller\n",
-                        req->tag, nvmeq->qid);
+                        "I/O tag %d (%04x) QID %d timeout, disable controller\n",
+                        req->tag, nvme_cid(req), nvmeq->qid);
                 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                 nvme_dev_disable(dev, true);
                 return BLK_EH_DONE;
@@@ -1343,10 -1344,12 +1344,12 @@@
          * command was already aborted once before and still hasn't been
          * returned to the driver, or if this is the admin queue.
          */
+       opcode = nvme_req(req)->cmd->common.opcode;
         if (!nvmeq->qid || iod->aborted) {
                 dev_warn(dev->ctrl.device,
-                        "I/O %d QID %d timeout, reset controller\n",
-                        req->tag, nvmeq->qid);
+                        "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
+                        req->tag, nvme_cid(req), opcode,
+                        nvme_opcode_str(nvmeq->qid, opcode, 0), nvmeq->qid);
                 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
                 goto disable;
         }
@@@ -1362,10 -1365,10 +1365,10 @@@
         cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
   
         dev_warn(nvmeq->dev->ctrl.device,
-               "I/O %d (%s) QID %d timeout, aborting\n",
-                req->tag,
-                nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode),
-                nvmeq->qid);
+                "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n",
+                req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
+                nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
+                blk_rq_bytes(req));
   
         abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
                                          BLK_MQ_REQ_NOWAIT);
@@@ -1593,7 -1596,7 +1596,7 @@@ static int nvme_setup_io_queues_trylock
         /*
          * Controller is in wrong state, fail early.
          */
- -      if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
+ +      if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) {
                 mutex_unlock(&dev->shutdown_lock);
                 return -ENODEV;
         }
@@@ -2573,13 -2576,13 +2576,13 @@@ static bool nvme_pci_ctrl_is_dead(struc
   
   static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
   {
+ +      enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl);
         struct pci_dev *pdev = to_pci_dev(dev->dev);
         bool dead;
   
         mutex_lock(&dev->shutdown_lock);
         dead = nvme_pci_ctrl_is_dead(dev);
- -      if (dev->ctrl.state == NVME_CTRL_LIVE ||
- -          dev->ctrl.state == NVME_CTRL_RESETTING) {
+ +      if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) {
                 if (pci_is_enabled(pdev))
                         nvme_start_freeze(&dev->ctrl);
                 /*
@@@ -2690,7 -2693,7 +2693,7 @@@ static void nvme_reset_work(struct work
         bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
         int result;
   
- -      if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+ +      if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
                 dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
                          dev->ctrl.state);
                 result = -ENODEV;
@@@ -2743,10 -2746,10 +2746,10 @@@
          * controller around but remove all namespaces.
          */
         if (dev->online_queues > 1) {
+               nvme_dbbuf_set(dev);
                 nvme_unquiesce_io_queues(&dev->ctrl);
                 nvme_wait_freeze(&dev->ctrl);
                 nvme_pci_update_nr_queues(dev);
-               nvme_dbbuf_set(dev);
                 nvme_unfreeze(&dev->ctrl);
         } else {
                 dev_warn(dev->ctrl.device, "IO queues lost\n");
@@@ -2902,18 -2905,6 +2905,18 @@@ static unsigned long check_vendor_combi
                 if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
                      dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
                         return NVME_QUIRK_SIMPLE_SUSPEND;
+ +      } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 ||
+ +                 pdev->device == 0x500f)) {
+ +              /*
+ +               * Exclude some Kingston NV1 and A2000 devices from
+ +               * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
+ +               * lot fo energy with s2idle sleep on some TUXEDO platforms.
+ +               */
+ +              if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
+ +                  dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
+ +                  dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") ||
+ +                  dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1"))
+ +                      return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
         }
   
         return 0;
@@@ -2944,9 -2935,7 +2947,9 @@@ static struct nvme_dev *nvme_pci_alloc_
         dev->dev = get_device(&pdev->dev);
   
         quirks |= check_vendor_combination_bug(pdev);
- -      if (!noacpi && acpi_storage_d3(&pdev->dev)) {
+ +      if (!noacpi &&
+ +          !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) &&
+ +          acpi_storage_d3(&pdev->dev)) {
                 /*
                  * Some systems use a bios work around to ask for D3 on
                  * platforms that support kernel managed suspend.
@@@ -3206,7 -3195,7 +3209,7 @@@ static int nvme_suspend(struct device *
         nvme_wait_freeze(ctrl);
         nvme_sync_queues(ctrl);
   
- -      if (ctrl->state != NVME_CTRL_LIVE)
+ +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
                 goto unfreeze;
   
         /*
@@@ -3408,6 -3397,8 +3411,8 @@@ static const struct pci_device_id nvme_
                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
         { PCI_DEVICE(0x1c5c, 0x174a),   /* SK Hynix P31 SSD */
                 .driver_data = NVME_QUIRK_BOGUS_NID, },
+       { PCI_DEVICE(0x1c5c, 0x1D59),   /* SK Hynix BC901 */
+               .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
         { PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
                 .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
         { PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
diff --combined drivers/nvme/host/rdma.c

index c89503d,2e77c0f..11dde0d
--- 1/drivers/nvme/host/rdma.c
--- 2/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@@ -984,11 -984,10 +984,11 @@@ free_ctrl
   
   static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
   {
+ +      enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+ +
         /* If we are resetting/deleting then do nothing */
- -      if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
- -              WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
- -                      ctrl->ctrl.state == NVME_CTRL_LIVE);
+ +      if (state != NVME_CTRL_CONNECTING) {
+ +              WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
                 return;
         }
   
@@@ -1060,10 -1059,8 +1060,10 @@@ static int nvme_rdma_setup_ctrl(struct 
                  * unless we're during creation of a new controller to
                  * avoid races with teardown flow.
                  */
- -              WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
- -                           ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
+ +              enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+ +
+ +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ +                           state != NVME_CTRL_DELETING_NOIO);
                 WARN_ON_ONCE(new);
                 ret = -EINVAL;
                 goto destroy_io;
@@@ -1132,10 -1129,8 +1132,10 @@@ static void nvme_rdma_error_recovery_wo
   
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we started ctrl delete */
- -              WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
- -                           ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
+ +              enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
+ +
+ +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ +                           state != NVME_CTRL_DELETING_NOIO);
                 return;
         }
   
@@@ -1167,7 -1162,7 +1167,7 @@@ static void nvme_rdma_wr_error(struct i
         struct nvme_rdma_queue *queue = wc->qp->qp_context;
         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
   
- -      if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+ +      if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE)
                 dev_info(ctrl->ctrl.device,
                              "%s for CQE 0x%p failed with status %s (%d)\n",
                              op, wc->wr_cqe,
@@@ -1946,11 -1941,16 +1946,16 @@@ static enum blk_eh_timer_return nvme_rd
         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
         struct nvme_rdma_queue *queue = req->queue;
         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- 
-       dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
-                rq->tag, nvme_rdma_queue_idx(queue));
+       u8 opcode = req->req.cmd->common.opcode;
+       u8 fctype = req->req.cmd->fabrics.fctype;
+       int qid = nvme_rdma_queue_idx(queue);
+ 
+       dev_warn(ctrl->ctrl.device,
+                "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n",
+                rq->tag, nvme_cid(rq), opcode,
+                nvme_opcode_str(qid, opcode, fctype), qid);
   
- -      if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ +      if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) {
                 /*
                  * If we are resetting, connecting or deleting we should
                  * complete immediately because we may block controller
diff --combined drivers/nvme/host/tcp.c

index 08805f0,b234f06..d058d99
--- 1/drivers/nvme/host/tcp.c
--- 2/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@@ -1922,14 -1922,13 +1922,13 @@@ static int nvme_tcp_alloc_admin_queue(s
                                                       ctrl->opts->subsysnqn);
                 if (!pskid) {
                         dev_err(ctrl->device, "no valid PSK found\n");
-                       ret = -ENOKEY;
-                       goto out_free_queue;
+                       return -ENOKEY;
                 }
         }
   
         ret = nvme_tcp_alloc_queue(ctrl, 0, pskid);
         if (ret)
-               goto out_free_queue;
+               return ret;
   
         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
         if (ret)
@@@ -2152,11 -2151,10 +2151,11 @@@ static void nvme_tcp_teardown_io_queues
   
   static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
   {
+ +      enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+ +
         /* If we are resetting/deleting then do nothing */
- -      if (ctrl->state != NVME_CTRL_CONNECTING) {
- -              WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
- -                      ctrl->state == NVME_CTRL_LIVE);
+ +      if (state != NVME_CTRL_CONNECTING) {
+ +              WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
                 return;
         }
   
@@@ -2216,10 -2214,8 +2215,10 @@@ static int nvme_tcp_setup_ctrl(struct n
                  * unless we're during creation of a new controller to
                  * avoid races with teardown flow.
                  */
- -              WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
- -                           ctrl->state != NVME_CTRL_DELETING_NOIO);
+ +              enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+ +
+ +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ +                           state != NVME_CTRL_DELETING_NOIO);
                 WARN_ON_ONCE(new);
                 ret = -EINVAL;
                 goto destroy_io;
@@@ -2283,10 -2279,8 +2282,10 @@@ static void nvme_tcp_error_recovery_wor
   
         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we started ctrl delete */
- -              WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
- -                           ctrl->state != NVME_CTRL_DELETING_NOIO);
+ +              enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+ +
+ +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ +                           state != NVME_CTRL_DELETING_NOIO);
                 return;
         }
   
@@@ -2316,10 -2310,8 +2315,10 @@@ static void nvme_reset_ctrl_work(struc
   
         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
                 /* state change failure is ok if we started ctrl delete */
- -              WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
- -                           ctrl->state != NVME_CTRL_DELETING_NOIO);
+ +              enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+ +
+ +              WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
+ +                           state != NVME_CTRL_DELETING_NOIO);
                 return;
         }
   
@@@ -2433,11 -2425,11 +2432,11 @@@ static enum blk_eh_timer_return nvme_tc
         int qid = nvme_tcp_queue_id(req->queue);
   
         dev_warn(ctrl->device,
-               "queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
-               nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
-               opc, nvme_opcode_str(qid, opc, fctype));
+                "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
+                rq->tag, nvme_cid(rq), pdu->hdr.type, opc,
+                nvme_opcode_str(qid, opc, fctype), qid);
   
- -      if (ctrl->state != NVME_CTRL_LIVE) {
+ +      if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
                 /*
                  * If we are resetting, connecting or deleting we should
                  * complete immediately because we may block controller
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 19 Jan 2024 02:22:40 +0000 (18:22 -0800)
		1	2
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/loop.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/virtio_blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/nvme.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/rdma.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/tcp.c	patch \|	diff1 \|	diff2 \|	blob \| history