Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 12 Jun 2020 01:18:50 +0000 (18:18 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 12 Jun 2020 01:18:50 +0000 (18:18 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Jun 2020 01:18:50 +0000 (18:18 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Jun 2020 01:18:50 +0000 (18:18 -0700)
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h

index 8f1e94f..a338a6d 100644 (file)
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -89,6 +89,8 @@
  #define INTEL_FAM6_COMETLAKE           0xA5
  #define INTEL_FAM6_COMETLAKE_L         0xA6
  
+#define INTEL_FAM6_SAPPHIRERAPIDS_X    0x8F
+
  /* "Small Core" Processors (Atom) */
  
  #define INTEL_FAM6_ATOM_BONNELL                0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h

index 9a6dc9b..fb81fea 100644 (file)
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -271,6 +271,24 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
         return __vdso_data;
  }
  
+static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd)
+{
+       return true;
+}
+#define vdso_clocksource_ok arch_vdso_clocksource_ok
+
+/*
+ * Clocksource read value validation to handle PV and HyperV clocksources
+ * which can be invalidated asynchronously and indicate invalidation by
+ * returning U64_MAX, which can be effectively tested by checking for a
+ * negative value after casting it to s64.
+ */
+static inline bool arch_vdso_cycles_ok(u64 cycles)
+{
+       return (s64)cycles >= 0;
+}
+#define vdso_cycles_ok arch_vdso_cycles_ok
+
  /*
   * x86 specific delta calculation.
   *
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index 4b1d31b..bf4acb0 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2060,7 +2060,7 @@ void __init init_apic_mappings(void)
         unsigned int new_apicid;
  
         if (apic_validate_deadline_timer())
-               pr_debug("TSC deadline timer available\n");
+               pr_info("TSC deadline timer available\n");
  
         if (x2apic_mode) {
                 boot_cpu_physical_apicid = read_apic_id();
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index b6f887b..0b71970 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -588,7 +588,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline);
  static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
         SPECTRE_V2_NONE;
  
-static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+       SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
         SPECTRE_V2_USER_NONE;
  
  #ifdef CONFIG_RETPOLINE
@@ -734,15 +736,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
                 break;
         }
  
-       /*
-        * At this point, an STIBP mode other than "off" has been set.
-        * If STIBP support is not being forced, check if STIBP always-on
-        * is preferred.
-        */
-       if (mode != SPECTRE_V2_USER_STRICT &&
-           boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
-               mode = SPECTRE_V2_USER_STRICT_PREFERRED;
-
         /* Initialize Indirect Branch Prediction Barrier */
         if (boot_cpu_has(X86_FEATURE_IBPB)) {
                 setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
@@ -765,23 +758,36 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
                 pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
                         static_key_enabled(&switch_mm_always_ibpb) ?
                         "always-on" : "conditional");
+
+               spectre_v2_user_ibpb = mode;
         }
  
-       /* If enhanced IBRS is enabled no STIBP required */
-       if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+       /*
+        * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+        * required.
+        */
+       if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
                 return;
  
         /*
-        * If SMT is not possible or STIBP is not available clear the STIBP
-        * mode.
+        * At this point, an STIBP mode other than "off" has been set.
+        * If STIBP support is not being forced, check if STIBP always-on
+        * is preferred.
          */
-       if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+       if (mode != SPECTRE_V2_USER_STRICT &&
+           boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+               mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+       /*
+        * If STIBP is not available, clear the STIBP mode.
+        */
+       if (!boot_cpu_has(X86_FEATURE_STIBP))
                 mode = SPECTRE_V2_USER_NONE;
+
+       spectre_v2_user_stibp = mode;
+
  set_mode:
-       spectre_v2_user = mode;
-       /* Only print the STIBP mode when SMT possible */
-       if (smt_possible)
-               pr_info("%s\n", spectre_v2_user_strings[mode]);
+       pr_info("%s\n", spectre_v2_user_strings[mode]);
  }
  
  static const char * const spectre_v2_strings[] = {
@@ -1014,7 +1020,7 @@ void cpu_bugs_smt_update(void)
  {
         mutex_lock(&spec_ctrl_mutex);
  
-       switch (spectre_v2_user) {
+       switch (spectre_v2_user_stibp) {
         case SPECTRE_V2_USER_NONE:
                 break;
         case SPECTRE_V2_USER_STRICT:
@@ -1257,14 +1263,19 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
  {
         switch (ctrl) {
         case PR_SPEC_ENABLE:
-               if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+               if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+                   spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
                         return 0;
                 /*
                  * Indirect branch speculation is always disabled in strict
-                * mode.
+                * mode. It can neither be enabled if it was force-disabled
+                * by a  previous prctl call.
+
                  */
-               if (spectre_v2_user == SPECTRE_V2_USER_STRICT ||
-                   spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED)
+               if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+                   spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+                   spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+                   task_spec_ib_force_disable(task))
                         return -EPERM;
                 task_clear_spec_ib_disable(task);
                 task_update_spec_tif(task);
@@ -1275,10 +1286,12 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
                  * Indirect branch speculation is always allowed when
                  * mitigation is force disabled.
                  */
-               if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+               if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+                   spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
                         return -EPERM;
-               if (spectre_v2_user == SPECTRE_V2_USER_STRICT ||
-                   spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED)
+               if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+                   spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+                   spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
                         return 0;
                 task_set_spec_ib_disable(task);
                 if (ctrl == PR_SPEC_FORCE_DISABLE)
@@ -1309,7 +1322,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
  {
         if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
                 ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
-       if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+       if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+           spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
                 ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
  }
  #endif
@@ -1340,22 +1354,24 @@ static int ib_prctl_get(struct task_struct *task)
         if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
                 return PR_SPEC_NOT_AFFECTED;
  
-       switch (spectre_v2_user) {
-       case SPECTRE_V2_USER_NONE:
+       if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+           spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
                 return PR_SPEC_ENABLE;
-       case SPECTRE_V2_USER_PRCTL:
-       case SPECTRE_V2_USER_SECCOMP:
+       else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+           spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+           spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+               return PR_SPEC_DISABLE;
+       else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+           spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+           spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+           spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) {
                 if (task_spec_ib_force_disable(task))
                         return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
                 if (task_spec_ib_disable(task))
                         return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
                 return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
-       case SPECTRE_V2_USER_STRICT:
-       case SPECTRE_V2_USER_STRICT_PREFERRED:
-               return PR_SPEC_DISABLE;
-       default:
+       } else
                 return PR_SPEC_NOT_AFFECTED;
-       }
  }
  
  int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
@@ -1594,7 +1610,7 @@ static char *stibp_state(void)
         if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
                 return "";
  
-       switch (spectre_v2_user) {
+       switch (spectre_v2_user_stibp) {
         case SPECTRE_V2_USER_NONE:
                 return ", STIBP: disabled";
         case SPECTRE_V2_USER_STRICT:
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 63926c9..c25a67a 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1142,9 +1142,12 @@ void switch_to_sld(unsigned long tifn)
  static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,           0),
         X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,           0),
+       X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,           0),
         X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT,        1),
         X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,      1),
         X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,      1),
+       X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,         1),
+       X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,           1),
         {}
  };
  
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 8e3d034..f362ce0 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,28 +545,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
  
         lockdep_assert_irqs_disabled();
  
-       /*
-        * If TIF_SSBD is different, select the proper mitigation
-        * method. Note that if SSBD mitigation is disabled or permanentely
-        * enabled this branch can't be taken because nothing can set
-        * TIF_SSBD.
-        */
-       if (tif_diff & _TIF_SSBD) {
-               if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+       /* Handle change of TIF_SSBD depending on the mitigation method. */
+       if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+               if (tif_diff & _TIF_SSBD)
                         amd_set_ssb_virt_state(tifn);
-               } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+       } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+               if (tif_diff & _TIF_SSBD)
                         amd_set_core_ssb_state(tifn);
-               } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
-                          static_cpu_has(X86_FEATURE_AMD_SSBD)) {
-                       msr |= ssbd_tif_to_spec_ctrl(tifn);
-                       updmsr  = true;
-               }
+       } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+                  static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+               updmsr |= !!(tif_diff & _TIF_SSBD);
+               msr |= ssbd_tif_to_spec_ctrl(tifn);
         }
  
-       /*
-        * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
-        * otherwise avoid the MSR write.
-        */
+       /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
         if (IS_ENABLED(CONFIG_SMP) &&
             static_branch_unlikely(&switch_to_cond_stibp)) {
                 updmsr |= !!(tif_diff & _TIF_SPEC_IB);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index e040ba6..0ec7ced 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
                         DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
                 },
         },
+       {       /* Handle problems with rebooting on Apple MacBook6,1 */
+               .callback = set_pci_reboot,
+               .ident = "Apple MacBook6,1",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+               },
+       },
         {       /* Handle problems with rebooting on Apple MacBookPro5 */
                 .callback = set_pci_reboot,
                 .ident = "Apple MacBookPro5",
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c

index 371a6b3..e42faa7 100644 (file)
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -25,10 +25,6 @@
  #include <asm/hpet.h>
  #include <asm/time.h>
  
-#ifdef CONFIG_X86_64
-__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-#endif
-
  unsigned long profile_pc(struct pt_regs *regs)
  {
         unsigned long pc = instruction_pointer(regs);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 1bf7e31..7c35556 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -40,13 +40,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
  #ifdef CONFIG_X86_32
  OUTPUT_ARCH(i386)
  ENTRY(phys_startup_32)
-jiffies = jiffies_64;
  #else
  OUTPUT_ARCH(i386:x86-64)
  ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
  #endif
  
+jiffies = jiffies_64;
+
  #if defined(CONFIG_X86_64)
  /*
   * On 64-bit, align RODATA to 2MB so we retain large page mappings for
diff --git a/block/bio-integrity.c b/block/bio-integrity.c

index 3579ac0..23632a3 100644 (file)
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -281,7 +281,6 @@ bool bio_integrity_prep(struct bio *bio)
  
                 if (ret == 0) {
                         printk(KERN_ERR "could not attach integrity payload\n");
-                       kfree(buf);
                         status = BLK_STS_RESOURCE;
                         goto err_end_io;
                 }
diff --git a/block/bio.c b/block/bio.c

index 5235da6..a7366c0 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -1434,8 +1434,7 @@ again:
         }
  
         if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-               trace_block_bio_complete(bio->bi_disk->queue, bio,
-                                        blk_status_to_errno(bio->bi_status));
+               trace_block_bio_complete(bio->bi_disk->queue, bio);
                 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
         }
  
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c

index 96a39d0..44f3d09 100644 (file)
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -191,6 +191,33 @@ found_tag:
         return tag + tag_offset;
  }
  
+bool __blk_mq_get_driver_tag(struct request *rq)
+{
+       struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+       unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+       bool shared = blk_mq_tag_busy(rq->mq_hctx);
+       int tag;
+
+       if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+               bt = &rq->mq_hctx->tags->breserved_tags;
+               tag_offset = 0;
+       }
+
+       if (!hctx_may_queue(rq->mq_hctx, bt))
+               return false;
+       tag = __sbitmap_queue_get(bt);
+       if (tag == BLK_MQ_NO_TAG)
+               return false;
+
+       rq->tag = tag + tag_offset;
+       if (shared) {
+               rq->rq_flags |= RQF_MQ_INFLIGHT;
+               atomic_inc(&rq->mq_hctx->nr_active);
+       }
+       rq->mq_hctx->tags->rqs[rq->tag] = rq;
+       return true;
+}
+
  void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                     unsigned int tag)
  {
@@ -269,6 +296,7 @@ struct bt_tags_iter_data {
  
  #define BT_TAG_ITER_RESERVED           (1 << 0)
  #define BT_TAG_ITER_STARTED            (1 << 1)
+#define BT_TAG_ITER_STATIC_RQS         (1 << 2)
  
  static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  {
@@ -282,9 +310,12 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  
         /*
          * We can hit rq == NULL here, because the tagging functions
-        * test and set the bit before assining ->rqs[].
+        * test and set the bit before assigning ->rqs[].
          */
-       rq = tags->rqs[bitnr];
+       if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
+               rq = tags->static_rqs[bitnr];
+       else
+               rq = tags->rqs[bitnr];
         if (!rq)
                 return true;
         if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
@@ -339,11 +370,13 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
   *             indicates whether or not @rq is a reserved request. Return
   *             true to continue iterating tags, false to stop.
   * @priv:      Will be passed as second argument to @fn.
+ *
+ * Caller has to pass the tag map from which requests are allocated.
   */
  void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                 void *priv)
  {
-       return __blk_mq_all_tag_iter(tags, fn, priv, 0);
+       return __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
  }
  
  /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h

index d38e48f..2e4ef51 100644 (file)
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -51,6 +51,14 @@ enum {
         BLK_MQ_TAG_MAX          = BLK_MQ_NO_TAG - 1,
  };
  
+bool __blk_mq_get_driver_tag(struct request *rq);
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+       if (rq->tag != BLK_MQ_NO_TAG)
+               return true;
+       return __blk_mq_get_driver_tag(rq);
+}
+
  extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
  extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
  
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 9a36ac1..4f57d27 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1052,35 +1052,6 @@ static inline unsigned int queued_to_index(unsigned int queued)
         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
  }
  
-bool blk_mq_get_driver_tag(struct request *rq)
-{
-       struct blk_mq_alloc_data data = {
-               .q = rq->q,
-               .hctx = rq->mq_hctx,
-               .flags = BLK_MQ_REQ_NOWAIT,
-               .cmd_flags = rq->cmd_flags,
-       };
-       bool shared;
-
-       if (rq->tag != BLK_MQ_NO_TAG)
-               return true;
-
-       if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
-               data.flags |= BLK_MQ_REQ_RESERVED;
-
-       shared = blk_mq_tag_busy(data.hctx);
-       rq->tag = blk_mq_get_tag(&data);
-       if (rq->tag >= 0) {
-               if (shared) {
-                       rq->rq_flags |= RQF_MQ_INFLIGHT;
-                       atomic_inc(&data.hctx->nr_active);
-               }
-               data.hctx->tags->rqs[rq->tag] = rq;
-       }
-
-       return rq->tag != BLK_MQ_NO_TAG;
-}
-
  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                 int flags, void *key)
  {
diff --git a/block/blk-mq.h b/block/blk-mq.h

index a139b06..b3ce0f3 100644 (file)
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -44,7 +44,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
  void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
                                 bool kick_requeue_list);
  void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq);
  struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                         struct blk_mq_ctx *start);
  
diff --git a/block/blk.h b/block/blk.h

index aa16e52..b5d1f0f 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -420,9 +420,11 @@ static inline sector_t part_nr_sects_read(struct hd_struct *part)
  static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
  {
  #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+       preempt_disable();
         write_seqcount_begin(&part->nr_sects_seq);
         part->nr_sects = size;
         write_seqcount_end(&part->nr_sects_seq);
+       preempt_enable();
  #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
         preempt_disable();
         part->nr_sects = size;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index 2e96d8b..c33bbbf 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1390,7 +1390,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
                 goto out_unfreeze;
  
         /* Mask out flags that can't be set using LOOP_SET_STATUS. */
-       lo->lo_flags &= ~LOOP_SET_STATUS_SETTABLE_FLAGS;
+       lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
         /* For those flags, use the previous values instead */
         lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS;
         /* For flags that can't be cleared, use previous values too */
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c

index 0b944ac..27a33ad 100644 (file)
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1613,7 +1613,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
         disc_information di;
         track_information ti;
         __u32 last_track;
-       int ret = -1;
+       int ret;
  
         ret = pkt_get_disc_info(pd, &di);
         if (ret)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c

index d84e8a8..1e2aa5a 100644 (file)
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -784,7 +784,7 @@ static const struct block_device_operations mm_fops = {
  
  static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
  {
-       int ret = -ENODEV;
+       int ret;
         struct cardinfo *card = &cards[num_cards];
         unsigned char   mem_present;
         unsigned char   batt_status;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index 0585efa..c2c5bc4 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3669,7 +3669,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
         ns->disk = disk;
  
         if (__nvme_revalidate_disk(disk, id))
-               goto out_free_disk;
+               goto out_put_disk;
  
         if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
                 ret = nvme_nvm_register(ns, disk_name, node);
@@ -3696,8 +3696,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
         /* prevent double queue cleanup */
         ns->disk->queue = NULL;
         put_disk(ns->disk);
- out_free_disk:
-       del_gendisk(ns->disk);
   out_unlink_ns:
         mutex_lock(&ctrl->subsys->lock);
         list_del_rcu(&ns->siblings);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c

index cb00075..e999a8c 100644 (file)
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2634,10 +2634,11 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
                 opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
                 __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
  
-               if (!(op->flags & FCOP_FLAGS_AEN))
+               if (!(op->flags & FCOP_FLAGS_AEN)) {
                         nvme_fc_unmap_data(ctrl, op->rq, op);
+                       nvme_cleanup_cmd(op->rq);
+               }
  
-               nvme_cleanup_cmd(op->rq);
                 nvme_fc_ctrl_put(ctrl);
  
                 if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h

index fa5c755..c0f4226 100644 (file)
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -599,8 +599,7 @@ static inline void nvme_trace_bio_complete(struct request *req,
         struct nvme_ns *ns = req->q->queuedata;
  
         if (req->cmd_flags & REQ_NVME_MPATH)
-               trace_block_bio_complete(ns->head->disk->queue,
-                                        req->bio, status);
+               trace_block_bio_complete(ns->head->disk->queue, req->bio);
  }
  
  extern struct device_attribute dev_attr_ana_grpid;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index d690d55..e2bacd3 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2950,9 +2950,15 @@ static int nvme_suspend(struct device *dev)
          * the PCI bus layer to put it into D3 in order to take the PCIe link
          * down, so as to allow the platform to achieve its minimum low-power
          * state (which may not be possible if the link is up).
+        *
+        * If a host memory buffer is enabled, shut down the device as the NVMe
+        * specification allows the device to access the host memory buffer in
+        * host DRAM from all power states, but hosts will fail access to DRAM
+        * during S3.
          */
         if (pm_suspend_via_firmware() || !ctrl->npss ||
             !pcie_aspm_enabled(pdev) ||
+           ndev->nr_host_mem_descs ||
             (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
                 return nvme_disable_prepare_reset(ndev, true);
  
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c

index 1843110..3345ec7 100644 (file)
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -131,8 +131,8 @@ struct nvme_tcp_ctrl {
  static LIST_HEAD(nvme_tcp_ctrl_list);
  static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
  static struct workqueue_struct *nvme_tcp_wq;
-static struct blk_mq_ops nvme_tcp_mq_ops;
-static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static const struct blk_mq_ops nvme_tcp_mq_ops;
+static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
  static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
  
  static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
@@ -2301,7 +2301,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
         return queue->nr_cqe;
  }
  
-static struct blk_mq_ops nvme_tcp_mq_ops = {
+static const struct blk_mq_ops nvme_tcp_mq_ops = {
         .queue_rq       = nvme_tcp_queue_rq,
         .complete       = nvme_complete_rq,
         .init_request   = nvme_tcp_init_request,
@@ -2312,7 +2312,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
         .poll           = nvme_tcp_poll,
  };
  
-static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
         .queue_rq       = nvme_tcp_queue_rq,
         .complete       = nvme_complete_rq,
         .init_request   = nvme_tcp_init_request,
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c

index 6392bcd..6e2f623 100644 (file)
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -129,7 +129,22 @@ static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
         return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
  }
  
-static void nvmet_async_events_process(struct nvmet_ctrl *ctrl, u16 status)
+static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl)
+{
+       u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
+       struct nvmet_req *req;
+
+       mutex_lock(&ctrl->lock);
+       while (ctrl->nr_async_event_cmds) {
+               req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
+               mutex_unlock(&ctrl->lock);
+               nvmet_req_complete(req, status);
+               mutex_lock(&ctrl->lock);
+       }
+       mutex_unlock(&ctrl->lock);
+}
+
+static void nvmet_async_events_process(struct nvmet_ctrl *ctrl)
  {
         struct nvmet_async_event *aen;
         struct nvmet_req *req;
@@ -139,15 +154,14 @@ static void nvmet_async_events_process(struct nvmet_ctrl *ctrl, u16 status)
                 aen = list_first_entry(&ctrl->async_events,
                                        struct nvmet_async_event, entry);
                 req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
-               if (status == 0)
-                       nvmet_set_result(req, nvmet_async_event_result(aen));
+               nvmet_set_result(req, nvmet_async_event_result(aen));
  
                 list_del(&aen->entry);
                 kfree(aen);
  
                 mutex_unlock(&ctrl->lock);
                 trace_nvmet_async_event(ctrl, req->cqe->result.u32);
-               nvmet_req_complete(req, status);
+               nvmet_req_complete(req, 0);
                 mutex_lock(&ctrl->lock);
         }
         mutex_unlock(&ctrl->lock);
@@ -170,7 +184,7 @@ static void nvmet_async_event_work(struct work_struct *work)
         struct nvmet_ctrl *ctrl =
                 container_of(work, struct nvmet_ctrl, async_event_work);
  
-       nvmet_async_events_process(ctrl, 0);
+       nvmet_async_events_process(ctrl);
  }
  
  void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
@@ -779,7 +793,6 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
  
  void nvmet_sq_destroy(struct nvmet_sq *sq)
  {
-       u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
         struct nvmet_ctrl *ctrl = sq->ctrl;
  
         /*
@@ -787,7 +800,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
          * queue doesn't have outstanding requests on it.
          */
         if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
-               nvmet_async_events_process(ctrl, status);
+               nvmet_async_events_failall(ctrl);
         percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
         wait_for_completion(&sq->confirm_done);
         wait_for_completion(&sq->free_done);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c

index 1669177..de9217c 100644 (file)
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -153,7 +153,7 @@ static LIST_HEAD(nvmet_tcp_queue_list);
  static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
  
  static struct workqueue_struct *nvmet_tcp_wq;
-static struct nvmet_fabrics_ops nvmet_tcp_ops;
+static const struct nvmet_fabrics_ops nvmet_tcp_ops;
  static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
  static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
  
@@ -1713,7 +1713,7 @@ static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
         }
  }
  
-static struct nvmet_fabrics_ops nvmet_tcp_ops = {
+static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
         .owner                  = THIS_MODULE,
         .type                   = NVMF_TRTYPE_TCP,
         .msdbd                  = 1,
diff --git a/fs/afs/write.c b/fs/afs/write.c

index 97bccde..768497f 100644 (file)
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -447,6 +447,7 @@ static int afs_store_data(struct address_space *mapping,
         op->store.last = last;
         op->store.first_offset = offset;
         op->store.last_to = to;
+       op->mtime = vnode->vfs_inode.i_mtime;
         op->ops = &afs_store_data_operation;
  
  try_next_key:
diff --git a/fs/io-wq.c b/fs/io-wq.c

index a5e90ac..0b65a91 100644 (file)
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -111,6 +111,7 @@ struct io_wq {
         unsigned long state;
  
         free_work_fn *free_work;
+       io_wq_work_fn *do_work;
  
         struct task_struct *manager;
         struct user_struct *user;
@@ -523,7 +524,7 @@ get_next:
  
                         hash = io_get_work_hash(work);
                         linked = old_work = work;
-                       linked->func(&linked);
+                       wq->do_work(&linked);
                         linked = (old_work == linked) ? NULL : linked;
  
                         work = next_hashed;
@@ -780,7 +781,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
                 struct io_wq_work *old_work = work;
  
                 work->flags |= IO_WQ_WORK_CANCEL;
-               work->func(&work);
+               wq->do_work(&work);
                 work = (work == old_work) ? NULL : work;
                 wq->free_work(old_work);
         } while (work);
@@ -1018,7 +1019,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
         int ret = -ENOMEM, node;
         struct io_wq *wq;
  
-       if (WARN_ON_ONCE(!data->free_work))
+       if (WARN_ON_ONCE(!data->free_work || !data->do_work))
                 return ERR_PTR(-EINVAL);
  
         wq = kzalloc(sizeof(*wq), GFP_KERNEL);
@@ -1032,6 +1033,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
         }
  
         wq->free_work = data->free_work;
+       wq->do_work = data->do_work;
  
         /* caller must already hold a reference to this */
         wq->user = data->user;
@@ -1088,7 +1090,7 @@ err:
  
  bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
  {
-       if (data->free_work != wq->free_work)
+       if (data->free_work != wq->free_work || data->do_work != wq->do_work)
                 return false;
  
         return refcount_inc_not_zero(&wq->use_refs);
diff --git a/fs/io-wq.h b/fs/io-wq.h

index 5ba12de..8e138fa 100644 (file)
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -85,7 +85,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
  
  struct io_wq_work {
         struct io_wq_work_node list;
-       void (*func)(struct io_wq_work **);
         struct files_struct *files;
         struct mm_struct *mm;
         const struct cred *creds;
@@ -94,11 +93,6 @@ struct io_wq_work {
         pid_t task_pid;
  };
  
-#define INIT_IO_WORK(work, _func)                              \
-       do {                                                    \
-               *(work) = (struct io_wq_work){ .func = _func }; \
-       } while (0)                                             \
-
  static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
  {
         if (!work->list.next)
@@ -108,10 +102,12 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
  }
  
  typedef void (free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work **);
  
  struct io_wq_data {
         struct user_struct *user;
  
+       io_wq_work_fn *do_work;
         free_work_fn *free_work;
  };
  
diff --git a/fs/io_uring.c b/fs/io_uring.c

index 26f7bc9..155f3d8 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -528,7 +528,6 @@ enum {
         REQ_F_INFLIGHT_BIT,
         REQ_F_CUR_POS_BIT,
         REQ_F_NOWAIT_BIT,
-       REQ_F_IOPOLL_COMPLETED_BIT,
         REQ_F_LINK_TIMEOUT_BIT,
         REQ_F_TIMEOUT_BIT,
         REQ_F_ISREG_BIT,
@@ -540,6 +539,8 @@ enum {
         REQ_F_POLLED_BIT,
         REQ_F_BUFFER_SELECTED_BIT,
         REQ_F_NO_FILE_TABLE_BIT,
+       REQ_F_QUEUE_TIMEOUT_BIT,
+       REQ_F_WORK_INITIALIZED_BIT,
  
         /* not a real bit, just to check we're not overflowing the space */
         __REQ_F_LAST_BIT,
@@ -571,8 +572,6 @@ enum {
         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
         /* must not punt to workers */
         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
-       /* polled IO has completed */
-       REQ_F_IOPOLL_COMPLETED  = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
         /* has linked timeout */
         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
         /* timeout request */
@@ -595,6 +594,10 @@ enum {
         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
         /* doesn't need file table for this request */
         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
+       /* needs to queue linked timeout */
+       REQ_F_QUEUE_TIMEOUT     = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
+       /* io_wq_work is initialized */
+       REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
  };
  
  struct async_poll {
@@ -633,6 +636,8 @@ struct io_kiocb {
         struct io_async_ctx             *io;
         int                             cflags;
         u8                              opcode;
+       /* polled IO has completed */
+       u8                              iopoll_completed;
  
         u16                             buf_index;
  
@@ -697,6 +702,8 @@ struct io_op_def {
         unsigned                needs_mm : 1;
         /* needs req->file assigned */
         unsigned                needs_file : 1;
+       /* don't fail if file grab fails */
+       unsigned                needs_file_no_error : 1;
         /* hash wq insertion if file is a regular file */
         unsigned                hash_reg_file : 1;
         /* unbound wq insertion if file is a non-regular file */
@@ -803,6 +810,8 @@ static const struct io_op_def io_op_defs[] = {
                 .needs_fs               = 1,
         },
         [IORING_OP_CLOSE] = {
+               .needs_file             = 1,
+               .needs_file_no_error    = 1,
                 .file_table             = 1,
         },
         [IORING_OP_FILES_UPDATE] = {
@@ -903,6 +912,19 @@ EXPORT_SYMBOL(io_uring_get_socket);
  
  static void io_file_put_work(struct work_struct *work);
  
+/*
+ * Note: must call io_req_init_async() for the first time you
+ * touch any members of io_wq_work.
+ */
+static inline void io_req_init_async(struct io_kiocb *req)
+{
+       if (req->flags & REQ_F_WORK_INITIALIZED)
+               return;
+
+       memset(&req->work, 0, sizeof(req->work));
+       req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
  static inline bool io_async_submit(struct io_ring_ctx *ctx)
  {
         return ctx->flags & IORING_SETUP_SQPOLL;
@@ -1029,6 +1051,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
  
  static inline void io_req_work_drop_env(struct io_kiocb *req)
  {
+       if (!(req->flags & REQ_F_WORK_INITIALIZED))
+               return;
+
         if (req->work.mm) {
                 mmdrop(req->work.mm);
                 req->work.mm = NULL;
@@ -1575,16 +1600,6 @@ static void io_free_req(struct io_kiocb *req)
                 io_queue_async_work(nxt);
  }
  
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *link;
-
-       link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
-       io_queue_linked_timeout(link);
-       io_wq_submit_work(workptr);
-}
-
  static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
  {
         struct io_kiocb *link;
@@ -1596,7 +1611,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
         *workptr = &nxt->work;
         link = io_prep_linked_timeout(nxt);
         if (link)
-               nxt->work.func = io_link_work_cb;
+               nxt->flags |= REQ_F_QUEUE_TIMEOUT;
  }
  
  /*
@@ -1781,7 +1796,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                  * If we find a request that requires polling, break out
                  * and complete those lists first, if we have entries there.
                  */
-               if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+               if (READ_ONCE(req->iopoll_completed)) {
                         list_move_tail(&req->list, &done);
                         continue;
                 }
@@ -1962,7 +1977,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
                 req_set_fail_links(req);
         req->result = res;
         if (res != -EAGAIN)
-               req->flags |= REQ_F_IOPOLL_COMPLETED;
+               WRITE_ONCE(req->iopoll_completed, 1);
  }
  
  /*
@@ -1995,7 +2010,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
          * For fast devices, IO may have already completed. If it has, add
          * it to the front so we find it first.
          */
-       if (req->flags & REQ_F_IOPOLL_COMPLETED)
+       if (READ_ONCE(req->iopoll_completed))
                 list_add(&req->list, &ctx->poll_list);
         else
                 list_add_tail(&req->list, &ctx->poll_list);
@@ -2063,6 +2078,10 @@ static bool io_file_supports_async(struct file *file, int rw)
         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
                 return true;
  
+       /* any ->read/write should understand O_NONBLOCK */
+       if (file->f_flags & O_NONBLOCK)
+               return true;
+
         if (!(file->f_mode & FMODE_NOWAIT))
                 return false;
  
@@ -2105,8 +2124,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                 kiocb->ki_ioprio = get_current_ioprio();
  
         /* don't allow async punt if RWF_NOWAIT was requested */
-       if ((kiocb->ki_flags & IOCB_NOWAIT) ||
-           (req->file->f_flags & O_NONBLOCK))
+       if (kiocb->ki_flags & IOCB_NOWAIT)
                 req->flags |= REQ_F_NOWAIT;
  
         if (force_nonblock)
@@ -2120,6 +2138,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                 kiocb->ki_flags |= IOCB_HIPRI;
                 kiocb->ki_complete = io_complete_rw_iopoll;
                 req->result = 0;
+               req->iopoll_completed = 0;
         } else {
                 if (kiocb->ki_flags & IOCB_HIPRI)
                         return -EINVAL;
@@ -2358,8 +2377,14 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
  static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                                     bool needs_lock)
  {
-       if (req->flags & REQ_F_BUFFER_SELECTED)
+       if (req->flags & REQ_F_BUFFER_SELECTED) {
+               struct io_buffer *kbuf;
+
+               kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+               iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+               iov[0].iov_len = kbuf->len;
                 return 0;
+       }
         if (!req->rw.len)
                 return 0;
         else if (req->rw.len > 1)
@@ -2741,7 +2766,8 @@ copy_iov:
                         if (ret)
                                 goto out_free;
                         /* any defer here is final, must blocking retry */
-                       if (!file_can_poll(req->file))
+                       if (!(req->flags & REQ_F_NOWAIT) &&
+                           !file_can_poll(req->file))
                                 req->flags |= REQ_F_MUST_PUNT;
                         return -EAGAIN;
                 }
@@ -2761,6 +2787,8 @@ static int __io_splice_prep(struct io_kiocb *req,
  
         if (req->flags & REQ_F_NEED_CLEANUP)
                 return 0;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
  
         sp->file_in = NULL;
         sp->len = READ_ONCE(sqe->len);
@@ -2775,8 +2803,14 @@ static int __io_splice_prep(struct io_kiocb *req,
                 return ret;
         req->flags |= REQ_F_NEED_CLEANUP;
  
-       if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+       if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+               /*
+                * Splice operation will be punted aync, and here need to
+                * modify io_wq_work.flags, so initialize io_wq_work firstly.
+                */
+               io_req_init_async(req);
                 req->work.flags |= IO_WQ_WORK_UNBOUND;
+       }
  
         return 0;
  }
@@ -2885,23 +2919,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         return 0;
  }
  
-static bool io_req_cancelled(struct io_kiocb *req)
-{
-       if (req->work.flags & IO_WQ_WORK_CANCEL) {
-               req_set_fail_links(req);
-               io_cqring_add_event(req, -ECANCELED);
-               io_put_req(req);
-               return true;
-       }
-
-       return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
  {
         loff_t end = req->sync.off + req->sync.len;
         int ret;
  
+       /* fsync always requires a blocking context */
+       if (force_nonblock)
+               return -EAGAIN;
+
         ret = vfs_fsync_range(req->file, req->sync.off,
                                 end > 0 ? end : LLONG_MAX,
                                 req->sync.flags & IORING_FSYNC_DATASYNC);
@@ -2909,58 +2935,16 @@ static void __io_fsync(struct io_kiocb *req)
                 req_set_fail_links(req);
         io_cqring_add_event(req, ret);
         io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_fsync(req);
-       io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
-       /* fsync always requires a blocking context */
-       if (force_nonblock) {
-               req->work.func = io_fsync_finish;
-               return -EAGAIN;
-       }
-       __io_fsync(req);
         return 0;
  }
  
-static void __io_fallocate(struct io_kiocb *req)
-{
-       int ret;
-
-       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
-       ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
-                               req->sync.len);
-       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-       if (ret < 0)
-               req_set_fail_links(req);
-       io_cqring_add_event(req, ret);
-       io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_fallocate(req);
-       io_steal_work(req, workptr);
-}
-
  static int io_fallocate_prep(struct io_kiocb *req,
                              const struct io_uring_sqe *sqe)
  {
         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
                 return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
  
         req->sync.off = READ_ONCE(sqe->off);
         req->sync.len = READ_ONCE(sqe->addr);
@@ -2971,66 +2955,74 @@ static int io_fallocate_prep(struct io_kiocb *req,
  
  static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
  {
+       int ret;
+
         /* fallocate always requiring blocking context */
-       if (force_nonblock) {
-               req->work.func = io_fallocate_finish;
+       if (force_nonblock)
                 return -EAGAIN;
-       }
  
-       __io_fallocate(req);
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+       ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+                               req->sync.len);
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       io_put_req(req);
         return 0;
  }
  
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         const char __user *fname;
         int ret;
  
-       if (sqe->ioprio || sqe->buf_index)
+       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
                 return -EINVAL;
-       if (req->flags & REQ_F_FIXED_FILE)
+       if (unlikely(sqe->ioprio || sqe->buf_index))
+               return -EINVAL;
+       if (unlikely(req->flags & REQ_F_FIXED_FILE))
                 return -EBADF;
-       if (req->flags & REQ_F_NEED_CLEANUP)
-               return 0;
  
-       req->open.dfd = READ_ONCE(sqe->fd);
-       req->open.how.mode = READ_ONCE(sqe->len);
-       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-       req->open.how.flags = READ_ONCE(sqe->open_flags);
-       if (force_o_largefile())
+       /* open.how should be already initialised */
+       if (!(req->open.how.flags & O_PATH) && force_o_largefile())
                 req->open.how.flags |= O_LARGEFILE;
  
+       req->open.dfd = READ_ONCE(sqe->fd);
+       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
         req->open.filename = getname(fname);
         if (IS_ERR(req->open.filename)) {
                 ret = PTR_ERR(req->open.filename);
                 req->open.filename = NULL;
                 return ret;
         }
-
         req->open.nofile = rlimit(RLIMIT_NOFILE);
         req->flags |= REQ_F_NEED_CLEANUP;
         return 0;
  }
  
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       u64 flags, mode;
+
+       if (req->flags & REQ_F_NEED_CLEANUP)
+               return 0;
+       mode = READ_ONCE(sqe->len);
+       flags = READ_ONCE(sqe->open_flags);
+       req->open.how = build_open_how(flags, mode);
+       return __io_openat_prep(req, sqe);
+}
+
  static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         struct open_how __user *how;
-       const char __user *fname;
         size_t len;
         int ret;
  
-       if (sqe->ioprio || sqe->buf_index)
-               return -EINVAL;
-       if (req->flags & REQ_F_FIXED_FILE)
-               return -EBADF;
         if (req->flags & REQ_F_NEED_CLEANUP)
                 return 0;
-
-       req->open.dfd = READ_ONCE(sqe->fd);
-       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
         len = READ_ONCE(sqe->len);
-
         if (len < OPEN_HOW_SIZE_VER0)
                 return -EINVAL;
  
@@ -3039,19 +3031,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         if (ret)
                 return ret;
  
-       if (!(req->open.how.flags & O_PATH) && force_o_largefile())
-               req->open.how.flags |= O_LARGEFILE;
-
-       req->open.filename = getname(fname);
-       if (IS_ERR(req->open.filename)) {
-               ret = PTR_ERR(req->open.filename);
-               req->open.filename = NULL;
-               return ret;
-       }
-
-       req->open.nofile = rlimit(RLIMIT_NOFILE);
-       req->flags |= REQ_F_NEED_CLEANUP;
-       return 0;
+       return __io_openat_prep(req, sqe);
  }
  
  static int io_openat2(struct io_kiocb *req, bool force_nonblock)
@@ -3091,7 +3071,6 @@ err:
  
  static int io_openat(struct io_kiocb *req, bool force_nonblock)
  {
-       req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
         return io_openat2(req, force_nonblock);
  }
  
@@ -3180,7 +3159,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
         p->addr = READ_ONCE(sqe->addr);
         p->len = READ_ONCE(sqe->len);
  
-       if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+       if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
                 return -EFAULT;
  
         p->bgid = READ_ONCE(sqe->buf_group);
@@ -3258,6 +3237,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
  #if defined(CONFIG_EPOLL)
         if (sqe->ioprio || sqe->buf_index)
                 return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
  
         req->epoll.epfd = READ_ONCE(sqe->fd);
         req->epoll.op = READ_ONCE(sqe->len);
@@ -3302,6 +3283,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
         if (sqe->ioprio || sqe->buf_index || sqe->off)
                 return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
  
         req->madvise.addr = READ_ONCE(sqe->addr);
         req->madvise.len = READ_ONCE(sqe->len);
@@ -3336,6 +3319,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         if (sqe->ioprio || sqe->buf_index || sqe->addr)
                 return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
  
         req->fadvise.offset = READ_ONCE(sqe->off);
         req->fadvise.len = READ_ONCE(sqe->len);
@@ -3369,6 +3354,8 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
  
  static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
         if (sqe->ioprio || sqe->buf_index)
                 return -EINVAL;
         if (req->flags & REQ_F_FIXED_FILE)
@@ -3409,10 +3396,14 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         /*
          * If we queue this for async, it must not be cancellable. That would
-        * leave the 'file' in an undeterminate state.
+        * leave the 'file' in an undeterminate state, and here need to modify
+        * io_wq_work.flags, so initialize io_wq_work firstly.
          */
+       io_req_init_async(req);
         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
  
+       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+               return -EINVAL;
         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
             sqe->rw_flags || sqe->buf_index)
                 return -EINVAL;
@@ -3420,53 +3411,41 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                 return -EBADF;
  
         req->close.fd = READ_ONCE(sqe->fd);
-       return 0;
-}
-
-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
-       int ret;
-
-       ret = filp_close(req->close.put_file, req->work.files);
-       if (ret < 0)
-               req_set_fail_links(req);
-       io_cqring_add_event(req, ret);
-       fput(req->close.put_file);
-       io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+       if ((req->file && req->file->f_op == &io_uring_fops) ||
+           req->close.fd == req->ctx->ring_fd)
+               return -EBADF;
  
-       /* not cancellable, don't do io_req_cancelled() */
-       __io_close_finish(req);
-       io_steal_work(req, workptr);
+       req->close.put_file = NULL;
+       return 0;
  }
  
  static int io_close(struct io_kiocb *req, bool force_nonblock)
  {
+       struct io_close *close = &req->close;
         int ret;
  
-       req->close.put_file = NULL;
-       ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
-       if (ret < 0)
-               return (ret == -ENOENT) ? -EBADF : ret;
+       /* might be already done during nonblock submission */
+       if (!close->put_file) {
+               ret = __close_fd_get_file(close->fd, &close->put_file);
+               if (ret < 0)
+                       return (ret == -ENOENT) ? -EBADF : ret;
+       }
  
         /* if the file has a flush method, be safe and punt to async */
-       if (req->close.put_file->f_op->flush && force_nonblock) {
+       if (close->put_file->f_op->flush && force_nonblock) {
                 /* avoid grabbing files - we don't need the files */
                 req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
-               req->work.func = io_close_finish;
                 return -EAGAIN;
         }
  
-       /*
-        * No ->flush(), safely close from here and just punt the
-        * fput() to async context.
-        */
-       __io_close_finish(req);
+       /* No ->flush() or already async, safely close from here */
+       ret = filp_close(close->put_file, req->work.files);
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       fput(close->put_file);
+       close->put_file = NULL;
+       io_put_req(req);
         return 0;
  }
  
@@ -3488,38 +3467,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         return 0;
  }
  
-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
  {
         int ret;
  
+       /* sync_file_range always requires a blocking context */
+       if (force_nonblock)
+               return -EAGAIN;
+
         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
                                 req->sync.flags);
         if (ret < 0)
                 req_set_fail_links(req);
         io_cqring_add_event(req, ret);
         io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_sync_file_range(req);
-       io_steal_work(req, workptr);
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
-       /* sync_file_range always requires a blocking context */
-       if (force_nonblock) {
-               req->work.func = io_sync_file_range_finish;
-               return -EAGAIN;
-       }
-
-       __io_sync_file_range(req);
         return 0;
  }
  
@@ -3545,6 +3506,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         struct io_async_ctx *io = req->io;
         int ret;
  
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
+
         sr->msg_flags = READ_ONCE(sqe->msg_flags);
         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
         sr->len = READ_ONCE(sqe->len);
@@ -3574,9 +3538,6 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
         struct socket *sock;
         int ret;
  
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
         sock = sock_from_file(req->file, &ret);
         if (sock) {
                 struct io_async_ctx io;
@@ -3630,9 +3591,6 @@ static int io_send(struct io_kiocb *req, bool force_nonblock)
         struct socket *sock;
         int ret;
  
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
         sock = sock_from_file(req->file, &ret);
         if (sock) {
                 struct io_sr_msg *sr = &req->sr_msg;
@@ -3785,6 +3743,9 @@ static int io_recvmsg_prep(struct io_kiocb *req,
         struct io_async_ctx *io = req->io;
         int ret;
  
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
+
         sr->msg_flags = READ_ONCE(sqe->msg_flags);
         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
         sr->len = READ_ONCE(sqe->len);
@@ -3813,9 +3774,6 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
         struct socket *sock;
         int ret, cflags = 0;
  
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
         sock = sock_from_file(req->file, &ret);
         if (sock) {
                 struct io_buffer *kbuf;
@@ -3877,9 +3835,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock)
         struct socket *sock;
         int ret, cflags = 0;
  
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
         sock = sock_from_file(req->file, &ret);
         if (sock) {
                 struct io_sr_msg *sr = &req->sr_msg;
@@ -3947,49 +3902,30 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         return 0;
  }
  
-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
  {
         struct io_accept *accept = &req->accept;
-       unsigned file_flags;
+       unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
         int ret;
  
-       file_flags = force_nonblock ? O_NONBLOCK : 0;
+       if (req->file->f_flags & O_NONBLOCK)
+               req->flags |= REQ_F_NOWAIT;
+
         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
                                         accept->addr_len, accept->flags,
                                         accept->nofile);
         if (ret == -EAGAIN && force_nonblock)
                 return -EAGAIN;
-       if (ret == -ERESTARTSYS)
-               ret = -EINTR;
-       if (ret < 0)
+       if (ret < 0) {
+               if (ret == -ERESTARTSYS)
+                       ret = -EINTR;
                 req_set_fail_links(req);
+       }
         io_cqring_add_event(req, ret);
         io_put_req(req);
         return 0;
  }
  
-static void io_accept_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_accept(req, false);
-       io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
-       int ret;
-
-       ret = __io_accept(req, force_nonblock);
-       if (ret == -EAGAIN && force_nonblock) {
-               req->work.func = io_accept_finish;
-               return -EAGAIN;
-       }
-       return 0;
-}
-
  static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         struct io_connect *conn = &req->connect;
@@ -4328,7 +4264,8 @@ static void io_async_task_func(struct callback_head *cb)
         spin_unlock_irq(&ctx->completion_lock);
  
         /* restore ->work in case we need to retry again */
-       memcpy(&req->work, &apoll->work, sizeof(req->work));
+       if (req->flags & REQ_F_WORK_INITIALIZED)
+               memcpy(&req->work, &apoll->work, sizeof(req->work));
         kfree(apoll);
  
         if (!canceled) {
@@ -4425,7 +4362,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
                 return false;
  
         req->flags |= REQ_F_POLLED;
-       memcpy(&apoll->work, &req->work, sizeof(req->work));
+       if (req->flags & REQ_F_WORK_INITIALIZED)
+               memcpy(&apoll->work, &req->work, sizeof(req->work));
         had_io = req->io != NULL;
  
         get_task_struct(current);
@@ -4450,7 +4388,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
                 if (!had_io)
                         io_poll_remove_double(req);
                 spin_unlock_irq(&ctx->completion_lock);
-               memcpy(&req->work, &apoll->work, sizeof(req->work));
+               if (req->flags & REQ_F_WORK_INITIALIZED)
+                       memcpy(&req->work, &apoll->work, sizeof(req->work));
                 kfree(apoll);
                 return false;
         }
@@ -4495,7 +4434,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
                          * io_req_work_drop_env below when dropping the
                          * final reference.
                          */
-                       memcpy(&req->work, &apoll->work, sizeof(req->work));
+                       if (req->flags & REQ_F_WORK_INITIALIZED)
+                               memcpy(&req->work, &apoll->work,
+                                      sizeof(req->work));
                         kfree(apoll);
                 }
         }
@@ -4944,6 +4885,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
         if (!sqe)
                 return 0;
  
+       io_req_init_async(req);
+
         if (io_op_defs[req->opcode].file_table) {
                 ret = io_grab_files(req);
                 if (unlikely(ret))
@@ -5381,12 +5324,26 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
         return 0;
  }
  
+static void io_arm_async_linked_timeout(struct io_kiocb *req)
+{
+       struct io_kiocb *link;
+
+       /* link head's timeout is queued in io_queue_async_work() */
+       if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+               return;
+
+       link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+       io_queue_linked_timeout(link);
+}
+
  static void io_wq_submit_work(struct io_wq_work **workptr)
  {
         struct io_wq_work *work = *workptr;
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
         int ret = 0;
  
+       io_arm_async_linked_timeout(req);
+
         /* if NO_CANCEL is set, we must still run the work */
         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
                                 IO_WQ_WORK_CANCEL) {
@@ -5437,19 +5394,20 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
                         return -EBADF;
                 fd = array_index_nospec(fd, ctx->nr_user_files);
                 file = io_file_from_index(ctx, fd);
-               if (!file)
-                       return -EBADF;
-               req->fixed_file_refs = ctx->file_data->cur_refs;
-               percpu_ref_get(req->fixed_file_refs);
+               if (file) {
+                       req->fixed_file_refs = ctx->file_data->cur_refs;
+                       percpu_ref_get(req->fixed_file_refs);
+               }
         } else {
                 trace_io_uring_file_get(ctx, fd);
                 file = __io_file_get(state, fd);
-               if (unlikely(!file))
-                       return -EBADF;
         }
  
-       *out_file = file;
-       return 0;
+       if (file || io_op_defs[req->opcode].needs_file_no_error) {
+               *out_file = file;
+               return 0;
+       }
+       return -EBADF;
  }
  
  static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -5583,7 +5541,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  again:
         linked_timeout = io_prep_linked_timeout(req);
  
-       if (req->work.creds && req->work.creds != current_cred()) {
+       if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+           req->work.creds != current_cred()) {
                 if (old_creds)
                         revert_creds(old_creds);
                 if (old_creds == req->work.creds)
@@ -5606,6 +5565,8 @@ again:
                         goto exit;
                 }
  punt:
+               io_req_init_async(req);
+
                 if (io_op_defs[req->opcode].file_table) {
                         ret = io_grab_files(req);
                         if (ret)
@@ -5858,7 +5819,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
         refcount_set(&req->refs, 2);
         req->task = NULL;
         req->result = 0;
-       INIT_IO_WORK(&req->work, io_wq_submit_work);
  
         if (unlikely(req->opcode >= IORING_OP_LAST))
                 return -EINVAL;
@@ -5880,6 +5840,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
  
         id = READ_ONCE(sqe->personality);
         if (id) {
+               io_req_init_async(req);
                 req->work.creds = idr_find(&ctx->personality_idr, id);
                 if (unlikely(!req->work.creds))
                         return -EINVAL;
@@ -6874,6 +6835,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
  
         data.user = ctx->user;
         data.free_work = io_free_work;
+       data.do_work = io_wq_submit_work;
  
         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
                 /* Do QD, or 4 * CPUS, whatever is smallest */
@@ -7155,8 +7117,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
  
                 ret = 0;
                 if (!pages || nr_pages > got_pages) {
-                       kfree(vmas);
-                       kfree(pages);
+                       kvfree(vmas);
+                       kvfree(pages);
                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
                                                 GFP_KERNEL);
                         vmas = kvmalloc_array(nr_pages,
diff --git a/include/trace/events/block.h b/include/trace/events/block.h

index 81b43f5..1257f26 100644 (file)
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -261,9 +261,9 @@ TRACE_EVENT(block_bio_bounce,
   */
  TRACE_EVENT(block_bio_complete,
  
-       TP_PROTO(struct request_queue *q, struct bio *bio, int error),
+       TP_PROTO(struct request_queue *q, struct bio *bio),
  
-       TP_ARGS(q, bio, error),
+       TP_ARGS(q, bio),
  
         TP_STRUCT__entry(
                 __field( dev_t,         dev             )
@@ -277,7 +277,7 @@ TRACE_EVENT(block_bio_complete,
                 __entry->dev            = bio_dev(bio);
                 __entry->sector         = bio->bi_iter.bi_sector;
                 __entry->nr_sector      = bio_sectors(bio);
-               __entry->error          = error;
+               __entry->error          = blk_status_to_errno(bio->bi_status);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
         ),
  
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c

index 7cb09c4..02441ea 100644 (file)
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -928,14 +928,12 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
  
         clocksource_arch_init(cs);
  
-#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE
         if (cs->vdso_clock_mode < 0 ||
             cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
                 pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
                         cs->name, cs->vdso_clock_mode);
                 cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
         }
-#endif
  
         /* Initialize mult/shift and max_idle_ns */
         __clocksource_update_freq_scale(cs, scale, freq);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c

index ea47f20..5773f0b 100644 (file)
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -885,10 +885,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
  }
  
  static void blk_add_trace_bio_complete(void *ignore,
-                                      struct request_queue *q, struct bio *bio,
-                                      int error)
+                                      struct request_queue *q, struct bio *bio)
  {
-       blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+       blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
+                         blk_status_to_errno(bio->bi_status));
  }
  
  static void blk_add_trace_bio_backmerge(void *ignore,
@@ -995,8 +995,10 @@ static void blk_add_trace_split(void *ignore,
  
                 __blk_add_trace(bt, bio->bi_iter.bi_sector,
                                 bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
-                               BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
-                               &rpdu, blk_trace_bio_get_cgid(q, bio));
+                               BLK_TA_SPLIT,
+                               blk_status_to_errno(bio->bi_status),
+                               sizeof(rpdu), &rpdu,
+                               blk_trace_bio_get_cgid(q, bio));
         }
         rcu_read_unlock();
  }
@@ -1033,7 +1035,8 @@ static void blk_add_trace_bio_remap(void *ignore,
         r.sector_from = cpu_to_be64(from);
  
         __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-                       bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
+                       bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+                       blk_status_to_errno(bio->bi_status),
                         sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
         rcu_read_unlock();
  }
@@ -1253,21 +1256,10 @@ static inline __u16 t_error(const struct trace_entry *ent)
  
  static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
  {
-       const __u64 *val = pdu_start(ent, has_cg);
+       const __be64 *val = pdu_start(ent, has_cg);
         return be64_to_cpu(*val);
  }
  
-static void get_pdu_remap(const struct trace_entry *ent,
-                         struct blk_io_trace_remap *r, bool has_cg)
-{
-       const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
-       __u64 sector_from = __r->sector_from;
-
-       r->device_from = be32_to_cpu(__r->device_from);
-       r->device_to   = be32_to_cpu(__r->device_to);
-       r->sector_from = be64_to_cpu(sector_from);
-}
-
  typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
         bool has_cg);
  
@@ -1407,13 +1399,13 @@ static void blk_log_with_error(struct trace_seq *s,
  
  static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
  {
-       struct blk_io_trace_remap r = { .device_from = 0, };
+       const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
  
-       get_pdu_remap(ent, &r, has_cg);
         trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
                          t_sector(ent), t_sec(ent),
-                        MAJOR(r.device_from), MINOR(r.device_from),
-                        (unsigned long long)r.sector_from);
+                        MAJOR(be32_to_cpu(__r->device_from)),
+                        MINOR(be32_to_cpu(__r->device_from)),
+                        be64_to_cpu(__r->sector_from));
  }
  
  static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c

index a2909af..bcc9a98 100644 (file)
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,6 +38,13 @@ static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
  }
  #endif
  
+#ifndef vdso_cycles_ok
+static inline bool vdso_cycles_ok(u64 cycles)
+{
+       return true;
+}
+#endif
+
  #ifdef CONFIG_TIME_NS
  static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk,
                           struct __kernel_timespec *ts)
@@ -62,6 +69,8 @@ static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk,
                         return -1;
  
                 cycles = __arch_get_hw_counter(vd->clock_mode);
+               if (unlikely(!vdso_cycles_ok(cycles)))
+                       return -1;
                 ns = vdso_ts->nsec;
                 last = vd->cycle_last;
                 ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
@@ -130,6 +139,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
                         return -1;
  
                 cycles = __arch_get_hw_counter(vd->clock_mode);
+               if (unlikely(!vdso_cycles_ok(cycles)))
+                       return -1;
                 ns = vdso_ts->nsec;
                 last = vd->cycle_last;
                 ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
@@ -210,7 +221,7 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk,
         return 0;
  }
  
-static __maybe_unused int
+static __always_inline int
  __cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock,
                              struct __kernel_timespec *ts)
  {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 12 Jun 2020 01:18:50 +0000 (18:18 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 12 Jun 2020 01:18:50 +0000 (18:18 -0700)
arch/x86/include/asm/intel-family.h		patch \| blob \| history
arch/x86/include/asm/vdso/gettimeofday.h		patch \| blob \| history
arch/x86/kernel/apic/apic.c		patch \| blob \| history
arch/x86/kernel/cpu/bugs.c		patch \| blob \| history
arch/x86/kernel/cpu/intel.c		patch \| blob \| history
arch/x86/kernel/process.c		patch \| blob \| history
arch/x86/kernel/reboot.c		patch \| blob \| history
arch/x86/kernel/time.c		patch \| blob \| history
arch/x86/kernel/vmlinux.lds.S		patch \| blob \| history
block/bio-integrity.c		patch \| blob \| history
block/bio.c		patch \| blob \| history
block/blk-mq-tag.c		patch \| blob \| history
block/blk-mq-tag.h		patch \| blob \| history
block/blk-mq.c		patch \| blob \| history
block/blk-mq.h		patch \| blob \| history
block/blk.h		patch \| blob \| history
drivers/block/loop.c		patch \| blob \| history
drivers/block/pktcdvd.c		patch \| blob \| history
drivers/block/umem.c		patch \| blob \| history
drivers/nvme/host/core.c		patch \| blob \| history
drivers/nvme/host/fc.c		patch \| blob \| history
drivers/nvme/host/nvme.h		patch \| blob \| history
drivers/nvme/host/pci.c		patch \| blob \| history
drivers/nvme/host/tcp.c		patch \| blob \| history
drivers/nvme/target/core.c		patch \| blob \| history
drivers/nvme/target/tcp.c		patch \| blob \| history
fs/afs/write.c		patch \| blob \| history
fs/io-wq.c		patch \| blob \| history
fs/io-wq.h		patch \| blob \| history
fs/io_uring.c		patch \| blob \| history
include/trace/events/block.h		patch \| blob \| history
kernel/time/clocksource.c		patch \| blob \| history
kernel/trace/blktrace.c		patch \| blob \| history
lib/vdso/gettimeofday.c		patch \| blob \| history