Merge tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 9 May 2021 19:52:25 +0000 (12:52 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 9 May 2021 19:52:25 +0000 (12:52 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 9 May 2021 19:52:25 +0000 (12:52 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 9 May 2021 19:52:25 +0000 (12:52 -0700)
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h

index 297fa12..84b8753 100644 (file)
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -7,18 +7,9 @@
  
  /*
   * Despite that some emulators terminate on UD2, we use it for WARN().
- *
- * Since various instruction decoders/specs disagree on the encoding of
- * UD0/UD1.
   */
-
-#define ASM_UD0                ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
-#define ASM_UD1                ".byte 0x0f, 0xb9" /* + ModRM */
  #define ASM_UD2                ".byte 0x0f, 0x0b"
-
-#define INSN_UD0       0xff0f
  #define INSN_UD2       0x0b0f
-
  #define LEN_UD2                2
  
  #ifdef CONFIG_GENERIC_BUG
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h

index e35e342..73d45b0 100644 (file)
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC,  xenpv_exc_machine_check);
  #endif
  
  /* NMI */
+
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+/*
+ * Special NOIST entry point for VMX which invokes this on the kernel
+ * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
+ * 'executing' marker.
+ *
+ * On 32bit this just uses the regular NMI entry point because 32-bit does
+ * not have ISTs.
+ */
+DECLARE_IDTENTRY(X86_TRAP_NMI,         exc_nmi_noist);
+#else
+#define asm_exc_nmi_noist              asm_exc_nmi
+#endif
+
  DECLARE_IDTENTRY_NMI(X86_TRAP_NMI,     exc_nmi);
  #ifdef CONFIG_XEN_PV
  DECLARE_IDTENTRY_RAW(X86_TRAP_NMI,     xenpv_exc_nmi);
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h

index e16cccd..a3f87f1 100644 (file)
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
         return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
  }
  
-#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
-
-#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
-
  struct msr *msrs_alloc(void);
  void msrs_free(struct msr *msrs);
  int msr_set_bit(u32 msr, u8 bit);
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h

index 939b1cf..ca840fe 100644 (file)
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -56,6 +56,39 @@ static inline void clear_page(void *page)
  
  void copy_page(void *to, void *from);
  
+#ifdef CONFIG_X86_5LEVEL
+/*
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+static inline unsigned long task_size_max(void)
+{
+       unsigned long ret;
+
+       alternative_io("movq %[small],%0","movq %[large],%0",
+                       X86_FEATURE_LA57,
+                       "=r" (ret),
+                       [small] "i" ((1ul << 47)-PAGE_SIZE),
+                       [large] "i" ((1ul << 56)-PAGE_SIZE));
+
+       return ret;
+}
+#endif /* CONFIG_X86_5LEVEL */
+
  #endif /* !__ASSEMBLY__ */
  
  #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h

index 64297ea..a8d4ad8 100644 (file)
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -55,30 +55,13 @@
  
  #ifdef CONFIG_X86_5LEVEL
  #define __VIRTUAL_MASK_SHIFT   (pgtable_l5_enabled() ? 56 : 47)
+/* See task_size_max() in <asm/page_64.h> */
  #else
  #define __VIRTUAL_MASK_SHIFT   47
+#define task_size_max()                ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
  #endif
  
-/*
- * User space process size.  This is the first address outside the user range.
- * There are a few constraints that determine this:
- *
- * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
- * address, then that syscall will enter the kernel with a
- * non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything
- * from being mapped at the maximum canonical address.
- *
- * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
- * CPUs malfunction if they execute code from the highest canonical page.
- * They'll speculate right off the end of the canonical space, and
- * bad things happen.  This is worked around in the same way as the
- * Intel problem.
- *
- * With page table isolation enabled, we map the LDT in ... [stay tuned]
- */
-#define TASK_SIZE_MAX  ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
-
+#define TASK_SIZE_MAX          task_size_max()
  #define DEFAULT_MAP_WINDOW     ((1UL << 47) - PAGE_SIZE)
  
  /* This decides where the kernel will search for a free chunk of vm
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 6bdb69a..a1b756c 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1851,8 +1851,8 @@ static inline void setup_getcpu(int cpu)
         unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
         struct desc_struct d = { };
  
-       if (boot_cpu_has(X86_FEATURE_RDTSCP))
-               write_rdtscp_aux(cpudata);
+       if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+               wrmsr(MSR_TSC_AUX, cpudata, 0);
  
         /* Store CPU and node number in limit. */
         d.limit0 = cpudata;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c

index dbeaa84..f07c10b 100644 (file)
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
  static const struct mbm_correction_factor_table {
         u32 rmidthreshold;
         u64 cf;
-} mbm_cf_table[] __initdata = {
+} mbm_cf_table[] __initconst = {
         {7,     CF(1.000000)},
         {15,    CF(1.000000)},
         {15,    CF(0.969650)},
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c

index bf250a3..2ef961c 100644 (file)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -524,6 +524,16 @@ nmi_restart:
                 mds_user_clear_cpu_buffers();
  }
  
+#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+{
+       exc_nmi(regs);
+}
+#endif
+#if IS_MODULE(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+#endif
+
  void stop_nmi(void)
  {
         ignore_nmis++;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 7ffb0cf..0ad5214 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1865,9 +1865,6 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
         return true;
  }
  
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
  #define X86_MATCH(model)                                       \
         X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6,            \
                 INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index 9790c73..b649f92 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3710,25 +3710,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
         struct vcpu_svm *svm = to_svm(vcpu);
         unsigned long vmcb_pa = svm->current_vmcb->pa;
  
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
  
         if (sev_es_guest(vcpu->kvm)) {
                 __svm_sev_es_vcpu_run(vmcb_pa);
@@ -3748,24 +3730,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
                 vmload(__sme_page_pa(sd->save_area));
         }
  
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
-
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
  }
  
  static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index cbe0cda..d000cdd 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
  #include <asm/debugreg.h>
  #include <asm/desc.h>
  #include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
  #include <asm/io.h>
  #include <asm/irq_remapping.h>
  #include <asm/kexec.h>
@@ -6415,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
  
  void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
  
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+                                       unsigned long entry)
  {
-       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
-       gate_desc *desc = (gate_desc *)host_idt_base + vector;
-
         kvm_before_interrupt(vcpu);
-       vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
+       vmx_do_interrupt_nmi_irqoff(entry);
         kvm_after_interrupt(vcpu);
  }
  
  static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
  {
+       const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
         u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
  
         /* if exit due to PF check for async PF */
@@ -6437,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
                 kvm_machine_check();
         /* We need to handle NMIs before interrupts are enabled */
         else if (is_nmi(intr_info))
-               handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
+               handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
  }
  
  static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
  {
         u32 intr_info = vmx_get_intr_info(vcpu);
+       unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+       gate_desc *desc = (gate_desc *)host_idt_base + vector;
  
         if (WARN_ONCE(!is_external_intr(intr_info),
             "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
                 return;
  
-       handle_interrupt_nmi_irqoff(vcpu, intr_info);
+       handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
  }
  
  static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6662,25 +6664,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
  static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                         struct vcpu_vmx *vmx)
  {
-       /*
-        * VMENTER enables interrupts (host state), but the kernel state is
-        * interrupts disabled when this is invoked. Also tell RCU about
-        * it. This is the same logic as for exit_to_user_mode().
-        *
-        * This ensures that e.g. latency analysis on the host observes
-        * guest mode as interrupt enabled.
-        *
-        * guest_enter_irqoff() informs context tracking about the
-        * transition to guest mode and if enabled adjusts RCU state
-        * accordingly.
-        */
-       instrumentation_begin();
-       trace_hardirqs_on_prepare();
-       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
-       instrumentation_end();
-
-       guest_enter_irqoff();
-       lockdep_hardirqs_on(CALLER_ADDR0);
+       kvm_guest_enter_irqoff();
  
         /* L1D Flush includes CPU buffer clear to mitigate MDS */
         if (static_branch_unlikely(&vmx_l1d_should_flush))
@@ -6696,24 +6680,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
  
         vcpu->arch.cr2 = native_read_cr2();
  
-       /*
-        * VMEXIT disables interrupts (host state), but tracing and lockdep
-        * have them in state 'on' as recorded before entering guest mode.
-        * Same as enter_from_user_mode().
-        *
-        * guest_exit_irqoff() restores host context and reinstates RCU if
-        * enabled and required.
-        *
-        * This needs to be done before the below as native_read_msr()
-        * contains a tracepoint and x86_spec_ctrl_restore_host() calls
-        * into world and some more.
-        */
-       lockdep_hardirqs_off(CALLER_ADDR0);
-       guest_exit_irqoff();
-
-       instrumentation_begin();
-       trace_hardirqs_off_finish();
-       instrumentation_end();
+       kvm_guest_exit_irqoff();
  }
  
  static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index cebdaa1..6eda283 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9315,6 +9315,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         local_irq_disable();
         kvm_after_interrupt(vcpu);
  
+       /*
+        * Wait until after servicing IRQs to account guest time so that any
+        * ticks that occurred while running the guest are properly accounted
+        * to the guest.  Waiting until IRQs are enabled degrades the accuracy
+        * of accounting via context tracking, but the loss of accuracy is
+        * acceptable for all known use cases.
+        */
+       vtime_account_guest_exit();
+
         if (lapic_in_kernel(vcpu)) {
                 s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
                 if (delta != S64_MIN) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 8ddd381..521f74e 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,51 @@
  #include "kvm_cache_regs.h"
  #include "kvm_emulate.h"
  
+static __always_inline void kvm_guest_enter_irqoff(void)
+{
+       /*
+        * VMENTER enables interrupts (host state), but the kernel state is
+        * interrupts disabled when this is invoked. Also tell RCU about
+        * it. This is the same logic as for exit_to_user_mode().
+        *
+        * This ensures that e.g. latency analysis on the host observes
+        * guest mode as interrupt enabled.
+        *
+        * guest_enter_irqoff() informs context tracking about the
+        * transition to guest mode and if enabled adjusts RCU state
+        * accordingly.
+        */
+       instrumentation_begin();
+       trace_hardirqs_on_prepare();
+       lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+       instrumentation_end();
+
+       guest_enter_irqoff();
+       lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+static __always_inline void kvm_guest_exit_irqoff(void)
+{
+       /*
+        * VMEXIT disables interrupts (host state), but tracing and lockdep
+        * have them in state 'on' as recorded before entering guest mode.
+        * Same as enter_from_user_mode().
+        *
+        * context_tracking_guest_exit() restores host context and reinstates
+        * RCU if enabled and required.
+        *
+        * This needs to be done immediately after VM-Exit, before any code
+        * that might contain tracepoints or call out to the greater world,
+        * e.g. before x86_spec_ctrl_restore_host().
+        */
+       lockdep_hardirqs_off(CALLER_ADDR0);
+       context_tracking_guest_exit();
+
+       instrumentation_begin();
+       trace_hardirqs_off_finish();
+       instrumentation_end();
+}
+
  #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check)                \
  ({                                                                     \
         bool failed = (consistency_check);                              \
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h

index bceb064..4d7fced 100644 (file)
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -71,6 +71,19 @@ static inline void exception_exit(enum ctx_state prev_ctx)
         }
  }
  
+static __always_inline bool context_tracking_guest_enter(void)
+{
+       if (context_tracking_enabled())
+               __context_tracking_enter(CONTEXT_GUEST);
+
+       return context_tracking_enabled_this_cpu();
+}
+
+static __always_inline void context_tracking_guest_exit(void)
+{
+       if (context_tracking_enabled())
+               __context_tracking_exit(CONTEXT_GUEST);
+}
  
  /**
   * ct_state() - return the current context tracking state if known
@@ -92,6 +105,9 @@ static inline void user_exit_irqoff(void) { }
  static inline enum ctx_state exception_enter(void) { return 0; }
  static inline void exception_exit(enum ctx_state prev_ctx) { }
  static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
+static inline bool context_tracking_guest_enter(void) { return false; }
+static inline void context_tracking_guest_exit(void) { }
+
  #endif /* !CONFIG_CONTEXT_TRACKING */
  
  #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
@@ -102,80 +118,4 @@ extern void context_tracking_init(void);
  static inline void context_tracking_init(void) { }
  #endif /* CONFIG_CONTEXT_TRACKING_FORCE */
  
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-/* must be called with irqs disabled */
-static __always_inline void guest_enter_irqoff(void)
-{
-       instrumentation_begin();
-       if (vtime_accounting_enabled_this_cpu())
-               vtime_guest_enter(current);
-       else
-               current->flags |= PF_VCPU;
-       instrumentation_end();
-
-       if (context_tracking_enabled())
-               __context_tracking_enter(CONTEXT_GUEST);
-
-       /* KVM does not hold any references to rcu protected data when it
-        * switches CPU into a guest mode. In fact switching to a guest mode
-        * is very similar to exiting to userspace from rcu point of view. In
-        * addition CPU may stay in a guest mode for quite a long time (up to
-        * one time slice). Lets treat guest mode as quiescent state, just like
-        * we do with user-mode execution.
-        */
-       if (!context_tracking_enabled_this_cpu()) {
-               instrumentation_begin();
-               rcu_virt_note_context_switch(smp_processor_id());
-               instrumentation_end();
-       }
-}
-
-static __always_inline void guest_exit_irqoff(void)
-{
-       if (context_tracking_enabled())
-               __context_tracking_exit(CONTEXT_GUEST);
-
-       instrumentation_begin();
-       if (vtime_accounting_enabled_this_cpu())
-               vtime_guest_exit(current);
-       else
-               current->flags &= ~PF_VCPU;
-       instrumentation_end();
-}
-
-#else
-static __always_inline void guest_enter_irqoff(void)
-{
-       /*
-        * This is running in ioctl context so its safe
-        * to assume that it's the stime pending cputime
-        * to flush.
-        */
-       instrumentation_begin();
-       vtime_account_kernel(current);
-       current->flags |= PF_VCPU;
-       rcu_virt_note_context_switch(smp_processor_id());
-       instrumentation_end();
-}
-
-static __always_inline void guest_exit_irqoff(void)
-{
-       instrumentation_begin();
-       /* Flush the guest cputime we spent on the guest */
-       vtime_account_kernel(current);
-       current->flags &= ~PF_VCPU;
-       instrumentation_end();
-}
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-
-static inline void guest_exit(void)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       guest_exit_irqoff();
-       local_irq_restore(flags);
-}
-
  #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 8895b95..2f34487 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -338,6 +338,51 @@ struct kvm_vcpu {
         struct kvm_dirty_ring dirty_ring;
  };
  
+/* must be called with irqs disabled */
+static __always_inline void guest_enter_irqoff(void)
+{
+       /*
+        * This is running in ioctl context so its safe to assume that it's the
+        * stime pending cputime to flush.
+        */
+       instrumentation_begin();
+       vtime_account_guest_enter();
+       instrumentation_end();
+
+       /*
+        * KVM does not hold any references to rcu protected data when it
+        * switches CPU into a guest mode. In fact switching to a guest mode
+        * is very similar to exiting to userspace from rcu point of view. In
+        * addition CPU may stay in a guest mode for quite a long time (up to
+        * one time slice). Lets treat guest mode as quiescent state, just like
+        * we do with user-mode execution.
+        */
+       if (!context_tracking_guest_enter()) {
+               instrumentation_begin();
+               rcu_virt_note_context_switch(smp_processor_id());
+               instrumentation_end();
+       }
+}
+
+static __always_inline void guest_exit_irqoff(void)
+{
+       context_tracking_guest_exit();
+
+       instrumentation_begin();
+       /* Flush the guest cputime we spent on the guest */
+       vtime_account_guest_exit();
+       instrumentation_end();
+}
+
+static inline void guest_exit(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       guest_exit_irqoff();
+       local_irq_restore(flags);
+}
+
  static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
  {
         /*
diff --git a/include/linux/vtime.h b/include/linux/vtime.h

index 041d652..3684487 100644 (file)
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -3,12 +3,46 @@
  #define _LINUX_KERNEL_VTIME_H
  
  #include <linux/context_tracking_state.h>
+#include <linux/sched.h>
+
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  #include <asm/vtime.h>
  #endif
  
+/*
+ * Common vtime APIs
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+extern void vtime_account_kernel(struct task_struct *tsk);
+extern void vtime_account_idle(struct task_struct *tsk);
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
  
-struct task_struct;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_user_enter(struct task_struct *tsk);
+extern void vtime_user_exit(struct task_struct *tsk);
+extern void vtime_guest_enter(struct task_struct *tsk);
+extern void vtime_guest_exit(struct task_struct *tsk);
+extern void vtime_init_idle(struct task_struct *tsk, int cpu);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
+static inline void vtime_user_enter(struct task_struct *tsk) { }
+static inline void vtime_user_exit(struct task_struct *tsk) { }
+static inline void vtime_guest_enter(struct task_struct *tsk) { }
+static inline void vtime_guest_exit(struct task_struct *tsk) { }
+static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
+#endif
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
+extern void vtime_account_softirq(struct task_struct *tsk);
+extern void vtime_account_hardirq(struct task_struct *tsk);
+extern void vtime_flush(struct task_struct *tsk);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
+static inline void vtime_account_softirq(struct task_struct *tsk) { }
+static inline void vtime_account_hardirq(struct task_struct *tsk) { }
+static inline void vtime_flush(struct task_struct *tsk) { }
+#endif
  
  /*
   * vtime_accounting_enabled_this_cpu() definitions/declarations
@@ -18,6 +52,18 @@ struct task_struct;
  static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
  extern void vtime_task_switch(struct task_struct *prev);
  
+static __always_inline void vtime_account_guest_enter(void)
+{
+       vtime_account_kernel(current);
+       current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+       vtime_account_kernel(current);
+       current->flags &= ~PF_VCPU;
+}
+
  #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
  
  /*
@@ -49,49 +95,37 @@ static inline void vtime_task_switch(struct task_struct *prev)
                 vtime_task_switch_generic(prev);
  }
  
+static __always_inline void vtime_account_guest_enter(void)
+{
+       if (vtime_accounting_enabled_this_cpu())
+               vtime_guest_enter(current);
+       else
+               current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+       if (vtime_accounting_enabled_this_cpu())
+               vtime_guest_exit(current);
+       else
+               current->flags &= ~PF_VCPU;
+}
+
  #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
  
-static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; }
  static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
  static inline void vtime_task_switch(struct task_struct *prev) { }
  
-#endif
-
-/*
- * Common vtime APIs
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void vtime_account_kernel(struct task_struct *tsk);
-extern void vtime_account_idle(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-static inline void vtime_account_kernel(struct task_struct *tsk) { }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+static __always_inline void vtime_account_guest_enter(void)
+{
+       current->flags |= PF_VCPU;
+}
  
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_user_enter(struct task_struct *tsk);
-extern void vtime_user_exit(struct task_struct *tsk);
-extern void vtime_guest_enter(struct task_struct *tsk);
-extern void vtime_guest_exit(struct task_struct *tsk);
-extern void vtime_init_idle(struct task_struct *tsk, int cpu);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_user_enter(struct task_struct *tsk) { }
-static inline void vtime_user_exit(struct task_struct *tsk) { }
-static inline void vtime_guest_enter(struct task_struct *tsk) { }
-static inline void vtime_guest_exit(struct task_struct *tsk) { }
-static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
-#endif
+static __always_inline void vtime_account_guest_exit(void)
+{
+       current->flags &= ~PF_VCPU;
+}
  
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
-extern void vtime_account_softirq(struct task_struct *tsk);
-extern void vtime_account_hardirq(struct task_struct *tsk);
-extern void vtime_flush(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
-static inline void vtime_account_softirq(struct task_struct *tsk) { }
-static inline void vtime_account_hardirq(struct task_struct *tsk) { }
-static inline void vtime_flush(struct task_struct *tsk) { }
  #endif
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 9 May 2021 19:52:25 +0000 (12:52 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 9 May 2021 19:52:25 +0000 (12:52 -0700)
arch/x86/include/asm/bug.h		patch \| blob \| history
arch/x86/include/asm/idtentry.h		patch \| blob \| history
arch/x86/include/asm/msr.h		patch \| blob \| history
arch/x86/include/asm/page_64.h		patch \| blob \| history
arch/x86/include/asm/page_64_types.h		patch \| blob \| history
arch/x86/kernel/cpu/common.c		patch \| blob \| history
arch/x86/kernel/cpu/resctrl/monitor.c		patch \| blob \| history
arch/x86/kernel/nmi.c		patch \| blob \| history
arch/x86/kernel/smpboot.c		patch \| blob \| history
arch/x86/kvm/svm/svm.c		patch \| blob \| history
arch/x86/kvm/vmx/vmx.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history
include/linux/context_tracking.h		patch \| blob \| history
include/linux/kvm_host.h		patch \| blob \| history
include/linux/vtime.h		patch \| blob \| history