Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 19 Aug 2018 17:38:36 +0000 (10:38 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 19 Aug 2018 17:38:36 +0000 (10:38 -0700)
Pull first set of KVM updates from Paolo Bonzini:
 "PPC:
   - minor code cleanups

  x86:
   - PCID emulation and CR3 caching for shadow page tables
   - nested VMX live migration
   - nested VMCS shadowing
   - optimized IPI hypercall
   - some optimizations

  ARM will come next week"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (85 commits)
  kvm: x86: Set highest physical address bits in non-present/reserved SPTEs
  KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c
  KVM: X86: Implement PV IPIs in linux guest
  KVM: X86: Add kvm hypervisor init time platform setup callback
  KVM: X86: Implement "send IPI" hypercall
  KVM/x86: Move X86_CR4_OSXSAVE check into kvm_valid_sregs()
  KVM: x86: Skip pae_root shadow allocation if tdp enabled
  KVM/MMU: Combine flushing remote tlb in mmu_set_spte()
  KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible
  KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible
  KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup
  KVM: vmx: move struct host_state usage to struct loaded_vmcs
  KVM: vmx: compute need to reload FS/GS/LDT on demand
  KVM: nVMX: remove a misleading comment regarding vmcs02 fields
  KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state()
  KVM: vmx: add dedicated utility to access guest's kernel_gs_base
  KVM: vmx: track host_state.loaded using a loaded_vmcs pointer
  KVM: vmx: refactor segmentation code in vmx_save_host_state()
  kvm: nVMX: Fix fault priority for VMX operations
  kvm: nVMX: Fix fault vector for VMX operation at CPL > 0
  ...

14 files changed:
1  2 
arch/powerpc/include/asm/reg.h
arch/powerpc/kvm/book3s_64_vio.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/powerpc.c
arch/x86/include/asm/hyperv-tlfs.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/mshyperv.h
arch/x86/include/asm/trace/hyperv.h
arch/x86/kernel/kvm.c
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
virt/kvm/kvm_main.c

Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1418,7 -1457,10 +1462,11 @@@ int kvm_cpu_get_interrupt(struct kvm_vc
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
  
+ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+                   unsigned long ipi_bitmap_high, int min,
+                   unsigned long icr, int op_64_bit);
 +u64 kvm_get_arch_capabilities(void);
  void kvm_define_shared_msr(unsigned index, u32 msr);
  int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
  
Simple merge
@@@ -28,21 -28,20 +28,35 @@@ TRACE_EVENT(hyperv_mmu_flush_tlb_others
                      __entry->addr, __entry->end)
        );
  
+ TRACE_EVENT(hyperv_nested_flush_guest_mapping,
+           TP_PROTO(u64 as, int ret),
+           TP_ARGS(as, ret),
+           TP_STRUCT__entry(
+                   __field(u64, as)
+                   __field(int, ret)
+                   ),
+           TP_fast_assign(__entry->as = as;
+                          __entry->ret = ret;
+                   ),
+           TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
+       );
 +TRACE_EVENT(hyperv_send_ipi_mask,
 +          TP_PROTO(const struct cpumask *cpus,
 +                   int vector),
 +          TP_ARGS(cpus, vector),
 +          TP_STRUCT__entry(
 +                  __field(unsigned int, ncpus)
 +                  __field(int, vector)
 +                  ),
 +          TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
 +                         __entry->vector = vector;
 +                  ),
 +          TP_printk("ncpus %d vector %x",
 +                    __entry->ncpus, __entry->vector)
 +      );
 +
  #endif /* CONFIG_HYPERV */
  
  #undef TRACE_INCLUDE_PATH
@@@ -611,6 -716,19 +703,20 @@@ static uint32_t __init kvm_detect(void
        return kvm_cpuid_base();
  }
  
+ static void __init kvm_apic_init(void)
+ {
+ #if defined(CONFIG_SMP)
+       if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+               kvm_setup_pv_ipi();
+ #endif
+ }
+ static void __init kvm_init_platform(void)
+ {
++      kvmclock_init();
+       x86_platform.apic_post_init = kvm_apic_init;
+ }
  const __initconst struct hypervisor_x86 x86_hyper_kvm = {
        .name                   = "KVM",
        .detect                 = kvm_detect,
Simple merge
Simple merge
@@@ -188,150 -189,12 +189,156 @@@ module_param(ple_window_max, uint, 0444
  
  extern const ulong vmx_return;
  
 +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 +static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 +
 +/* Storage for pre module init parameter parsing */
 +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 +
 +static const struct {
 +      const char *option;
 +      enum vmx_l1d_flush_state cmd;
 +} vmentry_l1d_param[] = {
 +      {"auto",        VMENTER_L1D_FLUSH_AUTO},
 +      {"never",       VMENTER_L1D_FLUSH_NEVER},
 +      {"cond",        VMENTER_L1D_FLUSH_COND},
 +      {"always",      VMENTER_L1D_FLUSH_ALWAYS},
 +};
 +
 +#define L1D_CACHE_ORDER 4
 +static void *vmx_l1d_flush_pages;
 +
 +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 +{
 +      struct page *page;
 +      unsigned int i;
 +
 +      if (!enable_ept) {
 +              l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 +              return 0;
 +      }
 +
 +       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
 +             u64 msr;
 +
 +             rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
 +             if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 +                     l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 +                     return 0;
 +             }
 +       }
 +
 +      /* If set to auto use the default l1tf mitigation method */
 +      if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 +              switch (l1tf_mitigation) {
 +              case L1TF_MITIGATION_OFF:
 +                      l1tf = VMENTER_L1D_FLUSH_NEVER;
 +                      break;
 +              case L1TF_MITIGATION_FLUSH_NOWARN:
 +              case L1TF_MITIGATION_FLUSH:
 +              case L1TF_MITIGATION_FLUSH_NOSMT:
 +                      l1tf = VMENTER_L1D_FLUSH_COND;
 +                      break;
 +              case L1TF_MITIGATION_FULL:
 +              case L1TF_MITIGATION_FULL_FORCE:
 +                      l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 +                      break;
 +              }
 +      } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 +              l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 +      }
 +
 +      if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 +          !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 +              page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 +              if (!page)
 +                      return -ENOMEM;
 +              vmx_l1d_flush_pages = page_address(page);
 +
 +              /*
 +               * Initialize each page with a different pattern in
 +               * order to protect against KSM in the nested
 +               * virtualization case.
 +               */
 +              for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 +                      memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 +                             PAGE_SIZE);
 +              }
 +      }
 +
 +      l1tf_vmx_mitigation = l1tf;
 +
 +      if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 +              static_branch_enable(&vmx_l1d_should_flush);
 +      else
 +              static_branch_disable(&vmx_l1d_should_flush);
 +
 +      if (l1tf == VMENTER_L1D_FLUSH_COND)
 +              static_branch_enable(&vmx_l1d_flush_cond);
 +      else
 +              static_branch_disable(&vmx_l1d_flush_cond);
 +      return 0;
 +}
 +
 +static int vmentry_l1d_flush_parse(const char *s)
 +{
 +      unsigned int i;
 +
 +      if (s) {
 +              for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 +                      if (sysfs_streq(s, vmentry_l1d_param[i].option))
 +                              return vmentry_l1d_param[i].cmd;
 +              }
 +      }
 +      return -EINVAL;
 +}
 +
 +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 +{
 +      int l1tf, ret;
 +
 +      if (!boot_cpu_has(X86_BUG_L1TF))
 +              return 0;
 +
 +      l1tf = vmentry_l1d_flush_parse(s);
 +      if (l1tf < 0)
 +              return l1tf;
 +
 +      /*
 +       * Has vmx_init() run already? If not then this is the pre init
 +       * parameter parsing. In that case just store the value and let
 +       * vmx_init() do the proper setup after enable_ept has been
 +       * established.
 +       */
 +      if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 +              vmentry_l1d_flush_param = l1tf;
 +              return 0;
 +      }
 +
 +      mutex_lock(&vmx_l1d_flush_mutex);
 +      ret = vmx_setup_l1d_flush(l1tf);
 +      mutex_unlock(&vmx_l1d_flush_mutex);
 +      return ret;
 +}
 +
 +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 +{
 +      return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 +}
 +
 +static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 +      .set = vmentry_l1d_flush_set,
 +      .get = vmentry_l1d_flush_get,
 +};
 +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 +
+ enum ept_pointers_status {
+       EPT_POINTERS_CHECK = 0,
+       EPT_POINTERS_MATCH = 1,
+       EPT_POINTERS_MISMATCH = 2
+ };
  struct kvm_vmx {
        struct kvm kvm;
  
@@@ -937,21 -828,14 +977,13 @@@ struct vcpu_vmx 
         */
        struct loaded_vmcs    vmcs01;
        struct loaded_vmcs   *loaded_vmcs;
+       struct loaded_vmcs   *loaded_cpu_state;
        bool                  __launched; /* temporary, used in vmx_vcpu_run */
        struct msr_autoload {
 -              unsigned nr;
 -              struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
 -              struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
 +              struct vmx_msrs guest;
 +              struct vmx_msrs host;
        } msr_autoload;
-       struct {
-               int           loaded;
-               u16           fs_sel, gs_sel, ldt_sel;
- #ifdef CONFIG_X86_64
-               u16           ds_sel, es_sel;
- #endif
-               int           gs_ldt_reload_needed;
-               int           fs_reload_needed;
-               u64           msr_host_bndcfgs;
-       } host_state;
        struct {
                int vm86_active;
                ulong save_rflags;
@@@ -10647,37 -10779,12 +11021,39 @@@ free_vcpu
        return ERR_PTR(err);
  }
  
 +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
 +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
 +
  static int vmx_vm_init(struct kvm *kvm)
  {
+       spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
        if (!ple_gap)
                kvm->arch.pause_in_guest = true;
 +
 +      if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
 +              switch (l1tf_mitigation) {
 +              case L1TF_MITIGATION_OFF:
 +              case L1TF_MITIGATION_FLUSH_NOWARN:
 +                      /* 'I explicitly don't care' is set */
 +                      break;
 +              case L1TF_MITIGATION_FLUSH:
 +              case L1TF_MITIGATION_FLUSH_NOSMT:
 +              case L1TF_MITIGATION_FULL:
 +                      /*
 +                       * Warn upon starting the first VM in a potentially
 +                       * insecure environment.
 +                       */
 +                      if (cpu_smt_control == CPU_SMT_ENABLED)
 +                              pr_warn_once(L1TF_MSG_SMT);
 +                      if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
 +                              pr_warn_once(L1TF_MSG_L1D);
 +                      break;
 +              case L1TF_MITIGATION_FULL_FORCE:
 +                      /* Flush is enforced */
 +                      break;
 +              }
 +      }
        return 0;
  }
  
@@@ -12164,15 -12375,25 +12644,28 @@@ static int nested_vmx_run(struct kvm_vc
         */
  
        vmx->nested.nested_run_pending = 1;
-       ret = enter_vmx_non_root_mode(vcpu);
+       ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
        if (ret) {
+               nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
                vmx->nested.nested_run_pending = 0;
-               return ret;
+               return 1;
        }
  
 +      /* Hide L1D cache contents from the nested guest.  */
 +      vmx->vcpu.arch.l1tf_flush_l1d = true;
 +
+       /*
+        * Must happen outside of enter_vmx_non_root_mode() as it will
+        * also be used as part of restoring nVMX state for
+        * snapshot restore (migration).
+        *
+        * In this flow, it is assumed that vmcs12 cache was
+        * trasferred as part of captured nVMX state and should
+        * therefore not be read from guest memory (which may not
+        * exist on destination host yet).
+        */
+       nested_cache_shadow_vmcs12(vcpu, vmcs12);
        /*
         * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
         * by event injection, halt vcpu.
Simple merge
Simple merge