x86/KVM/VMX: Use MSR save list for IA32_FLUSH_CMD if required
[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx.c
index 559a12b..eb7c207 100644 (file)
@@ -71,6 +71,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
+static bool __read_mostly nosmt;
+module_param(nosmt, bool, S_IRUGO);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -188,6 +191,54 @@ module_param(ple_window_max, uint, 0444);
 
 extern const ulong vmx_return;
 
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+
+/* These MUST be in sync with vmentry_l1d_param order. */
+enum vmx_l1d_flush_state {
+       VMENTER_L1D_FLUSH_NEVER,
+       VMENTER_L1D_FLUSH_COND,
+       VMENTER_L1D_FLUSH_ALWAYS,
+};
+
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush = VMENTER_L1D_FLUSH_COND;
+
+static const struct {
+       const char *option;
+       enum vmx_l1d_flush_state cmd;
+} vmentry_l1d_param[] = {
+       {"never",       VMENTER_L1D_FLUSH_NEVER},
+       {"cond",        VMENTER_L1D_FLUSH_COND},
+       {"always",      VMENTER_L1D_FLUSH_ALWAYS},
+};
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+       unsigned int i;
+
+       if (!s)
+               return -EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+               if (!strcmp(s, vmentry_l1d_param[i].option)) {
+                       vmentry_l1d_flush = vmentry_l1d_param[i].cmd;
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+       return sprintf(s, "%s\n", vmentry_l1d_param[vmentry_l1d_flush].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+       .set = vmentry_l1d_flush_set,
+       .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, &vmentry_l1d_flush, S_IRUGO);
+
 struct kvm_vmx {
        struct kvm kvm;
 
@@ -757,6 +808,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
                        (unsigned long *)&pi_desc->control);
 }
 
+struct vmx_msrs {
+       unsigned int            nr;
+       struct vmx_msr_entry    val[NR_AUTOLOAD_MSRS];
+};
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
@@ -790,9 +846,8 @@ struct vcpu_vmx {
        struct loaded_vmcs   *loaded_vmcs;
        bool                  __launched; /* temporary, used in vmx_vcpu_run */
        struct msr_autoload {
-               unsigned nr;
-               struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
-               struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+               struct vmx_msrs guest;
+               struct vmx_msrs host;
        } msr_autoload;
        struct {
                int           loaded;
@@ -2366,9 +2421,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
        vm_exit_controls_clearbit(vmx, exit);
 }
 
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+       unsigned int i;
+
+       for (i = 0; i < m->nr; ++i) {
+               if (m->val[i].index == msr)
+                       return i;
+       }
+       return -ENOENT;
+}
+
 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 {
-       unsigned i;
+       int i;
        struct msr_autoload *m = &vmx->msr_autoload;
 
        switch (msr) {
@@ -2389,18 +2455,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
                }
                break;
        }
+       i = find_msr(&m->guest, msr);
+       if (i < 0)
+               goto skip_guest;
+       --m->guest.nr;
+       m->guest.val[i] = m->guest.val[m->guest.nr];
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
-
-       if (i == m->nr)
+skip_guest:
+       i = find_msr(&m->host, msr);
+       if (i < 0)
                return;
-       --m->nr;
-       m->guest[i] = m->guest[m->nr];
-       m->host[i] = m->host[m->nr];
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+       --m->host.nr;
+       m->host.val[i] = m->host.val[m->host.nr];
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 }
 
 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -2415,9 +2484,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 }
 
 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
-                                 u64 guest_val, u64 host_val)
+                                 u64 guest_val, u64 host_val, bool entry_only)
 {
-       unsigned i;
+       int i, j = 0;
        struct msr_autoload *m = &vmx->msr_autoload;
 
        switch (msr) {
@@ -2452,24 +2521,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
 
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
+       i = find_msr(&m->guest, msr);
+       if (!entry_only)
+               j = find_msr(&m->host, msr);
 
-       if (i == NR_AUTOLOAD_MSRS) {
+       if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
                printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
-       } else if (i == m->nr) {
-               ++m->nr;
-               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
        }
+       if (i < 0) {
+               i = m->guest.nr++;
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+       }
+       m->guest.val[i].index = msr;
+       m->guest.val[i].value = guest_val;
 
-       m->guest[i].index = msr;
-       m->guest[i].value = guest_val;
-       m->host[i].index = msr;
-       m->host[i].value = host_val;
+       if (entry_only)
+               return;
+
+       if (j < 0) {
+               j = m->host.nr++;
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+       }
+       m->host.val[j].index = msr;
+       m->host.val[j].value = host_val;
 }
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2513,7 +2589,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                        guest_efer &= ~EFER_LME;
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
-                                             guest_efer, host_efer);
+                                             guest_efer, host_efer, false);
                return false;
        } else {
                guest_efer &= ~ignore_bits;
@@ -3967,7 +4043,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.ia32_xss = data;
                if (vcpu->arch.ia32_xss != host_xss)
                        add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-                               vcpu->arch.ia32_xss, host_xss);
+                               vcpu->arch.ia32_xss, host_xss, false);
                else
                        clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
                break;
@@ -6161,6 +6237,16 @@ static void ept_set_mmio_spte_mask(void)
                                   VMX_EPT_MISCONFIG_WX_VALUE);
 }
 
+static bool vmx_l1d_use_msr_save_list(void)
+{
+       if (!enable_ept || !boot_cpu_has_bug(X86_BUG_L1TF) ||
+           static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+           !static_cpu_has(X86_FEATURE_FLUSH_L1D))
+               return false;
+
+       return vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
+}
+
 #define VMX_XSS_EXIT_BITMAP 0
 /*
  * Sets up the vmcs for emulated real mode.
@@ -6239,9 +6325,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
 
        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -6282,6 +6368,12 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
+       /*
+        * If flushing the L1D cache on every VMENTER is enforced and the
+        * MSR is available, use the MSR save list.
+        */
+       if (vmx_l1d_use_msr_save_list())
+               add_atomic_switch_msr(vmx, MSR_IA32_FLUSH_CMD, L1D_FLUSH, 0, true);
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9512,6 +9604,77 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        }
 }
 
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+       int size = PAGE_SIZE << L1D_CACHE_ORDER;
+       bool always;
+
+       /*
+        * This code is only executed when:
+        * - the flush mode is 'cond'
+        * - the flush mode is 'always' and the flush MSR is not
+        *   available
+        *
+        * If the CPU has the flush MSR then clear the flush bit because
+        * 'always' mode is handled via the MSR save list.
+        *
+        * If the MSR is not avaibable then act depending on the mitigation
+        * mode: If 'flush always', keep the flush bit set, otherwise clear
+        * it.
+        *
+        * The flush bit gets set again either from vcpu_run() or from one
+        * of the unsafe VMEXIT handlers.
+        */
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D))
+               always = false;
+       else
+               always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
+
+       vcpu->arch.l1tf_flush_l1d = always;
+
+       vcpu->stat.l1d_flush++;
+
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               return;
+       }
+
+       asm volatile(
+               /* First ensure the pages are in the TLB */
+               "xorl   %%eax, %%eax\n"
+               ".Lpopulate_tlb:\n\t"
+               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $4096, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lpopulate_tlb\n\t"
+               "xorl   %%eax, %%eax\n\t"
+               "cpuid\n\t"
+               /* Now fill the cache */
+               "xorl   %%eax, %%eax\n"
+               ".Lfill_cache:\n"
+               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $64, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lfill_cache\n\t"
+               "lfence\n"
+               :: [empty_zp] "r" (vmx_l1d_flush_pages),
+                   [size] "r" (size)
+               : "eax", "ebx", "ecx", "edx");
+}
+
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9751,6 +9914,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
+               vcpu->arch.l1tf_flush_l1d = true;
        }
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -9913,7 +10077,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                        clear_atomic_switch_msr(vmx, msrs[i].msr);
                else
                        add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
-                                       msrs[i].host);
+                                       msrs[i].host, false);
 }
 
 static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -10008,6 +10172,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
                (unsigned long)&current_evmcs->host_rsp : 0;
 
+       if (static_branch_unlikely(&vmx_l1d_should_flush)) {
+               if (vcpu->arch.l1tf_flush_l1d)
+                       vmx_l1d_flush(vcpu);
+       }
+
        asm(
                /* Store host registers */
                "push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -10370,10 +10539,20 @@ free_vcpu:
        return ERR_PTR(err);
 }
 
+#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
+
 static int vmx_vm_init(struct kvm *kvm)
 {
        if (!ple_gap)
                kvm->arch.pause_in_guest = true;
+
+       if (boot_cpu_has(X86_BUG_L1TF) && cpu_smt_control == CPU_SMT_ENABLED) {
+               if (nosmt) {
+                       pr_err(L1TF_MSG);
+                       return -EOPNOTSUPP;
+               }
+               pr_warn(L1TF_MSG);
+       }
        return 0;
 }
 
@@ -11227,10 +11406,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * Set the MSR load/store lists to match L0's settings.
         */
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
 
        set_cr4_guest_host_mask(vmx);
 
@@ -11811,6 +11990,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return ret;
        }
 
+       /* Hide L1D cache contents from the nested guest.  */
+       vmx->vcpu.arch.l1tf_flush_l1d = true;
+
        /*
         * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
         * by event injection, halt vcpu.
@@ -12331,8 +12513,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        vmx_segment_cache_clear(vmx);
 
        /* Update any VMCS fields that might have changed while L2 ran */
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
        if (vmx->hv_deadline_tsc == -1)
                vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -13049,6 +13231,34 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .enable_smi_window = enable_smi_window,
 };
 
+static int __init vmx_setup_l1d_flush(void)
+{
+       struct page *page;
+
+       if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER ||
+           !boot_cpu_has_bug(X86_BUG_L1TF) ||
+           vmx_l1d_use_msr_save_list())
+               return 0;
+
+       if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+               if (!page)
+                       return -ENOMEM;
+               vmx_l1d_flush_pages = page_address(page);
+       }
+
+       static_branch_enable(&vmx_l1d_should_flush);
+       return 0;
+}
+
+static void vmx_free_l1d_flush_pages(void)
+{
+       if (vmx_l1d_flush_pages) {
+               free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+               vmx_l1d_flush_pages = NULL;
+       }
+}
+
 static int __init vmx_init(void)
 {
        int r;
@@ -13082,11 +13292,17 @@ static int __init vmx_init(void)
        }
 #endif
 
-       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                     __alignof__(struct vcpu_vmx), THIS_MODULE);
+       r = vmx_setup_l1d_flush();
        if (r)
                return r;
 
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       if (r) {
+               vmx_free_l1d_flush_pages();
+               return r;
+       }
+
 #ifdef CONFIG_KEXEC_CORE
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
@@ -13127,6 +13343,7 @@ static void __exit vmx_exit(void)
                static_branch_disable(&enable_evmcs);
        }
 #endif
+       vmx_free_l1d_flush_pages();
 }
 
 module_init(vmx_init)