KVM: VMX: Improve error reporting during invalid guest state emulation
[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx.c
index 4ff0ab9..c61eb34 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static const struct x86_cpu_id vmx_cpu_id[] = {
+       X86_FEATURE_MATCH(X86_FEATURE_VMX),
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -64,6 +71,9 @@ static bool __read_mostly enable_unrestricted_guest = 1;
 module_param_named(unrestricted_guest,
                        enable_unrestricted_guest, bool, S_IRUGO);
 
+static bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
 static bool __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
@@ -386,6 +396,9 @@ struct vcpu_vmx {
        struct {
                int           loaded;
                u16           fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+               u16           ds_sel, es_sel;
+#endif
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
        } host_state;
@@ -605,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg);
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+                           struct kvm_segment *var, int seg);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -779,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
        return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
 }
 
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+       return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
 static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
        return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -1410,6 +1432,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
                vmx->host_state.gs_ldt_reload_needed = 1;
        }
 
+#ifdef CONFIG_X86_64
+       savesegment(ds, vmx->host_state.ds_sel);
+       savesegment(es, vmx->host_state.es_sel);
+#endif
+
 #ifdef CONFIG_X86_64
        vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
        vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
@@ -1450,6 +1477,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
        }
        if (vmx->host_state.fs_reload_needed)
                loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+       if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
+               loadsegment(ds, vmx->host_state.ds_sel);
+               loadsegment(es, vmx->host_state.es_sel);
+       }
+#else
+       /*
+        * The sysexit path does not restore ds/es, so we must set them to
+        * a reasonable value ourselves.
+        */
+       loadsegment(ds, __USER_DS);
+       loadsegment(es, __USER_DS);
+#endif
        reload_tss();
 #ifdef CONFIG_X86_64
        wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -2617,8 +2657,12 @@ static __init int hardware_setup(void)
            !cpu_has_vmx_ept_4levels()) {
                enable_ept = 0;
                enable_unrestricted_guest = 0;
+               enable_ept_ad_bits = 0;
        }
 
+       if (!cpu_has_vmx_ept_ad_bits())
+               enable_ept_ad_bits = 0;
+
        if (!cpu_has_vmx_unrestricted_guest())
                enable_unrestricted_guest = 0;
 
@@ -2742,6 +2786,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct kvm_segment var;
 
        if (enable_unrestricted_guest)
                return;
@@ -2785,20 +2830,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        if (emulate_invalid_guest_state)
                goto continue_rmode;
 
-       vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
-       vmcs_write32(GUEST_SS_LIMIT, 0xffff);
-       vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+       vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
+       vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
+
+       vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
+       vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
+
+       vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
+       vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
 
-       vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
-       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-       if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
-               vmcs_writel(GUEST_CS_BASE, 0xf0000);
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+       vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
+       vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
 
-       fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
-       fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
-       fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
-       fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+       vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
+       vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
+
+       vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
+       vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
 
 continue_rmode:
        kvm_mmu_reset_context(vcpu);
@@ -2999,6 +3047,8 @@ static u64 construct_eptp(unsigned long root_hpa)
        /* TODO write the value reading from MSR */
        eptp = VMX_EPT_DEFAULT_MT |
                VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+       if (enable_ept_ad_bits)
+               eptp |= VMX_EPT_AD_ENABLE_BIT;
        eptp |= (root_hpa & PAGE_MASK);
 
        return eptp;
@@ -3125,11 +3175,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
 
 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       /*
+        * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
+        * fail; use the cache instead.
+        */
+       if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
+               return vmx->cpl;
+       }
+
        if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
                __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
-               to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+               vmx->cpl = __vmx_get_cpl(vcpu);
        }
-       return to_vmx(vcpu)->cpl;
+
+       return vmx->cpl;
 }
 
 
@@ -3137,7 +3198,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
 {
        u32 ar;
 
-       if (var->unusable)
+       if (var->unusable || !var->present)
                ar = 1 << 16;
        else {
                ar = var->type & 15;
@@ -3149,8 +3210,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
                ar |= (var->db & 1) << 14;
                ar |= (var->g & 1) << 15;
        }
-       if (ar == 0) /* a 0 value means unusable */
-               ar = AR_UNUSABLE_MASK;
 
        return ar;
 }
@@ -3201,6 +3260,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 
        vmcs_write32(sf->ar_bytes, ar);
        __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+
+       /*
+        * Fix segments for real mode guest in hosts that don't have
+        * "unrestricted_mode" or it was disabled.
+        * This is done to allow migration of the guests from hosts with
+        * unrestricted guest like Westmere to older host that don't have
+        * unrestricted guest like Nehelem.
+        */
+       if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+               switch (seg) {
+               case VCPU_SREG_CS:
+                       vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+                       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+                       if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+                               vmcs_writel(GUEST_CS_BASE, 0xf0000);
+                       vmcs_write16(GUEST_CS_SELECTOR,
+                                    vmcs_readl(GUEST_CS_BASE) >> 4);
+                       break;
+               case VCPU_SREG_ES:
+                       fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+                       break;
+               case VCPU_SREG_DS:
+                       fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+                       break;
+               case VCPU_SREG_GS:
+                       fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+                       break;
+               case VCPU_SREG_FS:
+                       fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+                       break;
+               case VCPU_SREG_SS:
+                       vmcs_write16(GUEST_SS_SELECTOR,
+                                    vmcs_readl(GUEST_SS_BASE) >> 4);
+                       vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+                       vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+                       break;
+               }
+       }
 }
 
 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3633,8 +3730,18 @@ static void vmx_set_constant_host_state(void)
        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+#ifdef CONFIG_X86_64
+       /*
+        * Load null selectors, so we can avoid reloading them in
+        * __vmx_load_host_state(), in case userspace uses the null selectors
+        * too (the expected case).
+        */
+       vmcs_write16(HOST_DS_SELECTOR, 0);
+       vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+#endif
        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
@@ -4451,7 +4558,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                break;
        }
        vcpu->run->exit_reason = 0;
-       pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+       vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
               (int)(exit_qualification >> 4) & 3, cr);
        return 0;
 }
@@ -4870,15 +4977,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
        int ret = 1;
        u32 cpu_exec_ctrl;
        bool intr_window_requested;
+       unsigned count = 130;
 
        cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 
-       while (!guest_state_valid(vcpu)) {
-               if (intr_window_requested
-                   && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+       while (!guest_state_valid(vcpu) && count-- != 0) {
+               if (intr_window_requested && vmx_interrupt_allowed(vcpu))
                        return handle_interrupt_window(&vmx->vcpu);
 
+               if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+                       return 1;
+
                err = emulate_instruction(vcpu, 0);
 
                if (err == EMULATE_DO_MMIO) {
@@ -4886,8 +4996,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               if (err != EMULATE_DONE)
+               if (err != EMULATE_DONE) {
+                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+                       vcpu->run->internal.ndata = 0;
                        return 0;
+               }
 
                if (signal_pending(current))
                        goto out;
@@ -4895,7 +5009,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                        schedule();
        }
 
-       vmx->emulation_required = 0;
+       vmx->emulation_required = !guest_state_valid(vcpu);
 out:
        return ret;
 }
@@ -6256,7 +6370,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                }
        }
 
-       asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
        vmx->loaded_vmcs->launched = 1;
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6343,7 +6456,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        return &vmx->vcpu;
 
 free_vmcs:
-       free_vmcs(vmx->loaded_vmcs->vmcs);
+       free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
        kfree(vmx->guest_msrs);
 uninit_vcpu:
@@ -7193,23 +7306,21 @@ static int __init vmx_init(void)
        if (!vmx_io_bitmap_a)
                return -ENOMEM;
 
+       r = -ENOMEM;
+
        vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_io_bitmap_b) {
-               r = -ENOMEM;
+       if (!vmx_io_bitmap_b)
                goto out;
-       }
 
        vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_legacy) {
-               r = -ENOMEM;
+       if (!vmx_msr_bitmap_legacy)
                goto out1;
-       }
+
 
        vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_longmode) {
-               r = -ENOMEM;
+       if (!vmx_msr_bitmap_longmode)
                goto out2;
-       }
+
 
        /*
         * Allow direct access to the PC debug port (it is often used for I/O
@@ -7238,8 +7349,10 @@ static int __init vmx_init(void)
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
        if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-                               VMX_EPT_EXECUTABLE_MASK);
+               kvm_mmu_set_mask_ptes(0ull,
+                       (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+                       (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+                       0ull, VMX_EPT_EXECUTABLE_MASK);
                ept_set_mmio_spte_mask();
                kvm_enable_tdp();
        } else