Merge branch 'kvm-sev-cgroup' into HEAD
[linux-2.6-microblaze.git] / arch / x86 / kvm / svm / nested.c
index fb204ea..540d43b 100644 (file)
@@ -29,6 +29,8 @@
 #include "lapic.h"
 #include "svm.h"
 
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
                                       struct x86_exception *fault)
 {
@@ -92,12 +94,12 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
 
        WARN_ON(mmu_is_nested(vcpu));
 
        vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+       kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+                               svm->vmcb01.ptr->save.efer,
                                svm->nested.ctl.nested_cr3);
        vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
@@ -123,7 +125,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
                return;
 
        c = &svm->vmcb->control;
-       h = &svm->nested.hsave->control;
+       h = &svm->vmcb01.ptr->control;
        g = &svm->nested.ctl;
 
        for (i = 0; i < MAX_INTERCEPT; i++)
@@ -213,44 +215,64 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
        return true;
 }
 
-static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 addr = PAGE_ALIGN(pa);
 
-       if (WARN_ON(!is_guest_mode(vcpu)))
-               return true;
-
-       if (!nested_svm_vmrun_msrpm(svm)) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror =
-                       KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
-               return false;
-       }
-
-       return true;
+       return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+           kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
 }
 
-static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+                                      struct vmcb_control_area *control)
 {
-       if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0)
+       if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
                return false;
 
-       if (control->asid == 0)
+       if (CC(control->asid == 0))
                return false;
 
-       if ((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-           !npt_enabled)
+       if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+               return false;
+
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+                                          MSRPM_SIZE)))
+               return false;
+       if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+                                          IOPM_SIZE)))
                return false;
 
        return true;
 }
 
-static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
+                                     struct vmcb_save_area *save)
 {
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       bool vmcb12_lma;
+       /*
+        * These checks are also performed by KVM_SET_SREGS,
+        * except that EFER.LMA is not checked by SVM against
+        * CR0.PG && EFER.LME.
+        */
+       if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+               if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+                   CC(!(save->cr0 & X86_CR0_PE)) ||
+                   CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
+                       return false;
+       }
+
+       if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
+               return false;
+
+       return true;
+}
 
+/* Common checks that apply to both L1 and L2 state.  */
+static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
+                                   struct vmcb_save_area *save)
+{
        /*
         * FIXME: these should be done after copying the fields,
         * to avoid TOC/TOU races.  For these save area checks
@@ -258,31 +280,27 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
         * kvm_set_cr4 handle failure; EFER_SVME is an exception
         * so it is force-set later in nested_prepare_vmcb_save.
         */
-       if ((vmcb12->save.efer & EFER_SVME) == 0)
+       if (CC(!(save->efer & EFER_SVME)))
                return false;
 
-       if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW))
+       if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+           CC(save->cr0 & ~0xffffffffULL))
                return false;
 
-       if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7))
+       if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
                return false;
 
-       vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);
+       if (!nested_vmcb_check_cr3_cr4(vcpu, save))
+               return false;
 
-       if (vmcb12_lma) {
-               if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
-                   !(vmcb12->save.cr0 & X86_CR0_PE) ||
-                   kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
-                       return false;
-       }
-       if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
+       if (CC(!kvm_valid_efer(vcpu, save->efer)))
                return false;
 
        return true;
 }
 
-static void load_nested_vmcb_control(struct vcpu_svm *svm,
-                                    struct vmcb_control_area *control)
+static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                           struct vmcb_control_area *control)
 {
        copy_vmcb_control_area(&svm->nested.ctl, control);
 
@@ -294,9 +312,9 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm,
 
 /*
  * Synchronize fields that are written by the processor, so that
- * they can be copied back into the nested_vmcb.
+ * they can be copied back into the vmcb12.
  */
-void sync_nested_vmcb_control(struct vcpu_svm *svm)
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 {
        u32 mask;
        svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
@@ -324,8 +342,8 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
  * Transfer any event that L0 or L1 wanted to inject into L2 to
  * EXIT_INT_INFO.
  */
-static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
-                                          struct vmcb *vmcb12)
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+                                               struct vmcb *vmcb12)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
        u32 exit_int_info = 0;
@@ -369,12 +387,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
                               bool nested_npt)
 {
-       if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
+       if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
                return -EINVAL;
 
        if (!nested_npt && is_pae_paging(vcpu) &&
            (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
-               if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+               if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
                        return -EINVAL;
        }
 
@@ -393,15 +411,42 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
        return 0;
 }
 
-static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 {
+       if (!svm->nested.vmcb02.ptr)
+               return;
+
+       /* FIXME: merge g_pat from vmcb01 and vmcb12.  */
+       svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+       bool new_vmcb12 = false;
+
+       nested_vmcb02_compute_g_pat(svm);
+
        /* Load the nested guest state */
-       svm->vmcb->save.es = vmcb12->save.es;
-       svm->vmcb->save.cs = vmcb12->save.cs;
-       svm->vmcb->save.ss = vmcb12->save.ss;
-       svm->vmcb->save.ds = vmcb12->save.ds;
-       svm->vmcb->save.gdtr = vmcb12->save.gdtr;
-       svm->vmcb->save.idtr = vmcb12->save.idtr;
+       if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+               new_vmcb12 = true;
+               svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+               svm->vmcb->save.es = vmcb12->save.es;
+               svm->vmcb->save.cs = vmcb12->save.cs;
+               svm->vmcb->save.ss = vmcb12->save.ss;
+               svm->vmcb->save.ds = vmcb12->save.ds;
+               svm->vmcb->save.cpl = vmcb12->save.cpl;
+               vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+       }
+
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+               svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+               svm->vmcb->save.idtr = vmcb12->save.idtr;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+       }
+
        kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
 
        /*
@@ -413,7 +458,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
 
        svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
        svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+       svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
        kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
        kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
        kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
@@ -422,15 +469,41 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
        svm->vmcb->save.rax = vmcb12->save.rax;
        svm->vmcb->save.rsp = vmcb12->save.rsp;
        svm->vmcb->save.rip = vmcb12->save.rip;
-       svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
-       svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
-       svm->vmcb->save.cpl = vmcb12->save.cpl;
+
+       /* These bits will be set properly on the first execution when new_vmc12 is true */
+       if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+               svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
+               svm->vcpu.arch.dr6  = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+               vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+       }
 }
 
-static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
        const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
 
+       /*
+        * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+        * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+        */
+
+       /*
+        * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+        * avic_physical_id.
+        */
+       WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+
+       /* Copied from vmcb01.  msrpm_base can be overwritten later.  */
+       svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+       svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+       svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+
+       /* Done at vmrun: asid.  */
+
+       /* Also overwritten later if necessary.  */
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+       /* nested_cr3.  */
        if (nested_npt_enabled(svm))
                nested_svm_init_mmu_context(&svm->vcpu);
 
@@ -439,7 +512,7 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
 
        svm->vmcb->control.int_ctl             =
                (svm->nested.ctl.int_ctl & ~mask) |
-               (svm->nested.hsave->control.int_ctl & mask);
+               (svm->vmcb01.ptr->control.int_ctl & mask);
 
        svm->vmcb->control.virt_ext            = svm->nested.ctl.virt_ext;
        svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
@@ -454,17 +527,28 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
        enter_guest_mode(&svm->vcpu);
 
        /*
-        * Merge guest and host intercepts - must be called  with vcpu in
-        * guest-mode to take affect here
+        * Merge guest and host intercepts - must be called with vcpu in
+        * guest-mode to take effect.
         */
        recalc_intercepts(svm);
+}
 
-       vmcb_mark_all_dirty(svm->vmcb);
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+       /*
+        * Some VMCB state is shared between L1 and L2 and thus has to be
+        * moved at the time of nested vmrun and vmexit.
+        *
+        * VMLOAD/VMSAVE state would also belong in this category, but KVM
+        * always performs VMLOAD and VMSAVE from the VMCB01.
+        */
+       to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
 }
 
-int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
                         struct vmcb *vmcb12)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
 
        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
@@ -482,8 +566,14 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
 
 
        svm->nested.vmcb12_gpa = vmcb12_gpa;
-       nested_prepare_vmcb_control(svm);
-       nested_prepare_vmcb_save(svm, vmcb12);
+
+       WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+       nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+       nested_vmcb02_prepare_control(svm);
+       nested_vmcb02_prepare_save(svm, vmcb12);
 
        ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
                                  nested_npt_enabled(svm));
@@ -491,47 +581,48 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
                return ret;
 
        if (!npt_enabled)
-               svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+               vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
 
        svm_set_gif(svm, true);
 
        return 0;
 }
 
-int nested_svm_vmrun(struct vcpu_svm *svm)
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_svm *svm = to_svm(vcpu);
        int ret;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
        u64 vmcb12_gpa;
 
-       if (is_smm(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       ++vcpu->stat.nested_run;
+
+       if (is_smm(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
        vmcb12_gpa = svm->vmcb->save.rax;
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+       ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
        if (ret == -EINVAL) {
-               kvm_inject_gp(&svm->vcpu, 0);
+               kvm_inject_gp(vcpu, 0);
                return 1;
        } else if (ret) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
        }
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       ret = kvm_skip_emulated_instruction(vcpu);
 
        vmcb12 = map.hva;
 
        if (WARN_ON_ONCE(!svm->nested.initialized))
                return -EINVAL;
 
-       load_nested_vmcb_control(svm, &vmcb12->control);
+       nested_load_control_from_vmcb12(svm, &vmcb12->control);
 
-       if (!nested_vmcb_check_save(svm, vmcb12) ||
-           !nested_vmcb_check_controls(&svm->nested.ctl)) {
+       if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
+           !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
                vmcb12->control.exit_code    = SVM_EXIT_ERR;
                vmcb12->control.exit_code_hi = 0;
                vmcb12->control.exit_info_1  = 0;
@@ -541,36 +632,25 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
 
 
        /* Clear internal status */
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        /*
-        * Save the old vmcb, so we don't need to pick what we save, but can
-        * restore everything when a VMEXIT occurs
+        * Since vmcb01 is not in use, we can use it to store some of the L1
+        * state.
         */
-       hsave->save.es     = vmcb->save.es;
-       hsave->save.cs     = vmcb->save.cs;
-       hsave->save.ss     = vmcb->save.ss;
-       hsave->save.ds     = vmcb->save.ds;
-       hsave->save.gdtr   = vmcb->save.gdtr;
-       hsave->save.idtr   = vmcb->save.idtr;
-       hsave->save.efer   = svm->vcpu.arch.efer;
-       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       hsave->save.cr4    = svm->vcpu.arch.cr4;
-       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
-       hsave->save.rsp    = vmcb->save.rsp;
-       hsave->save.rax    = vmcb->save.rax;
-       if (npt_enabled)
-               hsave->save.cr3    = vmcb->save.cr3;
-       else
-               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
-
-       copy_vmcb_control_area(&hsave->control, &vmcb->control);
+       svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
+       svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
+       svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
+       svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+       svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+
+       if (!npt_enabled)
+               svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
 
        svm->nested.nested_run_pending = 1;
 
-       if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12))
+       if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
                goto out_exit_err;
 
        if (nested_svm_vmrun_msrpm(svm))
@@ -587,7 +667,7 @@ out_exit_err:
        nested_svm_vmexit(svm);
 
 out:
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
        return ret;
 }
@@ -610,27 +690,30 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 
 int nested_svm_vmexit(struct vcpu_svm *svm)
 {
-       int rc;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
        struct vmcb *vmcb12;
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb *vmcb = svm->vmcb;
        struct kvm_host_map map;
+       int rc;
 
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+       /* Triple faults in L2 should never escape. */
+       WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
+
+       rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
        if (rc) {
                if (rc == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
+                       kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
        vmcb12 = map.hva;
 
        /* Exit Guest-Mode */
-       leave_guest_mode(&svm->vcpu);
+       leave_guest_mode(vcpu);
        svm->nested.vmcb12_gpa = 0;
        WARN_ON_ONCE(svm->nested.nested_run_pending);
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 
        /* in case we halted in L2 */
        svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -644,14 +727,14 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->save.gdtr   = vmcb->save.gdtr;
        vmcb12->save.idtr   = vmcb->save.idtr;
        vmcb12->save.efer   = svm->vcpu.arch.efer;
-       vmcb12->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       vmcb12->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       vmcb12->save.cr0    = kvm_read_cr0(vcpu);
+       vmcb12->save.cr3    = kvm_read_cr3(vcpu);
        vmcb12->save.cr2    = vmcb->save.cr2;
        vmcb12->save.cr4    = svm->vcpu.arch.cr4;
-       vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu);
-       vmcb12->save.rip    = kvm_rip_read(&svm->vcpu);
-       vmcb12->save.rsp    = kvm_rsp_read(&svm->vcpu);
-       vmcb12->save.rax    = kvm_rax_read(&svm->vcpu);
+       vmcb12->save.rflags = kvm_get_rflags(vcpu);
+       vmcb12->save.rip    = kvm_rip_read(vcpu);
+       vmcb12->save.rsp    = kvm_rsp_read(vcpu);
+       vmcb12->save.rax    = kvm_rax_read(vcpu);
        vmcb12->save.dr7    = vmcb->save.dr7;
        vmcb12->save.dr6    = svm->vcpu.arch.dr6;
        vmcb12->save.cpl    = vmcb->save.cpl;
@@ -663,7 +746,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
 
        if (vmcb12->control.exit_code != SVM_EXIT_ERR)
-               nested_vmcb_save_pending_event(svm, vmcb12);
+               nested_save_pending_event_to_vmcb12(svm, vmcb12);
 
        if (svm->nrips_enabled)
                vmcb12->control.next_rip  = vmcb->control.next_rip;
@@ -678,37 +761,39 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
        vmcb12->control.pause_filter_thresh =
                svm->vmcb->control.pause_filter_thresh;
 
-       /* Restore the original control entries */
-       copy_vmcb_control_area(&vmcb->control, &hsave->control);
+       nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+       svm_switch_vmcb(svm, &svm->vmcb01);
+       WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
 
-       /* On vmexit the  GIF is set to false */
+       /*
+        * On vmexit the  GIF is set to false and
+        * no event can be injected in L1.
+        */
        svm_set_gif(svm, false);
+       svm->vmcb->control.exit_int_info = 0;
 
-       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
-               svm->vcpu.arch.l1_tsc_offset;
+       svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+       if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+               svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+               vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+       }
 
        svm->nested.ctl.nested_cr3 = 0;
 
-       /* Restore selected save entries */
-       svm->vmcb->save.es = hsave->save.es;
-       svm->vmcb->save.cs = hsave->save.cs;
-       svm->vmcb->save.ss = hsave->save.ss;
-       svm->vmcb->save.ds = hsave->save.ds;
-       svm->vmcb->save.gdtr = hsave->save.gdtr;
-       svm->vmcb->save.idtr = hsave->save.idtr;
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags | X86_EFLAGS_FIXED);
-       svm_set_efer(&svm->vcpu, hsave->save.efer);
-       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
-       kvm_rax_write(&svm->vcpu, hsave->save.rax);
-       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
-       kvm_rip_write(&svm->vcpu, hsave->save.rip);
-       svm->vmcb->save.dr7 = DR7_FIXED_1;
-       svm->vmcb->save.cpl = 0;
-       svm->vmcb->control.exit_int_info = 0;
+       /*
+        * Restore processor state that had been saved in vmcb01
+        */
+       kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+       svm_set_efer(vcpu, svm->vmcb->save.efer);
+       svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+       kvm_rax_write(vcpu, svm->vmcb->save.rax);
+       kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+       kvm_rip_write(vcpu, svm->vmcb->save.rip);
 
-       vmcb_mark_all_dirty(svm->vmcb);
+       svm->vcpu.arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(&svm->vcpu);
 
        trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
                                       vmcb12->control.exit_info_1,
@@ -717,50 +802,62 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
                                       vmcb12->control.exit_int_info_err,
                                       KVM_ISA_SVM);
 
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       kvm_vcpu_unmap(vcpu, &map, true);
 
-       nested_svm_uninit_mmu_context(&svm->vcpu);
+       nested_svm_uninit_mmu_context(vcpu);
 
-       rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+       rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
        if (rc)
                return 1;
 
-       if (npt_enabled)
-               svm->vmcb->save.cr3 = hsave->save.cr3;
-
        /*
         * Drop what we picked up for L2 via svm_complete_interrupts() so it
         * doesn't end up in L1.
         */
        svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
+
+       /*
+        * If we are here following the completion of a VMRUN that
+        * is being single-stepped, queue the pending #DB intercept
+        * right now so that it an be accounted for before we execute
+        * L1's next instruction.
+        */
+       if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+               kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 
        return 0;
 }
 
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+       nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
 int svm_allocate_nested(struct vcpu_svm *svm)
 {
-       struct page *hsave_page;
+       struct page *vmcb02_page;
 
        if (svm->nested.initialized)
                return 0;
 
-       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-       if (!hsave_page)
+       vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+       if (!vmcb02_page)
                return -ENOMEM;
-       svm->nested.hsave = page_address(hsave_page);
+       svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+       svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
 
        svm->nested.msrpm = svm_vcpu_alloc_msrpm();
        if (!svm->nested.msrpm)
-               goto err_free_hsave;
+               goto err_free_vmcb02;
        svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
 
        svm->nested.initialized = true;
        return 0;
 
-err_free_hsave:
-       __free_page(hsave_page);
+err_free_vmcb02:
+       __free_page(vmcb02_page);
        return -ENOMEM;
 }
 
@@ -772,8 +869,8 @@ void svm_free_nested(struct vcpu_svm *svm)
        svm_vcpu_free_msrpm(svm->nested.msrpm);
        svm->nested.msrpm = NULL;
 
-       __free_page(virt_to_page(svm->nested.hsave));
-       svm->nested.hsave = NULL;
+       __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+       svm->nested.vmcb02.ptr = NULL;
 
        svm->nested.initialized = false;
 }
@@ -783,18 +880,19 @@ void svm_free_nested(struct vcpu_svm *svm)
  */
 void svm_leave_nested(struct vcpu_svm *svm)
 {
-       if (is_guest_mode(&svm->vcpu)) {
-               struct vmcb *hsave = svm->nested.hsave;
-               struct vmcb *vmcb = svm->vmcb;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
 
+       if (is_guest_mode(vcpu)) {
                svm->nested.nested_run_pending = 0;
-               leave_guest_mode(&svm->vcpu);
-               copy_vmcb_control_area(&vmcb->control, &hsave->control);
-               nested_svm_uninit_mmu_context(&svm->vcpu);
+               leave_guest_mode(vcpu);
+
+               svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+               nested_svm_uninit_mmu_context(vcpu);
                vmcb_mark_all_dirty(svm->vmcb);
        }
 
-       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu);
+       kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
 }
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -903,16 +1001,15 @@ int nested_svm_exit_handled(struct vcpu_svm *svm)
        return vmexit;
 }
 
-int nested_svm_check_permissions(struct vcpu_svm *svm)
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
 {
-       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
-           !is_paging(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
 
-       if (svm->vmcb->save.cpl) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       if (to_svm(vcpu)->vmcb->save.cpl) {
+               kvm_inject_gp(vcpu, 0);
                return 1;
        }
 
@@ -960,50 +1057,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
        nested_svm_vmexit(svm);
 }
 
-static void nested_svm_smi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_nmi(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
 {
        return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
 }
 
-static void nested_svm_init(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code   = SVM_EXIT_INIT;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
-
-       nested_svm_vmexit(svm);
-}
-
-
 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1017,12 +1075,18 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_init(svm))
                        return 0;
-               nested_svm_init(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
                return 0;
        }
 
        if (vcpu->arch.exception.pending) {
-               if (block_nested_events)
+               /*
+                * Only a pending nested run can block a pending exception.
+                * Otherwise an injected NMI/interrupt should either be
+                * lost or delivered to the nested hypervisor in the EXITINTINFO
+                * vmcb field, while delivering the pending exception.
+                */
+               if (svm->nested.nested_run_pending)
                         return -EBUSY;
                if (!nested_exit_on_exception(svm))
                        return 0;
@@ -1035,7 +1099,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_smi(svm))
                        return 0;
-               nested_svm_smi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
                return 0;
        }
 
@@ -1044,7 +1108,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_nmi(svm))
                        return 0;
-               nested_svm_nmi(svm);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
                return 0;
        }
 
@@ -1053,7 +1117,8 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                        return -EBUSY;
                if (!nested_exit_on_intr(svm))
                        return 0;
-               nested_svm_intr(svm);
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+               nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
                return 0;
        }
 
@@ -1072,8 +1137,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
 
-               if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] &
-                               excp_bits)
+               if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+                   excp_bits)
                        return NESTED_EXIT_HOST;
                else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
                         svm->vcpu.arch.apf.host_apf_flags)
@@ -1137,10 +1202,9 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
        if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
                         sizeof(user_vmcb->control)))
                return -EFAULT;
-       if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save,
+       if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
                         sizeof(user_vmcb->save)))
                return -EFAULT;
-
 out:
        return kvm_state.size;
 }
@@ -1150,7 +1214,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                                struct kvm_nested_state *kvm_state)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *hsave = svm->nested.hsave;
        struct vmcb __user *user_vmcb = (struct vmcb __user *)
                &user_kvm_nested_state->data.svm[0];
        struct vmcb_control_area *ctl;
@@ -1195,8 +1258,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        ret  = -ENOMEM;
-       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL);
-       save = kzalloc(sizeof(*save), GFP_KERNEL);
+       ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL_ACCOUNT);
+       save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
        if (!ctl || !save)
                goto out_free;
 
@@ -1207,12 +1270,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
                goto out_free;
 
        ret = -EINVAL;
-       if (!nested_vmcb_check_controls(ctl))
+       if (!nested_vmcb_check_controls(vcpu, ctl))
                goto out_free;
 
        /*
         * Processor state contains L2 state.  Check that it is
-        * valid for guest mode (see nested_vmcb_checks).
+        * valid for guest mode (see nested_vmcb_check_save).
         */
        cr0 = kvm_read_cr0(vcpu);
         if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
@@ -1221,29 +1284,48 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
        /*
         * Validate host state saved from before VMRUN (see
         * nested_svm_check_permissions).
-        * TODO: validate reserved bits for all saved state.
         */
-       if (!(save->cr0 & X86_CR0_PG))
-               goto out_free;
-       if (!(save->efer & EFER_SVME))
+       if (!(save->cr0 & X86_CR0_PG) ||
+           !(save->cr0 & X86_CR0_PE) ||
+           (save->rflags & X86_EFLAGS_VM) ||
+           !nested_vmcb_valid_sregs(vcpu, save))
                goto out_free;
 
        /*
-        * All checks done, we can enter guest mode.  L1 control fields
-        * come from the nested save state.  Guest state is already
-        * in the registers, the save area of the nested state instead
-        * contains saved L1 state.
+        * All checks done, we can enter guest mode. Userspace provides
+        * vmcb12.control, which will be combined with L1 and stored into
+        * vmcb02, and the L1 save state which we store in vmcb01.
+        * L2 registers if needed are moved from the current VMCB to VMCB02.
         */
 
        svm->nested.nested_run_pending =
                !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 
-       copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
-       hsave->save = *save;
-
        svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
-       load_nested_vmcb_control(svm, ctl);
-       nested_prepare_vmcb_control(svm);
+       if (svm->current_vmcb == &svm->vmcb01)
+               svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+       svm->vmcb01.ptr->save.es = save->es;
+       svm->vmcb01.ptr->save.cs = save->cs;
+       svm->vmcb01.ptr->save.ss = save->ss;
+       svm->vmcb01.ptr->save.ds = save->ds;
+       svm->vmcb01.ptr->save.gdtr = save->gdtr;
+       svm->vmcb01.ptr->save.idtr = save->idtr;
+       svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
+       svm->vmcb01.ptr->save.efer = save->efer;
+       svm->vmcb01.ptr->save.cr0 = save->cr0;
+       svm->vmcb01.ptr->save.cr3 = save->cr3;
+       svm->vmcb01.ptr->save.cr4 = save->cr4;
+       svm->vmcb01.ptr->save.rax = save->rax;
+       svm->vmcb01.ptr->save.rsp = save->rsp;
+       svm->vmcb01.ptr->save.rip = save->rip;
+       svm->vmcb01.ptr->save.cpl = 0;
+
+       nested_load_control_from_vmcb12(svm, ctl);
+
+       svm_switch_vmcb(svm, &svm->nested.vmcb02);
+
+       nested_vmcb02_prepare_control(svm);
 
        kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
        ret = 0;
@@ -1254,8 +1336,31 @@ out_free:
        return ret;
 }
 
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (WARN_ON(!is_guest_mode(vcpu)))
+               return true;
+
+       if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+                               nested_npt_enabled(svm)))
+               return false;
+
+       if (!nested_svm_vmrun_msrpm(svm)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_EMULATION;
+               vcpu->run->internal.ndata = 0;
+               return false;
+       }
+
+       return true;
+}
+
 struct kvm_x86_nested_ops svm_nested_ops = {
        .check_events = svm_check_nested_events,
+       .triple_fault = nested_svm_triple_fault,
        .get_nested_state_pages = svm_get_nested_state_pages,
        .get_state = svm_get_nested_state,
        .set_state = svm_set_nested_state,