KVM: x86/mmu: Get CR4.SMEP from MMU, not vCPU, in shadow page fault

[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / paging_tmpl.h
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index 70b7e44..490a028 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -90,8 +90,8 @@ struct guest_walker {
         gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
         pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
         bool pte_writable[PT_MAX_FULL_LEVELS];
-       unsigned pt_access;
-       unsigned pte_access;
+       unsigned int pt_access[PT_MAX_FULL_LEVELS];
+       unsigned int pte_access;
         gfn_t gfn;
         struct x86_exception fault;
  };
@@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
         return pkeys;
  }
  
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+                                      unsigned int level, unsigned int gpte)
+{
+       /*
+        * For EPT and PAE paging (both variants), bit 7 is either reserved at
+        * all level or indicates a huge page (ignoring CR3/EPTP).  In either
+        * case, bit 7 being set terminates the walk.
+        */
+#if PTTYPE == 32
+       /*
+        * 32-bit paging requires special handling because bit 7 is ignored if
+        * CR4.PSE=0, not reserved.  Clear bit 7 in the gpte if the level is
+        * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+        *
+        * The RHS has bit 7 set iff level < (2 + PSE).  If it is clear, bit 7
+        * is not reserved and does not indicate a large page at this level,
+        * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+        */
+       gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+#endif
+       /*
+        * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
+        * iff level <= PG_LEVEL_4K, which for our purpose means
+        * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+        */
+       gpte |= level - PG_LEVEL_4K - 1;
+
+       return gpte & PT_PAGE_SIZE_MASK;
+}
  /*
   * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
   */
@@ -418,13 +447,15 @@ retry_walk:
                 }
  
                 walker->ptes[walker->level - 1] = pte;
-       } while (!is_last_gpte(mmu, walker->level, pte));
+
+               /* Convert to ACC_*_MASK flags for struct guest_walker.  */
+               walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+       } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
  
         pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
         accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
  
         /* Convert to ACC_*_MASK flags for struct guest_walker.  */
-       walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
         walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
         errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
         if (unlikely(errcode))
@@ -463,13 +494,13 @@ retry_walk:
         }
  
         pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-                __func__, (u64)pte, walker->pte_access, walker->pt_access);
+                __func__, (u64)pte, walker->pte_access,
+                walker->pt_access[walker->level - 1]);
         return 1;
  
  error:
         errcode |= write_fault | user_fault;
-       if (fetch_fault && (mmu->nx ||
-                           kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+       if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
                 errcode |= PFERR_FETCH_MASK;
  
         walker->fault.vector = PF_VECTOR;
@@ -643,7 +674,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
         struct kvm_mmu_page *sp = NULL;
         struct kvm_shadow_walk_iterator it;
-       unsigned direct_access, access = gw->pt_access;
+       unsigned int direct_access, access;
         int top_level, level, req_level, ret;
         gfn_t base_gfn = gw->gfn;
  
@@ -675,6 +706,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
                 sp = NULL;
                 if (!is_shadow_present_pte(*it.sptep)) {
                         table_gfn = gw->table_gfn[it.level - 2];
+                       access = gw->pt_access[it.level - 2];
                         sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
                                               false, access);
                 }
@@ -763,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
         bool self_changed = false;
  
         if (!(walker->pte_access & ACC_WRITE_MASK ||
-             (!is_write_protection(vcpu) && !user_fault)))
+           (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
                 return false;
  
         for (level = walker->level; level <= walker->max_level; level++) {
@@ -861,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
          * we will cache the incorrect access into mmio spte.
          */
         if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
-            !is_write_protection(vcpu) && !user_fault &&
-             !is_noslot_pfn(pfn)) {
+           !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) {
                 walker.pte_access |= ACC_WRITE_MASK;
                 walker.pte_access &= ~ACC_USER_MASK;
  
@@ -872,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
                  * then we should prevent the kernel from executing it
                  * if SMEP is enabled.
                  */
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+               if (is_cr4_smep(vcpu->arch.mmu))
                         walker.pte_access &= ~ACC_EXEC_MASK;
         }
  
@@ -1027,13 +1058,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
   */
  static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  {
+       union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
         int i, nr_present = 0;
         bool host_writable;
         gpa_t first_pte_gpa;
         int set_spte_ret = 0;
  
-       /* direct kvm_mmu_page can not be unsync. */
-       BUG_ON(sp->role.direct);
+       /*
+        * Ignore various flags when verifying that it's safe to sync a shadow
+        * page using the current MMU context.
+        *
+        *  - level: not part of the overall MMU role and will never match as the MMU's
+        *           level tracks the root level
+        *  - access: updated based on the new guest PTE
+        *  - quadrant: not part of the overall MMU role (similar to level)
+        */
+       const union kvm_mmu_page_role sync_role_ign = {
+               .level = 0xf,
+               .access = 0x7,
+               .quadrant = 0x3,
+       };
+
+       /*
+        * Direct pages can never be unsync, and KVM should never attempt to
+        * sync a shadow page for a different MMU context, e.g. if the role
+        * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+        * reserved bits checks will be wrong, etc...
+        */
+       if (WARN_ON_ONCE(sp->role.direct ||
+                        (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+               return 0;
  
         first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);