KVM: x86/mmu: Get CR4.SMEP from MMU, not vCPU, in shadow page fault

[linux-2.6-microblaze.git] / arch / x86 / kvm / mmu / paging_tmpl.h
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h

index 52fffd6..490a028 100644 (file)
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
         return pkeys;
  }
  
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+                                      unsigned int level, unsigned int gpte)
+{
+       /*
+        * For EPT and PAE paging (both variants), bit 7 is either reserved at
+        * all level or indicates a huge page (ignoring CR3/EPTP).  In either
+        * case, bit 7 being set terminates the walk.
+        */
+#if PTTYPE == 32
+       /*
+        * 32-bit paging requires special handling because bit 7 is ignored if
+        * CR4.PSE=0, not reserved.  Clear bit 7 in the gpte if the level is
+        * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+        *
+        * The RHS has bit 7 set iff level < (2 + PSE).  If it is clear, bit 7
+        * is not reserved and does not indicate a large page at this level,
+        * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+        */
+       gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+#endif
+       /*
+        * PG_LEVEL_4K always terminates.  The RHS has bit 7 set
+        * iff level <= PG_LEVEL_4K, which for our purpose means
+        * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+        */
+       gpte |= level - PG_LEVEL_4K - 1;
+
+       return gpte & PT_PAGE_SIZE_MASK;
+}
  /*
   * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
   */
@@ -421,7 +450,7 @@ retry_walk:
  
                 /* Convert to ACC_*_MASK flags for struct guest_walker.  */
                 walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
-       } while (!is_last_gpte(mmu, walker->level, pte));
+       } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
  
         pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
         accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
@@ -471,7 +500,7 @@ retry_walk:
  
  error:
         errcode |= write_fault | user_fault;
-       if (fetch_fault && (mmu->nx || mmu->mmu_role.ext.cr4_smep))
+       if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
                 errcode |= PFERR_FETCH_MASK;
  
         walker->fault.vector = PF_VECTOR;
@@ -766,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
         bool self_changed = false;
  
         if (!(walker->pte_access & ACC_WRITE_MASK ||
-             (!is_write_protection(vcpu) && !user_fault)))
+           (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
                 return false;
  
         for (level = walker->level; level <= walker->max_level; level++) {
@@ -864,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
          * we will cache the incorrect access into mmio spte.
          */
         if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
-            !is_write_protection(vcpu) && !user_fault &&
-             !is_noslot_pfn(pfn)) {
+           !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) {
                 walker.pte_access |= ACC_WRITE_MASK;
                 walker.pte_access &= ~ACC_USER_MASK;
  
@@ -875,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
                  * then we should prevent the kernel from executing it
                  * if SMEP is enabled.
                  */
-               if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+               if (is_cr4_smep(vcpu->arch.mmu))
                         walker.pte_access &= ~ACC_EXEC_MASK;
         }
  
@@ -1030,13 +1058,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
   */
  static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
  {
+       union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
         int i, nr_present = 0;
         bool host_writable;
         gpa_t first_pte_gpa;
         int set_spte_ret = 0;
  
-       /* direct kvm_mmu_page can not be unsync. */
-       BUG_ON(sp->role.direct);
+       /*
+        * Ignore various flags when verifying that it's safe to sync a shadow
+        * page using the current MMU context.
+        *
+        *  - level: not part of the overall MMU role and will never match as the MMU's
+        *           level tracks the root level
+        *  - access: updated based on the new guest PTE
+        *  - quadrant: not part of the overall MMU role (similar to level)
+        */
+       const union kvm_mmu_page_role sync_role_ign = {
+               .level = 0xf,
+               .access = 0x7,
+               .quadrant = 0x3,
+       };
+
+       /*
+        * Direct pages can never be unsync, and KVM should never attempt to
+        * sync a shadow page for a different MMU context, e.g. if the role
+        * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+        * reserved bits checks will be wrong, etc...
+        */
+       if (WARN_ON_ONCE(sp->role.direct ||
+                        (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+               return 0;
  
         first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);