Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

[linux-2.6-microblaze.git] / arch / arm64 / kvm / mmu.c
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c

index 60ee3d9..31d7fa4 100644 (file)
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
         free_pages_exact(virt, size);
  }
  
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
+
+static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+{
+       struct page *page = container_of(head, struct page, rcu_head);
+       void *pgtable = page_to_virt(page);
+       u32 level = page_private(page);
+
+       kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+}
+
+static void stage2_free_removed_table(void *addr, u32 level)
+{
+       struct page *page = virt_to_page(addr);
+
+       set_page_private(page, (unsigned long)level);
+       call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+}
+
  static void kvm_host_get_page(void *addr)
  {
         get_page(virt_to_page(addr));
@@ -640,8 +659,8 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
  static int get_user_mapping_size(struct kvm *kvm, u64 addr)
  {
         struct kvm_pgtable pgt = {
-               .pgd            = (kvm_pte_t *)kvm->mm->pgd,
-               .ia_bits        = VA_BITS,
+               .pgd            = (kvm_pteref_t)kvm->mm->pgd,
+               .ia_bits        = vabits_actual,
                 .start_level    = (KVM_PGTABLE_MAX_LEVELS -
                                    CONFIG_PGTABLE_LEVELS),
                 .mm_ops         = &kvm_user_mm_ops,
@@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
         .zalloc_page            = stage2_memcache_zalloc_page,
         .zalloc_pages_exact     = kvm_s2_zalloc_pages_exact,
         .free_pages_exact       = kvm_s2_free_pages_exact,
+       .free_removed_table     = stage2_free_removed_table,
         .get_page               = kvm_host_get_page,
         .put_page               = kvm_s2_put_page,
         .page_count             = kvm_host_page_count,
@@ -675,15 +695,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
   * kvm_init_stage2_mmu - Initialise a S2 MMU structure
   * @kvm:       The pointer to the KVM structure
   * @mmu:       The pointer to the s2 MMU structure
+ * @type:      The machine type of the virtual machine
   *
   * Allocates only the stage-2 HW PGD level table(s).
   * Note we don't need locking here as this is only called when the VM is
   * created, which can only be done once.
   */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
  {
+       u32 kvm_ipa_limit = get_kvm_ipa_limit();
         int cpu, err;
         struct kvm_pgtable *pgt;
+       u64 mmfr0, mmfr1;
+       u32 phys_shift;
+
+       if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+               return -EINVAL;
+
+       phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+       if (is_protected_kvm_enabled()) {
+               phys_shift = kvm_ipa_limit;
+       } else if (phys_shift) {
+               if (phys_shift > kvm_ipa_limit ||
+                   phys_shift < ARM64_MIN_PARANGE_BITS)
+                       return -EINVAL;
+       } else {
+               phys_shift = KVM_PHYS_SHIFT;
+               if (phys_shift > kvm_ipa_limit) {
+                       pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
+                                    current->comm);
+                       return -EINVAL;
+               }
+       }
+
+       mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+       mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
  
         if (mmu->pgt != NULL) {
                 kvm_err("kvm_arch already initialized?\n");
@@ -807,6 +854,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
         }
  }
  
+static void hyp_mc_free_fn(void *addr, void *unused)
+{
+       free_page((unsigned long)addr);
+}
+
+static void *hyp_mc_alloc_fn(void *unused)
+{
+       return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc)
+{
+       if (is_protected_kvm_enabled())
+               __free_hyp_memcache(mc, hyp_mc_free_fn,
+                                   kvm_host_va, NULL);
+}
+
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
+{
+       if (!is_protected_kvm_enabled())
+               return 0;
+
+       return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
+                                   kvm_host_pa, NULL);
+}
+
  /**
   * kvm_phys_addr_ioremap - map a device range to guest IPA
   *
@@ -841,7 +914,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  
                 write_lock(&kvm->mmu_lock);
                 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
-                                            &cache);
+                                            &cache, 0);
                 write_unlock(&kvm->mmu_lock);
                 if (ret)
                         break;
@@ -1091,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
   * - mmap_lock protects between a VM faulting a page in and the VMM performing
   *   an mprotect() to add VM_MTE
   */
-static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
-                            unsigned long size)
+static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+                             unsigned long size)
  {
         unsigned long i, nr_pages = size >> PAGE_SHIFT;
-       struct page *page;
+       struct page *page = pfn_to_page(pfn);
  
         if (!kvm_has_mte(kvm))
-               return 0;
-
-       /*
-        * pfn_to_online_page() is used to reject ZONE_DEVICE pages
-        * that may not support tags.
-        */
-       page = pfn_to_online_page(pfn);
-
-       if (!page)
-               return -EFAULT;
+               return;
  
         for (i = 0; i < nr_pages; i++, page++) {
-               if (!test_bit(PG_mte_tagged, &page->flags)) {
+               if (try_page_mte_tagging(page)) {
                         mte_clear_page_tags(page_address(page));
-                       set_bit(PG_mte_tagged, &page->flags);
+                       set_page_mte_tagged(page);
                 }
         }
+}
  
-       return 0;
+static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_MTE_ALLOWED;
  }
  
  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1127,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         bool write_fault, writable, force_pte = false;
         bool exec_fault;
         bool device = false;
-       bool shared;
         unsigned long mmu_seq;
         struct kvm *kvm = vcpu->kvm;
         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1136,7 +1202,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         gfn_t gfn;
         kvm_pfn_t pfn;
         bool logging_active = memslot_is_logging(memslot);
-       bool use_read_lock = false;
         unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
         unsigned long vma_pagesize, fault_granule;
         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
@@ -1171,14 +1236,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         if (logging_active) {
                 force_pte = true;
                 vma_shift = PAGE_SHIFT;
-               use_read_lock = (fault_status == FSC_PERM && write_fault &&
-                                fault_granule == PAGE_SIZE);
         } else {
                 vma_shift = get_vma_page_shift(vma, hva);
         }
  
-       shared = (vma->vm_flags & VM_SHARED);
-
         switch (vma_shift) {
  #ifndef __PAGETABLE_PMD_FOLDED
         case PUD_SHIFT:
@@ -1239,7 +1300,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
          */
         smp_rmb();
  
-       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
                                    write_fault, &writable, NULL);
         if (pfn == KVM_PFN_ERR_HWPOISON) {
                 kvm_send_hwpoison_signal(hva, vma_shift);
@@ -1271,15 +1332,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         if (exec_fault && device)
                 return -ENOEXEC;
  
-       /*
-        * To reduce MMU contentions and enhance concurrency during dirty
-        * logging dirty logging, only acquire read lock for permission
-        * relaxation.
-        */
-       if (use_read_lock)
-               read_lock(&kvm->mmu_lock);
-       else
-               write_lock(&kvm->mmu_lock);
+       read_lock(&kvm->mmu_lock);
         pgt = vcpu->arch.hw_mmu->pgt;
         if (mmu_invalidate_retry(kvm, mmu_seq))
                 goto out_unlock;
@@ -1298,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         }
  
         if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
-               /* Check the VMM hasn't introduced a new VM_SHARED VMA */
-               if (!shared)
-                       ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
-               else
+               /* Check the VMM hasn't introduced a new disallowed VMA */
+               if (kvm_vma_mte_allowed(vma)) {
+                       sanitise_mte_tags(kvm, pfn, vma_pagesize);
+               } else {
                         ret = -EFAULT;
-               if (ret)
                         goto out_unlock;
+               }
         }
  
         if (writable)
@@ -1323,15 +1376,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
          * permissions only if vma_pagesize equals fault_granule. Otherwise,
          * kvm_pgtable_stage2_map() should be called to change block size.
          */
-       if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
+       if (fault_status == FSC_PERM && vma_pagesize == fault_granule)
                 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
-       } else {
-               WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
-
+       else
                 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
                                              __pfn_to_phys(pfn), prot,
-                                            memcache);
-       }
+                                            memcache, KVM_PGTABLE_WALK_SHARED);
  
         /* Mark the page dirty only if the fault is handled successfully */
         if (writable && !ret) {
@@ -1340,10 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         }
  
  out_unlock:
-       if (use_read_lock)
-               read_unlock(&kvm->mmu_lock);
-       else
-               write_unlock(&kvm->mmu_lock);
+       read_unlock(&kvm->mmu_lock);
         kvm_set_pfn_accessed(pfn);
         kvm_release_pfn_clean(pfn);
         return ret != -EAGAIN ? ret : 0;
@@ -1526,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  {
         kvm_pfn_t pfn = pte_pfn(range->pte);
-       int ret;
  
         if (!kvm->arch.mmu.pgt)
                 return false;
  
         WARN_ON(range->end - range->start != 1);
  
-       ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
-       if (ret)
+       /*
+        * If the page isn't tagged, defer to user_mem_abort() for sanitising
+        * the MTE tags. The S2 pte should have been unmapped by
+        * mmu_notifier_invalidate_range_end().
+        */
+       if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
                 return false;
  
         /*
@@ -1549,7 +1599,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
          */
         kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
                                PAGE_SIZE, __pfn_to_phys(pfn),
-                              KVM_PGTABLE_PROT_R, NULL);
+                              KVM_PGTABLE_PROT_R, NULL, 0);
  
         return false;
  }
@@ -1618,6 +1668,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
  int kvm_mmu_init(u32 *hyp_va_bits)
  {
         int err;
+       u32 idmap_bits;
+       u32 kernel_bits;
  
         hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
         hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1631,7 +1683,31 @@ int kvm_mmu_init(u32 *hyp_va_bits)
          */
         BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
  
-       *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       /*
+        * The ID map may be configured to use an extended virtual address
+        * range. This is only the case if system RAM is out of range for the
+        * currently configured page size and VA_BITS_MIN, in which case we will
+        * also need the extended virtual range for the HYP ID map, or we won't
+        * be able to enable the EL2 MMU.
+        *
+        * However, in some cases the ID map may be configured for fewer than
+        * the number of VA bits used by the regular kernel stage 1. This
+        * happens when VA_BITS=52 and the kernel image is placed in PA space
+        * below 48 bits.
+        *
+        * At EL2, there is only one TTBR register, and we can't switch between
+        * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
+        * line: we need to use the extended range with *both* our translation
+        * tables.
+        *
+        * So use the maximum of the idmap VA bits and the regular kernel stage
+        * 1 VA bits to assure that the hypervisor can both ID map its code page
+        * and map any kernel memory.
+        */
+       idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       kernel_bits = vabits_actual;
+       *hyp_va_bits = max(idmap_bits, kernel_bits);
+
         kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
         kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
         kvm_debug("HYP VA range: %lx:%lx\n",
@@ -1740,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                 if (!vma)
                         break;
  
-               /*
-                * VM_SHARED mappings are not allowed with MTE to avoid races
-                * when updating the PG_mte_tagged page flag, see
-                * sanitise_mte_tags for more details.
-                */
-               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
+               if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
                         ret = -EINVAL;
                         break;
                 }