Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
[linux-2.6-microblaze.git] / arch / arm64 / kvm / mmu.c
index 60ee3d9..31d7fa4 100644 (file)
@@ -128,6 +128,25 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
        free_pages_exact(virt, size);
 }
 
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
+
+static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+{
+       struct page *page = container_of(head, struct page, rcu_head);
+       void *pgtable = page_to_virt(page);
+       u32 level = page_private(page);
+
+       kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+}
+
+static void stage2_free_removed_table(void *addr, u32 level)
+{
+       struct page *page = virt_to_page(addr);
+
+       set_page_private(page, (unsigned long)level);
+       call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+}
+
 static void kvm_host_get_page(void *addr)
 {
        get_page(virt_to_page(addr));
@@ -640,8 +659,8 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
 static int get_user_mapping_size(struct kvm *kvm, u64 addr)
 {
        struct kvm_pgtable pgt = {
-               .pgd            = (kvm_pte_t *)kvm->mm->pgd,
-               .ia_bits        = VA_BITS,
+               .pgd            = (kvm_pteref_t)kvm->mm->pgd,
+               .ia_bits        = vabits_actual,
                .start_level    = (KVM_PGTABLE_MAX_LEVELS -
                                   CONFIG_PGTABLE_LEVELS),
                .mm_ops         = &kvm_user_mm_ops,
@@ -662,6 +681,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
        .zalloc_page            = stage2_memcache_zalloc_page,
        .zalloc_pages_exact     = kvm_s2_zalloc_pages_exact,
        .free_pages_exact       = kvm_s2_free_pages_exact,
+       .free_removed_table     = stage2_free_removed_table,
        .get_page               = kvm_host_get_page,
        .put_page               = kvm_s2_put_page,
        .page_count             = kvm_host_page_count,
@@ -675,15 +695,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
  * kvm_init_stage2_mmu - Initialise a S2 MMU structure
  * @kvm:       The pointer to the KVM structure
  * @mmu:       The pointer to the s2 MMU structure
+ * @type:      The machine type of the virtual machine
  *
  * Allocates only the stage-2 HW PGD level table(s).
  * Note we don't need locking here as this is only called when the VM is
  * created, which can only be done once.
  */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
 {
+       u32 kvm_ipa_limit = get_kvm_ipa_limit();
        int cpu, err;
        struct kvm_pgtable *pgt;
+       u64 mmfr0, mmfr1;
+       u32 phys_shift;
+
+       if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+               return -EINVAL;
+
+       phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+       if (is_protected_kvm_enabled()) {
+               phys_shift = kvm_ipa_limit;
+       } else if (phys_shift) {
+               if (phys_shift > kvm_ipa_limit ||
+                   phys_shift < ARM64_MIN_PARANGE_BITS)
+                       return -EINVAL;
+       } else {
+               phys_shift = KVM_PHYS_SHIFT;
+               if (phys_shift > kvm_ipa_limit) {
+                       pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
+                                    current->comm);
+                       return -EINVAL;
+               }
+       }
+
+       mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+       mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
        if (mmu->pgt != NULL) {
                kvm_err("kvm_arch already initialized?\n");
@@ -807,6 +854,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
        }
 }
 
+static void hyp_mc_free_fn(void *addr, void *unused)
+{
+       free_page((unsigned long)addr);
+}
+
+static void *hyp_mc_alloc_fn(void *unused)
+{
+       return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc)
+{
+       if (is_protected_kvm_enabled())
+               __free_hyp_memcache(mc, hyp_mc_free_fn,
+                                   kvm_host_va, NULL);
+}
+
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
+{
+       if (!is_protected_kvm_enabled())
+               return 0;
+
+       return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
+                                   kvm_host_pa, NULL);
+}
+
 /**
  * kvm_phys_addr_ioremap - map a device range to guest IPA
  *
@@ -841,7 +914,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
                write_lock(&kvm->mmu_lock);
                ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
-                                            &cache);
+                                            &cache, 0);
                write_unlock(&kvm->mmu_lock);
                if (ret)
                        break;
@@ -1091,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
  * - mmap_lock protects between a VM faulting a page in and the VMM performing
  *   an mprotect() to add VM_MTE
  */
-static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
-                            unsigned long size)
+static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+                             unsigned long size)
 {
        unsigned long i, nr_pages = size >> PAGE_SHIFT;
-       struct page *page;
+       struct page *page = pfn_to_page(pfn);
 
        if (!kvm_has_mte(kvm))
-               return 0;
-
-       /*
-        * pfn_to_online_page() is used to reject ZONE_DEVICE pages
-        * that may not support tags.
-        */
-       page = pfn_to_online_page(pfn);
-
-       if (!page)
-               return -EFAULT;
+               return;
 
        for (i = 0; i < nr_pages; i++, page++) {
-               if (!test_bit(PG_mte_tagged, &page->flags)) {
+               if (try_page_mte_tagging(page)) {
                        mte_clear_page_tags(page_address(page));
-                       set_bit(PG_mte_tagged, &page->flags);
+                       set_page_mte_tagged(page);
                }
        }
+}
 
-       return 0;
+static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1127,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        bool write_fault, writable, force_pte = false;
        bool exec_fault;
        bool device = false;
-       bool shared;
        unsigned long mmu_seq;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1136,7 +1202,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        gfn_t gfn;
        kvm_pfn_t pfn;
        bool logging_active = memslot_is_logging(memslot);
-       bool use_read_lock = false;
        unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
        unsigned long vma_pagesize, fault_granule;
        enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
@@ -1171,14 +1236,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (logging_active) {
                force_pte = true;
                vma_shift = PAGE_SHIFT;
-               use_read_lock = (fault_status == FSC_PERM && write_fault &&
-                                fault_granule == PAGE_SIZE);
        } else {
                vma_shift = get_vma_page_shift(vma, hva);
        }
 
-       shared = (vma->vm_flags & VM_SHARED);
-
        switch (vma_shift) {
 #ifndef __PAGETABLE_PMD_FOLDED
        case PUD_SHIFT:
@@ -1239,7 +1300,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         */
        smp_rmb();
 
-       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+       pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
                                   write_fault, &writable, NULL);
        if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(hva, vma_shift);
@@ -1271,15 +1332,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (exec_fault && device)
                return -ENOEXEC;
 
-       /*
-        * To reduce MMU contentions and enhance concurrency during dirty
-        * logging dirty logging, only acquire read lock for permission
-        * relaxation.
-        */
-       if (use_read_lock)
-               read_lock(&kvm->mmu_lock);
-       else
-               write_lock(&kvm->mmu_lock);
+       read_lock(&kvm->mmu_lock);
        pgt = vcpu->arch.hw_mmu->pgt;
        if (mmu_invalidate_retry(kvm, mmu_seq))
                goto out_unlock;
@@ -1298,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        }
 
        if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
-               /* Check the VMM hasn't introduced a new VM_SHARED VMA */
-               if (!shared)
-                       ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
-               else
+               /* Check the VMM hasn't introduced a new disallowed VMA */
+               if (kvm_vma_mte_allowed(vma)) {
+                       sanitise_mte_tags(kvm, pfn, vma_pagesize);
+               } else {
                        ret = -EFAULT;
-               if (ret)
                        goto out_unlock;
+               }
        }
 
        if (writable)
@@ -1323,15 +1376,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
         * permissions only if vma_pagesize equals fault_granule. Otherwise,
         * kvm_pgtable_stage2_map() should be called to change block size.
         */
-       if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
+       if (fault_status == FSC_PERM && vma_pagesize == fault_granule)
                ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
-       } else {
-               WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
-
+       else
                ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
                                             __pfn_to_phys(pfn), prot,
-                                            memcache);
-       }
+                                            memcache, KVM_PGTABLE_WALK_SHARED);
 
        /* Mark the page dirty only if the fault is handled successfully */
        if (writable && !ret) {
@@ -1340,10 +1390,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        }
 
 out_unlock:
-       if (use_read_lock)
-               read_unlock(&kvm->mmu_lock);
-       else
-               write_unlock(&kvm->mmu_lock);
+       read_unlock(&kvm->mmu_lock);
        kvm_set_pfn_accessed(pfn);
        kvm_release_pfn_clean(pfn);
        return ret != -EAGAIN ? ret : 0;
@@ -1526,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
        kvm_pfn_t pfn = pte_pfn(range->pte);
-       int ret;
 
        if (!kvm->arch.mmu.pgt)
                return false;
 
        WARN_ON(range->end - range->start != 1);
 
-       ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
-       if (ret)
+       /*
+        * If the page isn't tagged, defer to user_mem_abort() for sanitising
+        * the MTE tags. The S2 pte should have been unmapped by
+        * mmu_notifier_invalidate_range_end().
+        */
+       if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
                return false;
 
        /*
@@ -1549,7 +1599,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
         */
        kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
                               PAGE_SIZE, __pfn_to_phys(pfn),
-                              KVM_PGTABLE_PROT_R, NULL);
+                              KVM_PGTABLE_PROT_R, NULL, 0);
 
        return false;
 }
@@ -1618,6 +1668,8 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
 int kvm_mmu_init(u32 *hyp_va_bits)
 {
        int err;
+       u32 idmap_bits;
+       u32 kernel_bits;
 
        hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
        hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1631,7 +1683,31 @@ int kvm_mmu_init(u32 *hyp_va_bits)
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
-       *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       /*
+        * The ID map may be configured to use an extended virtual address
+        * range. This is only the case if system RAM is out of range for the
+        * currently configured page size and VA_BITS_MIN, in which case we will
+        * also need the extended virtual range for the HYP ID map, or we won't
+        * be able to enable the EL2 MMU.
+        *
+        * However, in some cases the ID map may be configured for fewer than
+        * the number of VA bits used by the regular kernel stage 1. This
+        * happens when VA_BITS=52 and the kernel image is placed in PA space
+        * below 48 bits.
+        *
+        * At EL2, there is only one TTBR register, and we can't switch between
+        * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
+        * line: we need to use the extended range with *both* our translation
+        * tables.
+        *
+        * So use the maximum of the idmap VA bits and the regular kernel stage
+        * 1 VA bits to assure that the hypervisor can both ID map its code page
+        * and map any kernel memory.
+        */
+       idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       kernel_bits = vabits_actual;
+       *hyp_va_bits = max(idmap_bits, kernel_bits);
+
        kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
        kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
        kvm_debug("HYP VA range: %lx:%lx\n",
@@ -1740,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                if (!vma)
                        break;
 
-               /*
-                * VM_SHARED mappings are not allowed with MTE to avoid races
-                * when updating the PG_mte_tagged page flag, see
-                * sanitise_mte_tags for more details.
-                */
-               if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
+               if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
                        ret = -EINVAL;
                        break;
                }