KVM: arm64: Move VTCR_EL2 into struct s2_mmu
authorMarc Zyngier <maz@kernel.org>
Thu, 12 Oct 2023 20:51:08 +0000 (21:51 +0100)
committerOliver Upton <oliver.upton@linux.dev>
Mon, 23 Oct 2023 18:48:46 +0000 (18:48 +0000)
We currently have a global VTCR_EL2 value for each guest, even
if the guest uses NV. This implies that the guest's own S2 must
fit in the host's. This is odd, for multiple reasons:

- the PARange values and the number of IPA bits don't necessarily
  match: you can have 33 bits of IPA space, and yet you can only
  describe 32 or 36 bits of PARange

- When userspace set the IPA space, it creates a contract with the
  kernel saying "this is the IPA space I'm prepared to handle".
  At no point does it constraint the guest's own IPA space as
  long as the guest doesn't try to use a [I]PA outside of the
  IPA space set by userspace

- We don't even try to hide the value of ID_AA64MMFR0_EL1.PARange.

And then there is the consequence of the above: if a guest tries
to create a S2 that has for input address something that is larger
than the IPA space defined by the host, we inject a fatal exception.

This is no good. For all intent and purposes, a guest should be
able to have the S2 it really wants, as long as the *output* address
of that S2 isn't outside of the IPA space.

For that, we need to have a per-s2_mmu VTCR_EL2 setting, which
allows us to represent the full PARange. Move the vctr field into
the s2_mmu structure, which has no impact whatsoever, except for NV.

Note that once we are able to override ID_AA64MMFR0_EL1.PARange
from userspace, we'll also be able to restrict the size of the
shadow S2 that NV uses.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231012205108.3937270-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/stage2_pgtable.h
arch/arm64/kvm/hyp/nvhe/mem_protect.c
arch/arm64/kvm/hyp/nvhe/pkvm.c
arch/arm64/kvm/hyp/pgtable.c
arch/arm64/kvm/mmu.c
arch/arm64/kvm/pkvm.c
arch/arm64/kvm/vgic/vgic-kvm-device.c

index 759adee..b6b10eb 100644 (file)
@@ -158,6 +158,16 @@ struct kvm_s2_mmu {
        phys_addr_t     pgd_phys;
        struct kvm_pgtable *pgt;
 
+       /*
+        * VTCR value used on the host. For a non-NV guest (or a NV
+        * guest that runs in a context where its own S2 doesn't
+        * apply), its T0SZ value reflects that of the IPA size.
+        *
+        * For a shadow S2 MMU, T0SZ reflects the PARange exposed to
+        * the guest.
+        */
+       u64     vtcr;
+
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
 
@@ -205,9 +215,6 @@ struct kvm_protected_vm {
 struct kvm_arch {
        struct kvm_s2_mmu mmu;
 
-       /* VTCR_EL2 value for this VM */
-       u64    vtcr;
-
        /* Interrupt controller */
        struct vgic_dist        vgic;
 
index 96a80e8..caa29c1 100644 (file)
@@ -150,9 +150,9 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
  */
 #define KVM_PHYS_SHIFT (40)
 
-#define kvm_phys_shift(kvm)            VTCR_EL2_IPA(kvm->arch.vtcr)
-#define kvm_phys_size(kvm)             (_AC(1, ULL) << kvm_phys_shift(kvm))
-#define kvm_phys_mask(kvm)             (kvm_phys_size(kvm) - _AC(1, ULL))
+#define kvm_phys_shift(mmu)            VTCR_EL2_IPA((mmu)->vtcr)
+#define kvm_phys_size(mmu)             (_AC(1, ULL) << kvm_phys_shift(mmu))
+#define kvm_phys_mask(mmu)             (kvm_phys_size(mmu) - _AC(1, ULL))
 
 #include <asm/kvm_pgtable.h>
 #include <asm/stage2_pgtable.h>
@@ -299,7 +299,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
 static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
                                          struct kvm_arch *arch)
 {
-       write_sysreg(arch->vtcr, vtcr_el2);
+       write_sysreg(mmu->vtcr, vtcr_el2);
        write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);
 
        /*
index c8dca8a..23d2762 100644 (file)
  * (IPA_SHIFT - 4).
  */
 #define stage2_pgtable_levels(ipa)     ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
-#define kvm_stage2_levels(kvm)         VTCR_EL2_LVLS(kvm->arch.vtcr)
+#define kvm_stage2_levels(mmu)         VTCR_EL2_LVLS((mmu)->vtcr)
 
 /*
  * kvm_mmmu_cache_min_pages() is the number of pages required to install
  * a stage-2 translation. We pre-allocate the entry level page table at
  * the VM creation.
  */
-#define kvm_mmu_cache_min_pages(kvm)   (kvm_stage2_levels(kvm) - 1)
+#define kvm_mmu_cache_min_pages(mmu)   (kvm_stage2_levels(mmu) - 1)
 
 #endif /* __ARM64_S2_PGTABLE_H_ */
index 9d70344..8d0a583 100644 (file)
@@ -129,8 +129,8 @@ static void prepare_host_vtcr(void)
        parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
        phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
 
-       host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
-                                         id_aa64mmfr1_el1_sys_val, phys_shift);
+       host_mmu.arch.mmu.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+                                             id_aa64mmfr1_el1_sys_val, phys_shift);
 }
 
 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
@@ -235,7 +235,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
        unsigned long nr_pages;
        int ret;
 
-       nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+       nr_pages = kvm_pgtable_stage2_pgd_size(mmu->vtcr) >> PAGE_SHIFT;
        ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
        if (ret)
                return ret;
@@ -295,7 +295,7 @@ int __pkvm_prot_finalize(void)
                return -EPERM;
 
        params->vttbr = kvm_get_vttbr(mmu);
-       params->vtcr = host_mmu.arch.vtcr;
+       params->vtcr = mmu->vtcr;
        params->hcr_el2 |= HCR_VM;
 
        /*
index 8033ef3..9d23a51 100644 (file)
@@ -303,7 +303,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
 {
        hyp_vm->host_kvm = host_kvm;
        hyp_vm->kvm.created_vcpus = nr_vcpus;
-       hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+       hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr;
 }
 
 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
@@ -483,7 +483,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
        }
 
        vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
-       pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
+       pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.mmu.vtcr);
 
        ret = -ENOMEM;
 
index f155b8c..0c84872 100644 (file)
@@ -1511,7 +1511,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                              kvm_pgtable_force_pte_cb_t force_pte_cb)
 {
        size_t pgd_sz;
-       u64 vtcr = mmu->arch->vtcr;
+       u64 vtcr = mmu->vtcr;
        u32 ia_bits = VTCR_EL2_IPA(vtcr);
        u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
index 482280f..551f219 100644 (file)
@@ -892,7 +892,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
-       kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+       mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
        if (mmu->pgt != NULL) {
                kvm_err("kvm_arch already initialized?\n");
@@ -1067,7 +1067,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
        phys_addr_t addr;
        int ret = 0;
        struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
-       struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+       struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+       struct kvm_pgtable *pgt = mmu->pgt;
        enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
                                     KVM_PGTABLE_PROT_R |
                                     (writable ? KVM_PGTABLE_PROT_W : 0);
@@ -1080,7 +1081,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
        for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
                ret = kvm_mmu_topup_memory_cache(&cache,
-                                                kvm_mmu_cache_min_pages(kvm));
+                                                kvm_mmu_cache_min_pages(mmu));
                if (ret)
                        break;
 
@@ -1431,7 +1432,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (fault_status != ESR_ELx_FSC_PERM ||
            (logging_active && write_fault)) {
                ret = kvm_mmu_topup_memory_cache(memcache,
-                                                kvm_mmu_cache_min_pages(kvm));
+                                                kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
                if (ret)
                        return ret;
        }
@@ -1747,7 +1748,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
        }
 
        /* Userspace should not be able to register out-of-bounds IPAs */
-       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+       VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
 
        if (fault_status == ESR_ELx_FSC_ACCESS) {
                handle_access_fault(vcpu, fault_ipa);
@@ -2021,7 +2022,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
         * Prevent userspace from creating a memory region outside of the IPA
         * space addressable by the KVM guest IPA space.
         */
-       if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
+       if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
                return -EFAULT;
 
        hva = new->userspace_addr;
index 6ff3ec1..8350fb8 100644 (file)
@@ -123,7 +123,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
        if (host_kvm->created_vcpus < 1)
                return -EINVAL;
 
-       pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
+       pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr);
 
        /*
         * The PGD pages will be reclaimed using a hyp_memcache which implies
index 212b73a..64f8e2e 100644 (file)
@@ -27,7 +27,8 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
        if (addr + size < addr)
                return -EINVAL;
 
-       if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm))
+       if (addr & ~kvm_phys_mask(&kvm->arch.mmu) ||
+           (addr + size) > kvm_phys_size(&kvm->arch.mmu))
                return -E2BIG;
 
        return 0;