Merge branch 'kvm-arm64/nvhe-hyp-context' into kvmarm-master/next
authorMarc Zyngier <maz@kernel.org>
Wed, 16 Sep 2020 09:59:17 +0000 (10:59 +0100)
committerMarc Zyngier <maz@kernel.org>
Wed, 16 Sep 2020 09:59:17 +0000 (10:59 +0100)
Signed-off-by: Marc Zyngier <maz@kernel.org>
18 files changed:
Documentation/virt/kvm/api.rst
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/kvm_mmu.h
arch/arm64/include/asm/kvm_pgtable.h [new file with mode: 0644]
arch/arm64/include/asm/pgtable-hwdef.h
arch/arm64/include/asm/pgtable-prot.h
arch/arm64/include/asm/stage2_pgtable.h
arch/arm64/kvm/arm.c
arch/arm64/kvm/hyp/Makefile
arch/arm64/kvm/hyp/pgtable.c [new file with mode: 0644]
arch/arm64/kvm/mmu.c
arch/arm64/kvm/pvtime.c
arch/arm64/kvm/reset.c
arch/arm64/kvm/trace_arm.h
arch/arm64/kvm/trace_handle_exit.h
arch/x86/kvm/x86.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h

index eb3a131..d2b733d 100644 (file)
@@ -6130,7 +6130,7 @@ HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
 8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
 -----------------------------------
 
-:Architecture: x86
+:Architectures: x86
 
 This capability indicates that KVM running on top of Hyper-V hypervisor
 enables Direct TLB flush for its guests meaning that TLB flush
@@ -6143,19 +6143,33 @@ in CPUID and only exposes Hyper-V identification. In this case, guest
 thinks it's running on Hyper-V and only use Hyper-V hypercalls.
 
 8.22 KVM_CAP_S390_VCPU_RESETS
+-----------------------------
 
-Architectures: s390
+:Architectures: s390
 
 This capability indicates that the KVM_S390_NORMAL_RESET and
 KVM_S390_CLEAR_RESET ioctls are available.
 
 8.23 KVM_CAP_S390_PROTECTED
+---------------------------
 
-Architecture: s390
-
+:Architectures: s390
 
 This capability indicates that the Ultravisor has been initialized and
 KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.24 KVM_CAP_STEAL_TIME
+-----------------------
+
+:Architectures: arm64, x86
+
+This capability indicates that KVM supports steal time accounting.
+When steal time accounting is supported it may be enabled with
+architecture-specific interfaces.  This capability and the architecture-
+specific interfaces must be consistent, i.e. if one says the feature
+is supported, than the other should as well and vice versa.  For arm64
+see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL".
+For x86 see Documentation/virt/kvm/msr.rst "MSR_KVM_STEAL_TIME".
index ef0325c..b537ab0 100644 (file)
@@ -80,8 +80,8 @@ struct kvm_s2_mmu {
         * for vEL1/EL0 with vHCR_EL2.VM == 0.  In that case, we use the
         * canonical stage-2 page tables.
         */
-       pgd_t           *pgd;
        phys_addr_t     pgd_phys;
+       struct kvm_pgtable *pgt;
 
        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;
@@ -367,7 +367,6 @@ struct kvm_vcpu_arch {
 
        /* Guest PV state */
        struct {
-               u64 steal;
                u64 last_steal;
                gpa_t base;
        } steal;
@@ -540,6 +539,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
 gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
 void kvm_update_stolen_time(struct kvm_vcpu *vcpu);
 
+bool kvm_arm_pvtime_supported(void);
 int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);
 int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
index 189839c..c490fe8 100644 (file)
  *     HYP_VA_MIN = 1 << (VA_BITS - 1)
  * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
  *
- * This of course assumes that the trampoline page exists within the
- * VA_BITS range. If it doesn't, then it means we're in the odd case
- * where the kernel idmap (as well as HYP) uses more levels than the
- * kernel runtime page tables (as seen when the kernel is configured
- * for 4k pages, 39bits VA, and yet memory lives just above that
- * limit, forcing the idmap to use 4 levels of page tables while the
- * kernel itself only uses 3). In this particular case, it doesn't
- * matter which side of VA_BITS we use, as we're guaranteed not to
- * conflict with anything.
- *
  * When using VHE, there are no separate hyp mappings and all KVM
  * functionality is already mapped as part of the main kernel
  * mappings, and none of this applies in that case.
@@ -117,15 +107,10 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 #define kvm_phys_size(kvm)             (_AC(1, ULL) << kvm_phys_shift(kvm))
 #define kvm_phys_mask(kvm)             (kvm_phys_size(kvm) - _AC(1, ULL))
 
-static inline bool kvm_page_empty(void *ptr)
-{
-       struct page *ptr_page = virt_to_page(ptr);
-       return page_count(ptr_page) == 1;
-}
-
+#include <asm/kvm_pgtable.h>
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to, pgprot_t prot);
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
                           void __iomem **kaddr,
                           void __iomem **haddr);
@@ -141,149 +126,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
-
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
-void kvm_clear_hyp_idmap(void);
-
-#define kvm_mk_pmd(ptep)                                       \
-       __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
-#define kvm_mk_pud(pmdp)                                       \
-       __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_p4d(pmdp)                                       \
-       __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
-
-#define kvm_set_pud(pudp, pud)         set_pud(pudp, pud)
-
-#define kvm_pfn_pte(pfn, prot)         pfn_pte(pfn, prot)
-#define kvm_pfn_pmd(pfn, prot)         pfn_pmd(pfn, prot)
-#define kvm_pfn_pud(pfn, prot)         pfn_pud(pfn, prot)
-
-#define kvm_pud_pfn(pud)               pud_pfn(pud)
-
-#define kvm_pmd_mkhuge(pmd)            pmd_mkhuge(pmd)
-#define kvm_pud_mkhuge(pud)            pud_mkhuge(pud)
-
-static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
-{
-       pte_val(pte) |= PTE_S2_RDWR;
-       return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
-{
-       pmd_val(pmd) |= PMD_S2_RDWR;
-       return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
-{
-       pud_val(pud) |= PUD_S2_RDWR;
-       return pud;
-}
-
-static inline pte_t kvm_s2pte_mkexec(pte_t pte)
-{
-       pte_val(pte) &= ~PTE_S2_XN;
-       return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
-{
-       pmd_val(pmd) &= ~PMD_S2_XN;
-       return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkexec(pud_t pud)
-{
-       pud_val(pud) &= ~PUD_S2_XN;
-       return pud;
-}
-
-static inline void kvm_set_s2pte_readonly(pte_t *ptep)
-{
-       pteval_t old_pteval, pteval;
-
-       pteval = READ_ONCE(pte_val(*ptep));
-       do {
-               old_pteval = pteval;
-               pteval &= ~PTE_S2_RDWR;
-               pteval |= PTE_S2_RDONLY;
-               pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
-       } while (pteval != old_pteval);
-}
-
-static inline bool kvm_s2pte_readonly(pte_t *ptep)
-{
-       return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
-}
-
-static inline bool kvm_s2pte_exec(pte_t *ptep)
-{
-       return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
-}
-
-static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
-{
-       kvm_set_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
-{
-       return kvm_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
-{
-       return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
-}
-
-static inline void kvm_set_s2pud_readonly(pud_t *pudp)
-{
-       kvm_set_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_readonly(pud_t *pudp)
-{
-       return kvm_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_exec(pud_t *pudp)
-{
-       return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
-}
-
-static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
-{
-       return pud_mkyoung(pud);
-}
-
-static inline bool kvm_s2pud_young(pud_t pud)
-{
-       return pud_young(pud);
-}
-
-#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
-
-#ifdef __PAGETABLE_PMD_FOLDED
-#define hyp_pmd_table_empty(pmdp) (0)
-#else
-#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#endif
-
-#ifdef __PAGETABLE_PUD_FOLDED
-#define hyp_pud_table_empty(pudp) (0)
-#else
-#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
-#endif
-
-#ifdef __PAGETABLE_P4D_FOLDED
-#define hyp_p4d_table_empty(p4dp) (0)
-#else
-#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
-#endif
 
 struct kvm;
 
@@ -325,77 +170,9 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
        }
 }
 
-static inline void __kvm_flush_dcache_pte(pte_t pte)
-{
-       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
-               struct page *page = pte_page(pte);
-               kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
-       }
-}
-
-static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
-{
-       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
-               struct page *page = pmd_page(pmd);
-               kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
-       }
-}
-
-static inline void __kvm_flush_dcache_pud(pud_t pud)
-{
-       if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
-               struct page *page = pud_page(pud);
-               kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
-       }
-}
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
-static inline bool __kvm_cpu_uses_extended_idmap(void)
-{
-       return __cpu_uses_extended_idmap_level();
-}
-
-static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
-{
-       return idmap_ptrs_per_pgd;
-}
-
-/*
- * Can't use pgd_populate here, because the extended idmap adds an extra level
- * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
- * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
- */
-static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
-                                      pgd_t *hyp_pgd,
-                                      pgd_t *merged_hyp_pgd,
-                                      unsigned long hyp_idmap_start)
-{
-       int idmap_idx;
-       u64 pgd_addr;
-
-       /*
-        * Use the first entry to access the HYP mappings. It is
-        * guaranteed to be free, otherwise we wouldn't use an
-        * extended idmap.
-        */
-       VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
-       pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
-       merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-
-       /*
-        * Create another extended level entry that points to the boot HYP map,
-        * which contains an ID mapping of the HYP init code. We essentially
-        * merge the boot and runtime HYP maps by doing so, but they don't
-        * overlap anyway, so this is fine.
-        */
-       idmap_idx = hyp_idmap_start >> VA_BITS;
-       VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
-       pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
-       merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-}
-
 static inline unsigned int kvm_get_vmid_bits(void)
 {
        int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
@@ -553,30 +330,6 @@ static inline int hyp_map_aux_data(void)
 
 #define kvm_phys_to_vttbr(addr)                phys_to_ttbr(addr)
 
-/*
- * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
- * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
- * 52bit IPS.
- */
-static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
-{
-       int x = ARM64_VTTBR_X(ipa_shift, levels);
-
-       return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
-}
-
-static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
-{
-       unsigned int x = arm64_vttbr_x(ipa_shift, levels);
-
-       return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
-}
-
-static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
-{
-       return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
-}
-
 static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
 {
        struct kvm_vmid *vmid = &mmu->vmid;
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
new file mode 100644 (file)
index 0000000..52ab38d
--- /dev/null
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits:           Maximum input address size, in bits.
+ * @start_level:       Level at which the page-table walk starts.
+ * @pgd:               Pointer to the first top-level entry of the page-table.
+ * @mmu:               Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+       u32                                     ia_bits;
+       u32                                     start_level;
+       kvm_pte_t                               *pgd;
+
+       /* Stage-2 only */
+       struct kvm_s2_mmu                       *mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_X:                Execute permission.
+ * @KVM_PGTABLE_PROT_W:                Write permission.
+ * @KVM_PGTABLE_PROT_R:                Read permission.
+ * @KVM_PGTABLE_PROT_DEVICE:   Device attributes.
+ */
+enum kvm_pgtable_prot {
+       KVM_PGTABLE_PROT_X                      = BIT(0),
+       KVM_PGTABLE_PROT_W                      = BIT(1),
+       KVM_PGTABLE_PROT_R                      = BIT(2),
+
+       KVM_PGTABLE_PROT_DEVICE                 = BIT(3),
+};
+
+#define PAGE_HYP               (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define PAGE_HYP_EXEC          (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
+#define PAGE_HYP_RO            (KVM_PGTABLE_PROT_R)
+#define PAGE_HYP_DEVICE                (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF:             Visit leaf entries, including invalid
+ *                                     entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE:                Visit table entries before their
+ *                                     children.
+ * @KVM_PGTABLE_WALK_TABLE_POST:       Visit table entries after their
+ *                                     children.
+ */
+enum kvm_pgtable_walk_flags {
+       KVM_PGTABLE_WALK_LEAF                   = BIT(0),
+       KVM_PGTABLE_WALK_TABLE_PRE              = BIT(1),
+       KVM_PGTABLE_WALK_TABLE_POST             = BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+                                       kvm_pte_t *ptep,
+                                       enum kvm_pgtable_walk_flags flag,
+                                       void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb:                Callback function to invoke during the walk.
+ * @arg:       Argument passed to the callback function.
+ * @flags:     Bitwise-OR of flags to identify the entry types on which to
+ *             invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+       const kvm_pgtable_visitor_fn_t          cb;
+       void * const                            arg;
+       const enum kvm_pgtable_walk_flags       flags;
+};
+
+/**
+ * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
+ * @pgt:       Uninitialised page-table structure to initialise.
+ * @va_bits:   Maximum virtual address bits.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+
+/**
+ * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_hyp_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_hyp_init().
+ * @addr:      Virtual address at which to place the mapping.
+ * @size:      Size of the mapping.
+ * @phys:      Physical address of the memory to map.
+ * @prot:      Permissions and attributes for the mapping.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable. Attempts to install a new mapping
+ * for a virtual address that is already mapped will be rejected with an
+ * error and a WARN().
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+                       enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * @pgt:       Uninitialised page-table structure to initialise.
+ * @kvm:       KVM structure representing the guest virtual machine.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+
+/**
+ * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address at which to place the mapping.
+ * @size:      Size of the mapping.
+ * @phys:      Physical address of the memory to map.
+ * @prot:      Permissions and attributes for the mapping.
+ * @mc:                Cache of pre-allocated GFP_PGTABLE_USER memory from which to
+ *             allocate page-table pages.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable.
+ *
+ * Note that this function will both coalesce existing table entries and split
+ * existing block mappings, relying on page-faults to fault back areas outside
+ * of the new mapping lazily.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                          u64 phys, enum kvm_pgtable_prot prot,
+                          struct kvm_mmu_memory_cache *mc);
+
+/**
+ * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address from which to remove the mapping.
+ * @size:      Size of the mapping.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * TLB invalidation is performed for each page-table entry cleared during the
+ * unmapping operation and the reference count for the page-table page
+ * containing the cleared entry is decremented, with unreferenced pages being
+ * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
+ * FWB is not supported by the CPU.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
+ *                                  without TLB invalidation.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address from which to write-protect,
+ * @size:      Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * set the access flag in that entry.
+ *
+ * Return: The old page-table entry prior to setting the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * clear the access flag in that entry.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
+ *                                   page-table entry.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address to identify the page-table entry.
+ * @prot:      Additional permissions to grant for the mapping.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * relax the permissions in that entry according to the read, write and
+ * execute permissions specified by @prot. No permissions are removed, and
+ * TLB invalidation is performed after updating the entry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+                                  enum kvm_pgtable_prot prot);
+
+/**
+ * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
+ *                                access flag set.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * Return: True if the page-table entry has the access flag set, false otherwise.
+ */
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
+ *                                   of Coherency for guest stage-2 address
+ *                                   range.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:      Intermediate physical address from which to flush.
+ * @size:      Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt:       Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr:      Input address for the start of the walk.
+ * @size:      Size of the range to walk.
+ * @walker:    Walker callback description.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                    struct kvm_pgtable_walker *walker);
+
+#endif /* __ARM64_KVM_PGTABLE_H__ */
index d400a4d..2b5f822 100644 (file)
 #define PTE_CONT               (_AT(pteval_t, 1) << 52)        /* Contiguous range */
 #define PTE_PXN                        (_AT(pteval_t, 1) << 53)        /* Privileged XN */
 #define PTE_UXN                        (_AT(pteval_t, 1) << 54)        /* User XN */
-#define PTE_HYP_XN             (_AT(pteval_t, 1) << 54)        /* HYP XN */
 
 #define PTE_ADDR_LOW           (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
 #ifdef CONFIG_ARM64_PA_BITS_52
 #define PTE_ATTRINDX(t)                (_AT(pteval_t, (t)) << 2)
 #define PTE_ATTRINDX_MASK      (_AT(pteval_t, 7) << 2)
 
-/*
- * 2nd stage PTE definitions
- */
-#define PTE_S2_RDONLY          (_AT(pteval_t, 1) << 6)   /* HAP[2:1] */
-#define PTE_S2_RDWR            (_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
-#define PTE_S2_XN              (_AT(pteval_t, 2) << 53)  /* XN[1:0] */
-#define PTE_S2_SW_RESVD                (_AT(pteval_t, 15) << 55) /* Reserved for SW */
-
-#define PMD_S2_RDONLY          (_AT(pmdval_t, 1) << 6)   /* HAP[2:1] */
-#define PMD_S2_RDWR            (_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
-#define PMD_S2_XN              (_AT(pmdval_t, 2) << 53)  /* XN[1:0] */
-#define PMD_S2_SW_RESVD                (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
-
-#define PUD_S2_RDONLY          (_AT(pudval_t, 1) << 6)   /* HAP[2:1] */
-#define PUD_S2_RDWR            (_AT(pudval_t, 3) << 6)   /* HAP[2:1] */
-#define PUD_S2_XN              (_AT(pudval_t, 2) << 53)  /* XN[1:0] */
-
 /*
  * Memory Attribute override for Stage-2 (MemAttr[3:0])
  */
 #define PTE_S2_MEMATTR(t)      (_AT(pteval_t, (t)) << 2)
 
-/*
- * EL2/HYP PTE/PMD definitions
- */
-#define PMD_HYP                        PMD_SECT_USER
-#define PTE_HYP                        PTE_USER
-
 /*
  * Highest possible physical address supported.
  */
index 4d867c6..8f094c4 100644 (file)
@@ -56,7 +56,6 @@ extern bool arm64_use_ng_mappings;
 #define PROT_SECT_NORMAL_EXEC  (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
 
 #define _PAGE_DEFAULT          (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _HYP_PAGE_DEFAULT      _PAGE_DEFAULT
 
 #define PAGE_KERNEL            __pgprot(PROT_NORMAL)
 #define PAGE_KERNEL_RO         __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
@@ -64,11 +63,6 @@ extern bool arm64_use_ng_mappings;
 #define PAGE_KERNEL_EXEC       __pgprot(PROT_NORMAL & ~PTE_PXN)
 #define PAGE_KERNEL_EXEC_CONT  __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
 
-#define PAGE_HYP               __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
-#define PAGE_HYP_EXEC          __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
-#define PAGE_HYP_RO            __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
-#define PAGE_HYP_DEVICE                __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
-
 #define PAGE_S2_MEMATTR(attr)                                          \
        ({                                                              \
                u64 __val;                                              \
@@ -79,19 +73,6 @@ extern bool arm64_use_ng_mappings;
                __val;                                                  \
         })
 
-#define PAGE_S2_XN                                                     \
-       ({                                                              \
-               u64 __val;                                              \
-               if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))           \
-                       __val = 0;                                      \
-               else                                                    \
-                       __val = PTE_S2_XN;                              \
-               __val;                                                  \
-       })
-
-#define PAGE_S2                        __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
-#define PAGE_S2_DEVICE         __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
-
 #define PAGE_NONE              __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
 /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
 #define PAGE_SHARED            __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
index 996bf98..fe341a6 100644 (file)
@@ -8,7 +8,6 @@
 #ifndef __ARM64_S2_PGTABLE_H_
 #define __ARM64_S2_PGTABLE_H_
 
-#include <linux/hugetlb.h>
 #include <linux/pgtable.h>
 
 /*
 #define stage2_pgdir_size(kvm)         (1ULL << stage2_pgdir_shift(kvm))
 #define stage2_pgdir_mask(kvm)         ~(stage2_pgdir_size(kvm) - 1)
 
-/*
- * The number of PTRS across all concatenated stage2 tables given by the
- * number of bits resolved at the initial level.
- * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
- * in which case, stage2_pgd_ptrs will have one entry.
- */
-#define pgd_ptrs_shift(ipa, pgdir_shift)       \
-       ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
-#define __s2_pgd_ptrs(ipa, lvls)               \
-       (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
-#define __s2_pgd_size(ipa, lvls)       (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
-
-#define stage2_pgd_ptrs(kvm)           __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-#define stage2_pgd_size(kvm)           __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-
 /*
  * kvm_mmmu_cache_min_pages() is the number of pages required to install
  * a stage-2 translation. We pre-allocate the entry level page table at
  */
 #define kvm_mmu_cache_min_pages(kvm)   (kvm_stage2_levels(kvm) - 1)
 
-/* Stage2 PUD definitions when the level is present */
-static inline bool kvm_stage2_has_pud(struct kvm *kvm)
-{
-       return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
-}
-
-#define S2_PUD_SHIFT                   ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE                    (1UL << S2_PUD_SHIFT)
-#define S2_PUD_MASK                    (~(S2_PUD_SIZE - 1))
-
-#define stage2_pgd_none(kvm, pgd)              pgd_none(pgd)
-#define stage2_pgd_clear(kvm, pgd)             pgd_clear(pgd)
-#define stage2_pgd_present(kvm, pgd)           pgd_present(pgd)
-#define stage2_pgd_populate(kvm, pgd, p4d)     pgd_populate(NULL, pgd, p4d)
-
-static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
-                                      pgd_t *pgd, unsigned long address)
-{
-       return p4d_offset(pgd, address);
-}
-
-static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
-{
-}
-
-static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
-{
-       return false;
-}
-
-static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
-                                             phys_addr_t addr, phys_addr_t end)
-{
-       return end;
-}
-
-static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
-{
-       if (kvm_stage2_has_pud(kvm))
-               return p4d_none(p4d);
-       else
-               return 0;
-}
-
-static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
-{
-       if (kvm_stage2_has_pud(kvm))
-               p4d_clear(p4dp);
-}
-
-static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
-{
-       if (kvm_stage2_has_pud(kvm))
-               return p4d_present(p4d);
-       else
-               return 1;
-}
-
-static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
-{
-       if (kvm_stage2_has_pud(kvm))
-               p4d_populate(NULL, p4d, pud);
-}
-
-static inline pud_t *stage2_pud_offset(struct kvm *kvm,
-                                      p4d_t *p4d, unsigned long address)
-{
-       if (kvm_stage2_has_pud(kvm))
-               return pud_offset(p4d, address);
-       else
-               return (pud_t *)p4d;
-}
-
-static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
-{
-       if (kvm_stage2_has_pud(kvm))
-               free_page((unsigned long)pud);
-}
-
-static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
-{
-       if (kvm_stage2_has_pud(kvm))
-               return kvm_page_empty(pudp);
-       else
-               return false;
-}
-
-static inline phys_addr_t
-stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-       if (kvm_stage2_has_pud(kvm)) {
-               phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
-
-               return (boundary - 1 < end - 1) ? boundary : end;
-       } else {
-               return end;
-       }
-}
-
-/* Stage2 PMD definitions when the level is present */
-static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
-{
-       return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
-}
-
-#define S2_PMD_SHIFT                   ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE                    (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK                    (~(S2_PMD_SIZE - 1))
-
-static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               return pud_none(pud);
-       else
-               return 0;
-}
-
-static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               pud_clear(pud);
-}
-
-static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               return pud_present(pud);
-       else
-               return 1;
-}
-
-static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               pud_populate(NULL, pud, pmd);
-}
-
-static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
-                                      pud_t *pud, unsigned long address)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               return pmd_offset(pud, address);
-       else
-               return (pmd_t *)pud;
-}
-
-static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               free_page((unsigned long)pmd);
-}
-
-static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               return pud_huge(pud);
-       else
-               return 0;
-}
-
-static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
-{
-       if (kvm_stage2_has_pmd(kvm))
-               return kvm_page_empty(pmdp);
-       else
-               return 0;
-}
-
-static inline phys_addr_t
-stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-       if (kvm_stage2_has_pmd(kvm)) {
-               phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-
-               return (boundary - 1 < end - 1) ? boundary : end;
-       } else {
-               return end;
-       }
-}
-
-static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
-{
-       return kvm_page_empty(ptep);
-}
-
-static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
-{
-       return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
-}
-
 static inline phys_addr_t
 stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
@@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
        return (boundary - 1 < end - 1) ? boundary : end;
 }
 
-/*
- * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
- * the architectural page-table level.
- */
-#define S2_NO_LEVEL_HINT       0
-#define S2_PUD_LEVEL           1
-#define S2_PMD_LEVEL           2
-#define S2_PTE_LEVEL           3
-
 #endif /* __ARM64_S2_PGTABLE_H_ */
index c074d98..28d1e98 100644 (file)
@@ -208,6 +208,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 */
                r = 1;
                break;
+       case KVM_CAP_STEAL_TIME:
+               r = kvm_arm_pvtime_supported();
+               break;
        default:
                r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
                break;
@@ -285,7 +288,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
                static_branch_dec(&userspace_irqchip_in_use);
 
-       kvm_mmu_free_memory_caches(vcpu);
+       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
        kvm_timer_vcpu_terminate(vcpu);
        kvm_pmu_vcpu_destroy(vcpu);
 
index f54f0e8..607b8a8 100644 (file)
@@ -10,5 +10,5 @@ subdir-ccflags-y := -I$(incdir)                               \
                    -DDISABLE_BRANCH_PROFILING          \
                    $(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vhe/ nvhe/
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
 obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644 (file)
index 0000000..603d6b4
--- /dev/null
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS         4U
+
+#define KVM_PTE_VALID                  BIT(0)
+
+#define KVM_PTE_TYPE                   BIT(1)
+#define KVM_PTE_TYPE_BLOCK             0
+#define KVM_PTE_TYPE_PAGE              1
+#define KVM_PTE_TYPE_TABLE             1
+
+#define KVM_PTE_ADDR_MASK              GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48             GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO           GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX        GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP     GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO  3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW  1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH     GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS  3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF     BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR        GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH     GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS  3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF     BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI           GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN     BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN     BIT(54)
+
+struct kvm_pgtable_walk_data {
+       struct kvm_pgtable              *pgt;
+       struct kvm_pgtable_walker       *walker;
+
+       u64                             addr;
+       u64                             end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+       /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+       return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+       return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+       u64 granule = kvm_granule_size(level);
+
+       /*
+        * Reject invalid block mappings and don't bother with 4TB mappings for
+        * 52-bit PAs.
+        */
+       if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+               return false;
+
+       if (granule > (end - addr))
+               return false;
+
+       return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+       u64 shift = kvm_granule_shift(level);
+       u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+       return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+       u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+       u64 mask = BIT(pgt->ia_bits) - 1;
+
+       return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+       return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+       struct kvm_pgtable pgt = {
+               .ia_bits        = ia_bits,
+               .start_level    = start_level,
+       };
+
+       return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+       return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+       if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+               return false;
+
+       if (!kvm_pte_valid(pte))
+               return false;
+
+       return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+       u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+       if (PAGE_SHIFT == 16)
+               pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+       return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+       kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+       if (PAGE_SHIFT == 16)
+               pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+       return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+       return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+       kvm_pte_t pte = *ptep;
+       WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+       pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+       pte |= KVM_PTE_VALID;
+
+       WARN_ON(kvm_pte_valid(old));
+       smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+                                  u32 level)
+{
+       kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+       u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+                                                          KVM_PTE_TYPE_BLOCK;
+
+       pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+       pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+       pte |= KVM_PTE_VALID;
+
+       /* Tolerate KVM recreating the exact same mapping. */
+       if (kvm_pte_valid(old))
+               return old == pte;
+
+       smp_store_release(ptep, pte);
+       return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+                                 u32 level, kvm_pte_t *ptep,
+                                 enum kvm_pgtable_walk_flags flag)
+{
+       struct kvm_pgtable_walker *walker = data->walker;
+       return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+                             kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+                                     kvm_pte_t *ptep, u32 level)
+{
+       int ret = 0;
+       u64 addr = data->addr;
+       kvm_pte_t *childp, pte = *ptep;
+       bool table = kvm_pte_table(pte, level);
+       enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+       if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+               ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+                                            KVM_PGTABLE_WALK_TABLE_PRE);
+       }
+
+       if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+               ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+                                            KVM_PGTABLE_WALK_LEAF);
+               pte = *ptep;
+               table = kvm_pte_table(pte, level);
+       }
+
+       if (ret)
+               goto out;
+
+       if (!table) {
+               data->addr += kvm_granule_size(level);
+               goto out;
+       }
+
+       childp = kvm_pte_follow(pte);
+       ret = __kvm_pgtable_walk(data, childp, level + 1);
+       if (ret)
+               goto out;
+
+       if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+               ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+                                            KVM_PGTABLE_WALK_TABLE_POST);
+       }
+
+out:
+       return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+                             kvm_pte_t *pgtable, u32 level)
+{
+       u32 idx;
+       int ret = 0;
+
+       if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+               return -EINVAL;
+
+       for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+               kvm_pte_t *ptep = &pgtable[idx];
+
+               if (data->addr >= data->end)
+                       break;
+
+               ret = __kvm_pgtable_visit(data, ptep, level);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+       u32 idx;
+       int ret = 0;
+       struct kvm_pgtable *pgt = data->pgt;
+       u64 limit = BIT(pgt->ia_bits);
+
+       if (data->addr > limit || data->end > limit)
+               return -ERANGE;
+
+       if (!pgt->pgd)
+               return -EINVAL;
+
+       for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+               kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+               ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                    struct kvm_pgtable_walker *walker)
+{
+       struct kvm_pgtable_walk_data walk_data = {
+               .pgt    = pgt,
+               .addr   = ALIGN_DOWN(addr, PAGE_SIZE),
+               .end    = PAGE_ALIGN(walk_data.addr + size),
+               .walker = walker,
+       };
+
+       return _kvm_pgtable_walk(&walk_data);
+}
+
+struct hyp_map_data {
+       u64             phys;
+       kvm_pte_t       attr;
+};
+
+static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
+                                struct hyp_map_data *data)
+{
+       bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+       u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
+       kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+       u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+       u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
+                                              KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+
+       if (!(prot & KVM_PGTABLE_PROT_R))
+               return -EINVAL;
+
+       if (prot & KVM_PGTABLE_PROT_X) {
+               if (prot & KVM_PGTABLE_PROT_W)
+                       return -EINVAL;
+
+               if (device)
+                       return -EINVAL;
+       } else {
+               attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
+       }
+
+       attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
+       attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+       attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+       data->attr = attr;
+       return 0;
+}
+
+static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+                                   kvm_pte_t *ptep, struct hyp_map_data *data)
+{
+       u64 granule = kvm_granule_size(level), phys = data->phys;
+
+       if (!kvm_block_mapping_supported(addr, end, phys, level))
+               return false;
+
+       WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+       data->phys += granule;
+       return true;
+}
+
+static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                         enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+       kvm_pte_t *childp;
+
+       if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+               return 0;
+
+       if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+               return -EINVAL;
+
+       childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       if (!childp)
+               return -ENOMEM;
+
+       kvm_set_table_pte(ptep, childp);
+       return 0;
+}
+
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+                       enum kvm_pgtable_prot prot)
+{
+       int ret;
+       struct hyp_map_data map_data = {
+               .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
+       };
+       struct kvm_pgtable_walker walker = {
+               .cb     = hyp_map_walker,
+               .flags  = KVM_PGTABLE_WALK_LEAF,
+               .arg    = &map_data,
+       };
+
+       ret = hyp_map_set_prot_attr(prot, &map_data);
+       if (ret)
+               return ret;
+
+       ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+       dsb(ishst);
+       isb();
+       return ret;
+}
+
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+{
+       u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+       pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+       if (!pgt->pgd)
+               return -ENOMEM;
+
+       pgt->ia_bits            = va_bits;
+       pgt->start_level        = KVM_PGTABLE_MAX_LEVELS - levels;
+       pgt->mmu                = NULL;
+       return 0;
+}
+
+static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                          enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+       free_page((unsigned long)kvm_pte_follow(*ptep));
+       return 0;
+}
+
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
+{
+       struct kvm_pgtable_walker walker = {
+               .cb     = hyp_free_walker,
+               .flags  = KVM_PGTABLE_WALK_TABLE_POST,
+       };
+
+       WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+       free_page((unsigned long)pgt->pgd);
+       pgt->pgd = NULL;
+}
+
+struct stage2_map_data {
+       u64                             phys;
+       kvm_pte_t                       attr;
+
+       kvm_pte_t                       *anchor;
+
+       struct kvm_s2_mmu               *mmu;
+       struct kvm_mmu_memory_cache     *memcache;
+};
+
+static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
+                                   struct stage2_map_data *data)
+{
+       bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+       kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
+                           PAGE_S2_MEMATTR(NORMAL);
+       u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+
+       if (!(prot & KVM_PGTABLE_PROT_X))
+               attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+       else if (device)
+               return -EINVAL;
+
+       if (prot & KVM_PGTABLE_PROT_R)
+               attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+       if (prot & KVM_PGTABLE_PROT_W)
+               attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+       attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+       attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+       data->attr = attr;
+       return 0;
+}
+
+static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+                                      kvm_pte_t *ptep,
+                                      struct stage2_map_data *data)
+{
+       u64 granule = kvm_granule_size(level), phys = data->phys;
+
+       if (!kvm_block_mapping_supported(addr, end, phys, level))
+               return false;
+
+       if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
+               goto out;
+
+       /* There's an existing valid leaf entry, so perform break-before-make */
+       kvm_set_invalid_pte(ptep);
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+       kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+out:
+       data->phys += granule;
+       return true;
+}
+
+static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
+                                    kvm_pte_t *ptep,
+                                    struct stage2_map_data *data)
+{
+       if (data->anchor)
+               return 0;
+
+       if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+               return 0;
+
+       kvm_set_invalid_pte(ptep);
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+       data->anchor = ptep;
+       return 0;
+}
+
+static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                               struct stage2_map_data *data)
+{
+       kvm_pte_t *childp, pte = *ptep;
+       struct page *page = virt_to_page(ptep);
+
+       if (data->anchor) {
+               if (kvm_pte_valid(pte))
+                       put_page(page);
+
+               return 0;
+       }
+
+       if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
+               goto out_get_page;
+
+       if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+               return -EINVAL;
+
+       if (!data->memcache)
+               return -ENOMEM;
+
+       childp = kvm_mmu_memory_cache_alloc(data->memcache);
+       if (!childp)
+               return -ENOMEM;
+
+       /*
+        * If we've run into an existing block mapping then replace it with
+        * a table. Accesses beyond 'end' that fall within the new table
+        * will be mapped lazily.
+        */
+       if (kvm_pte_valid(pte)) {
+               kvm_set_invalid_pte(ptep);
+               kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+               put_page(page);
+       }
+
+       kvm_set_table_pte(ptep, childp);
+
+out_get_page:
+       get_page(page);
+       return 0;
+}
+
+static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
+                                     kvm_pte_t *ptep,
+                                     struct stage2_map_data *data)
+{
+       int ret = 0;
+
+       if (!data->anchor)
+               return 0;
+
+       free_page((unsigned long)kvm_pte_follow(*ptep));
+       put_page(virt_to_page(ptep));
+
+       if (data->anchor == ptep) {
+               data->anchor = NULL;
+               ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+       }
+
+       return ret;
+}
+
+/*
+ * This is a little fiddly, as we use all three of the walk flags. The idea
+ * is that the TABLE_PRE callback runs for table entries on the way down,
+ * looking for table entries which we could conceivably replace with a
+ * block entry for this mapping. If it finds one, then it sets the 'anchor'
+ * field in 'struct stage2_map_data' to point at the table entry, before
+ * clearing the entry to zero and descending into the now detached table.
+ *
+ * The behaviour of the LEAF callback then depends on whether or not the
+ * anchor has been set. If not, then we're not using a block mapping higher
+ * up the table and we perform the mapping at the existing leaves instead.
+ * If, on the other hand, the anchor _is_ set, then we drop references to
+ * all valid leaves so that the pages beneath the anchor can be freed.
+ *
+ * Finally, the TABLE_POST callback does nothing if the anchor has not
+ * been set, but otherwise frees the page-table pages while walking back up
+ * the page-table, installing the block entry when it revisits the anchor
+ * pointer and clearing the anchor to NULL.
+ */
+static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                            enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+       struct stage2_map_data *data = arg;
+
+       switch (flag) {
+       case KVM_PGTABLE_WALK_TABLE_PRE:
+               return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+       case KVM_PGTABLE_WALK_LEAF:
+               return stage2_map_walk_leaf(addr, end, level, ptep, data);
+       case KVM_PGTABLE_WALK_TABLE_POST:
+               return stage2_map_walk_table_post(addr, end, level, ptep, data);
+       }
+
+       return -EINVAL;
+}
+
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+                          u64 phys, enum kvm_pgtable_prot prot,
+                          struct kvm_mmu_memory_cache *mc)
+{
+       int ret;
+       struct stage2_map_data map_data = {
+               .phys           = ALIGN_DOWN(phys, PAGE_SIZE),
+               .mmu            = pgt->mmu,
+               .memcache       = mc,
+       };
+       struct kvm_pgtable_walker walker = {
+               .cb             = stage2_map_walker,
+               .flags          = KVM_PGTABLE_WALK_TABLE_PRE |
+                                 KVM_PGTABLE_WALK_LEAF |
+                                 KVM_PGTABLE_WALK_TABLE_POST,
+               .arg            = &map_data,
+       };
+
+       ret = stage2_map_set_prot_attr(prot, &map_data);
+       if (ret)
+               return ret;
+
+       ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+       dsb(ishst);
+       return ret;
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               return;
+
+       __flush_dcache_area(addr, size);
+}
+
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+       u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+       return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                              enum kvm_pgtable_walk_flags flag,
+                              void * const arg)
+{
+       struct kvm_s2_mmu *mmu = arg;
+       kvm_pte_t pte = *ptep, *childp = NULL;
+       bool need_flush = false;
+
+       if (!kvm_pte_valid(pte))
+               return 0;
+
+       if (kvm_pte_table(pte, level)) {
+               childp = kvm_pte_follow(pte);
+
+               if (page_count(virt_to_page(childp)) != 1)
+                       return 0;
+       } else if (stage2_pte_cacheable(pte)) {
+               need_flush = true;
+       }
+
+       /*
+        * This is similar to the map() path in that we unmap the entire
+        * block entry and rely on the remaining portions being faulted
+        * back lazily.
+        */
+       kvm_set_invalid_pte(ptep);
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+       put_page(virt_to_page(ptep));
+
+       if (need_flush) {
+               stage2_flush_dcache(kvm_pte_follow(pte),
+                                   kvm_granule_size(level));
+       }
+
+       if (childp)
+               free_page((unsigned long)childp);
+
+       return 0;
+}
+
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+       struct kvm_pgtable_walker walker = {
+               .cb     = stage2_unmap_walker,
+               .arg    = pgt->mmu,
+               .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+       };
+
+       return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+struct stage2_attr_data {
+       kvm_pte_t       attr_set;
+       kvm_pte_t       attr_clr;
+       kvm_pte_t       pte;
+};
+
+static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                             enum kvm_pgtable_walk_flags flag,
+                             void * const arg)
+{
+       kvm_pte_t pte = *ptep;
+       struct stage2_attr_data *data = arg;
+
+       if (!kvm_pte_valid(pte))
+               return 0;
+
+       data->pte = pte;
+       pte &= ~data->attr_clr;
+       pte |= data->attr_set;
+
+       /*
+        * We may race with the CPU trying to set the access flag here,
+        * but worst-case the access flag update gets lost and will be
+        * set on the next access instead.
+        */
+       if (data->pte != pte)
+               WRITE_ONCE(*ptep, pte);
+
+       return 0;
+}
+
+static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
+                                   u64 size, kvm_pte_t attr_set,
+                                   kvm_pte_t attr_clr, kvm_pte_t *orig_pte)
+{
+       int ret;
+       kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
+       struct stage2_attr_data data = {
+               .attr_set       = attr_set & attr_mask,
+               .attr_clr       = attr_clr & attr_mask,
+       };
+       struct kvm_pgtable_walker walker = {
+               .cb             = stage2_attr_walker,
+               .arg            = &data,
+               .flags          = KVM_PGTABLE_WALK_LEAF,
+       };
+
+       ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+       if (ret)
+               return ret;
+
+       if (orig_pte)
+               *orig_pte = data.pte;
+       return 0;
+}
+
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+       return stage2_update_leaf_attrs(pgt, addr, size, 0,
+                                       KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL);
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+{
+       kvm_pte_t pte = 0;
+       stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+                                &pte);
+       dsb(ishst);
+       return pte;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+{
+       kvm_pte_t pte = 0;
+       stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
+                                &pte);
+       /*
+        * "But where's the TLBI?!", you scream.
+        * "Over in the core code", I sigh.
+        *
+        * See the '->clear_flush_young()' callback on the KVM mmu notifier.
+        */
+       return pte;
+}
+
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+{
+       kvm_pte_t pte = 0;
+       stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte);
+       return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+}
+
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+                                  enum kvm_pgtable_prot prot)
+{
+       int ret;
+       kvm_pte_t set = 0, clr = 0;
+
+       if (prot & KVM_PGTABLE_PROT_R)
+               set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+       if (prot & KVM_PGTABLE_PROT_W)
+               set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+       if (prot & KVM_PGTABLE_PROT_X)
+               clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+       ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL);
+       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, 0);
+       return ret;
+}
+
+static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                              enum kvm_pgtable_walk_flags flag,
+                              void * const arg)
+{
+       kvm_pte_t pte = *ptep;
+
+       if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+               return 0;
+
+       stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+       return 0;
+}
+
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+       struct kvm_pgtable_walker walker = {
+               .cb     = stage2_flush_walker,
+               .flags  = KVM_PGTABLE_WALK_LEAF,
+       };
+
+       if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+               return 0;
+
+       return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+{
+       size_t pgd_sz;
+       u64 vtcr = kvm->arch.vtcr;
+       u32 ia_bits = VTCR_EL2_IPA(vtcr);
+       u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+       u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+       pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+       pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+       if (!pgt->pgd)
+               return -ENOMEM;
+
+       pgt->ia_bits            = ia_bits;
+       pgt->start_level        = start_level;
+       pgt->mmu                = &kvm->arch.mmu;
+
+       /* Ensure zeroed PGD pages are visible to the hardware walker */
+       dsb(ishst);
+       return 0;
+}
+
+static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+                             enum kvm_pgtable_walk_flags flag,
+                             void * const arg)
+{
+       kvm_pte_t pte = *ptep;
+
+       if (!kvm_pte_valid(pte))
+               return 0;
+
+       put_page(virt_to_page(ptep));
+
+       if (kvm_pte_table(pte, level))
+               free_page((unsigned long)kvm_pte_follow(pte));
+
+       return 0;
+}
+
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+       size_t pgd_sz;
+       struct kvm_pgtable_walker walker = {
+               .cb     = stage2_free_walker,
+               .flags  = KVM_PGTABLE_WALK_LEAF |
+                         KVM_PGTABLE_WALK_TABLE_POST,
+       };
+
+       WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+       pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
+       free_pages_exact(pgt->pgd, pgd_sz);
+       pgt->pgd = NULL;
+}
index ba00bcc..e8a5179 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/cacheflush.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
 #include <asm/kvm_ras.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
@@ -21,9 +22,7 @@
 
 #include "trace.h"
 
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
+static struct kvm_pgtable *hyp_pgtable;
 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
 
 static unsigned long hyp_idmap_start;
@@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector;
 
 static unsigned long io_map_base;
 
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
 
-#define KVM_S2PTE_FLAG_IS_IOMAP                (1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE     (1UL << 1)
-
-static bool is_iomap(unsigned long flags)
+/*
+ * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
+ * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
+ * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
+ * long will also starve other vCPUs. We have to also make sure that the page
+ * tables are not freed while we released the lock.
+ */
+static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+                             phys_addr_t end,
+                             int (*fn)(struct kvm_pgtable *, u64, u64),
+                             bool resched)
 {
-       return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+       int ret;
+       u64 next;
+
+       do {
+               struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+               if (!pgt)
+                       return -EINVAL;
+
+               next = stage2_pgd_addr_end(kvm, addr, end);
+               ret = fn(pgt, addr, next - addr);
+               if (ret)
+                       break;
+
+               if (resched && next != end)
+                       cond_resched_lock(&kvm->mmu_lock);
+       } while (addr = next, addr != end);
+
+       return ret;
 }
 
+#define stage2_apply_range_resched(kvm, addr, end, fn)                 \
+       stage2_apply_range(kvm, addr, end, fn, true)
+
 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
 {
        return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
        kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
 }
 
-static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
-                                  int level)
-{
-       kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
-       __kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
-       __kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
-       __kvm_flush_dcache_pud(pud);
-}
-
 static bool kvm_is_device_pfn(unsigned long pfn)
 {
        return !pfn_valid(pfn);
 }
 
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @mmu:       pointer to mmu structure to operate on
- * @addr:      IPA
- * @pmd:       pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
-{
-       if (!pmd_thp_or_huge(*pmd))
-               return;
-
-       pmd_clear(pmd);
-       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-       put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @mmu:       pointer to mmu structure to operate on
- * @addr:      IPA
- * @pud:       pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
-{
-       struct kvm *kvm = mmu->kvm;
-
-       if (!stage2_pud_huge(kvm, *pudp))
-               return;
-
-       stage2_pud_clear(kvm, pudp);
-       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
-       put_page(virt_to_page(pudp));
-}
-
-static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
-{
-       struct kvm *kvm = mmu->kvm;
-       p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
-       stage2_pgd_clear(kvm, pgd);
-       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-       stage2_p4d_free(kvm, p4d_table);
-       put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
-{
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
-       stage2_p4d_clear(kvm, p4d);
-       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-       stage2_pud_free(kvm, pud_table);
-       put_page(virt_to_page(p4d));
-}
-
-static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
-{
-       struct kvm *kvm = mmu->kvm;
-       pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-
-       VM_BUG_ON(stage2_pud_huge(kvm, *pud));
-       stage2_pud_clear(kvm, pud);
-       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-       stage2_pmd_free(kvm, pmd_table);
-       put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
-{
-       pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       VM_BUG_ON(pmd_thp_or_huge(*pmd));
-       pmd_clear(pmd);
-       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-       free_page((unsigned long)pte_table);
-       put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
-       WRITE_ONCE(*ptep, new_pte);
-       dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
-       WRITE_ONCE(*pmdp, new_pmd);
-       dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
-       kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
-       WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
-       dsb(ishst);
-}
-
-static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
-{
-       WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
-       dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
-{
-#ifndef __PAGETABLE_P4D_FOLDED
-       WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
-       dsb(ishst);
-#endif
-}
-
 /*
  * Unmapping vs dcache management:
  *
@@ -223,115 +105,13 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
  * end up writing old data to disk.
  *
  * This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
+ * the corresponding TLBs, we flush to make sure the IO subsystem will
+ * never hit in the cache.
  *
  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
  * we then fully enforce cacheability of RAM, no matter what the guest
  * does.
  */
-static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t start_addr = addr;
-       pte_t *pte, *start_pte;
-
-       start_pte = pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte)) {
-                       pte_t old_pte = *pte;
-
-                       kvm_set_pte(pte, __pte(0));
-                       kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-
-                       /* No need to invalidate the cache for device mappings */
-                       if (!kvm_is_device_pfn(pte_pfn(old_pte)))
-                               kvm_flush_dcache_pte(old_pte);
-
-                       put_page(virt_to_page(pte));
-               }
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-
-       if (stage2_pte_table_empty(mmu->kvm, start_pte))
-               clear_stage2_pmd_entry(mmu, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       phys_addr_t next, start_addr = addr;
-       pmd_t *pmd, *start_pmd;
-
-       start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
-       do {
-               next = stage2_pmd_addr_end(kvm, addr, end);
-               if (!pmd_none(*pmd)) {
-                       if (pmd_thp_or_huge(*pmd)) {
-                               pmd_t old_pmd = *pmd;
-
-                               pmd_clear(pmd);
-                               kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-
-                               kvm_flush_dcache_pmd(old_pmd);
-
-                               put_page(virt_to_page(pmd));
-                       } else {
-                               unmap_stage2_ptes(mmu, pmd, addr, next);
-                       }
-               }
-       } while (pmd++, addr = next, addr != end);
-
-       if (stage2_pmd_table_empty(kvm, start_pmd))
-               clear_stage2_pud_entry(mmu, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       phys_addr_t next, start_addr = addr;
-       pud_t *pud, *start_pud;
-
-       start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
-       do {
-               next = stage2_pud_addr_end(kvm, addr, end);
-               if (!stage2_pud_none(kvm, *pud)) {
-                       if (stage2_pud_huge(kvm, *pud)) {
-                               pud_t old_pud = *pud;
-
-                               stage2_pud_clear(kvm, pud);
-                               kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
-                               kvm_flush_dcache_pud(old_pud);
-                               put_page(virt_to_page(pud));
-                       } else {
-                               unmap_stage2_pmds(mmu, pud, addr, next);
-                       }
-               }
-       } while (pud++, addr = next, addr != end);
-
-       if (stage2_pud_table_empty(kvm, start_pud))
-               clear_stage2_p4d_entry(mmu, p4d, start_addr);
-}
-
-static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
-                      phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       phys_addr_t next, start_addr = addr;
-       p4d_t *p4d, *start_p4d;
-
-       start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
-       do {
-               next = stage2_p4d_addr_end(kvm, addr, end);
-               if (!stage2_p4d_none(kvm, *p4d))
-                       unmap_stage2_puds(mmu, p4d, addr, next);
-       } while (p4d++, addr = next, addr != end);
-
-       if (stage2_p4d_table_empty(kvm, start_p4d))
-               clear_stage2_pgd_entry(mmu, pgd, start_addr);
-}
-
 /**
  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  * @kvm:   The VM pointer
@@ -347,32 +127,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
                                 bool may_block)
 {
        struct kvm *kvm = mmu->kvm;
-       pgd_t *pgd;
-       phys_addr_t addr = start, end = start + size;
-       phys_addr_t next;
+       phys_addr_t end = start + size;
 
        assert_spin_locked(&kvm->mmu_lock);
        WARN_ON(size & ~PAGE_MASK);
-
-       pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-       do {
-               /*
-                * Make sure the page table is still active, as another thread
-                * could have possibly freed the page table, while we released
-                * the lock.
-                */
-               if (!READ_ONCE(mmu->pgd))
-                       break;
-               next = stage2_pgd_addr_end(kvm, addr, end);
-               if (!stage2_pgd_none(kvm, *pgd))
-                       unmap_stage2_p4ds(mmu, pgd, addr, next);
-               /*
-                * If the range is too large, release the kvm->mmu_lock
-                * to prevent starvation and lockup detector warnings.
-                */
-               if (may_block && next != end)
-                       cond_resched_lock(&kvm->mmu_lock);
-       } while (pgd++, addr = next, addr != end);
+       WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+                                  may_block));
 }
 
 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
@@ -380,89 +140,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
        __unmap_stage2_range(mmu, start, size, true);
 }
 
-static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       pte_t *pte;
-
-       pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
-                       kvm_flush_dcache_pte(*pte);
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       pmd_t *pmd;
-       phys_addr_t next;
-
-       pmd = stage2_pmd_offset(kvm, pud, addr);
-       do {
-               next = stage2_pmd_addr_end(kvm, addr, end);
-               if (!pmd_none(*pmd)) {
-                       if (pmd_thp_or_huge(*pmd))
-                               kvm_flush_dcache_pmd(*pmd);
-                       else
-                               stage2_flush_ptes(mmu, pmd, addr, next);
-               }
-       } while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pud;
-       phys_addr_t next;
-
-       pud = stage2_pud_offset(kvm, p4d, addr);
-       do {
-               next = stage2_pud_addr_end(kvm, addr, end);
-               if (!stage2_pud_none(kvm, *pud)) {
-                       if (stage2_pud_huge(kvm, *pud))
-                               kvm_flush_dcache_pud(*pud);
-                       else
-                               stage2_flush_pmds(mmu, pud, addr, next);
-               }
-       } while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
-                             phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       p4d_t *p4d;
-       phys_addr_t next;
-
-       p4d = stage2_p4d_offset(kvm, pgd, addr);
-       do {
-               next = stage2_p4d_addr_end(kvm, addr, end);
-               if (!stage2_p4d_none(kvm, *p4d))
-                       stage2_flush_puds(mmu, p4d, addr, next);
-       } while (p4d++, addr = next, addr != end);
-}
-
 static void stage2_flush_memslot(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot)
 {
-       struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
        phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
-       phys_addr_t next;
-       pgd_t *pgd;
-
-       pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-       do {
-               next = stage2_pgd_addr_end(kvm, addr, end);
-               if (!stage2_pgd_none(kvm, *pgd))
-                       stage2_flush_p4ds(mmu, pgd, addr, next);
 
-               if (next != end)
-                       cond_resched_lock(&kvm->mmu_lock);
-       } while (pgd++, addr = next, addr != end);
+       stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
 }
 
 /**
@@ -489,338 +173,28 @@ static void stage2_flush_vm(struct kvm *kvm)
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
-       p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
-       pgd_clear(pgd);
-       p4d_free(NULL, p4d_table);
-       put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_p4d_entry(p4d_t *p4d)
-{
-       pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
-       VM_BUG_ON(p4d_huge(*p4d));
-       p4d_clear(p4d);
-       pud_free(NULL, pud_table);
-       put_page(virt_to_page(p4d));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
-       pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
-       VM_BUG_ON(pud_huge(*pud));
-       pud_clear(pud);
-       pmd_free(NULL, pmd_table);
-       put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
-       pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       VM_BUG_ON(pmd_thp_or_huge(*pmd));
-       pmd_clear(pmd);
-       pte_free_kernel(NULL, pte_table);
-       put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-       pte_t *pte, *start_pte;
-
-       start_pte = pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte)) {
-                       kvm_set_pte(pte, __pte(0));
-                       put_page(virt_to_page(pte));
-               }
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-
-       if (hyp_pte_table_empty(start_pte))
-               clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next;
-       pmd_t *pmd, *start_pmd;
-
-       start_pmd = pmd = pmd_offset(pud, addr);
-       do {
-               next = pmd_addr_end(addr, end);
-               /* Hyp doesn't use huge pmds */
-               if (!pmd_none(*pmd))
-                       unmap_hyp_ptes(pmd, addr, next);
-       } while (pmd++, addr = next, addr != end);
-
-       if (hyp_pmd_table_empty(start_pmd))
-               clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next;
-       pud_t *pud, *start_pud;
-
-       start_pud = pud = pud_offset(p4d, addr);
-       do {
-               next = pud_addr_end(addr, end);
-               /* Hyp doesn't use huge puds */
-               if (!pud_none(*pud))
-                       unmap_hyp_pmds(pud, addr, next);
-       } while (pud++, addr = next, addr != end);
-
-       if (hyp_pud_table_empty(start_pud))
-               clear_hyp_p4d_entry(p4d);
-}
-
-static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-       phys_addr_t next;
-       p4d_t *p4d, *start_p4d;
-
-       start_p4d = p4d = p4d_offset(pgd, addr);
-       do {
-               next = p4d_addr_end(addr, end);
-               /* Hyp doesn't use huge p4ds */
-               if (!p4d_none(*p4d))
-                       unmap_hyp_puds(p4d, addr, next);
-       } while (p4d++, addr = next, addr != end);
-
-       if (hyp_p4d_table_empty(start_p4d))
-               clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
-       return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-                             phys_addr_t start, u64 size)
-{
-       pgd_t *pgd;
-       phys_addr_t addr = start, end = start + size;
-       phys_addr_t next;
-
-       /*
-        * We don't unmap anything from HYP, except at the hyp tear down.
-        * Hence, we don't have to invalidate the TLBs here.
-        */
-       pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-       do {
-               next = pgd_addr_end(addr, end);
-               if (!pgd_none(*pgd))
-                       unmap_hyp_p4ds(pgd, addr, next);
-       } while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-       __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-       __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
 /**
  * free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
  */
 void free_hyp_pgds(void)
 {
-       pgd_t *id_pgd;
-
        mutex_lock(&kvm_hyp_pgd_mutex);
-
-       id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
-       if (id_pgd) {
-               /* In case we never called hyp_mmu_init() */
-               if (!io_map_base)
-                       io_map_base = hyp_idmap_start;
-               unmap_hyp_idmap_range(id_pgd, io_map_base,
-                                     hyp_idmap_start + PAGE_SIZE - io_map_base);
-       }
-
-       if (boot_hyp_pgd) {
-               free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-               boot_hyp_pgd = NULL;
-       }
-
-       if (hyp_pgd) {
-               unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
-                               (uintptr_t)high_memory - PAGE_OFFSET);
-
-               free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
-               hyp_pgd = NULL;
-       }
-       if (merged_hyp_pgd) {
-               clear_page(merged_hyp_pgd);
-               free_page((unsigned long)merged_hyp_pgd);
-               merged_hyp_pgd = NULL;
+       if (hyp_pgtable) {
+               kvm_pgtable_hyp_destroy(hyp_pgtable);
+               kfree(hyp_pgtable);
        }
-
        mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-                                   unsigned long end, unsigned long pfn,
-                                   pgprot_t prot)
-{
-       pte_t *pte;
-       unsigned long addr;
-
-       addr = start;
-       do {
-               pte = pte_offset_kernel(pmd, addr);
-               kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
-               get_page(virt_to_page(pte));
-               pfn++;
-       } while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-                                  unsigned long end, unsigned long pfn,
-                                  pgprot_t prot)
+static int __create_hyp_mappings(unsigned long start, unsigned long size,
+                                unsigned long phys, enum kvm_pgtable_prot prot)
 {
-       pmd_t *pmd;
-       pte_t *pte;
-       unsigned long addr, next;
-
-       addr = start;
-       do {
-               pmd = pmd_offset(pud, addr);
-
-               BUG_ON(pmd_sect(*pmd));
-
-               if (pmd_none(*pmd)) {
-                       pte = pte_alloc_one_kernel(NULL);
-                       if (!pte) {
-                               kvm_err("Cannot allocate Hyp pte\n");
-                               return -ENOMEM;
-                       }
-                       kvm_pmd_populate(pmd, pte);
-                       get_page(virt_to_page(pmd));
-               }
-
-               next = pmd_addr_end(addr, end);
-
-               create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-
-       return 0;
-}
-
-static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
-                                  unsigned long end, unsigned long pfn,
-                                  pgprot_t prot)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       unsigned long addr, next;
-       int ret;
-
-       addr = start;
-       do {
-               pud = pud_offset(p4d, addr);
-
-               if (pud_none_or_clear_bad(pud)) {
-                       pmd = pmd_alloc_one(NULL, addr);
-                       if (!pmd) {
-                               kvm_err("Cannot allocate Hyp pmd\n");
-                               return -ENOMEM;
-                       }
-                       kvm_pud_populate(pud, pmd);
-                       get_page(virt_to_page(pud));
-               }
-
-               next = pud_addr_end(addr, end);
-               ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
-               if (ret)
-                       return ret;
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-
-       return 0;
-}
-
-static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
-                                  unsigned long end, unsigned long pfn,
-                                  pgprot_t prot)
-{
-       p4d_t *p4d;
-       pud_t *pud;
-       unsigned long addr, next;
-       int ret;
-
-       addr = start;
-       do {
-               p4d = p4d_offset(pgd, addr);
-
-               if (p4d_none(*p4d)) {
-                       pud = pud_alloc_one(NULL, addr);
-                       if (!pud) {
-                               kvm_err("Cannot allocate Hyp pud\n");
-                               return -ENOMEM;
-                       }
-                       kvm_p4d_populate(p4d, pud);
-                       get_page(virt_to_page(p4d));
-               }
-
-               next = p4d_addr_end(addr, end);
-               ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
-               if (ret)
-                       return ret;
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-
-       return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-                                unsigned long start, unsigned long end,
-                                unsigned long pfn, pgprot_t prot)
-{
-       pgd_t *pgd;
-       p4d_t *p4d;
-       unsigned long addr, next;
-       int err = 0;
+       int err;
 
        mutex_lock(&kvm_hyp_pgd_mutex);
-       addr = start & PAGE_MASK;
-       end = PAGE_ALIGN(end);
-       do {
-               pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
-               if (pgd_none(*pgd)) {
-                       p4d = p4d_alloc_one(NULL, addr);
-                       if (!p4d) {
-                               kvm_err("Cannot allocate Hyp p4d\n");
-                               err = -ENOMEM;
-                               goto out;
-                       }
-                       kvm_pgd_populate(pgd, p4d);
-                       get_page(virt_to_page(pgd));
-               }
-
-               next = pgd_addr_end(addr, end);
-               err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
-               if (err)
-                       goto out;
-               pfn += (next - addr) >> PAGE_SHIFT;
-       } while (addr = next, addr != end);
-out:
+       err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
        mutex_unlock(&kvm_hyp_pgd_mutex);
+
        return err;
 }
 
@@ -845,7 +219,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
  */
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 {
        phys_addr_t phys_addr;
        unsigned long virt_addr;
@@ -862,9 +236,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
                int err;
 
                phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-               err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
-                                           virt_addr, virt_addr + PAGE_SIZE,
-                                           __phys_to_pfn(phys_addr),
+               err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
                                            prot);
                if (err)
                        return err;
@@ -874,9 +246,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 }
 
 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
-                                       unsigned long *haddr, pgprot_t prot)
+                                       unsigned long *haddr,
+                                       enum kvm_pgtable_prot prot)
 {
-       pgd_t *pgd = hyp_pgd;
        unsigned long base;
        int ret = 0;
 
@@ -908,17 +280,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
        if (ret)
                goto out;
 
-       if (__kvm_cpu_uses_extended_idmap())
-               pgd = boot_hyp_pgd;
-
-       ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-                                   base, base + size,
-                                   __phys_to_pfn(phys_addr), prot);
+       ret = __create_hyp_mappings(base, size, phys_addr, prot);
        if (ret)
                goto out;
 
        *haddr = base + offset_in_page(phys_addr);
-
 out:
        return ret;
 }
@@ -973,492 +339,151 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 
        BUG_ON(is_kernel_in_hyp_mode());
 
-       ret = __create_hyp_private_mapping(phys_addr, size,
-                                          &addr, PAGE_HYP_EXEC);
-       if (ret) {
-               *haddr = NULL;
-               return ret;
-       }
-
-       *haddr = (void *)addr;
-       return 0;
-}
-
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
- * @kvm:       The pointer to the KVM structure
- * @mmu:       The pointer to the s2 MMU structure
- *
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(mmu->kvm).
- *
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
-{
-       phys_addr_t pgd_phys;
-       pgd_t *pgd;
-       int cpu;
-
-       if (mmu->pgd != NULL) {
-               kvm_err("kvm_arch already initialized?\n");
-               return -EINVAL;
-       }
-
-       /* Allocate the HW PGD, making sure that each page gets its own refcount */
-       pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
-       if (!pgd)
-               return -ENOMEM;
-
-       pgd_phys = virt_to_phys(pgd);
-       if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
-               return -EINVAL;
-
-       mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
-       if (!mmu->last_vcpu_ran) {
-               free_pages_exact(pgd, stage2_pgd_size(kvm));
-               return -ENOMEM;
-       }
-
-       for_each_possible_cpu(cpu)
-               *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
-
-       mmu->kvm = kvm;
-       mmu->pgd = pgd;
-       mmu->pgd_phys = pgd_phys;
-       mmu->vmid.vmid_gen = 0;
-
-       return 0;
-}
-
-static void stage2_unmap_memslot(struct kvm *kvm,
-                                struct kvm_memory_slot *memslot)
-{
-       hva_t hva = memslot->userspace_addr;
-       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-       phys_addr_t size = PAGE_SIZE * memslot->npages;
-       hva_t reg_end = hva + size;
-
-       /*
-        * A memory region could potentially cover multiple VMAs, and any holes
-        * between them, so iterate over all of them to find out if we should
-        * unmap any of them.
-        *
-        *     +--------------------------------------------+
-        * +---------------+----------------+   +----------------+
-        * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-        * +---------------+----------------+   +----------------+
-        *     |               memory region                |
-        *     +--------------------------------------------+
-        */
-       do {
-               struct vm_area_struct *vma = find_vma(current->mm, hva);
-               hva_t vm_start, vm_end;
-
-               if (!vma || vma->vm_start >= reg_end)
-                       break;
-
-               /*
-                * Take the intersection of this VMA with the memory region
-                */
-               vm_start = max(hva, vma->vm_start);
-               vm_end = min(reg_end, vma->vm_end);
-
-               if (!(vma->vm_flags & VM_PFNMAP)) {
-                       gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-                       unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
-               }
-               hva = vm_end;
-       } while (hva < reg_end);
-}
-
-/**
- * stage2_unmap_vm - Unmap Stage-2 RAM mappings
- * @kvm: The struct kvm pointer
- *
- * Go through the memregions and unmap any regular RAM
- * backing memory already mapped to the VM.
- */
-void stage2_unmap_vm(struct kvm *kvm)
-{
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-       int idx;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       mmap_read_lock(current->mm);
-       spin_lock(&kvm->mmu_lock);
-
-       slots = kvm_memslots(kvm);
-       kvm_for_each_memslot(memslot, slots)
-               stage2_unmap_memslot(kvm, memslot);
-
-       spin_unlock(&kvm->mmu_lock);
-       mmap_read_unlock(current->mm);
-       srcu_read_unlock(&kvm->srcu, idx);
-}
-
-void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
-{
-       struct kvm *kvm = mmu->kvm;
-       void *pgd = NULL;
-
-       spin_lock(&kvm->mmu_lock);
-       if (mmu->pgd) {
-               unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
-               pgd = READ_ONCE(mmu->pgd);
-               mmu->pgd = NULL;
-       }
-       spin_unlock(&kvm->mmu_lock);
-
-       /* Free the HW pgd, one page at a time */
-       if (pgd) {
-               free_pages_exact(pgd, stage2_pgd_size(kvm));
-               free_percpu(mmu->last_vcpu_ran);
-       }
-}
-
-static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
-                            phys_addr_t addr)
-{
-       struct kvm *kvm = mmu->kvm;
-       pgd_t *pgd;
-       p4d_t *p4d;
-
-       pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-       if (stage2_pgd_none(kvm, *pgd)) {
-               if (!cache)
-                       return NULL;
-               p4d = kvm_mmu_memory_cache_alloc(cache);
-               stage2_pgd_populate(kvm, pgd, p4d);
-               get_page(virt_to_page(pgd));
-       }
-
-       return stage2_p4d_offset(kvm, pgd, addr);
-}
-
-static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
-                            phys_addr_t addr)
-{
-       struct kvm *kvm = mmu->kvm;
-       p4d_t *p4d;
-       pud_t *pud;
-
-       p4d = stage2_get_p4d(mmu, cache, addr);
-       if (stage2_p4d_none(kvm, *p4d)) {
-               if (!cache)
-                       return NULL;
-               pud = kvm_mmu_memory_cache_alloc(cache);
-               stage2_p4d_populate(kvm, p4d, pud);
-               get_page(virt_to_page(p4d));
-       }
-
-       return stage2_pud_offset(kvm, p4d, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
-                            phys_addr_t addr)
-{
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pud;
-       pmd_t *pmd;
-
-       pud = stage2_get_pud(mmu, cache, addr);
-       if (!pud || stage2_pud_huge(kvm, *pud))
-               return NULL;
-
-       if (stage2_pud_none(kvm, *pud)) {
-               if (!cache)
-                       return NULL;
-               pmd = kvm_mmu_memory_cache_alloc(cache);
-               stage2_pud_populate(kvm, pud, pmd);
-               get_page(virt_to_page(pud));
-       }
-
-       return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
-                              struct kvm_mmu_memory_cache *cache,
-                              phys_addr_t addr, const pmd_t *new_pmd)
-{
-       pmd_t *pmd, old_pmd;
-
-retry:
-       pmd = stage2_get_pmd(mmu, cache, addr);
-       VM_BUG_ON(!pmd);
-
-       old_pmd = *pmd;
-       /*
-        * Multiple vcpus faulting on the same PMD entry, can
-        * lead to them sequentially updating the PMD with the
-        * same value. Following the break-before-make
-        * (pmd_clear() followed by tlb_flush()) process can
-        * hinder forward progress due to refaults generated
-        * on missing translations.
-        *
-        * Skip updating the page table if the entry is
-        * unchanged.
-        */
-       if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-               return 0;
-
-       if (pmd_present(old_pmd)) {
-               /*
-                * If we already have PTE level mapping for this block,
-                * we must unmap it to avoid inconsistent TLB state and
-                * leaking the table page. We could end up in this situation
-                * if the memory slot was marked for dirty logging and was
-                * reverted, leaving PTE level mappings for the pages accessed
-                * during the period. So, unmap the PTE level mapping for this
-                * block and retry, as we could have released the upper level
-                * table in the process.
-                *
-                * Normal THP split/merge follows mmu_notifier callbacks and do
-                * get handled accordingly.
-                */
-               if (!pmd_thp_or_huge(old_pmd)) {
-                       unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
-                       goto retry;
-               }
-               /*
-                * Mapping in huge pages should only happen through a
-                * fault.  If a page is merged into a transparent huge
-                * page, the individual subpages of that huge page
-                * should be unmapped through MMU notifiers before we
-                * get here.
-                *
-                * Merging of CompoundPages is not supported; they
-                * should become splitting first, unmapped, merged,
-                * and mapped back in on-demand.
-                */
-               WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-               pmd_clear(pmd);
-               kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-       } else {
-               get_page(virt_to_page(pmd));
-       }
-
-       kvm_set_pmd(pmd, *new_pmd);
-       return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
-                              struct kvm_mmu_memory_cache *cache,
-                              phys_addr_t addr, const pud_t *new_pudp)
-{
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pudp, old_pud;
-
-retry:
-       pudp = stage2_get_pud(mmu, cache, addr);
-       VM_BUG_ON(!pudp);
-
-       old_pud = *pudp;
-
-       /*
-        * A large number of vcpus faulting on the same stage 2 entry,
-        * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
-        * Skip updating the page tables if there is no change.
-        */
-       if (pud_val(old_pud) == pud_val(*new_pudp))
-               return 0;
-
-       if (stage2_pud_present(kvm, old_pud)) {
-               /*
-                * If we already have table level mapping for this block, unmap
-                * the range for this block and retry.
-                */
-               if (!stage2_pud_huge(kvm, old_pud)) {
-                       unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
-                       goto retry;
-               }
-
-               WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
-               stage2_pud_clear(kvm, pudp);
-               kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
-       } else {
-               get_page(virt_to_page(pudp));
+       ret = __create_hyp_private_mapping(phys_addr, size,
+                                          &addr, PAGE_HYP_EXEC);
+       if (ret) {
+               *haddr = NULL;
+               return ret;
        }
 
-       kvm_set_pud(pudp, *new_pudp);
+       *haddr = (void *)addr;
        return 0;
 }
 
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
+ * @kvm:       The pointer to the KVM structure
+ * @mmu:       The pointer to the s2 MMU structure
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called when the VM is
+ * created, which can only be done once.
  */
-static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
-                                 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
 {
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-
-       *pudpp = NULL;
-       *pmdpp = NULL;
-       *ptepp = NULL;
-
-       pudp = stage2_get_pud(mmu, NULL, addr);
-       if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
-               return false;
+       int cpu, err;
+       struct kvm_pgtable *pgt;
 
-       if (stage2_pud_huge(kvm, *pudp)) {
-               *pudpp = pudp;
-               return true;
+       if (mmu->pgt != NULL) {
+               kvm_err("kvm_arch already initialized?\n");
+               return -EINVAL;
        }
 
-       pmdp = stage2_pmd_offset(kvm, pudp, addr);
-       if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
-               return false;
-
-       if (pmd_thp_or_huge(*pmdp)) {
-               *pmdpp = pmdp;
-               return true;
-       }
+       pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+       if (!pgt)
+               return -ENOMEM;
 
-       ptep = pte_offset_kernel(pmdp, addr);
-       if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
-               return false;
+       err = kvm_pgtable_stage2_init(pgt, kvm);
+       if (err)
+               goto out_free_pgtable;
 
-       *ptepp = ptep;
-       return true;
-}
+       mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
+       if (!mmu->last_vcpu_ran) {
+               err = -ENOMEM;
+               goto out_destroy_pgtable;
+       }
 
-static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
-{
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
-       bool found;
+       for_each_possible_cpu(cpu)
+               *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 
-       found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
-       if (!found)
-               return false;
+       mmu->kvm = kvm;
+       mmu->pgt = pgt;
+       mmu->pgd_phys = __pa(pgt->pgd);
+       mmu->vmid.vmid_gen = 0;
+       return 0;
 
-       if (pudp)
-               return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
-       else if (pmdp)
-               return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
-       else
-               return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
+out_destroy_pgtable:
+       kvm_pgtable_stage2_destroy(pgt);
+out_free_pgtable:
+       kfree(pgt);
+       return err;
 }
 
-static int stage2_set_pte(struct kvm_s2_mmu *mmu,
-                         struct kvm_mmu_memory_cache *cache,
-                         phys_addr_t addr, const pte_t *new_pte,
-                         unsigned long flags)
+static void stage2_unmap_memslot(struct kvm *kvm,
+                                struct kvm_memory_slot *memslot)
 {
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte, old_pte;
-       bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
-       bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
-       VM_BUG_ON(logging_active && !cache);
-
-       /* Create stage-2 page table mapping - Levels 0 and 1 */
-       pud = stage2_get_pud(mmu, cache, addr);
-       if (!pud) {
-               /*
-                * Ignore calls from kvm_set_spte_hva for unallocated
-                * address ranges.
-                */
-               return 0;
-       }
+       hva_t hva = memslot->userspace_addr;
+       phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+       phys_addr_t size = PAGE_SIZE * memslot->npages;
+       hva_t reg_end = hva + size;
 
        /*
-        * While dirty page logging - dissolve huge PUD, then continue
-        * on to allocate page.
+        * A memory region could potentially cover multiple VMAs, and any holes
+        * between them, so iterate over all of them to find out if we should
+        * unmap any of them.
+        *
+        *     +--------------------------------------------+
+        * +---------------+----------------+   +----------------+
+        * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+        * +---------------+----------------+   +----------------+
+        *     |               memory region                |
+        *     +--------------------------------------------+
         */
-       if (logging_active)
-               stage2_dissolve_pud(mmu, addr, pud);
-
-       if (stage2_pud_none(kvm, *pud)) {
-               if (!cache)
-                       return 0; /* ignore calls from kvm_set_spte_hva */
-               pmd = kvm_mmu_memory_cache_alloc(cache);
-               stage2_pud_populate(kvm, pud, pmd);
-               get_page(virt_to_page(pud));
-       }
+       do {
+               struct vm_area_struct *vma = find_vma(current->mm, hva);
+               hva_t vm_start, vm_end;
+
+               if (!vma || vma->vm_start >= reg_end)
+                       break;
 
-       pmd = stage2_pmd_offset(kvm, pud, addr);
-       if (!pmd) {
                /*
-                * Ignore calls from kvm_set_spte_hva for unallocated
-                * address ranges.
+                * Take the intersection of this VMA with the memory region
                 */
-               return 0;
-       }
-
-       /*
-        * While dirty page logging - dissolve huge PMD, then continue on to
-        * allocate page.
-        */
-       if (logging_active)
-               stage2_dissolve_pmd(mmu, addr, pmd);
-
-       /* Create stage-2 page mappings - Level 2 */
-       if (pmd_none(*pmd)) {
-               if (!cache)
-                       return 0; /* ignore calls from kvm_set_spte_hva */
-               pte = kvm_mmu_memory_cache_alloc(cache);
-               kvm_pmd_populate(pmd, pte);
-               get_page(virt_to_page(pmd));
-       }
+               vm_start = max(hva, vma->vm_start);
+               vm_end = min(reg_end, vma->vm_end);
 
-       pte = pte_offset_kernel(pmd, addr);
+               if (!(vma->vm_flags & VM_PFNMAP)) {
+                       gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+                       unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+               }
+               hva = vm_end;
+       } while (hva < reg_end);
+}
 
-       if (iomap && pte_present(*pte))
-               return -EFAULT;
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any regular RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int idx;
 
-       /* Create 2nd stage page table mapping - Level 3 */
-       old_pte = *pte;
-       if (pte_present(old_pte)) {
-               /* Skip page table update if there is no change */
-               if (pte_val(old_pte) == pte_val(*new_pte))
-                       return 0;
+       idx = srcu_read_lock(&kvm->srcu);
+       mmap_read_lock(current->mm);
+       spin_lock(&kvm->mmu_lock);
 
-               kvm_set_pte(pte, __pte(0));
-               kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-       } else {
-               get_page(virt_to_page(pte));
-       }
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots)
+               stage2_unmap_memslot(kvm, memslot);
 
-       kvm_set_pte(pte, *new_pte);
-       return 0;
+       spin_unlock(&kvm->mmu_lock);
+       mmap_read_unlock(current->mm);
+       srcu_read_unlock(&kvm->srcu, idx);
 }
 
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-       if (pte_young(*pte)) {
-               *pte = pte_mkold(*pte);
-               return 1;
-       }
-       return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
+void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 {
-       return __ptep_test_and_clear_young(pte);
-}
-#endif
+       struct kvm *kvm = mmu->kvm;
+       struct kvm_pgtable *pgt = NULL;
 
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
-       return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
+       spin_lock(&kvm->mmu_lock);
+       pgt = mmu->pgt;
+       if (pgt) {
+               mmu->pgd_phys = 0;
+               mmu->pgt = NULL;
+               free_percpu(mmu->last_vcpu_ran);
+       }
+       spin_unlock(&kvm->mmu_lock);
 
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
-       return stage2_ptep_test_and_clear_young((pte_t *)pud);
+       if (pgt) {
+               kvm_pgtable_stage2_destroy(pgt);
+               kfree(pgt);
+       }
 }
 
 /**
@@ -1472,135 +497,37 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
                          phys_addr_t pa, unsigned long size, bool writable)
 {
-       phys_addr_t addr, end;
+       phys_addr_t addr;
        int ret = 0;
-       unsigned long pfn;
        struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+       struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+                                    KVM_PGTABLE_PROT_R |
+                                    (writable ? KVM_PGTABLE_PROT_W : 0);
 
-       end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
-       pfn = __phys_to_pfn(pa);
-
-       for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
-               pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
-               if (writable)
-                       pte = kvm_s2pte_mkwrite(pte);
+       size += offset_in_page(guest_ipa);
+       guest_ipa &= PAGE_MASK;
 
+       for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
                ret = kvm_mmu_topup_memory_cache(&cache,
                                                 kvm_mmu_cache_min_pages(kvm));
                if (ret)
-                       goto out;
+                       break;
+
                spin_lock(&kvm->mmu_lock);
-               ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
-                                    KVM_S2PTE_FLAG_IS_IOMAP);
+               ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+                                            &cache);
                spin_unlock(&kvm->mmu_lock);
                if (ret)
-                       goto out;
+                       break;
 
-               pfn++;
+               pa += PAGE_SIZE;
        }
 
-out:
        kvm_mmu_free_memory_cache(&cache);
        return ret;
 }
 
-/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd:       pointer to pmd entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-       pte_t *pte;
-
-       pte = pte_offset_kernel(pmd, addr);
-       do {
-               if (!pte_none(*pte)) {
-                       if (!kvm_s2pte_readonly(pte))
-                               kvm_set_s2pte_readonly(pte);
-               }
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm:                kvm instance for the VM
- * @pud:       pointer to pud entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
-                          phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       pmd_t *pmd;
-       phys_addr_t next;
-
-       pmd = stage2_pmd_offset(kvm, pud, addr);
-
-       do {
-               next = stage2_pmd_addr_end(kvm, addr, end);
-               if (!pmd_none(*pmd)) {
-                       if (pmd_thp_or_huge(*pmd)) {
-                               if (!kvm_s2pmd_readonly(pmd))
-                                       kvm_set_s2pmd_readonly(pmd);
-                       } else {
-                               stage2_wp_ptes(pmd, addr, next);
-                       }
-               }
-       } while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect P4D range
- * @p4d:       pointer to p4d entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
-                           phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       pud_t *pud;
-       phys_addr_t next;
-
-       pud = stage2_pud_offset(kvm, p4d, addr);
-       do {
-               next = stage2_pud_addr_end(kvm, addr, end);
-               if (!stage2_pud_none(kvm, *pud)) {
-                       if (stage2_pud_huge(kvm, *pud)) {
-                               if (!kvm_s2pud_readonly(pud))
-                                       kvm_set_s2pud_readonly(pud);
-                       } else {
-                               stage2_wp_pmds(mmu, pud, addr, next);
-                       }
-               }
-       } while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_p4ds - write protect PGD range
- * @pgd:       pointer to pgd entry
- * @addr:      range start address
- * @end:       range end address
- */
-static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
-                           phys_addr_t addr, phys_addr_t end)
-{
-       struct kvm *kvm = mmu->kvm;
-       p4d_t *p4d;
-       phys_addr_t next;
-
-       p4d = stage2_p4d_offset(kvm, pgd, addr);
-       do {
-               next = stage2_p4d_addr_end(kvm, addr, end);
-               if (!stage2_p4d_none(kvm, *p4d))
-                       stage2_wp_puds(mmu, p4d, addr, next);
-       } while (p4d++, addr = next, addr != end);
-}
-
 /**
  * stage2_wp_range() - write protect stage2 memory region range
  * @kvm:       The KVM pointer
@@ -1610,27 +537,7 @@ static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
 {
        struct kvm *kvm = mmu->kvm;
-       pgd_t *pgd;
-       phys_addr_t next;
-
-       pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-       do {
-               /*
-                * Release kvm_mmu_lock periodically if the memory region is
-                * large. Otherwise, we may see kernel panics with
-                * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
-                * CONFIG_LOCKDEP. Additionally, holding the lock too long
-                * will also starve other vCPUs. We have to also make sure
-                * that the page tables are not freed while we released
-                * the lock.
-                */
-               cond_resched_lock(&kvm->mmu_lock);
-               if (!READ_ONCE(mmu->pgd))
-                       break;
-               next = stage2_pgd_addr_end(kvm, addr, end);
-               if (stage2_pgd_present(kvm, *pgd))
-                       stage2_wp_p4ds(mmu, pgd, addr, next);
-       } while (pgd++, addr = next, addr != end);
+       stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
 }
 
 /**
@@ -1835,18 +742,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 {
        int ret;
        bool write_fault, writable, force_pte = false;
-       bool exec_fault, needs_exec;
+       bool exec_fault;
+       bool device = false;
        unsigned long mmu_seq;
-       gfn_t gfn = fault_ipa >> PAGE_SHIFT;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
        struct vm_area_struct *vma;
        short vma_shift;
+       gfn_t gfn;
        kvm_pfn_t pfn;
-       pgprot_t mem_type = PAGE_S2;
        bool logging_active = memslot_is_logging(memslot);
-       unsigned long vma_pagesize, flags = 0;
-       struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+       unsigned long vma_pagesize;
+       enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+       struct kvm_pgtable *pgt;
 
        write_fault = kvm_is_write_fault(vcpu);
        exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1877,24 +785,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
            !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
                force_pte = true;
                vma_pagesize = PAGE_SIZE;
+               vma_shift = PAGE_SHIFT;
        }
 
-       /*
-        * The stage2 has a minimum of 2 level table (For arm64 see
-        * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
-        * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
-        * As for PUD huge maps, we must make sure that we have at least
-        * 3 levels, i.e, PMD is not folded.
-        */
-       if (vma_pagesize == PMD_SIZE ||
-           (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
-               gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+       if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+               fault_ipa &= huge_page_mask(hstate_vma(vma));
+
+       gfn = fault_ipa >> PAGE_SHIFT;
        mmap_read_unlock(current->mm);
 
-       /* We need minimum second+third level pages */
-       ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
-       if (ret)
-               return ret;
+       /*
+        * Permission faults just need to update the existing leaf entry,
+        * and so normally don't require allocations from the memcache. The
+        * only exception to this is when dirty logging is enabled at runtime
+        * and a write fault needs to collapse a block entry into a table.
+        */
+       if (fault_status != FSC_PERM || (logging_active && write_fault)) {
+               ret = kvm_mmu_topup_memory_cache(memcache,
+                                                kvm_mmu_cache_min_pages(kvm));
+               if (ret)
+                       return ret;
+       }
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        /*
@@ -1917,28 +828,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                return -EFAULT;
 
        if (kvm_is_device_pfn(pfn)) {
-               mem_type = PAGE_S2_DEVICE;
-               flags |= KVM_S2PTE_FLAG_IS_IOMAP;
-       } else if (logging_active) {
-               /*
-                * Faults on pages in a memslot with logging enabled
-                * should not be mapped with huge pages (it introduces churn
-                * and performance degradation), so force a pte mapping.
-                */
-               flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
+               device = true;
+       } else if (logging_active && !write_fault) {
                /*
                 * Only actually map the page as writable if this was a write
                 * fault.
                 */
-               if (!write_fault)
-                       writable = false;
+               writable = false;
        }
 
-       if (exec_fault && is_iomap(flags))
+       if (exec_fault && device)
                return -ENOEXEC;
 
        spin_lock(&kvm->mmu_lock);
+       pgt = vcpu->arch.hw_mmu->pgt;
        if (mmu_notifier_retry(kvm, mmu_seq))
                goto out_unlock;
 
@@ -1949,62 +852,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (vma_pagesize == PAGE_SIZE && !force_pte)
                vma_pagesize = transparent_hugepage_adjust(memslot, hva,
                                                           &pfn, &fault_ipa);
-       if (writable)
+       if (writable) {
+               prot |= KVM_PGTABLE_PROT_W;
                kvm_set_pfn_dirty(pfn);
+               mark_page_dirty(kvm, gfn);
+       }
 
-       if (fault_status != FSC_PERM && !is_iomap(flags))
+       if (fault_status != FSC_PERM && !device)
                clean_dcache_guest_page(pfn, vma_pagesize);
 
-       if (exec_fault)
+       if (exec_fault) {
+               prot |= KVM_PGTABLE_PROT_X;
                invalidate_icache_guest_page(pfn, vma_pagesize);
+       }
 
-       /*
-        * If we took an execution fault we have made the
-        * icache/dcache coherent above and should now let the s2
-        * mapping be executable.
-        *
-        * Write faults (!exec_fault && FSC_PERM) are orthogonal to
-        * execute permissions, and we preserve whatever we have.
-        */
-       needs_exec = exec_fault ||
-               (fault_status == FSC_PERM &&
-                stage2_is_exec(mmu, fault_ipa, vma_pagesize));
-
-       if (vma_pagesize == PUD_SIZE) {
-               pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
-               new_pud = kvm_pud_mkhuge(new_pud);
-               if (writable)
-                       new_pud = kvm_s2pud_mkwrite(new_pud);
-
-               if (needs_exec)
-                       new_pud = kvm_s2pud_mkexec(new_pud);
-
-               ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
-       } else if (vma_pagesize == PMD_SIZE) {
-               pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
-               new_pmd = kvm_pmd_mkhuge(new_pmd);
-
-               if (writable)
-                       new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
-               if (needs_exec)
-                       new_pmd = kvm_s2pmd_mkexec(new_pmd);
+       if (device)
+               prot |= KVM_PGTABLE_PROT_DEVICE;
+       else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+               prot |= KVM_PGTABLE_PROT_X;
 
-               ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
+       if (fault_status == FSC_PERM && !(logging_active && writable)) {
+               ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
        } else {
-               pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
-               if (writable) {
-                       new_pte = kvm_s2pte_mkwrite(new_pte);
-                       mark_page_dirty(kvm, gfn);
-               }
-
-               if (needs_exec)
-                       new_pte = kvm_s2pte_mkexec(new_pte);
-
-               ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
+               ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+                                            __pfn_to_phys(pfn), prot,
+                                            memcache);
        }
 
 out_unlock:
@@ -2014,46 +886,23 @@ out_unlock:
        return ret;
 }
 
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
+/* Resolve the access fault by making the page young again. */
 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 {
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       kvm_pfn_t pfn;
-       bool pfn_valid = false;
+       pte_t pte;
+       kvm_pte_t kpte;
+       struct kvm_s2_mmu *mmu;
 
        trace_kvm_access_fault(fault_ipa);
 
        spin_lock(&vcpu->kvm->mmu_lock);
-
-       if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
-               goto out;
-
-       if (pud) {              /* HugeTLB */
-               *pud = kvm_s2pud_mkyoung(*pud);
-               pfn = kvm_pud_pfn(*pud);
-               pfn_valid = true;
-       } else  if (pmd) {      /* THP, HugeTLB */
-               *pmd = pmd_mkyoung(*pmd);
-               pfn = pmd_pfn(*pmd);
-               pfn_valid = true;
-       } else {
-               *pte = pte_mkyoung(*pte);       /* Just a page... */
-               pfn = pte_pfn(*pte);
-               pfn_valid = true;
-       }
-
-out:
+       mmu = vcpu->arch.hw_mmu;
+       kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
        spin_unlock(&vcpu->kvm->mmu_lock);
-       if (pfn_valid)
-               kvm_set_pfn_accessed(pfn);
+
+       pte = __pte(kpte);
+       if (pte_valid(pte))
+               kvm_set_pfn_accessed(pte_pfn(pte));
 }
 
 /**
@@ -2224,7 +1073,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
 int kvm_unmap_hva_range(struct kvm *kvm,
                        unsigned long start, unsigned long end, unsigned flags)
 {
-       if (!kvm->arch.mmu.pgd)
+       if (!kvm->arch.mmu.pgt)
                return 0;
 
        trace_kvm_unmap_hva_range(start, end);
@@ -2234,28 +1083,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
 
 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-       pte_t *pte = (pte_t *)data;
+       kvm_pfn_t *pfn = (kvm_pfn_t *)data;
 
        WARN_ON(size != PAGE_SIZE);
+
        /*
-        * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
-        * flag clear because MMU notifiers will have unmapped a huge PMD before
-        * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-        * therefore stage2_set_pte() never needs to clear out a huge PMD
-        * through this calling path.
+        * The MMU notifiers will have unmapped a huge PMD before calling
+        * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+        * therefore we never need to clear out a huge PMD through this
+        * calling path and a memcache is not required.
         */
-       stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
+       kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
+                              __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
        return 0;
 }
 
-
 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
        unsigned long end = hva + PAGE_SIZE;
        kvm_pfn_t pfn = pte_pfn(pte);
-       pte_t stage2_pte;
 
-       if (!kvm->arch.mmu.pgd)
+       if (!kvm->arch.mmu.pgt)
                return 0;
 
        trace_kvm_set_spte_hva(hva);
@@ -2265,51 +1113,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
         * just like a translation fault and clean the cache to the PoC.
         */
        clean_dcache_guest_page(pfn, PAGE_SIZE);
-       stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
-       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
+       handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
        return 0;
 }
 
 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
+       pte_t pte;
+       kvm_pte_t kpte;
 
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
-               return 0;
-
-       if (pud)
-               return stage2_pudp_test_and_clear_young(pud);
-       else if (pmd)
-               return stage2_pmdp_test_and_clear_young(pmd);
-       else
-               return stage2_ptep_test_and_clear_young(pte);
+       kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+       pte = __pte(kpte);
+       return pte_valid(pte) && pte_young(pte);
 }
 
 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
        WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-       if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
-               return 0;
-
-       if (pud)
-               return kvm_s2pud_young(*pud);
-       else if (pmd)
-               return pmd_young(*pmd);
-       else
-               return pte_young(*pte);
+       return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-       if (!kvm->arch.mmu.pgd)
+       if (!kvm->arch.mmu.pgt)
                return 0;
        trace_kvm_age_hva(start, end);
        return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
@@ -2317,24 +1144,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 {
-       if (!kvm->arch.mmu.pgd)
+       if (!kvm->arch.mmu.pgt)
                return 0;
        trace_kvm_test_age_hva(hva);
        return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
                                 kvm_test_age_hva_handler, NULL);
 }
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-       kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
 phys_addr_t kvm_mmu_get_httbr(void)
 {
-       if (__kvm_cpu_uses_extended_idmap())
-               return virt_to_phys(merged_hyp_pgd);
-       else
-               return virt_to_phys(hyp_pgd);
+       return __pa(hyp_pgtable->pgd);
 }
 
 phys_addr_t kvm_get_idmap_vector(void)
@@ -2342,15 +1161,11 @@ phys_addr_t kvm_get_idmap_vector(void)
        return hyp_idmap_vector;
 }
 
-static int kvm_map_idmap_text(pgd_t *pgd)
+static int kvm_map_idmap_text(void)
 {
-       int err;
-
-       /* Create the idmap in the boot page tables */
-       err =   __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-                                     hyp_idmap_start, hyp_idmap_end,
-                                     __phys_to_pfn(hyp_idmap_start),
-                                     PAGE_HYP_EXEC);
+       unsigned long size = hyp_idmap_end - hyp_idmap_start;
+       int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
+                                       PAGE_HYP_EXEC);
        if (err)
                kvm_err("Failed to idmap %lx-%lx\n",
                        hyp_idmap_start, hyp_idmap_end);
@@ -2361,6 +1176,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
 int kvm_mmu_init(void)
 {
        int err;
+       u32 hyp_va_bits;
 
        hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
        hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -2374,6 +1190,8 @@ int kvm_mmu_init(void)
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
+       hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+       kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
        kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
        kvm_debug("HYP VA range: %lx:%lx\n",
                  kern_hyp_va(PAGE_OFFSET),
@@ -2391,43 +1209,30 @@ int kvm_mmu_init(void)
                goto out;
        }
 
-       hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-       if (!hyp_pgd) {
-               kvm_err("Hyp mode PGD not allocated\n");
+       hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+       if (!hyp_pgtable) {
+               kvm_err("Hyp mode page-table not allocated\n");
                err = -ENOMEM;
                goto out;
        }
 
-       if (__kvm_cpu_uses_extended_idmap()) {
-               boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                                        hyp_pgd_order);
-               if (!boot_hyp_pgd) {
-                       kvm_err("Hyp boot PGD not allocated\n");
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               err = kvm_map_idmap_text(boot_hyp_pgd);
-               if (err)
-                       goto out;
+       err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+       if (err)
+               goto out_free_pgtable;
 
-               merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-               if (!merged_hyp_pgd) {
-                       kvm_err("Failed to allocate extra HYP pgd\n");
-                       goto out;
-               }
-               __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
-                                   hyp_idmap_start);
-       } else {
-               err = kvm_map_idmap_text(hyp_pgd);
-               if (err)
-                       goto out;
-       }
+       err = kvm_map_idmap_text();
+       if (err)
+               goto out_destroy_pgtable;
 
        io_map_base = hyp_idmap_start;
        return 0;
+
+out_destroy_pgtable:
+       kvm_pgtable_hyp_destroy(hyp_pgtable);
+out_free_pgtable:
+       kfree(hyp_pgtable);
+       hyp_pgtable = NULL;
 out:
-       free_hyp_pgds();
        return err;
 }
 
index f7b52ce..920ac43 100644 (file)
 void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
 {
        struct kvm *kvm = vcpu->kvm;
-       u64 steal;
-       __le64 steal_le;
-       u64 offset;
-       int idx;
        u64 base = vcpu->arch.steal.base;
+       u64 last_steal = vcpu->arch.steal.last_steal;
+       u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+       u64 steal = 0;
+       int idx;
 
        if (base == GPA_INVALID)
                return;
 
-       /* Let's do the local bookkeeping */
-       steal = vcpu->arch.steal.steal;
-       steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
-       vcpu->arch.steal.last_steal = current->sched_info.run_delay;
-       vcpu->arch.steal.steal = steal;
-
-       steal_le = cpu_to_le64(steal);
        idx = srcu_read_lock(&kvm->srcu);
-       offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
-       kvm_put_guest(kvm, base + offset, steal_le, u64);
+       if (!kvm_get_guest(kvm, base + offset, steal)) {
+               steal = le64_to_cpu(steal);
+               vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay);
+               steal += vcpu->arch.steal.last_steal - last_steal;
+               kvm_put_guest(kvm, base + offset, cpu_to_le64(steal));
+       }
        srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -43,7 +40,8 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
        switch (feature) {
        case ARM_SMCCC_HV_PV_TIME_FEATURES:
        case ARM_SMCCC_HV_PV_TIME_ST:
-               val = SMCCC_RET_SUCCESS;
+               if (vcpu->arch.steal.base != GPA_INVALID)
+                       val = SMCCC_RET_SUCCESS;
                break;
        }
 
@@ -64,7 +62,6 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
         * Start counting stolen time from the time the guest requests
         * the feature enabled.
         */
-       vcpu->arch.steal.steal = 0;
        vcpu->arch.steal.last_steal = current->sched_info.run_delay;
 
        idx = srcu_read_lock(&kvm->srcu);
@@ -74,7 +71,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
        return base;
 }
 
-static bool kvm_arm_pvtime_supported(void)
+bool kvm_arm_pvtime_supported(void)
 {
        return !!sched_info_on();
 }
index ee33875..2202b71 100644 (file)
@@ -339,7 +339,7 @@ u32 get_kvm_ipa_limit(void)
 
 int kvm_set_ipa_limit(void)
 {
-       unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+       unsigned int parange, tgran_2;
        u64 mmfr0;
 
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
@@ -376,39 +376,11 @@ int kvm_set_ipa_limit(void)
                break;
        }
 
-       pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
-
-       /* Clamp the IPA limit to the PA size supported by the kernel */
-       ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
-       /*
-        * Since our stage2 table is dependent on the stage1 page table code,
-        * we must always honor the following condition:
-        *
-        *  Number of levels in Stage1 >= Number of levels in Stage2.
-        *
-        * So clamp the ipa limit further down to limit the number of levels.
-        * Since we can concatenate upto 16 tables at entry level, we could
-        * go upto 4bits above the maximum VA addressable with the current
-        * number of levels.
-        */
-       va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
-       va_max += 4;
-
-       if (va_max < ipa_max)
-               ipa_max = va_max;
-
-       /*
-        * If the final limit is lower than the real physical address
-        * limit of the CPUs, report the reason.
-        */
-       if (ipa_max < pa_max)
-               pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
-                       (va_max < pa_max) ? "Virtual" : "Physical");
-
-       WARN(ipa_max < KVM_PHYS_SHIFT,
-            "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
-       kvm_ipa_limit = ipa_max;
-       kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+       kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+       WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+            "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+            kvm_ipa_limit);
+       kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
 
        return 0;
 }
index 4691053..ff04443 100644 (file)
@@ -23,7 +23,7 @@ TRACE_EVENT(kvm_entry,
                __entry->vcpu_pc                = vcpu_pc;
        ),
 
-       TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+       TP_printk("PC: 0x%016lx", __entry->vcpu_pc)
 );
 
 TRACE_EVENT(kvm_exit,
@@ -42,7 +42,7 @@ TRACE_EVENT(kvm_exit,
                __entry->vcpu_pc                = vcpu_pc;
        ),
 
-       TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+       TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx",
                  __print_symbolic(__entry->ret, kvm_arm_exception_type),
                  __entry->esr_ec,
                  __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
@@ -69,7 +69,7 @@ TRACE_EVENT(kvm_guest_fault,
                __entry->ipa                    = ipa;
        ),
 
-       TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+       TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx",
                  __entry->ipa, __entry->hsr,
                  __entry->hxfar, __entry->vcpu_pc)
 );
@@ -131,7 +131,7 @@ TRACE_EVENT(kvm_mmio_emulate,
                __entry->cpsr                   = cpsr;
        ),
 
-       TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+       TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)",
                  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
 );
 
@@ -149,7 +149,7 @@ TRACE_EVENT(kvm_unmap_hva_range,
                __entry->end            = end;
        ),
 
-       TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
+       TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
                  __entry->start, __entry->end)
 );
 
@@ -165,7 +165,7 @@ TRACE_EVENT(kvm_set_spte_hva,
                __entry->hva            = hva;
        ),
 
-       TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
+       TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
 );
 
 TRACE_EVENT(kvm_age_hva,
@@ -182,7 +182,7 @@ TRACE_EVENT(kvm_age_hva,
                __entry->end            = end;
        ),
 
-       TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+       TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
                  __entry->start, __entry->end)
 );
 
@@ -198,7 +198,7 @@ TRACE_EVENT(kvm_test_age_hva,
                __entry->hva            = hva;
        ),
 
-       TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+       TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
 );
 
 TRACE_EVENT(kvm_set_way_flush,
index 2c56d1e..8d78acc 100644 (file)
@@ -22,7 +22,7 @@ TRACE_EVENT(kvm_wfx_arm64,
                __entry->is_wfe  = is_wfe;
        ),
 
-       TP_printk("guest executed wf%c at: 0x%08lx",
+       TP_printk("guest executed wf%c at: 0x%016lx",
                  __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
 );
 
@@ -42,7 +42,7 @@ TRACE_EVENT(kvm_hvc_arm64,
                __entry->imm = imm;
        ),
 
-       TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+       TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)",
                  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 
@@ -135,7 +135,7 @@ TRACE_EVENT(trap_reg,
                __entry->write_value = write_value;
        ),
 
-       TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+       TP_printk("%s %s reg %d (0x%016llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
 );
 
 TRACE_EVENT(kvm_handle_sys_reg,
index d39d6cf..7527022 100644 (file)
@@ -3578,6 +3578,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SMALLER_MAXPHYADDR:
                r = (int) allow_smaller_maxphyaddr;
                break;
+       case KVM_CAP_STEAL_TIME:
+               r = sched_info_on();
+               break;
        default:
                break;
        }
index a230767..05e3c2f 100644 (file)
@@ -749,25 +749,46 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                              gpa_t gpa, unsigned long len);
 
-#define __kvm_put_guest(kvm, gfn, offset, value, type)                 \
+#define __kvm_get_guest(kvm, gfn, offset, v)                           \
 ({                                                                     \
        unsigned long __addr = gfn_to_hva(kvm, gfn);                    \
-       type __user *__uaddr = (type __user *)(__addr + offset);        \
+       typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \
        int __ret = -EFAULT;                                            \
                                                                        \
        if (!kvm_is_error_hva(__addr))                                  \
-               __ret = put_user(value, __uaddr);                       \
+               __ret = get_user(v, __uaddr);                           \
+       __ret;                                                          \
+})
+
+#define kvm_get_guest(kvm, gpa, v)                                     \
+({                                                                     \
+       gpa_t __gpa = gpa;                                              \
+       struct kvm *__kvm = kvm;                                        \
+                                                                       \
+       __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT,                     \
+                       offset_in_page(__gpa), v);                      \
+})
+
+#define __kvm_put_guest(kvm, gfn, offset, v)                           \
+({                                                                     \
+       unsigned long __addr = gfn_to_hva(kvm, gfn);                    \
+       typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset); \
+       int __ret = -EFAULT;                                            \
+                                                                       \
+       if (!kvm_is_error_hva(__addr))                                  \
+               __ret = put_user(v, __uaddr);                           \
        if (!__ret)                                                     \
                mark_page_dirty(kvm, gfn);                              \
        __ret;                                                          \
 })
 
-#define kvm_put_guest(kvm, gpa, value, type)                           \
+#define kvm_put_guest(kvm, gpa, v)                                     \
 ({                                                                     \
        gpa_t __gpa = gpa;                                              \
        struct kvm *__kvm = kvm;                                        \
+                                                                       \
        __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT,                     \
-                       offset_in_page(__gpa), (value), type);          \
+                       offset_in_page(__gpa), v);                      \
 })
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
index f6d8603..3d80234 100644 (file)
@@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_LAST_CPU 184
 #define KVM_CAP_SMALLER_MAXPHYADDR 185
 #define KVM_CAP_S390_DIAG318 186
+#define KVM_CAP_STEAL_TIME 187
 
 #ifdef KVM_CAP_IRQ_ROUTING