1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_pgtable.h>
18 #include <asm/kvm_ras.h>
19 #include <asm/kvm_asm.h>
20 #include <asm/kvm_emulate.h>
25 static struct kvm_pgtable *hyp_pgtable;
26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
28 static unsigned long hyp_idmap_start;
29 static unsigned long hyp_idmap_end;
30 static phys_addr_t hyp_idmap_vector;
32 static unsigned long io_map_base;
34 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
36 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
37 phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
39 return (boundary - 1 < end - 1) ? boundary : end;
43 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
44 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
45 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
46 * long will also starve other vCPUs. We have to also make sure that the page
47 * tables are not freed while we released the lock.
49 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
51 int (*fn)(struct kvm_pgtable *, u64, u64),
58 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
62 next = stage2_range_addr_end(addr, end);
63 ret = fn(pgt, addr, next - addr);
67 if (resched && next != end)
68 cond_resched_rwlock_write(&kvm->mmu_lock);
69 } while (addr = next, addr != end);
74 #define stage2_apply_range_resched(kvm, addr, end, fn) \
75 stage2_apply_range(kvm, addr, end, fn, true)
77 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
79 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
83 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
84 * @kvm: pointer to kvm structure.
86 * Interface to HYP function to flush all VM TLB entries
88 void kvm_flush_remote_tlbs(struct kvm *kvm)
90 ++kvm->stat.generic.remote_tlb_flush_requests;
91 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
94 static bool kvm_is_device_pfn(unsigned long pfn)
96 return !pfn_is_map_memory(pfn);
99 static void *stage2_memcache_zalloc_page(void *arg)
101 struct kvm_mmu_memory_cache *mc = arg;
104 /* Allocated with __GFP_ZERO, so no need to zero */
105 virt = kvm_mmu_memory_cache_alloc(mc);
107 kvm_account_pgtable_pages(virt, 1);
111 static void *kvm_host_zalloc_pages_exact(size_t size)
113 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
116 static void *kvm_s2_zalloc_pages_exact(size_t size)
118 void *virt = kvm_host_zalloc_pages_exact(size);
121 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
125 static void kvm_s2_free_pages_exact(void *virt, size_t size)
127 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
128 free_pages_exact(virt, size);
131 static void kvm_host_get_page(void *addr)
133 get_page(virt_to_page(addr));
136 static void kvm_host_put_page(void *addr)
138 put_page(virt_to_page(addr));
141 static void kvm_s2_put_page(void *addr)
143 struct page *p = virt_to_page(addr);
144 /* Dropping last refcount, the page will be freed */
145 if (page_count(p) == 1)
146 kvm_account_pgtable_pages(addr, -1);
150 static int kvm_host_page_count(void *addr)
152 return page_count(virt_to_page(addr));
155 static phys_addr_t kvm_host_pa(void *addr)
160 static void *kvm_host_va(phys_addr_t phys)
165 static void clean_dcache_guest_page(void *va, size_t size)
167 __clean_dcache_guest_page(va, size);
170 static void invalidate_icache_guest_page(void *va, size_t size)
172 __invalidate_icache_guest_page(va, size);
176 * Unmapping vs dcache management:
178 * If a guest maps certain memory pages as uncached, all writes will
179 * bypass the data cache and go directly to RAM. However, the CPUs
180 * can still speculate reads (not writes) and fill cache lines with
183 * Those cache lines will be *clean* cache lines though, so a
184 * clean+invalidate operation is equivalent to an invalidate
185 * operation, because no cache lines are marked dirty.
187 * Those clean cache lines could be filled prior to an uncached write
188 * by the guest, and the cache coherent IO subsystem would therefore
189 * end up writing old data to disk.
191 * This is why right after unmapping a page/section and invalidating
192 * the corresponding TLBs, we flush to make sure the IO subsystem will
193 * never hit in the cache.
195 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
196 * we then fully enforce cacheability of RAM, no matter what the guest
200 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
201 * @mmu: The KVM stage-2 MMU pointer
202 * @start: The intermediate physical base address of the range to unmap
203 * @size: The size of the area to unmap
204 * @may_block: Whether or not we are permitted to block
206 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
207 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
208 * destroying the VM), otherwise another faulting VCPU may come in and mess
209 * with things behind our backs.
211 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
214 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
215 phys_addr_t end = start + size;
217 lockdep_assert_held_write(&kvm->mmu_lock);
218 WARN_ON(size & ~PAGE_MASK);
219 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
223 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
225 __unmap_stage2_range(mmu, start, size, true);
228 static void stage2_flush_memslot(struct kvm *kvm,
229 struct kvm_memory_slot *memslot)
231 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
232 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
234 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
238 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
239 * @kvm: The struct kvm pointer
241 * Go through the stage 2 page tables and invalidate any cache lines
242 * backing memory already mapped to the VM.
244 static void stage2_flush_vm(struct kvm *kvm)
246 struct kvm_memslots *slots;
247 struct kvm_memory_slot *memslot;
250 idx = srcu_read_lock(&kvm->srcu);
251 write_lock(&kvm->mmu_lock);
253 slots = kvm_memslots(kvm);
254 kvm_for_each_memslot(memslot, bkt, slots)
255 stage2_flush_memslot(kvm, memslot);
257 write_unlock(&kvm->mmu_lock);
258 srcu_read_unlock(&kvm->srcu, idx);
262 * free_hyp_pgds - free Hyp-mode page tables
264 void free_hyp_pgds(void)
266 mutex_lock(&kvm_hyp_pgd_mutex);
268 kvm_pgtable_hyp_destroy(hyp_pgtable);
272 mutex_unlock(&kvm_hyp_pgd_mutex);
275 static bool kvm_host_owns_hyp_mappings(void)
277 if (is_kernel_in_hyp_mode())
280 if (static_branch_likely(&kvm_protected_mode_initialized))
284 * This can happen at boot time when __create_hyp_mappings() is called
285 * after the hyp protection has been enabled, but the static key has
286 * not been flipped yet.
288 if (!hyp_pgtable && is_protected_kvm_enabled())
291 WARN_ON(!hyp_pgtable);
296 int __create_hyp_mappings(unsigned long start, unsigned long size,
297 unsigned long phys, enum kvm_pgtable_prot prot)
301 if (WARN_ON(!kvm_host_owns_hyp_mappings()))
304 mutex_lock(&kvm_hyp_pgd_mutex);
305 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
306 mutex_unlock(&kvm_hyp_pgd_mutex);
311 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
313 if (!is_vmalloc_addr(kaddr)) {
314 BUG_ON(!virt_addr_valid(kaddr));
317 return page_to_phys(vmalloc_to_page(kaddr)) +
318 offset_in_page(kaddr);
322 struct hyp_shared_pfn {
328 static DEFINE_MUTEX(hyp_shared_pfns_lock);
329 static struct rb_root hyp_shared_pfns = RB_ROOT;
331 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
332 struct rb_node **parent)
334 struct hyp_shared_pfn *this;
336 *node = &hyp_shared_pfns.rb_node;
339 this = container_of(**node, struct hyp_shared_pfn, node);
342 *node = &((**node)->rb_left);
343 else if (this->pfn > pfn)
344 *node = &((**node)->rb_right);
352 static int share_pfn_hyp(u64 pfn)
354 struct rb_node **node, *parent;
355 struct hyp_shared_pfn *this;
358 mutex_lock(&hyp_shared_pfns_lock);
359 this = find_shared_pfn(pfn, &node, &parent);
365 this = kzalloc(sizeof(*this), GFP_KERNEL);
373 rb_link_node(&this->node, parent, node);
374 rb_insert_color(&this->node, &hyp_shared_pfns);
375 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
377 mutex_unlock(&hyp_shared_pfns_lock);
382 static int unshare_pfn_hyp(u64 pfn)
384 struct rb_node **node, *parent;
385 struct hyp_shared_pfn *this;
388 mutex_lock(&hyp_shared_pfns_lock);
389 this = find_shared_pfn(pfn, &node, &parent);
390 if (WARN_ON(!this)) {
399 rb_erase(&this->node, &hyp_shared_pfns);
401 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
403 mutex_unlock(&hyp_shared_pfns_lock);
408 int kvm_share_hyp(void *from, void *to)
410 phys_addr_t start, end, cur;
414 if (is_kernel_in_hyp_mode())
418 * The share hcall maps things in the 'fixed-offset' region of the hyp
419 * VA space, so we can only share physically contiguous data-structures
422 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
425 if (kvm_host_owns_hyp_mappings())
426 return create_hyp_mappings(from, to, PAGE_HYP);
428 start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
429 end = PAGE_ALIGN(__pa(to));
430 for (cur = start; cur < end; cur += PAGE_SIZE) {
431 pfn = __phys_to_pfn(cur);
432 ret = share_pfn_hyp(pfn);
440 void kvm_unshare_hyp(void *from, void *to)
442 phys_addr_t start, end, cur;
445 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
448 start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
449 end = PAGE_ALIGN(__pa(to));
450 for (cur = start; cur < end; cur += PAGE_SIZE) {
451 pfn = __phys_to_pfn(cur);
452 WARN_ON(unshare_pfn_hyp(pfn));
457 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
458 * @from: The virtual kernel start address of the range
459 * @to: The virtual kernel end address of the range (exclusive)
460 * @prot: The protection to be applied to this range
462 * The same virtual address as the kernel virtual address is also used
463 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
466 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
468 phys_addr_t phys_addr;
469 unsigned long virt_addr;
470 unsigned long start = kern_hyp_va((unsigned long)from);
471 unsigned long end = kern_hyp_va((unsigned long)to);
473 if (is_kernel_in_hyp_mode())
476 if (!kvm_host_owns_hyp_mappings())
479 start = start & PAGE_MASK;
480 end = PAGE_ALIGN(end);
482 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
485 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
486 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
497 * hyp_alloc_private_va_range - Allocates a private VA range.
498 * @size: The size of the VA range to reserve.
499 * @haddr: The hypervisor virtual start address of the allocation.
501 * The private virtual address (VA) range is allocated below io_map_base
502 * and aligned based on the order of @size.
504 * Return: 0 on success or negative error code on failure.
506 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
511 mutex_lock(&kvm_hyp_pgd_mutex);
514 * This assumes that we have enough space below the idmap
515 * page to allocate our VAs. If not, the check below will
516 * kick. A potential alternative would be to detect that
517 * overflow and switch to an allocation above the idmap.
519 * The allocated size is always a multiple of PAGE_SIZE.
521 base = io_map_base - PAGE_ALIGN(size);
523 /* Align the allocation based on the order of its size */
524 base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
527 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
528 * allocating the new area, as it would indicate we've
529 * overflowed the idmap/IO address range.
531 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
534 *haddr = io_map_base = base;
536 mutex_unlock(&kvm_hyp_pgd_mutex);
541 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
542 unsigned long *haddr,
543 enum kvm_pgtable_prot prot)
548 if (!kvm_host_owns_hyp_mappings()) {
549 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
550 phys_addr, size, prot);
551 if (IS_ERR_VALUE(addr))
558 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
559 ret = hyp_alloc_private_va_range(size, &addr);
563 ret = __create_hyp_mappings(addr, size, phys_addr, prot);
567 *haddr = addr + offset_in_page(phys_addr);
572 * create_hyp_io_mappings - Map IO into both kernel and HYP
573 * @phys_addr: The physical start address which gets mapped
574 * @size: Size of the region being mapped
575 * @kaddr: Kernel VA for this mapping
576 * @haddr: HYP VA for this mapping
578 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
579 void __iomem **kaddr,
580 void __iomem **haddr)
585 if (is_protected_kvm_enabled())
588 *kaddr = ioremap(phys_addr, size);
592 if (is_kernel_in_hyp_mode()) {
597 ret = __create_hyp_private_mapping(phys_addr, size,
598 &addr, PAGE_HYP_DEVICE);
606 *haddr = (void __iomem *)addr;
611 * create_hyp_exec_mappings - Map an executable range into HYP
612 * @phys_addr: The physical start address which gets mapped
613 * @size: Size of the region being mapped
614 * @haddr: HYP VA for this mapping
616 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
622 BUG_ON(is_kernel_in_hyp_mode());
624 ret = __create_hyp_private_mapping(phys_addr, size,
625 &addr, PAGE_HYP_EXEC);
631 *haddr = (void *)addr;
635 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
636 /* We shouldn't need any other callback to walk the PT */
637 .phys_to_virt = kvm_host_va,
640 static int get_user_mapping_size(struct kvm *kvm, u64 addr)
642 struct kvm_pgtable pgt = {
643 .pgd = (kvm_pte_t *)kvm->mm->pgd,
645 .start_level = (KVM_PGTABLE_MAX_LEVELS -
646 CONFIG_PGTABLE_LEVELS),
647 .mm_ops = &kvm_user_mm_ops,
649 kvm_pte_t pte = 0; /* Keep GCC quiet... */
653 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
655 VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
656 VM_BUG_ON(!(pte & PTE_VALID));
658 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
661 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
662 .zalloc_page = stage2_memcache_zalloc_page,
663 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
664 .free_pages_exact = kvm_s2_free_pages_exact,
665 .get_page = kvm_host_get_page,
666 .put_page = kvm_s2_put_page,
667 .page_count = kvm_host_page_count,
668 .phys_to_virt = kvm_host_va,
669 .virt_to_phys = kvm_host_pa,
670 .dcache_clean_inval_poc = clean_dcache_guest_page,
671 .icache_inval_pou = invalidate_icache_guest_page,
675 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
676 * @kvm: The pointer to the KVM structure
677 * @mmu: The pointer to the s2 MMU structure
679 * Allocates only the stage-2 HW PGD level table(s).
680 * Note we don't need locking here as this is only called when the VM is
681 * created, which can only be done once.
683 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
686 struct kvm_pgtable *pgt;
688 if (mmu->pgt != NULL) {
689 kvm_err("kvm_arch already initialized?\n");
693 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
697 mmu->arch = &kvm->arch;
698 err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
700 goto out_free_pgtable;
702 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
703 if (!mmu->last_vcpu_ran) {
705 goto out_destroy_pgtable;
708 for_each_possible_cpu(cpu)
709 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
712 mmu->pgd_phys = __pa(pgt->pgd);
716 kvm_pgtable_stage2_destroy(pgt);
722 static void stage2_unmap_memslot(struct kvm *kvm,
723 struct kvm_memory_slot *memslot)
725 hva_t hva = memslot->userspace_addr;
726 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
727 phys_addr_t size = PAGE_SIZE * memslot->npages;
728 hva_t reg_end = hva + size;
731 * A memory region could potentially cover multiple VMAs, and any holes
732 * between them, so iterate over all of them to find out if we should
735 * +--------------------------------------------+
736 * +---------------+----------------+ +----------------+
737 * | : VMA 1 | VMA 2 | | VMA 3 : |
738 * +---------------+----------------+ +----------------+
740 * +--------------------------------------------+
743 struct vm_area_struct *vma;
744 hva_t vm_start, vm_end;
746 vma = find_vma_intersection(current->mm, hva, reg_end);
751 * Take the intersection of this VMA with the memory region
753 vm_start = max(hva, vma->vm_start);
754 vm_end = min(reg_end, vma->vm_end);
756 if (!(vma->vm_flags & VM_PFNMAP)) {
757 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
758 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
761 } while (hva < reg_end);
765 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
766 * @kvm: The struct kvm pointer
768 * Go through the memregions and unmap any regular RAM
769 * backing memory already mapped to the VM.
771 void stage2_unmap_vm(struct kvm *kvm)
773 struct kvm_memslots *slots;
774 struct kvm_memory_slot *memslot;
777 idx = srcu_read_lock(&kvm->srcu);
778 mmap_read_lock(current->mm);
779 write_lock(&kvm->mmu_lock);
781 slots = kvm_memslots(kvm);
782 kvm_for_each_memslot(memslot, bkt, slots)
783 stage2_unmap_memslot(kvm, memslot);
785 write_unlock(&kvm->mmu_lock);
786 mmap_read_unlock(current->mm);
787 srcu_read_unlock(&kvm->srcu, idx);
790 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
792 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
793 struct kvm_pgtable *pgt = NULL;
795 write_lock(&kvm->mmu_lock);
800 free_percpu(mmu->last_vcpu_ran);
802 write_unlock(&kvm->mmu_lock);
805 kvm_pgtable_stage2_destroy(pgt);
811 * kvm_phys_addr_ioremap - map a device range to guest IPA
813 * @kvm: The KVM pointer
814 * @guest_ipa: The IPA at which to insert the mapping
815 * @pa: The physical address of the device
816 * @size: The size of the mapping
817 * @writable: Whether or not to create a writable mapping
819 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
820 phys_addr_t pa, unsigned long size, bool writable)
824 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
825 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
826 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
828 (writable ? KVM_PGTABLE_PROT_W : 0);
830 if (is_protected_kvm_enabled())
833 size += offset_in_page(guest_ipa);
834 guest_ipa &= PAGE_MASK;
836 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
837 ret = kvm_mmu_topup_memory_cache(&cache,
838 kvm_mmu_cache_min_pages(kvm));
842 write_lock(&kvm->mmu_lock);
843 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
845 write_unlock(&kvm->mmu_lock);
852 kvm_mmu_free_memory_cache(&cache);
857 * stage2_wp_range() - write protect stage2 memory region range
858 * @mmu: The KVM stage-2 MMU pointer
859 * @addr: Start address of range
860 * @end: End address of range
862 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
864 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
865 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
869 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
870 * @kvm: The KVM pointer
871 * @slot: The memory slot to write protect
873 * Called to start logging dirty pages after memory region
874 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
875 * all present PUD, PMD and PTEs are write protected in the memory region.
876 * Afterwards read of dirty page log can be called.
878 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
879 * serializing operations for VM memory regions.
881 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
883 struct kvm_memslots *slots = kvm_memslots(kvm);
884 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
885 phys_addr_t start, end;
887 if (WARN_ON_ONCE(!memslot))
890 start = memslot->base_gfn << PAGE_SHIFT;
891 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
893 write_lock(&kvm->mmu_lock);
894 stage2_wp_range(&kvm->arch.mmu, start, end);
895 write_unlock(&kvm->mmu_lock);
896 kvm_flush_remote_tlbs(kvm);
900 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
901 * @kvm: The KVM pointer
902 * @slot: The memory slot associated with mask
903 * @gfn_offset: The gfn offset in memory slot
904 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
905 * slot to be write protected
907 * Walks bits set in mask write protects the associated pte's. Caller must
908 * acquire kvm_mmu_lock.
910 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
911 struct kvm_memory_slot *slot,
912 gfn_t gfn_offset, unsigned long mask)
914 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
915 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
916 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
918 stage2_wp_range(&kvm->arch.mmu, start, end);
922 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
925 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
926 * enable dirty logging for them.
928 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
929 struct kvm_memory_slot *slot,
930 gfn_t gfn_offset, unsigned long mask)
932 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
935 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
937 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
940 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
942 unsigned long map_size)
945 hva_t uaddr_start, uaddr_end;
948 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
949 if (map_size == PAGE_SIZE)
952 size = memslot->npages * PAGE_SIZE;
954 gpa_start = memslot->base_gfn << PAGE_SHIFT;
956 uaddr_start = memslot->userspace_addr;
957 uaddr_end = uaddr_start + size;
960 * Pages belonging to memslots that don't have the same alignment
961 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
962 * PMD/PUD entries, because we'll end up mapping the wrong pages.
964 * Consider a layout like the following:
966 * memslot->userspace_addr:
967 * +-----+--------------------+--------------------+---+
968 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
969 * +-----+--------------------+--------------------+---+
971 * memslot->base_gfn << PAGE_SHIFT:
972 * +---+--------------------+--------------------+-----+
973 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
974 * +---+--------------------+--------------------+-----+
976 * If we create those stage-2 blocks, we'll end up with this incorrect
982 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
986 * Next, let's make sure we're not trying to map anything not covered
987 * by the memslot. This means we have to prohibit block size mappings
988 * for the beginning and end of a non-block aligned and non-block sized
989 * memory slot (illustrated by the head and tail parts of the
990 * userspace view above containing pages 'abcde' and 'xyz',
993 * Note that it doesn't matter if we do the check using the
994 * userspace_addr or the base_gfn, as both are equally aligned (per
995 * the check above) and equally sized.
997 return (hva & ~(map_size - 1)) >= uaddr_start &&
998 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1002 * Check if the given hva is backed by a transparent huge page (THP) and
1003 * whether it can be mapped using block mapping in stage2. If so, adjust
1004 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1005 * supported. This will need to be updated to support other THP sizes.
1007 * Returns the size of the mapping.
1009 static unsigned long
1010 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
1011 unsigned long hva, kvm_pfn_t *pfnp,
1014 kvm_pfn_t pfn = *pfnp;
1017 * Make sure the adjustment is done only for THP pages. Also make
1018 * sure that the HVA and IPA are sufficiently aligned and that the
1019 * block map is contained within the memslot.
1021 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
1022 get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
1024 * The address we faulted on is backed by a transparent huge
1025 * page. However, because we map the compound huge page and
1026 * not the individual tail page, we need to transfer the
1027 * refcount to the head page. We have to be careful that the
1028 * THP doesn't start to split while we are adjusting the
1031 * We are sure this doesn't happen, because mmu_invalidate_retry
1032 * was successful and we are holding the mmu_lock, so if this
1033 * THP is trying to split, it will be blocked in the mmu
1034 * notifier before touching any of the pages, specifically
1035 * before being able to call __split_huge_page_refcount().
1037 * We can therefore safely transfer the refcount from PG_tail
1038 * to PG_head and switch the pfn from a tail page to the head
1042 kvm_release_pfn_clean(pfn);
1043 pfn &= ~(PTRS_PER_PMD - 1);
1044 get_page(pfn_to_page(pfn));
1050 /* Use page mapping if we cannot use block mapping. */
1054 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
1058 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
1059 return huge_page_shift(hstate_vma(vma));
1061 if (!(vma->vm_flags & VM_PFNMAP))
1064 VM_BUG_ON(is_vm_hugetlb_page(vma));
1066 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
1068 #ifndef __PAGETABLE_PMD_FOLDED
1069 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
1070 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
1071 ALIGN(hva, PUD_SIZE) <= vma->vm_end)
1075 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
1076 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
1077 ALIGN(hva, PMD_SIZE) <= vma->vm_end)
1084 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
1085 * able to see the page's tags and therefore they must be initialised first. If
1086 * PG_mte_tagged is set, tags have already been initialised.
1088 * The race in the test/set of the PG_mte_tagged flag is handled by:
1089 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
1090 * racing to santise the same page
1091 * - mmap_lock protects between a VM faulting a page in and the VMM performing
1092 * an mprotect() to add VM_MTE
1094 static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
1097 unsigned long i, nr_pages = size >> PAGE_SHIFT;
1100 if (!kvm_has_mte(kvm))
1104 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
1105 * that may not support tags.
1107 page = pfn_to_online_page(pfn);
1112 for (i = 0; i < nr_pages; i++, page++) {
1113 if (!test_bit(PG_mte_tagged, &page->flags)) {
1114 mte_clear_page_tags(page_address(page));
1115 set_bit(PG_mte_tagged, &page->flags);
1122 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1123 struct kvm_memory_slot *memslot, unsigned long hva,
1124 unsigned long fault_status)
1127 bool write_fault, writable, force_pte = false;
1129 bool device = false;
1131 unsigned long mmu_seq;
1132 struct kvm *kvm = vcpu->kvm;
1133 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1134 struct vm_area_struct *vma;
1138 bool logging_active = memslot_is_logging(memslot);
1139 bool use_read_lock = false;
1140 unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
1141 unsigned long vma_pagesize, fault_granule;
1142 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
1143 struct kvm_pgtable *pgt;
1145 fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
1146 write_fault = kvm_is_write_fault(vcpu);
1147 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1148 VM_BUG_ON(write_fault && exec_fault);
1150 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1151 kvm_err("Unexpected L2 read permission error\n");
1156 * Let's check if we will get back a huge page backed by hugetlbfs, or
1157 * get block mapping for device MMIO region.
1159 mmap_read_lock(current->mm);
1160 vma = vma_lookup(current->mm, hva);
1161 if (unlikely(!vma)) {
1162 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1163 mmap_read_unlock(current->mm);
1168 * logging_active is guaranteed to never be true for VM_PFNMAP
1171 if (logging_active) {
1173 vma_shift = PAGE_SHIFT;
1174 use_read_lock = (fault_status == FSC_PERM && write_fault &&
1175 fault_granule == PAGE_SIZE);
1177 vma_shift = get_vma_page_shift(vma, hva);
1180 shared = (vma->vm_flags & VM_SHARED);
1182 switch (vma_shift) {
1183 #ifndef __PAGETABLE_PMD_FOLDED
1185 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
1189 case CONT_PMD_SHIFT:
1190 vma_shift = PMD_SHIFT;
1193 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
1196 case CONT_PTE_SHIFT:
1197 vma_shift = PAGE_SHIFT;
1203 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
1206 vma_pagesize = 1UL << vma_shift;
1207 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
1208 fault_ipa &= ~(vma_pagesize - 1);
1210 gfn = fault_ipa >> PAGE_SHIFT;
1211 mmap_read_unlock(current->mm);
1214 * Permission faults just need to update the existing leaf entry,
1215 * and so normally don't require allocations from the memcache. The
1216 * only exception to this is when dirty logging is enabled at runtime
1217 * and a write fault needs to collapse a block entry into a table.
1219 if (fault_status != FSC_PERM || (logging_active && write_fault)) {
1220 ret = kvm_mmu_topup_memory_cache(memcache,
1221 kvm_mmu_cache_min_pages(kvm));
1226 mmu_seq = vcpu->kvm->mmu_invalidate_seq;
1228 * Ensure the read of mmu_invalidate_seq happens before we call
1229 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1230 * the page we just got a reference to gets unmapped before we have a
1231 * chance to grab the mmu_lock, which ensure that if the page gets
1232 * unmapped afterwards, the call to kvm_unmap_gfn will take it away
1233 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1234 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1236 * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
1237 * used to avoid unnecessary overhead introduced to locate the memory
1238 * slot because it's always fixed even @gfn is adjusted for huge pages.
1242 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
1243 write_fault, &writable, NULL);
1244 if (pfn == KVM_PFN_ERR_HWPOISON) {
1245 kvm_send_hwpoison_signal(hva, vma_shift);
1248 if (is_error_noslot_pfn(pfn))
1251 if (kvm_is_device_pfn(pfn)) {
1253 * If the page was identified as device early by looking at
1254 * the VMA flags, vma_pagesize is already representing the
1255 * largest quantity we can map. If instead it was mapped
1256 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
1257 * and must not be upgraded.
1259 * In both cases, we don't let transparent_hugepage_adjust()
1260 * change things at the last minute.
1263 } else if (logging_active && !write_fault) {
1265 * Only actually map the page as writable if this was a write
1271 if (exec_fault && device)
1275 * To reduce MMU contentions and enhance concurrency during dirty
1276 * logging dirty logging, only acquire read lock for permission
1280 read_lock(&kvm->mmu_lock);
1282 write_lock(&kvm->mmu_lock);
1283 pgt = vcpu->arch.hw_mmu->pgt;
1284 if (mmu_invalidate_retry(kvm, mmu_seq))
1288 * If we are not forced to use page mapping, check if we are
1289 * backed by a THP and thus use block mapping if possible.
1291 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
1292 if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
1293 vma_pagesize = fault_granule;
1295 vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
1300 if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
1301 /* Check the VMM hasn't introduced a new VM_SHARED VMA */
1303 ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
1311 prot |= KVM_PGTABLE_PROT_W;
1314 prot |= KVM_PGTABLE_PROT_X;
1317 prot |= KVM_PGTABLE_PROT_DEVICE;
1318 else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
1319 prot |= KVM_PGTABLE_PROT_X;
1322 * Under the premise of getting a FSC_PERM fault, we just need to relax
1323 * permissions only if vma_pagesize equals fault_granule. Otherwise,
1324 * kvm_pgtable_stage2_map() should be called to change block size.
1326 if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
1327 ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
1329 WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
1331 ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
1332 __pfn_to_phys(pfn), prot,
1336 /* Mark the page dirty only if the fault is handled successfully */
1337 if (writable && !ret) {
1338 kvm_set_pfn_dirty(pfn);
1339 mark_page_dirty_in_slot(kvm, memslot, gfn);
1344 read_unlock(&kvm->mmu_lock);
1346 write_unlock(&kvm->mmu_lock);
1347 kvm_set_pfn_accessed(pfn);
1348 kvm_release_pfn_clean(pfn);
1349 return ret != -EAGAIN ? ret : 0;
1352 /* Resolve the access fault by making the page young again. */
1353 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1357 struct kvm_s2_mmu *mmu;
1359 trace_kvm_access_fault(fault_ipa);
1361 write_lock(&vcpu->kvm->mmu_lock);
1362 mmu = vcpu->arch.hw_mmu;
1363 kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1364 write_unlock(&vcpu->kvm->mmu_lock);
1368 kvm_set_pfn_accessed(pte_pfn(pte));
1372 * kvm_handle_guest_abort - handles all 2nd stage aborts
1373 * @vcpu: the VCPU pointer
1375 * Any abort that gets to the host is almost guaranteed to be caused by a
1376 * missing second stage translation table entry, which can mean that either the
1377 * guest simply needs more memory and we must allocate an appropriate page or it
1378 * can mean that the guest tried to access I/O memory, which is emulated by user
1379 * space. The distinction is based on the IPA causing the fault and whether this
1380 * memory region has been registered as standard RAM by user space.
1382 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1384 unsigned long fault_status;
1385 phys_addr_t fault_ipa;
1386 struct kvm_memory_slot *memslot;
1388 bool is_iabt, write_fault, writable;
1392 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1394 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1395 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1397 if (fault_status == FSC_FAULT) {
1398 /* Beyond sanitised PARange (which is the IPA limit) */
1399 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
1400 kvm_inject_size_fault(vcpu);
1404 /* Falls between the IPA range and the PARange? */
1405 if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
1406 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
1409 kvm_inject_pabt(vcpu, fault_ipa);
1411 kvm_inject_dabt(vcpu, fault_ipa);
1416 /* Synchronous External Abort? */
1417 if (kvm_vcpu_abt_issea(vcpu)) {
1419 * For RAS the host kernel may handle this abort.
1420 * There is no need to pass the error into the guest.
1422 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1423 kvm_inject_vabt(vcpu);
1428 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1429 kvm_vcpu_get_hfar(vcpu), fault_ipa);
1431 /* Check the stage-2 fault is trans. fault or write fault */
1432 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1433 fault_status != FSC_ACCESS) {
1434 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1435 kvm_vcpu_trap_get_class(vcpu),
1436 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1437 (unsigned long)kvm_vcpu_get_esr(vcpu));
1441 idx = srcu_read_lock(&vcpu->kvm->srcu);
1443 gfn = fault_ipa >> PAGE_SHIFT;
1444 memslot = gfn_to_memslot(vcpu->kvm, gfn);
1445 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1446 write_fault = kvm_is_write_fault(vcpu);
1447 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1449 * The guest has put either its instructions or its page-tables
1450 * somewhere it shouldn't have. Userspace won't be able to do
1451 * anything about this (there's no syndrome for a start), so
1452 * re-inject the abort back into the guest.
1459 if (kvm_vcpu_abt_iss1tw(vcpu)) {
1460 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1466 * Check for a cache maintenance operation. Since we
1467 * ended-up here, we know it is outside of any memory
1468 * slot. But we can't find out if that is for a device,
1469 * or if the guest is just being stupid. The only thing
1470 * we know for sure is that this range cannot be cached.
1472 * So let's assume that the guest is just being
1473 * cautious, and skip the instruction.
1475 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1482 * The IPA is reported as [MAX:12], so we need to
1483 * complement it with the bottom 12 bits from the
1484 * faulting VA. This is always 12 bits, irrespective
1487 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1488 ret = io_mem_abort(vcpu, fault_ipa);
1492 /* Userspace should not be able to register out-of-bounds IPAs */
1493 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1495 if (fault_status == FSC_ACCESS) {
1496 handle_access_fault(vcpu, fault_ipa);
1501 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1505 if (ret == -ENOEXEC) {
1506 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1510 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1514 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1516 if (!kvm->arch.mmu.pgt)
1519 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
1520 (range->end - range->start) << PAGE_SHIFT,
1526 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1528 kvm_pfn_t pfn = pte_pfn(range->pte);
1531 if (!kvm->arch.mmu.pgt)
1534 WARN_ON(range->end - range->start != 1);
1536 ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
1541 * We've moved a page around, probably through CoW, so let's treat
1542 * it just like a translation fault and the map handler will clean
1543 * the cache to the PoC.
1545 * The MMU notifiers will have unmapped a huge PMD before calling
1546 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
1547 * therefore we never need to clear out a huge PMD through this
1548 * calling path and a memcache is not required.
1550 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
1551 PAGE_SIZE, __pfn_to_phys(pfn),
1552 KVM_PGTABLE_PROT_R, NULL);
1557 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1559 u64 size = (range->end - range->start) << PAGE_SHIFT;
1563 if (!kvm->arch.mmu.pgt)
1566 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1568 kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
1569 range->start << PAGE_SHIFT);
1571 return pte_valid(pte) && pte_young(pte);
1574 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1576 if (!kvm->arch.mmu.pgt)
1579 return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
1580 range->start << PAGE_SHIFT);
1583 phys_addr_t kvm_mmu_get_httbr(void)
1585 return __pa(hyp_pgtable->pgd);
1588 phys_addr_t kvm_get_idmap_vector(void)
1590 return hyp_idmap_vector;
1593 static int kvm_map_idmap_text(void)
1595 unsigned long size = hyp_idmap_end - hyp_idmap_start;
1596 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1599 kvm_err("Failed to idmap %lx-%lx\n",
1600 hyp_idmap_start, hyp_idmap_end);
1605 static void *kvm_hyp_zalloc_page(void *arg)
1607 return (void *)get_zeroed_page(GFP_KERNEL);
1610 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
1611 .zalloc_page = kvm_hyp_zalloc_page,
1612 .get_page = kvm_host_get_page,
1613 .put_page = kvm_host_put_page,
1614 .phys_to_virt = kvm_host_va,
1615 .virt_to_phys = kvm_host_pa,
1618 int kvm_mmu_init(u32 *hyp_va_bits)
1622 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
1623 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1624 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
1625 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1626 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
1629 * We rely on the linker script to ensure at build time that the HYP
1630 * init code does not cross a page boundary.
1632 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1634 *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1635 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
1636 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1637 kvm_debug("HYP VA range: %lx:%lx\n",
1638 kern_hyp_va(PAGE_OFFSET),
1639 kern_hyp_va((unsigned long)high_memory - 1));
1641 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
1642 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
1643 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
1645 * The idmap page is intersecting with the VA space,
1646 * it is not safe to continue further.
1648 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
1653 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
1655 kvm_err("Hyp mode page-table not allocated\n");
1660 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
1662 goto out_free_pgtable;
1664 err = kvm_map_idmap_text();
1666 goto out_destroy_pgtable;
1668 io_map_base = hyp_idmap_start;
1671 out_destroy_pgtable:
1672 kvm_pgtable_hyp_destroy(hyp_pgtable);
1680 void kvm_arch_commit_memory_region(struct kvm *kvm,
1681 struct kvm_memory_slot *old,
1682 const struct kvm_memory_slot *new,
1683 enum kvm_mr_change change)
1686 * At this point memslot has been committed and there is an
1687 * allocated dirty_bitmap[], dirty pages will be tracked while the
1688 * memory slot is write protected.
1690 if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1692 * If we're with initial-all-set, we don't need to write
1693 * protect any pages because they're all reported as dirty.
1694 * Huge pages and normal pages will be write protect gradually.
1696 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1697 kvm_mmu_wp_memory_region(kvm, new->id);
1702 int kvm_arch_prepare_memory_region(struct kvm *kvm,
1703 const struct kvm_memory_slot *old,
1704 struct kvm_memory_slot *new,
1705 enum kvm_mr_change change)
1710 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1711 change != KVM_MR_FLAGS_ONLY)
1715 * Prevent userspace from creating a memory region outside of the IPA
1716 * space addressable by the KVM guest IPA space.
1718 if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
1721 hva = new->userspace_addr;
1722 reg_end = hva + (new->npages << PAGE_SHIFT);
1724 mmap_read_lock(current->mm);
1726 * A memory region could potentially cover multiple VMAs, and any holes
1727 * between them, so iterate over all of them.
1729 * +--------------------------------------------+
1730 * +---------------+----------------+ +----------------+
1731 * | : VMA 1 | VMA 2 | | VMA 3 : |
1732 * +---------------+----------------+ +----------------+
1734 * +--------------------------------------------+
1737 struct vm_area_struct *vma;
1739 vma = find_vma_intersection(current->mm, hva, reg_end);
1744 * VM_SHARED mappings are not allowed with MTE to avoid races
1745 * when updating the PG_mte_tagged page flag, see
1746 * sanitise_mte_tags for more details.
1748 if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
1753 if (vma->vm_flags & VM_PFNMAP) {
1754 /* IO region dirty page logging not allowed */
1755 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1760 hva = min(reg_end, vma->vm_end);
1761 } while (hva < reg_end);
1763 mmap_read_unlock(current->mm);
1767 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
1771 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
1775 void kvm_arch_flush_shadow_all(struct kvm *kvm)
1777 kvm_free_stage2_pgd(&kvm->arch.mmu);
1780 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1781 struct kvm_memory_slot *slot)
1783 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1784 phys_addr_t size = slot->npages << PAGE_SHIFT;
1786 write_lock(&kvm->mmu_lock);
1787 unmap_stage2_range(&kvm->arch.mmu, gpa, size);
1788 write_unlock(&kvm->mmu_lock);
1792 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1795 * - S/W ops are local to a CPU (not broadcast)
1796 * - We have line migration behind our back (speculation)
1797 * - System caches don't support S/W at all (damn!)
1799 * In the face of the above, the best we can do is to try and convert
1800 * S/W ops to VA ops. Because the guest is not allowed to infer the
1801 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1802 * which is a rather good thing for us.
1804 * Also, it is only used when turning caches on/off ("The expected
1805 * usage of the cache maintenance instructions that operate by set/way
1806 * is associated with the cache maintenance instructions associated
1807 * with the powerdown and powerup of caches, if this is required by
1808 * the implementation.").
1810 * We use the following policy:
1812 * - If we trap a S/W operation, we enable VM trapping to detect
1813 * caches being turned on/off, and do a full clean.
1815 * - We flush the caches on both caches being turned on and off.
1817 * - Once the caches are enabled, we stop trapping VM ops.
1819 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1821 unsigned long hcr = *vcpu_hcr(vcpu);
1824 * If this is the first time we do a S/W operation
1825 * (i.e. HCR_TVM not set) flush the whole memory, and set the
1828 * Otherwise, rely on the VM trapping to wait for the MMU +
1829 * Caches to be turned off. At that point, we'll be able to
1830 * clean the caches again.
1832 if (!(hcr & HCR_TVM)) {
1833 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1834 vcpu_has_cache_enabled(vcpu));
1835 stage2_flush_vm(vcpu->kvm);
1836 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
1840 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1842 bool now_enabled = vcpu_has_cache_enabled(vcpu);
1845 * If switching the MMU+caches on, need to invalidate the caches.
1846 * If switching it off, need to clean the caches.
1847 * Clean + invalidate does the trick always.
1849 if (now_enabled != was_enabled)
1850 stage2_flush_vm(vcpu->kvm);
1852 /* Caches are now on, stop trapping VM ops (until a S/W op) */
1854 *vcpu_hcr(vcpu) &= ~HCR_TVM;
1856 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);