1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_pgtable.h>
18 #include <asm/kvm_ras.h>
19 #include <asm/kvm_asm.h>
20 #include <asm/kvm_emulate.h>
25 static struct kvm_pgtable *hyp_pgtable;
26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
28 static unsigned long hyp_idmap_start;
29 static unsigned long hyp_idmap_end;
30 static phys_addr_t hyp_idmap_vector;
32 static unsigned long io_map_base;
34 #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
35 #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
37 static bool is_iomap(unsigned long flags)
39 return flags & KVM_S2PTE_FLAG_IS_IOMAP;
43 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
44 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
45 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
46 * long will also starve other vCPUs. We have to also make sure that the page
47 * tables are not freed while we released the lock.
49 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
51 int (*fn)(struct kvm_pgtable *, u64, u64),
58 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
62 next = stage2_pgd_addr_end(kvm, addr, end);
63 ret = fn(pgt, addr, next - addr);
67 if (resched && next != end)
68 cond_resched_lock(&kvm->mmu_lock);
69 } while (addr = next, addr != end);
74 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
76 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
80 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
81 * @kvm: pointer to kvm structure.
83 * Interface to HYP function to flush all VM TLB entries
85 void kvm_flush_remote_tlbs(struct kvm *kvm)
87 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
90 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
93 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
97 * D-Cache management functions. They take the page table entries by
98 * value, as they are flushing the cache using the kernel mapping (or
101 static void kvm_flush_dcache_pte(pte_t pte)
103 __kvm_flush_dcache_pte(pte);
106 static void kvm_flush_dcache_pmd(pmd_t pmd)
108 __kvm_flush_dcache_pmd(pmd);
111 static void kvm_flush_dcache_pud(pud_t pud)
113 __kvm_flush_dcache_pud(pud);
116 static bool kvm_is_device_pfn(unsigned long pfn)
118 return !pfn_valid(pfn);
122 * stage2_dissolve_pmd() - clear and flush huge PMD entry
123 * @mmu: pointer to mmu structure to operate on
125 * @pmd: pmd pointer for IPA
127 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
129 static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
131 if (!pmd_thp_or_huge(*pmd))
135 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
136 put_page(virt_to_page(pmd));
140 * stage2_dissolve_pud() - clear and flush huge PUD entry
141 * @mmu: pointer to mmu structure to operate on
143 * @pud: pud pointer for IPA
145 * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
147 static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
149 struct kvm *kvm = mmu->kvm;
151 if (!stage2_pud_huge(kvm, *pudp))
154 stage2_pud_clear(kvm, pudp);
155 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
156 put_page(virt_to_page(pudp));
159 static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
161 struct kvm *kvm = mmu->kvm;
162 p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
163 stage2_pgd_clear(kvm, pgd);
164 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
165 stage2_p4d_free(kvm, p4d_table);
166 put_page(virt_to_page(pgd));
169 static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
171 struct kvm *kvm = mmu->kvm;
172 pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
173 stage2_p4d_clear(kvm, p4d);
174 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
175 stage2_pud_free(kvm, pud_table);
176 put_page(virt_to_page(p4d));
179 static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
181 struct kvm *kvm = mmu->kvm;
182 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
184 VM_BUG_ON(stage2_pud_huge(kvm, *pud));
185 stage2_pud_clear(kvm, pud);
186 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
187 stage2_pmd_free(kvm, pmd_table);
188 put_page(virt_to_page(pud));
191 static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
193 pte_t *pte_table = pte_offset_kernel(pmd, 0);
194 VM_BUG_ON(pmd_thp_or_huge(*pmd));
196 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
197 free_page((unsigned long)pte_table);
198 put_page(virt_to_page(pmd));
201 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
203 WRITE_ONCE(*ptep, new_pte);
207 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
209 WRITE_ONCE(*pmdp, new_pmd);
213 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
215 kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
218 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
220 WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
224 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
226 WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
230 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
232 #ifndef __PAGETABLE_P4D_FOLDED
233 WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
239 * Unmapping vs dcache management:
241 * If a guest maps certain memory pages as uncached, all writes will
242 * bypass the data cache and go directly to RAM. However, the CPUs
243 * can still speculate reads (not writes) and fill cache lines with
246 * Those cache lines will be *clean* cache lines though, so a
247 * clean+invalidate operation is equivalent to an invalidate
248 * operation, because no cache lines are marked dirty.
250 * Those clean cache lines could be filled prior to an uncached write
251 * by the guest, and the cache coherent IO subsystem would therefore
252 * end up writing old data to disk.
254 * This is why right after unmapping a page/section and invalidating
255 * the corresponding TLBs, we flush to make sure the IO subsystem will
256 * never hit in the cache.
258 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
259 * we then fully enforce cacheability of RAM, no matter what the guest
262 static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
263 phys_addr_t addr, phys_addr_t end)
265 phys_addr_t start_addr = addr;
266 pte_t *pte, *start_pte;
268 start_pte = pte = pte_offset_kernel(pmd, addr);
270 if (!pte_none(*pte)) {
271 pte_t old_pte = *pte;
273 kvm_set_pte(pte, __pte(0));
274 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
276 /* No need to invalidate the cache for device mappings */
277 if (!kvm_is_device_pfn(pte_pfn(old_pte)))
278 kvm_flush_dcache_pte(old_pte);
280 put_page(virt_to_page(pte));
282 } while (pte++, addr += PAGE_SIZE, addr != end);
284 if (stage2_pte_table_empty(mmu->kvm, start_pte))
285 clear_stage2_pmd_entry(mmu, pmd, start_addr);
288 static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
289 phys_addr_t addr, phys_addr_t end)
291 struct kvm *kvm = mmu->kvm;
292 phys_addr_t next, start_addr = addr;
293 pmd_t *pmd, *start_pmd;
295 start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
297 next = stage2_pmd_addr_end(kvm, addr, end);
298 if (!pmd_none(*pmd)) {
299 if (pmd_thp_or_huge(*pmd)) {
300 pmd_t old_pmd = *pmd;
303 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
305 kvm_flush_dcache_pmd(old_pmd);
307 put_page(virt_to_page(pmd));
309 unmap_stage2_ptes(mmu, pmd, addr, next);
312 } while (pmd++, addr = next, addr != end);
314 if (stage2_pmd_table_empty(kvm, start_pmd))
315 clear_stage2_pud_entry(mmu, pud, start_addr);
318 static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
319 phys_addr_t addr, phys_addr_t end)
321 struct kvm *kvm = mmu->kvm;
322 phys_addr_t next, start_addr = addr;
323 pud_t *pud, *start_pud;
325 start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
327 next = stage2_pud_addr_end(kvm, addr, end);
328 if (!stage2_pud_none(kvm, *pud)) {
329 if (stage2_pud_huge(kvm, *pud)) {
330 pud_t old_pud = *pud;
332 stage2_pud_clear(kvm, pud);
333 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
334 kvm_flush_dcache_pud(old_pud);
335 put_page(virt_to_page(pud));
337 unmap_stage2_pmds(mmu, pud, addr, next);
340 } while (pud++, addr = next, addr != end);
342 if (stage2_pud_table_empty(kvm, start_pud))
343 clear_stage2_p4d_entry(mmu, p4d, start_addr);
346 static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
347 phys_addr_t addr, phys_addr_t end)
349 struct kvm *kvm = mmu->kvm;
350 phys_addr_t next, start_addr = addr;
351 p4d_t *p4d, *start_p4d;
353 start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
355 next = stage2_p4d_addr_end(kvm, addr, end);
356 if (!stage2_p4d_none(kvm, *p4d))
357 unmap_stage2_puds(mmu, p4d, addr, next);
358 } while (p4d++, addr = next, addr != end);
360 if (stage2_p4d_table_empty(kvm, start_p4d))
361 clear_stage2_pgd_entry(mmu, pgd, start_addr);
365 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
366 * @kvm: The VM pointer
367 * @start: The intermediate physical base address of the range to unmap
368 * @size: The size of the area to unmap
370 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
371 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
372 * destroying the VM), otherwise another faulting VCPU may come in and mess
373 * with things behind our backs.
375 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
378 struct kvm *kvm = mmu->kvm;
379 phys_addr_t end = start + size;
381 assert_spin_locked(&kvm->mmu_lock);
382 WARN_ON(size & ~PAGE_MASK);
383 WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
387 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
389 __unmap_stage2_range(mmu, start, size, true);
392 static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
393 phys_addr_t addr, phys_addr_t end)
397 pte = pte_offset_kernel(pmd, addr);
399 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
400 kvm_flush_dcache_pte(*pte);
401 } while (pte++, addr += PAGE_SIZE, addr != end);
404 static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
405 phys_addr_t addr, phys_addr_t end)
407 struct kvm *kvm = mmu->kvm;
411 pmd = stage2_pmd_offset(kvm, pud, addr);
413 next = stage2_pmd_addr_end(kvm, addr, end);
414 if (!pmd_none(*pmd)) {
415 if (pmd_thp_or_huge(*pmd))
416 kvm_flush_dcache_pmd(*pmd);
418 stage2_flush_ptes(mmu, pmd, addr, next);
420 } while (pmd++, addr = next, addr != end);
423 static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
424 phys_addr_t addr, phys_addr_t end)
426 struct kvm *kvm = mmu->kvm;
430 pud = stage2_pud_offset(kvm, p4d, addr);
432 next = stage2_pud_addr_end(kvm, addr, end);
433 if (!stage2_pud_none(kvm, *pud)) {
434 if (stage2_pud_huge(kvm, *pud))
435 kvm_flush_dcache_pud(*pud);
437 stage2_flush_pmds(mmu, pud, addr, next);
439 } while (pud++, addr = next, addr != end);
442 static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
443 phys_addr_t addr, phys_addr_t end)
445 struct kvm *kvm = mmu->kvm;
449 p4d = stage2_p4d_offset(kvm, pgd, addr);
451 next = stage2_p4d_addr_end(kvm, addr, end);
452 if (!stage2_p4d_none(kvm, *p4d))
453 stage2_flush_puds(mmu, p4d, addr, next);
454 } while (p4d++, addr = next, addr != end);
457 static void stage2_flush_memslot(struct kvm *kvm,
458 struct kvm_memory_slot *memslot)
460 struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
461 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
462 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
466 pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
468 next = stage2_pgd_addr_end(kvm, addr, end);
469 if (!stage2_pgd_none(kvm, *pgd))
470 stage2_flush_p4ds(mmu, pgd, addr, next);
473 cond_resched_lock(&kvm->mmu_lock);
474 } while (pgd++, addr = next, addr != end);
478 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
479 * @kvm: The struct kvm pointer
481 * Go through the stage 2 page tables and invalidate any cache lines
482 * backing memory already mapped to the VM.
484 static void stage2_flush_vm(struct kvm *kvm)
486 struct kvm_memslots *slots;
487 struct kvm_memory_slot *memslot;
490 idx = srcu_read_lock(&kvm->srcu);
491 spin_lock(&kvm->mmu_lock);
493 slots = kvm_memslots(kvm);
494 kvm_for_each_memslot(memslot, slots)
495 stage2_flush_memslot(kvm, memslot);
497 spin_unlock(&kvm->mmu_lock);
498 srcu_read_unlock(&kvm->srcu, idx);
502 * free_hyp_pgds - free Hyp-mode page tables
504 void free_hyp_pgds(void)
506 mutex_lock(&kvm_hyp_pgd_mutex);
508 kvm_pgtable_hyp_destroy(hyp_pgtable);
511 mutex_unlock(&kvm_hyp_pgd_mutex);
514 static int __create_hyp_mappings(unsigned long start, unsigned long size,
515 unsigned long phys, enum kvm_pgtable_prot prot)
519 mutex_lock(&kvm_hyp_pgd_mutex);
520 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
521 mutex_unlock(&kvm_hyp_pgd_mutex);
526 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
528 if (!is_vmalloc_addr(kaddr)) {
529 BUG_ON(!virt_addr_valid(kaddr));
532 return page_to_phys(vmalloc_to_page(kaddr)) +
533 offset_in_page(kaddr);
538 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
539 * @from: The virtual kernel start address of the range
540 * @to: The virtual kernel end address of the range (exclusive)
541 * @prot: The protection to be applied to this range
543 * The same virtual address as the kernel virtual address is also used
544 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
547 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
549 phys_addr_t phys_addr;
550 unsigned long virt_addr;
551 unsigned long start = kern_hyp_va((unsigned long)from);
552 unsigned long end = kern_hyp_va((unsigned long)to);
554 if (is_kernel_in_hyp_mode())
557 start = start & PAGE_MASK;
558 end = PAGE_ALIGN(end);
560 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
563 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
564 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
573 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
574 unsigned long *haddr,
575 enum kvm_pgtable_prot prot)
580 mutex_lock(&kvm_hyp_pgd_mutex);
583 * This assumes that we have enough space below the idmap
584 * page to allocate our VAs. If not, the check below will
585 * kick. A potential alternative would be to detect that
586 * overflow and switch to an allocation above the idmap.
588 * The allocated size is always a multiple of PAGE_SIZE.
590 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
591 base = io_map_base - size;
594 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
595 * allocating the new area, as it would indicate we've
596 * overflowed the idmap/IO address range.
598 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
603 mutex_unlock(&kvm_hyp_pgd_mutex);
608 ret = __create_hyp_mappings(base, size, phys_addr, prot);
612 *haddr = base + offset_in_page(phys_addr);
618 * create_hyp_io_mappings - Map IO into both kernel and HYP
619 * @phys_addr: The physical start address which gets mapped
620 * @size: Size of the region being mapped
621 * @kaddr: Kernel VA for this mapping
622 * @haddr: HYP VA for this mapping
624 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
625 void __iomem **kaddr,
626 void __iomem **haddr)
631 *kaddr = ioremap(phys_addr, size);
635 if (is_kernel_in_hyp_mode()) {
640 ret = __create_hyp_private_mapping(phys_addr, size,
641 &addr, PAGE_HYP_DEVICE);
649 *haddr = (void __iomem *)addr;
654 * create_hyp_exec_mappings - Map an executable range into HYP
655 * @phys_addr: The physical start address which gets mapped
656 * @size: Size of the region being mapped
657 * @haddr: HYP VA for this mapping
659 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
665 BUG_ON(is_kernel_in_hyp_mode());
667 ret = __create_hyp_private_mapping(phys_addr, size,
668 &addr, PAGE_HYP_EXEC);
674 *haddr = (void *)addr;
679 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
680 * @kvm: The pointer to the KVM structure
681 * @mmu: The pointer to the s2 MMU structure
683 * Allocates only the stage-2 HW PGD level table(s).
684 * Note we don't need locking here as this is only called when the VM is
685 * created, which can only be done once.
687 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
690 struct kvm_pgtable *pgt;
692 if (mmu->pgt != NULL) {
693 kvm_err("kvm_arch already initialized?\n");
697 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
701 err = kvm_pgtable_stage2_init(pgt, kvm);
703 goto out_free_pgtable;
705 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
706 if (!mmu->last_vcpu_ran) {
708 goto out_destroy_pgtable;
711 for_each_possible_cpu(cpu)
712 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
716 mmu->pgd_phys = __pa(pgt->pgd);
717 mmu->pgd = (void *)pgt->pgd;
718 mmu->vmid.vmid_gen = 0;
722 kvm_pgtable_stage2_destroy(pgt);
728 static void stage2_unmap_memslot(struct kvm *kvm,
729 struct kvm_memory_slot *memslot)
731 hva_t hva = memslot->userspace_addr;
732 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
733 phys_addr_t size = PAGE_SIZE * memslot->npages;
734 hva_t reg_end = hva + size;
737 * A memory region could potentially cover multiple VMAs, and any holes
738 * between them, so iterate over all of them to find out if we should
741 * +--------------------------------------------+
742 * +---------------+----------------+ +----------------+
743 * | : VMA 1 | VMA 2 | | VMA 3 : |
744 * +---------------+----------------+ +----------------+
746 * +--------------------------------------------+
749 struct vm_area_struct *vma = find_vma(current->mm, hva);
750 hva_t vm_start, vm_end;
752 if (!vma || vma->vm_start >= reg_end)
756 * Take the intersection of this VMA with the memory region
758 vm_start = max(hva, vma->vm_start);
759 vm_end = min(reg_end, vma->vm_end);
761 if (!(vma->vm_flags & VM_PFNMAP)) {
762 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
763 unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
766 } while (hva < reg_end);
770 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
771 * @kvm: The struct kvm pointer
773 * Go through the memregions and unmap any regular RAM
774 * backing memory already mapped to the VM.
776 void stage2_unmap_vm(struct kvm *kvm)
778 struct kvm_memslots *slots;
779 struct kvm_memory_slot *memslot;
782 idx = srcu_read_lock(&kvm->srcu);
783 mmap_read_lock(current->mm);
784 spin_lock(&kvm->mmu_lock);
786 slots = kvm_memslots(kvm);
787 kvm_for_each_memslot(memslot, slots)
788 stage2_unmap_memslot(kvm, memslot);
790 spin_unlock(&kvm->mmu_lock);
791 mmap_read_unlock(current->mm);
792 srcu_read_unlock(&kvm->srcu, idx);
795 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
797 struct kvm *kvm = mmu->kvm;
798 struct kvm_pgtable *pgt = NULL;
800 spin_lock(&kvm->mmu_lock);
806 free_percpu(mmu->last_vcpu_ran);
808 spin_unlock(&kvm->mmu_lock);
811 kvm_pgtable_stage2_destroy(pgt);
816 static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
819 struct kvm *kvm = mmu->kvm;
823 pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
824 if (stage2_pgd_none(kvm, *pgd)) {
827 p4d = kvm_mmu_memory_cache_alloc(cache);
828 stage2_pgd_populate(kvm, pgd, p4d);
829 get_page(virt_to_page(pgd));
832 return stage2_p4d_offset(kvm, pgd, addr);
835 static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
838 struct kvm *kvm = mmu->kvm;
842 p4d = stage2_get_p4d(mmu, cache, addr);
843 if (stage2_p4d_none(kvm, *p4d)) {
846 pud = kvm_mmu_memory_cache_alloc(cache);
847 stage2_p4d_populate(kvm, p4d, pud);
848 get_page(virt_to_page(p4d));
851 return stage2_pud_offset(kvm, p4d, addr);
854 static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
857 struct kvm *kvm = mmu->kvm;
861 pud = stage2_get_pud(mmu, cache, addr);
862 if (!pud || stage2_pud_huge(kvm, *pud))
865 if (stage2_pud_none(kvm, *pud)) {
868 pmd = kvm_mmu_memory_cache_alloc(cache);
869 stage2_pud_populate(kvm, pud, pmd);
870 get_page(virt_to_page(pud));
873 return stage2_pmd_offset(kvm, pud, addr);
876 static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
877 struct kvm_mmu_memory_cache *cache,
878 phys_addr_t addr, const pmd_t *new_pmd)
883 pmd = stage2_get_pmd(mmu, cache, addr);
888 * Multiple vcpus faulting on the same PMD entry, can
889 * lead to them sequentially updating the PMD with the
890 * same value. Following the break-before-make
891 * (pmd_clear() followed by tlb_flush()) process can
892 * hinder forward progress due to refaults generated
893 * on missing translations.
895 * Skip updating the page table if the entry is
898 if (pmd_val(old_pmd) == pmd_val(*new_pmd))
901 if (pmd_present(old_pmd)) {
903 * If we already have PTE level mapping for this block,
904 * we must unmap it to avoid inconsistent TLB state and
905 * leaking the table page. We could end up in this situation
906 * if the memory slot was marked for dirty logging and was
907 * reverted, leaving PTE level mappings for the pages accessed
908 * during the period. So, unmap the PTE level mapping for this
909 * block and retry, as we could have released the upper level
910 * table in the process.
912 * Normal THP split/merge follows mmu_notifier callbacks and do
913 * get handled accordingly.
915 if (!pmd_thp_or_huge(old_pmd)) {
916 unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
920 * Mapping in huge pages should only happen through a
921 * fault. If a page is merged into a transparent huge
922 * page, the individual subpages of that huge page
923 * should be unmapped through MMU notifiers before we
926 * Merging of CompoundPages is not supported; they
927 * should become splitting first, unmapped, merged,
928 * and mapped back in on-demand.
930 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
932 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
934 get_page(virt_to_page(pmd));
937 kvm_set_pmd(pmd, *new_pmd);
941 static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
942 struct kvm_mmu_memory_cache *cache,
943 phys_addr_t addr, const pud_t *new_pudp)
945 struct kvm *kvm = mmu->kvm;
946 pud_t *pudp, old_pud;
949 pudp = stage2_get_pud(mmu, cache, addr);
955 * A large number of vcpus faulting on the same stage 2 entry,
956 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
957 * Skip updating the page tables if there is no change.
959 if (pud_val(old_pud) == pud_val(*new_pudp))
962 if (stage2_pud_present(kvm, old_pud)) {
964 * If we already have table level mapping for this block, unmap
965 * the range for this block and retry.
967 if (!stage2_pud_huge(kvm, old_pud)) {
968 unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
972 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
973 stage2_pud_clear(kvm, pudp);
974 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
976 get_page(virt_to_page(pudp));
979 kvm_set_pud(pudp, *new_pudp);
984 * stage2_get_leaf_entry - walk the stage2 VM page tables and return
985 * true if a valid and present leaf-entry is found. A pointer to the
986 * leaf-entry is returned in the appropriate level variable - pudpp,
989 static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
990 pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
992 struct kvm *kvm = mmu->kvm;
1001 pudp = stage2_get_pud(mmu, NULL, addr);
1002 if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1005 if (stage2_pud_huge(kvm, *pudp)) {
1010 pmdp = stage2_pmd_offset(kvm, pudp, addr);
1011 if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1014 if (pmd_thp_or_huge(*pmdp)) {
1019 ptep = pte_offset_kernel(pmdp, addr);
1020 if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1027 static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
1034 found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
1039 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1041 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1043 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1046 static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1047 struct kvm_mmu_memory_cache *cache,
1048 phys_addr_t addr, const pte_t *new_pte,
1049 unsigned long flags)
1051 struct kvm *kvm = mmu->kvm;
1054 pte_t *pte, old_pte;
1055 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1056 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1058 VM_BUG_ON(logging_active && !cache);
1060 /* Create stage-2 page table mapping - Levels 0 and 1 */
1061 pud = stage2_get_pud(mmu, cache, addr);
1064 * Ignore calls from kvm_set_spte_hva for unallocated
1071 * While dirty page logging - dissolve huge PUD, then continue
1072 * on to allocate page.
1075 stage2_dissolve_pud(mmu, addr, pud);
1077 if (stage2_pud_none(kvm, *pud)) {
1079 return 0; /* ignore calls from kvm_set_spte_hva */
1080 pmd = kvm_mmu_memory_cache_alloc(cache);
1081 stage2_pud_populate(kvm, pud, pmd);
1082 get_page(virt_to_page(pud));
1085 pmd = stage2_pmd_offset(kvm, pud, addr);
1088 * Ignore calls from kvm_set_spte_hva for unallocated
1095 * While dirty page logging - dissolve huge PMD, then continue on to
1099 stage2_dissolve_pmd(mmu, addr, pmd);
1101 /* Create stage-2 page mappings - Level 2 */
1102 if (pmd_none(*pmd)) {
1104 return 0; /* ignore calls from kvm_set_spte_hva */
1105 pte = kvm_mmu_memory_cache_alloc(cache);
1106 kvm_pmd_populate(pmd, pte);
1107 get_page(virt_to_page(pmd));
1110 pte = pte_offset_kernel(pmd, addr);
1112 if (iomap && pte_present(*pte))
1115 /* Create 2nd stage page table mapping - Level 3 */
1117 if (pte_present(old_pte)) {
1118 /* Skip page table update if there is no change */
1119 if (pte_val(old_pte) == pte_val(*new_pte))
1122 kvm_set_pte(pte, __pte(0));
1123 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
1125 get_page(virt_to_page(pte));
1128 kvm_set_pte(pte, *new_pte);
1132 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1133 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1135 if (pte_young(*pte)) {
1136 *pte = pte_mkold(*pte);
1142 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1144 return __ptep_test_and_clear_young(pte);
1148 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1150 return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1153 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1155 return stage2_ptep_test_and_clear_young((pte_t *)pud);
1159 * kvm_phys_addr_ioremap - map a device range to guest IPA
1161 * @kvm: The KVM pointer
1162 * @guest_ipa: The IPA at which to insert the mapping
1163 * @pa: The physical address of the device
1164 * @size: The size of the mapping
1166 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1167 phys_addr_t pa, unsigned long size, bool writable)
1171 struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
1172 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
1173 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1174 KVM_PGTABLE_PROT_R |
1175 (writable ? KVM_PGTABLE_PROT_W : 0);
1177 size += offset_in_page(guest_ipa);
1178 guest_ipa &= PAGE_MASK;
1180 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1181 ret = kvm_mmu_topup_memory_cache(&cache,
1182 kvm_mmu_cache_min_pages(kvm));
1186 spin_lock(&kvm->mmu_lock);
1187 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1189 spin_unlock(&kvm->mmu_lock);
1196 kvm_mmu_free_memory_cache(&cache);
1201 * stage2_wp_ptes - write protect PMD range
1202 * @pmd: pointer to pmd entry
1203 * @addr: range start address
1204 * @end: range end address
1206 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1210 pte = pte_offset_kernel(pmd, addr);
1212 if (!pte_none(*pte)) {
1213 if (!kvm_s2pte_readonly(pte))
1214 kvm_set_s2pte_readonly(pte);
1216 } while (pte++, addr += PAGE_SIZE, addr != end);
1220 * stage2_wp_pmds - write protect PUD range
1221 * kvm: kvm instance for the VM
1222 * @pud: pointer to pud entry
1223 * @addr: range start address
1224 * @end: range end address
1226 static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
1227 phys_addr_t addr, phys_addr_t end)
1229 struct kvm *kvm = mmu->kvm;
1233 pmd = stage2_pmd_offset(kvm, pud, addr);
1236 next = stage2_pmd_addr_end(kvm, addr, end);
1237 if (!pmd_none(*pmd)) {
1238 if (pmd_thp_or_huge(*pmd)) {
1239 if (!kvm_s2pmd_readonly(pmd))
1240 kvm_set_s2pmd_readonly(pmd);
1242 stage2_wp_ptes(pmd, addr, next);
1245 } while (pmd++, addr = next, addr != end);
1249 * stage2_wp_puds - write protect P4D range
1250 * @p4d: pointer to p4d entry
1251 * @addr: range start address
1252 * @end: range end address
1254 static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
1255 phys_addr_t addr, phys_addr_t end)
1257 struct kvm *kvm = mmu->kvm;
1261 pud = stage2_pud_offset(kvm, p4d, addr);
1263 next = stage2_pud_addr_end(kvm, addr, end);
1264 if (!stage2_pud_none(kvm, *pud)) {
1265 if (stage2_pud_huge(kvm, *pud)) {
1266 if (!kvm_s2pud_readonly(pud))
1267 kvm_set_s2pud_readonly(pud);
1269 stage2_wp_pmds(mmu, pud, addr, next);
1272 } while (pud++, addr = next, addr != end);
1276 * stage2_wp_p4ds - write protect PGD range
1277 * @pgd: pointer to pgd entry
1278 * @addr: range start address
1279 * @end: range end address
1281 static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
1282 phys_addr_t addr, phys_addr_t end)
1284 struct kvm *kvm = mmu->kvm;
1288 p4d = stage2_p4d_offset(kvm, pgd, addr);
1290 next = stage2_p4d_addr_end(kvm, addr, end);
1291 if (!stage2_p4d_none(kvm, *p4d))
1292 stage2_wp_puds(mmu, p4d, addr, next);
1293 } while (p4d++, addr = next, addr != end);
1297 * stage2_wp_range() - write protect stage2 memory region range
1298 * @kvm: The KVM pointer
1299 * @addr: Start address of range
1300 * @end: End address of range
1302 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1304 struct kvm *kvm = mmu->kvm;
1308 pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
1311 * Release kvm_mmu_lock periodically if the memory region is
1312 * large. Otherwise, we may see kernel panics with
1313 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1314 * CONFIG_LOCKDEP. Additionally, holding the lock too long
1315 * will also starve other vCPUs. We have to also make sure
1316 * that the page tables are not freed while we released
1319 cond_resched_lock(&kvm->mmu_lock);
1320 if (!READ_ONCE(mmu->pgd))
1322 next = stage2_pgd_addr_end(kvm, addr, end);
1323 if (stage2_pgd_present(kvm, *pgd))
1324 stage2_wp_p4ds(mmu, pgd, addr, next);
1325 } while (pgd++, addr = next, addr != end);
1329 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1330 * @kvm: The KVM pointer
1331 * @slot: The memory slot to write protect
1333 * Called to start logging dirty pages after memory region
1334 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1335 * all present PUD, PMD and PTEs are write protected in the memory region.
1336 * Afterwards read of dirty page log can be called.
1338 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1339 * serializing operations for VM memory regions.
1341 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1343 struct kvm_memslots *slots = kvm_memslots(kvm);
1344 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1345 phys_addr_t start, end;
1347 if (WARN_ON_ONCE(!memslot))
1350 start = memslot->base_gfn << PAGE_SHIFT;
1351 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1353 spin_lock(&kvm->mmu_lock);
1354 stage2_wp_range(&kvm->arch.mmu, start, end);
1355 spin_unlock(&kvm->mmu_lock);
1356 kvm_flush_remote_tlbs(kvm);
1360 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1361 * @kvm: The KVM pointer
1362 * @slot: The memory slot associated with mask
1363 * @gfn_offset: The gfn offset in memory slot
1364 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
1365 * slot to be write protected
1367 * Walks bits set in mask write protects the associated pte's. Caller must
1368 * acquire kvm_mmu_lock.
1370 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1371 struct kvm_memory_slot *slot,
1372 gfn_t gfn_offset, unsigned long mask)
1374 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1375 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
1376 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1378 stage2_wp_range(&kvm->arch.mmu, start, end);
1382 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1385 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1386 * enable dirty logging for them.
1388 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1389 struct kvm_memory_slot *slot,
1390 gfn_t gfn_offset, unsigned long mask)
1392 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1395 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1397 __clean_dcache_guest_page(pfn, size);
1400 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1402 __invalidate_icache_guest_page(pfn, size);
1405 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1407 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1410 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1412 unsigned long map_size)
1415 hva_t uaddr_start, uaddr_end;
1418 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1419 if (map_size == PAGE_SIZE)
1422 size = memslot->npages * PAGE_SIZE;
1424 gpa_start = memslot->base_gfn << PAGE_SHIFT;
1426 uaddr_start = memslot->userspace_addr;
1427 uaddr_end = uaddr_start + size;
1430 * Pages belonging to memslots that don't have the same alignment
1431 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1432 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1434 * Consider a layout like the following:
1436 * memslot->userspace_addr:
1437 * +-----+--------------------+--------------------+---+
1438 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
1439 * +-----+--------------------+--------------------+---+
1441 * memslot->base_gfn << PAGE_SHIFT:
1442 * +---+--------------------+--------------------+-----+
1443 * |abc|def Stage-2 block | Stage-2 block |tvxyz|
1444 * +---+--------------------+--------------------+-----+
1446 * If we create those stage-2 blocks, we'll end up with this incorrect
1452 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1456 * Next, let's make sure we're not trying to map anything not covered
1457 * by the memslot. This means we have to prohibit block size mappings
1458 * for the beginning and end of a non-block aligned and non-block sized
1459 * memory slot (illustrated by the head and tail parts of the
1460 * userspace view above containing pages 'abcde' and 'xyz',
1463 * Note that it doesn't matter if we do the check using the
1464 * userspace_addr or the base_gfn, as both are equally aligned (per
1465 * the check above) and equally sized.
1467 return (hva & ~(map_size - 1)) >= uaddr_start &&
1468 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1472 * Check if the given hva is backed by a transparent huge page (THP) and
1473 * whether it can be mapped using block mapping in stage2. If so, adjust
1474 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1475 * supported. This will need to be updated to support other THP sizes.
1477 * Returns the size of the mapping.
1479 static unsigned long
1480 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1481 unsigned long hva, kvm_pfn_t *pfnp,
1484 kvm_pfn_t pfn = *pfnp;
1487 * Make sure the adjustment is done only for THP pages. Also make
1488 * sure that the HVA and IPA are sufficiently aligned and that the
1489 * block map is contained within the memslot.
1491 if (kvm_is_transparent_hugepage(pfn) &&
1492 fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1494 * The address we faulted on is backed by a transparent huge
1495 * page. However, because we map the compound huge page and
1496 * not the individual tail page, we need to transfer the
1497 * refcount to the head page. We have to be careful that the
1498 * THP doesn't start to split while we are adjusting the
1501 * We are sure this doesn't happen, because mmu_notifier_retry
1502 * was successful and we are holding the mmu_lock, so if this
1503 * THP is trying to split, it will be blocked in the mmu
1504 * notifier before touching any of the pages, specifically
1505 * before being able to call __split_huge_page_refcount().
1507 * We can therefore safely transfer the refcount from PG_tail
1508 * to PG_head and switch the pfn from a tail page to the head
1512 kvm_release_pfn_clean(pfn);
1513 pfn &= ~(PTRS_PER_PMD - 1);
1520 /* Use page mapping if we cannot use block mapping. */
1524 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1525 struct kvm_memory_slot *memslot, unsigned long hva,
1526 unsigned long fault_status)
1529 bool write_fault, writable, force_pte = false;
1530 bool exec_fault, needs_exec;
1531 unsigned long mmu_seq;
1532 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1533 struct kvm *kvm = vcpu->kvm;
1534 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1535 struct vm_area_struct *vma;
1538 pgprot_t mem_type = PAGE_S2;
1539 bool logging_active = memslot_is_logging(memslot);
1540 unsigned long vma_pagesize, flags = 0;
1541 struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1543 write_fault = kvm_is_write_fault(vcpu);
1544 exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1545 VM_BUG_ON(write_fault && exec_fault);
1547 if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1548 kvm_err("Unexpected L2 read permission error\n");
1552 /* Let's check if we will get back a huge page backed by hugetlbfs */
1553 mmap_read_lock(current->mm);
1554 vma = find_vma_intersection(current->mm, hva, hva + 1);
1555 if (unlikely(!vma)) {
1556 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1557 mmap_read_unlock(current->mm);
1561 if (is_vm_hugetlb_page(vma))
1562 vma_shift = huge_page_shift(hstate_vma(vma));
1564 vma_shift = PAGE_SHIFT;
1566 vma_pagesize = 1ULL << vma_shift;
1567 if (logging_active ||
1568 (vma->vm_flags & VM_PFNMAP) ||
1569 !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1571 vma_pagesize = PAGE_SIZE;
1575 * The stage2 has a minimum of 2 level table (For arm64 see
1576 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1577 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1578 * As for PUD huge maps, we must make sure that we have at least
1579 * 3 levels, i.e, PMD is not folded.
1581 if (vma_pagesize == PMD_SIZE ||
1582 (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1583 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1584 mmap_read_unlock(current->mm);
1586 /* We need minimum second+third level pages */
1587 ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
1591 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1593 * Ensure the read of mmu_notifier_seq happens before we call
1594 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1595 * the page we just got a reference to gets unmapped before we have a
1596 * chance to grab the mmu_lock, which ensure that if the page gets
1597 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1598 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1599 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1603 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1604 if (pfn == KVM_PFN_ERR_HWPOISON) {
1605 kvm_send_hwpoison_signal(hva, vma_shift);
1608 if (is_error_noslot_pfn(pfn))
1611 if (kvm_is_device_pfn(pfn)) {
1612 mem_type = PAGE_S2_DEVICE;
1613 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1614 } else if (logging_active) {
1616 * Faults on pages in a memslot with logging enabled
1617 * should not be mapped with huge pages (it introduces churn
1618 * and performance degradation), so force a pte mapping.
1620 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1623 * Only actually map the page as writable if this was a write
1630 if (exec_fault && is_iomap(flags))
1633 spin_lock(&kvm->mmu_lock);
1634 if (mmu_notifier_retry(kvm, mmu_seq))
1638 * If we are not forced to use page mapping, check if we are
1639 * backed by a THP and thus use block mapping if possible.
1641 if (vma_pagesize == PAGE_SIZE && !force_pte)
1642 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1645 kvm_set_pfn_dirty(pfn);
1647 if (fault_status != FSC_PERM && !is_iomap(flags))
1648 clean_dcache_guest_page(pfn, vma_pagesize);
1651 invalidate_icache_guest_page(pfn, vma_pagesize);
1654 * If we took an execution fault we have made the
1655 * icache/dcache coherent above and should now let the s2
1656 * mapping be executable.
1658 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1659 * execute permissions, and we preserve whatever we have.
1661 needs_exec = exec_fault ||
1662 (fault_status == FSC_PERM &&
1663 stage2_is_exec(mmu, fault_ipa, vma_pagesize));
1665 if (vma_pagesize == PUD_SIZE) {
1666 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1668 new_pud = kvm_pud_mkhuge(new_pud);
1670 new_pud = kvm_s2pud_mkwrite(new_pud);
1673 new_pud = kvm_s2pud_mkexec(new_pud);
1675 ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
1676 } else if (vma_pagesize == PMD_SIZE) {
1677 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1679 new_pmd = kvm_pmd_mkhuge(new_pmd);
1682 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1685 new_pmd = kvm_s2pmd_mkexec(new_pmd);
1687 ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
1689 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1692 new_pte = kvm_s2pte_mkwrite(new_pte);
1693 mark_page_dirty(kvm, gfn);
1697 new_pte = kvm_s2pte_mkexec(new_pte);
1699 ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
1703 spin_unlock(&kvm->mmu_lock);
1704 kvm_set_pfn_accessed(pfn);
1705 kvm_release_pfn_clean(pfn);
1710 * Resolve the access fault by making the page young again.
1711 * Note that because the faulting entry is guaranteed not to be
1712 * cached in the TLB, we don't need to invalidate anything.
1713 * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1714 * so there is no need for atomic (pte|pmd)_mkyoung operations.
1716 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1722 bool pfn_valid = false;
1724 trace_kvm_access_fault(fault_ipa);
1726 spin_lock(&vcpu->kvm->mmu_lock);
1728 if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
1731 if (pud) { /* HugeTLB */
1732 *pud = kvm_s2pud_mkyoung(*pud);
1733 pfn = kvm_pud_pfn(*pud);
1735 } else if (pmd) { /* THP, HugeTLB */
1736 *pmd = pmd_mkyoung(*pmd);
1737 pfn = pmd_pfn(*pmd);
1740 *pte = pte_mkyoung(*pte); /* Just a page... */
1741 pfn = pte_pfn(*pte);
1746 spin_unlock(&vcpu->kvm->mmu_lock);
1748 kvm_set_pfn_accessed(pfn);
1752 * kvm_handle_guest_abort - handles all 2nd stage aborts
1753 * @vcpu: the VCPU pointer
1755 * Any abort that gets to the host is almost guaranteed to be caused by a
1756 * missing second stage translation table entry, which can mean that either the
1757 * guest simply needs more memory and we must allocate an appropriate page or it
1758 * can mean that the guest tried to access I/O memory, which is emulated by user
1759 * space. The distinction is based on the IPA causing the fault and whether this
1760 * memory region has been registered as standard RAM by user space.
1762 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1764 unsigned long fault_status;
1765 phys_addr_t fault_ipa;
1766 struct kvm_memory_slot *memslot;
1768 bool is_iabt, write_fault, writable;
1772 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1774 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1775 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1777 /* Synchronous External Abort? */
1778 if (kvm_vcpu_abt_issea(vcpu)) {
1780 * For RAS the host kernel may handle this abort.
1781 * There is no need to pass the error into the guest.
1783 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1784 kvm_inject_vabt(vcpu);
1789 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1790 kvm_vcpu_get_hfar(vcpu), fault_ipa);
1792 /* Check the stage-2 fault is trans. fault or write fault */
1793 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1794 fault_status != FSC_ACCESS) {
1795 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1796 kvm_vcpu_trap_get_class(vcpu),
1797 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1798 (unsigned long)kvm_vcpu_get_esr(vcpu));
1802 idx = srcu_read_lock(&vcpu->kvm->srcu);
1804 gfn = fault_ipa >> PAGE_SHIFT;
1805 memslot = gfn_to_memslot(vcpu->kvm, gfn);
1806 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1807 write_fault = kvm_is_write_fault(vcpu);
1808 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1810 * The guest has put either its instructions or its page-tables
1811 * somewhere it shouldn't have. Userspace won't be able to do
1812 * anything about this (there's no syndrome for a start), so
1813 * re-inject the abort back into the guest.
1820 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1821 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1827 * Check for a cache maintenance operation. Since we
1828 * ended-up here, we know it is outside of any memory
1829 * slot. But we can't find out if that is for a device,
1830 * or if the guest is just being stupid. The only thing
1831 * we know for sure is that this range cannot be cached.
1833 * So let's assume that the guest is just being
1834 * cautious, and skip the instruction.
1836 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1837 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1843 * The IPA is reported as [MAX:12], so we need to
1844 * complement it with the bottom 12 bits from the
1845 * faulting VA. This is always 12 bits, irrespective
1848 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1849 ret = io_mem_abort(vcpu, fault_ipa);
1853 /* Userspace should not be able to register out-of-bounds IPAs */
1854 VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1856 if (fault_status == FSC_ACCESS) {
1857 handle_access_fault(vcpu, fault_ipa);
1862 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1866 if (ret == -ENOEXEC) {
1867 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1871 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1875 static int handle_hva_to_gpa(struct kvm *kvm,
1876 unsigned long start,
1878 int (*handler)(struct kvm *kvm,
1879 gpa_t gpa, u64 size,
1883 struct kvm_memslots *slots;
1884 struct kvm_memory_slot *memslot;
1887 slots = kvm_memslots(kvm);
1889 /* we only care about the pages that the guest sees */
1890 kvm_for_each_memslot(memslot, slots) {
1891 unsigned long hva_start, hva_end;
1894 hva_start = max(start, memslot->userspace_addr);
1895 hva_end = min(end, memslot->userspace_addr +
1896 (memslot->npages << PAGE_SHIFT));
1897 if (hva_start >= hva_end)
1900 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1901 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
1907 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1909 unsigned flags = *(unsigned *)data;
1910 bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1912 __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
1916 int kvm_unmap_hva_range(struct kvm *kvm,
1917 unsigned long start, unsigned long end, unsigned flags)
1919 if (!kvm->arch.mmu.pgd)
1922 trace_kvm_unmap_hva_range(start, end);
1923 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
1927 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1929 kvm_pfn_t *pfn = (kvm_pfn_t *)data;
1931 WARN_ON(size != PAGE_SIZE);
1934 * The MMU notifiers will have unmapped a huge PMD before calling
1935 * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1936 * therefore we never need to clear out a huge PMD through this
1937 * calling path and a memcache is not required.
1939 kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1940 __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
1944 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1946 unsigned long end = hva + PAGE_SIZE;
1947 kvm_pfn_t pfn = pte_pfn(pte);
1949 if (!kvm->arch.mmu.pgt)
1952 trace_kvm_set_spte_hva(hva);
1955 * We've moved a page around, probably through CoW, so let's treat it
1956 * just like a translation fault and clean the cache to the PoC.
1958 clean_dcache_guest_page(pfn, PAGE_SIZE);
1959 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
1963 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1969 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1970 if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
1974 return stage2_pudp_test_and_clear_young(pud);
1976 return stage2_pmdp_test_and_clear_young(pmd);
1978 return stage2_ptep_test_and_clear_young(pte);
1981 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1987 WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1988 if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
1992 return kvm_s2pud_young(*pud);
1994 return pmd_young(*pmd);
1996 return pte_young(*pte);
1999 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2001 if (!kvm->arch.mmu.pgd)
2003 trace_kvm_age_hva(start, end);
2004 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2007 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2009 if (!kvm->arch.mmu.pgd)
2011 trace_kvm_test_age_hva(hva);
2012 return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2013 kvm_test_age_hva_handler, NULL);
2016 phys_addr_t kvm_mmu_get_httbr(void)
2018 return __pa(hyp_pgtable->pgd);
2021 phys_addr_t kvm_get_idmap_vector(void)
2023 return hyp_idmap_vector;
2026 static int kvm_map_idmap_text(void)
2028 unsigned long size = hyp_idmap_end - hyp_idmap_start;
2029 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
2032 kvm_err("Failed to idmap %lx-%lx\n",
2033 hyp_idmap_start, hyp_idmap_end);
2038 int kvm_mmu_init(void)
2043 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2044 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2045 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2046 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2047 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2050 * We rely on the linker script to ensure at build time that the HYP
2051 * init code does not cross a page boundary.
2053 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2055 hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
2056 kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
2057 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2058 kvm_debug("HYP VA range: %lx:%lx\n",
2059 kern_hyp_va(PAGE_OFFSET),
2060 kern_hyp_va((unsigned long)high_memory - 1));
2062 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2063 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
2064 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2066 * The idmap page is intersecting with the VA space,
2067 * it is not safe to continue further.
2069 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2074 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2076 kvm_err("Hyp mode page-table not allocated\n");
2081 err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
2083 goto out_free_pgtable;
2085 err = kvm_map_idmap_text();
2087 goto out_destroy_pgtable;
2089 io_map_base = hyp_idmap_start;
2092 out_destroy_pgtable:
2093 kvm_pgtable_hyp_destroy(hyp_pgtable);
2101 void kvm_arch_commit_memory_region(struct kvm *kvm,
2102 const struct kvm_userspace_memory_region *mem,
2103 struct kvm_memory_slot *old,
2104 const struct kvm_memory_slot *new,
2105 enum kvm_mr_change change)
2108 * At this point memslot has been committed and there is an
2109 * allocated dirty_bitmap[], dirty pages will be tracked while the
2110 * memory slot is write protected.
2112 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2114 * If we're with initial-all-set, we don't need to write
2115 * protect any pages because they're all reported as dirty.
2116 * Huge pages and normal pages will be write protect gradually.
2118 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2119 kvm_mmu_wp_memory_region(kvm, mem->slot);
2124 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2125 struct kvm_memory_slot *memslot,
2126 const struct kvm_userspace_memory_region *mem,
2127 enum kvm_mr_change change)
2129 hva_t hva = mem->userspace_addr;
2130 hva_t reg_end = hva + mem->memory_size;
2131 bool writable = !(mem->flags & KVM_MEM_READONLY);
2134 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2135 change != KVM_MR_FLAGS_ONLY)
2139 * Prevent userspace from creating a memory region outside of the IPA
2140 * space addressable by the KVM guest IPA space.
2142 if (memslot->base_gfn + memslot->npages >=
2143 (kvm_phys_size(kvm) >> PAGE_SHIFT))
2146 mmap_read_lock(current->mm);
2148 * A memory region could potentially cover multiple VMAs, and any holes
2149 * between them, so iterate over all of them to find out if we can map
2150 * any of them right now.
2152 * +--------------------------------------------+
2153 * +---------------+----------------+ +----------------+
2154 * | : VMA 1 | VMA 2 | | VMA 3 : |
2155 * +---------------+----------------+ +----------------+
2157 * +--------------------------------------------+
2160 struct vm_area_struct *vma = find_vma(current->mm, hva);
2161 hva_t vm_start, vm_end;
2163 if (!vma || vma->vm_start >= reg_end)
2167 * Take the intersection of this VMA with the memory region
2169 vm_start = max(hva, vma->vm_start);
2170 vm_end = min(reg_end, vma->vm_end);
2172 if (vma->vm_flags & VM_PFNMAP) {
2173 gpa_t gpa = mem->guest_phys_addr +
2174 (vm_start - mem->userspace_addr);
2177 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2178 pa += vm_start - vma->vm_start;
2180 /* IO region dirty page logging not allowed */
2181 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2186 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2193 } while (hva < reg_end);
2195 if (change == KVM_MR_FLAGS_ONLY)
2198 spin_lock(&kvm->mmu_lock);
2200 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
2202 stage2_flush_memslot(kvm, memslot);
2203 spin_unlock(&kvm->mmu_lock);
2205 mmap_read_unlock(current->mm);
2209 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2213 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2217 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2219 kvm_free_stage2_pgd(&kvm->arch.mmu);
2222 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2223 struct kvm_memory_slot *slot)
2225 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2226 phys_addr_t size = slot->npages << PAGE_SHIFT;
2228 spin_lock(&kvm->mmu_lock);
2229 unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2230 spin_unlock(&kvm->mmu_lock);
2234 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2237 * - S/W ops are local to a CPU (not broadcast)
2238 * - We have line migration behind our back (speculation)
2239 * - System caches don't support S/W at all (damn!)
2241 * In the face of the above, the best we can do is to try and convert
2242 * S/W ops to VA ops. Because the guest is not allowed to infer the
2243 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2244 * which is a rather good thing for us.
2246 * Also, it is only used when turning caches on/off ("The expected
2247 * usage of the cache maintenance instructions that operate by set/way
2248 * is associated with the cache maintenance instructions associated
2249 * with the powerdown and powerup of caches, if this is required by
2250 * the implementation.").
2252 * We use the following policy:
2254 * - If we trap a S/W operation, we enable VM trapping to detect
2255 * caches being turned on/off, and do a full clean.
2257 * - We flush the caches on both caches being turned on and off.
2259 * - Once the caches are enabled, we stop trapping VM ops.
2261 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2263 unsigned long hcr = *vcpu_hcr(vcpu);
2266 * If this is the first time we do a S/W operation
2267 * (i.e. HCR_TVM not set) flush the whole memory, and set the
2270 * Otherwise, rely on the VM trapping to wait for the MMU +
2271 * Caches to be turned off. At that point, we'll be able to
2272 * clean the caches again.
2274 if (!(hcr & HCR_TVM)) {
2275 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2276 vcpu_has_cache_enabled(vcpu));
2277 stage2_flush_vm(vcpu->kvm);
2278 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2282 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2284 bool now_enabled = vcpu_has_cache_enabled(vcpu);
2287 * If switching the MMU+caches on, need to invalidate the caches.
2288 * If switching it off, need to clean the caches.
2289 * Clean + invalidate does the trick always.
2291 if (now_enabled != was_enabled)
2292 stage2_flush_vm(vcpu->kvm);
2294 /* Caches are now on, stop trapping VM ops (until a S/W op) */
2296 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2298 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);