KVM: arm64: Convert unmap_stage2_range() to generic page-table API
[linux-2.6-microblaze.git] / arch / arm64 / kvm / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_pgtable.h>
18 #include <asm/kvm_ras.h>
19 #include <asm/kvm_asm.h>
20 #include <asm/kvm_emulate.h>
21 #include <asm/virt.h>
22
23 #include "trace.h"
24
25 static struct kvm_pgtable *hyp_pgtable;
26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
27
28 static unsigned long hyp_idmap_start;
29 static unsigned long hyp_idmap_end;
30 static phys_addr_t hyp_idmap_vector;
31
32 static unsigned long io_map_base;
33
34 #define KVM_S2PTE_FLAG_IS_IOMAP         (1UL << 0)
35 #define KVM_S2_FLAG_LOGGING_ACTIVE      (1UL << 1)
36
37 static bool is_iomap(unsigned long flags)
38 {
39         return flags & KVM_S2PTE_FLAG_IS_IOMAP;
40 }
41
42 /*
43  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
44  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
45  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
46  * long will also starve other vCPUs. We have to also make sure that the page
47  * tables are not freed while we released the lock.
48  */
49 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
50                               phys_addr_t end,
51                               int (*fn)(struct kvm_pgtable *, u64, u64),
52                               bool resched)
53 {
54         int ret;
55         u64 next;
56
57         do {
58                 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
59                 if (!pgt)
60                         return -EINVAL;
61
62                 next = stage2_pgd_addr_end(kvm, addr, end);
63                 ret = fn(pgt, addr, next - addr);
64                 if (ret)
65                         break;
66
67                 if (resched && next != end)
68                         cond_resched_lock(&kvm->mmu_lock);
69         } while (addr = next, addr != end);
70
71         return ret;
72 }
73
74 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
75 {
76         return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
77 }
78
79 /**
80  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
81  * @kvm:        pointer to kvm structure.
82  *
83  * Interface to HYP function to flush all VM TLB entries
84  */
85 void kvm_flush_remote_tlbs(struct kvm *kvm)
86 {
87         kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
88 }
89
90 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
91                                    int level)
92 {
93         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
94 }
95
96 /*
97  * D-Cache management functions. They take the page table entries by
98  * value, as they are flushing the cache using the kernel mapping (or
99  * kmap on 32bit).
100  */
101 static void kvm_flush_dcache_pte(pte_t pte)
102 {
103         __kvm_flush_dcache_pte(pte);
104 }
105
106 static void kvm_flush_dcache_pmd(pmd_t pmd)
107 {
108         __kvm_flush_dcache_pmd(pmd);
109 }
110
111 static void kvm_flush_dcache_pud(pud_t pud)
112 {
113         __kvm_flush_dcache_pud(pud);
114 }
115
116 static bool kvm_is_device_pfn(unsigned long pfn)
117 {
118         return !pfn_valid(pfn);
119 }
120
121 /**
122  * stage2_dissolve_pmd() - clear and flush huge PMD entry
123  * @mmu:        pointer to mmu structure to operate on
124  * @addr:       IPA
125  * @pmd:        pmd pointer for IPA
126  *
127  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
128  */
129 static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
130 {
131         if (!pmd_thp_or_huge(*pmd))
132                 return;
133
134         pmd_clear(pmd);
135         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
136         put_page(virt_to_page(pmd));
137 }
138
139 /**
140  * stage2_dissolve_pud() - clear and flush huge PUD entry
141  * @mmu:        pointer to mmu structure to operate on
142  * @addr:       IPA
143  * @pud:        pud pointer for IPA
144  *
145  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
146  */
147 static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
148 {
149         struct kvm *kvm = mmu->kvm;
150
151         if (!stage2_pud_huge(kvm, *pudp))
152                 return;
153
154         stage2_pud_clear(kvm, pudp);
155         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
156         put_page(virt_to_page(pudp));
157 }
158
159 static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
160 {
161         struct kvm *kvm = mmu->kvm;
162         p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
163         stage2_pgd_clear(kvm, pgd);
164         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
165         stage2_p4d_free(kvm, p4d_table);
166         put_page(virt_to_page(pgd));
167 }
168
169 static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
170 {
171         struct kvm *kvm = mmu->kvm;
172         pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
173         stage2_p4d_clear(kvm, p4d);
174         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
175         stage2_pud_free(kvm, pud_table);
176         put_page(virt_to_page(p4d));
177 }
178
179 static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
180 {
181         struct kvm *kvm = mmu->kvm;
182         pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
183
184         VM_BUG_ON(stage2_pud_huge(kvm, *pud));
185         stage2_pud_clear(kvm, pud);
186         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
187         stage2_pmd_free(kvm, pmd_table);
188         put_page(virt_to_page(pud));
189 }
190
191 static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
192 {
193         pte_t *pte_table = pte_offset_kernel(pmd, 0);
194         VM_BUG_ON(pmd_thp_or_huge(*pmd));
195         pmd_clear(pmd);
196         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
197         free_page((unsigned long)pte_table);
198         put_page(virt_to_page(pmd));
199 }
200
201 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
202 {
203         WRITE_ONCE(*ptep, new_pte);
204         dsb(ishst);
205 }
206
207 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
208 {
209         WRITE_ONCE(*pmdp, new_pmd);
210         dsb(ishst);
211 }
212
213 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
214 {
215         kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
216 }
217
218 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
219 {
220         WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
221         dsb(ishst);
222 }
223
224 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
225 {
226         WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
227         dsb(ishst);
228 }
229
230 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
231 {
232 #ifndef __PAGETABLE_P4D_FOLDED
233         WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
234         dsb(ishst);
235 #endif
236 }
237
238 /*
239  * Unmapping vs dcache management:
240  *
241  * If a guest maps certain memory pages as uncached, all writes will
242  * bypass the data cache and go directly to RAM.  However, the CPUs
243  * can still speculate reads (not writes) and fill cache lines with
244  * data.
245  *
246  * Those cache lines will be *clean* cache lines though, so a
247  * clean+invalidate operation is equivalent to an invalidate
248  * operation, because no cache lines are marked dirty.
249  *
250  * Those clean cache lines could be filled prior to an uncached write
251  * by the guest, and the cache coherent IO subsystem would therefore
252  * end up writing old data to disk.
253  *
254  * This is why right after unmapping a page/section and invalidating
255  * the corresponding TLBs, we flush to make sure the IO subsystem will
256  * never hit in the cache.
257  *
258  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
259  * we then fully enforce cacheability of RAM, no matter what the guest
260  * does.
261  */
262 static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
263                        phys_addr_t addr, phys_addr_t end)
264 {
265         phys_addr_t start_addr = addr;
266         pte_t *pte, *start_pte;
267
268         start_pte = pte = pte_offset_kernel(pmd, addr);
269         do {
270                 if (!pte_none(*pte)) {
271                         pte_t old_pte = *pte;
272
273                         kvm_set_pte(pte, __pte(0));
274                         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
275
276                         /* No need to invalidate the cache for device mappings */
277                         if (!kvm_is_device_pfn(pte_pfn(old_pte)))
278                                 kvm_flush_dcache_pte(old_pte);
279
280                         put_page(virt_to_page(pte));
281                 }
282         } while (pte++, addr += PAGE_SIZE, addr != end);
283
284         if (stage2_pte_table_empty(mmu->kvm, start_pte))
285                 clear_stage2_pmd_entry(mmu, pmd, start_addr);
286 }
287
288 static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
289                        phys_addr_t addr, phys_addr_t end)
290 {
291         struct kvm *kvm = mmu->kvm;
292         phys_addr_t next, start_addr = addr;
293         pmd_t *pmd, *start_pmd;
294
295         start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
296         do {
297                 next = stage2_pmd_addr_end(kvm, addr, end);
298                 if (!pmd_none(*pmd)) {
299                         if (pmd_thp_or_huge(*pmd)) {
300                                 pmd_t old_pmd = *pmd;
301
302                                 pmd_clear(pmd);
303                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
304
305                                 kvm_flush_dcache_pmd(old_pmd);
306
307                                 put_page(virt_to_page(pmd));
308                         } else {
309                                 unmap_stage2_ptes(mmu, pmd, addr, next);
310                         }
311                 }
312         } while (pmd++, addr = next, addr != end);
313
314         if (stage2_pmd_table_empty(kvm, start_pmd))
315                 clear_stage2_pud_entry(mmu, pud, start_addr);
316 }
317
318 static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
319                        phys_addr_t addr, phys_addr_t end)
320 {
321         struct kvm *kvm = mmu->kvm;
322         phys_addr_t next, start_addr = addr;
323         pud_t *pud, *start_pud;
324
325         start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
326         do {
327                 next = stage2_pud_addr_end(kvm, addr, end);
328                 if (!stage2_pud_none(kvm, *pud)) {
329                         if (stage2_pud_huge(kvm, *pud)) {
330                                 pud_t old_pud = *pud;
331
332                                 stage2_pud_clear(kvm, pud);
333                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
334                                 kvm_flush_dcache_pud(old_pud);
335                                 put_page(virt_to_page(pud));
336                         } else {
337                                 unmap_stage2_pmds(mmu, pud, addr, next);
338                         }
339                 }
340         } while (pud++, addr = next, addr != end);
341
342         if (stage2_pud_table_empty(kvm, start_pud))
343                 clear_stage2_p4d_entry(mmu, p4d, start_addr);
344 }
345
346 static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
347                        phys_addr_t addr, phys_addr_t end)
348 {
349         struct kvm *kvm = mmu->kvm;
350         phys_addr_t next, start_addr = addr;
351         p4d_t *p4d, *start_p4d;
352
353         start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
354         do {
355                 next = stage2_p4d_addr_end(kvm, addr, end);
356                 if (!stage2_p4d_none(kvm, *p4d))
357                         unmap_stage2_puds(mmu, p4d, addr, next);
358         } while (p4d++, addr = next, addr != end);
359
360         if (stage2_p4d_table_empty(kvm, start_p4d))
361                 clear_stage2_pgd_entry(mmu, pgd, start_addr);
362 }
363
364 /**
365  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
366  * @kvm:   The VM pointer
367  * @start: The intermediate physical base address of the range to unmap
368  * @size:  The size of the area to unmap
369  *
370  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
371  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
372  * destroying the VM), otherwise another faulting VCPU may come in and mess
373  * with things behind our backs.
374  */
375 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
376                                  bool may_block)
377 {
378         struct kvm *kvm = mmu->kvm;
379         phys_addr_t end = start + size;
380
381         assert_spin_locked(&kvm->mmu_lock);
382         WARN_ON(size & ~PAGE_MASK);
383         WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
384                                    may_block));
385 }
386
387 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
388 {
389         __unmap_stage2_range(mmu, start, size, true);
390 }
391
392 static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
393                               phys_addr_t addr, phys_addr_t end)
394 {
395         pte_t *pte;
396
397         pte = pte_offset_kernel(pmd, addr);
398         do {
399                 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
400                         kvm_flush_dcache_pte(*pte);
401         } while (pte++, addr += PAGE_SIZE, addr != end);
402 }
403
404 static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
405                               phys_addr_t addr, phys_addr_t end)
406 {
407         struct kvm *kvm = mmu->kvm;
408         pmd_t *pmd;
409         phys_addr_t next;
410
411         pmd = stage2_pmd_offset(kvm, pud, addr);
412         do {
413                 next = stage2_pmd_addr_end(kvm, addr, end);
414                 if (!pmd_none(*pmd)) {
415                         if (pmd_thp_or_huge(*pmd))
416                                 kvm_flush_dcache_pmd(*pmd);
417                         else
418                                 stage2_flush_ptes(mmu, pmd, addr, next);
419                 }
420         } while (pmd++, addr = next, addr != end);
421 }
422
423 static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
424                               phys_addr_t addr, phys_addr_t end)
425 {
426         struct kvm *kvm = mmu->kvm;
427         pud_t *pud;
428         phys_addr_t next;
429
430         pud = stage2_pud_offset(kvm, p4d, addr);
431         do {
432                 next = stage2_pud_addr_end(kvm, addr, end);
433                 if (!stage2_pud_none(kvm, *pud)) {
434                         if (stage2_pud_huge(kvm, *pud))
435                                 kvm_flush_dcache_pud(*pud);
436                         else
437                                 stage2_flush_pmds(mmu, pud, addr, next);
438                 }
439         } while (pud++, addr = next, addr != end);
440 }
441
442 static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
443                               phys_addr_t addr, phys_addr_t end)
444 {
445         struct kvm *kvm = mmu->kvm;
446         p4d_t *p4d;
447         phys_addr_t next;
448
449         p4d = stage2_p4d_offset(kvm, pgd, addr);
450         do {
451                 next = stage2_p4d_addr_end(kvm, addr, end);
452                 if (!stage2_p4d_none(kvm, *p4d))
453                         stage2_flush_puds(mmu, p4d, addr, next);
454         } while (p4d++, addr = next, addr != end);
455 }
456
457 static void stage2_flush_memslot(struct kvm *kvm,
458                                  struct kvm_memory_slot *memslot)
459 {
460         struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
461         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
462         phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
463         phys_addr_t next;
464         pgd_t *pgd;
465
466         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
467         do {
468                 next = stage2_pgd_addr_end(kvm, addr, end);
469                 if (!stage2_pgd_none(kvm, *pgd))
470                         stage2_flush_p4ds(mmu, pgd, addr, next);
471
472                 if (next != end)
473                         cond_resched_lock(&kvm->mmu_lock);
474         } while (pgd++, addr = next, addr != end);
475 }
476
477 /**
478  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
479  * @kvm: The struct kvm pointer
480  *
481  * Go through the stage 2 page tables and invalidate any cache lines
482  * backing memory already mapped to the VM.
483  */
484 static void stage2_flush_vm(struct kvm *kvm)
485 {
486         struct kvm_memslots *slots;
487         struct kvm_memory_slot *memslot;
488         int idx;
489
490         idx = srcu_read_lock(&kvm->srcu);
491         spin_lock(&kvm->mmu_lock);
492
493         slots = kvm_memslots(kvm);
494         kvm_for_each_memslot(memslot, slots)
495                 stage2_flush_memslot(kvm, memslot);
496
497         spin_unlock(&kvm->mmu_lock);
498         srcu_read_unlock(&kvm->srcu, idx);
499 }
500
501 /**
502  * free_hyp_pgds - free Hyp-mode page tables
503  */
504 void free_hyp_pgds(void)
505 {
506         mutex_lock(&kvm_hyp_pgd_mutex);
507         if (hyp_pgtable) {
508                 kvm_pgtable_hyp_destroy(hyp_pgtable);
509                 kfree(hyp_pgtable);
510         }
511         mutex_unlock(&kvm_hyp_pgd_mutex);
512 }
513
514 static int __create_hyp_mappings(unsigned long start, unsigned long size,
515                                  unsigned long phys, enum kvm_pgtable_prot prot)
516 {
517         int err;
518
519         mutex_lock(&kvm_hyp_pgd_mutex);
520         err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
521         mutex_unlock(&kvm_hyp_pgd_mutex);
522
523         return err;
524 }
525
526 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
527 {
528         if (!is_vmalloc_addr(kaddr)) {
529                 BUG_ON(!virt_addr_valid(kaddr));
530                 return __pa(kaddr);
531         } else {
532                 return page_to_phys(vmalloc_to_page(kaddr)) +
533                        offset_in_page(kaddr);
534         }
535 }
536
537 /**
538  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
539  * @from:       The virtual kernel start address of the range
540  * @to:         The virtual kernel end address of the range (exclusive)
541  * @prot:       The protection to be applied to this range
542  *
543  * The same virtual address as the kernel virtual address is also used
544  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
545  * physical pages.
546  */
547 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
548 {
549         phys_addr_t phys_addr;
550         unsigned long virt_addr;
551         unsigned long start = kern_hyp_va((unsigned long)from);
552         unsigned long end = kern_hyp_va((unsigned long)to);
553
554         if (is_kernel_in_hyp_mode())
555                 return 0;
556
557         start = start & PAGE_MASK;
558         end = PAGE_ALIGN(end);
559
560         for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
561                 int err;
562
563                 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
564                 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
565                                             prot);
566                 if (err)
567                         return err;
568         }
569
570         return 0;
571 }
572
573 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
574                                         unsigned long *haddr,
575                                         enum kvm_pgtable_prot prot)
576 {
577         unsigned long base;
578         int ret = 0;
579
580         mutex_lock(&kvm_hyp_pgd_mutex);
581
582         /*
583          * This assumes that we have enough space below the idmap
584          * page to allocate our VAs. If not, the check below will
585          * kick. A potential alternative would be to detect that
586          * overflow and switch to an allocation above the idmap.
587          *
588          * The allocated size is always a multiple of PAGE_SIZE.
589          */
590         size = PAGE_ALIGN(size + offset_in_page(phys_addr));
591         base = io_map_base - size;
592
593         /*
594          * Verify that BIT(VA_BITS - 1) hasn't been flipped by
595          * allocating the new area, as it would indicate we've
596          * overflowed the idmap/IO address range.
597          */
598         if ((base ^ io_map_base) & BIT(VA_BITS - 1))
599                 ret = -ENOMEM;
600         else
601                 io_map_base = base;
602
603         mutex_unlock(&kvm_hyp_pgd_mutex);
604
605         if (ret)
606                 goto out;
607
608         ret = __create_hyp_mappings(base, size, phys_addr, prot);
609         if (ret)
610                 goto out;
611
612         *haddr = base + offset_in_page(phys_addr);
613 out:
614         return ret;
615 }
616
617 /**
618  * create_hyp_io_mappings - Map IO into both kernel and HYP
619  * @phys_addr:  The physical start address which gets mapped
620  * @size:       Size of the region being mapped
621  * @kaddr:      Kernel VA for this mapping
622  * @haddr:      HYP VA for this mapping
623  */
624 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
625                            void __iomem **kaddr,
626                            void __iomem **haddr)
627 {
628         unsigned long addr;
629         int ret;
630
631         *kaddr = ioremap(phys_addr, size);
632         if (!*kaddr)
633                 return -ENOMEM;
634
635         if (is_kernel_in_hyp_mode()) {
636                 *haddr = *kaddr;
637                 return 0;
638         }
639
640         ret = __create_hyp_private_mapping(phys_addr, size,
641                                            &addr, PAGE_HYP_DEVICE);
642         if (ret) {
643                 iounmap(*kaddr);
644                 *kaddr = NULL;
645                 *haddr = NULL;
646                 return ret;
647         }
648
649         *haddr = (void __iomem *)addr;
650         return 0;
651 }
652
653 /**
654  * create_hyp_exec_mappings - Map an executable range into HYP
655  * @phys_addr:  The physical start address which gets mapped
656  * @size:       Size of the region being mapped
657  * @haddr:      HYP VA for this mapping
658  */
659 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
660                              void **haddr)
661 {
662         unsigned long addr;
663         int ret;
664
665         BUG_ON(is_kernel_in_hyp_mode());
666
667         ret = __create_hyp_private_mapping(phys_addr, size,
668                                            &addr, PAGE_HYP_EXEC);
669         if (ret) {
670                 *haddr = NULL;
671                 return ret;
672         }
673
674         *haddr = (void *)addr;
675         return 0;
676 }
677
678 /**
679  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
680  * @kvm:        The pointer to the KVM structure
681  * @mmu:        The pointer to the s2 MMU structure
682  *
683  * Allocates only the stage-2 HW PGD level table(s).
684  * Note we don't need locking here as this is only called when the VM is
685  * created, which can only be done once.
686  */
687 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
688 {
689         int cpu, err;
690         struct kvm_pgtable *pgt;
691
692         if (mmu->pgt != NULL) {
693                 kvm_err("kvm_arch already initialized?\n");
694                 return -EINVAL;
695         }
696
697         pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
698         if (!pgt)
699                 return -ENOMEM;
700
701         err = kvm_pgtable_stage2_init(pgt, kvm);
702         if (err)
703                 goto out_free_pgtable;
704
705         mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
706         if (!mmu->last_vcpu_ran) {
707                 err = -ENOMEM;
708                 goto out_destroy_pgtable;
709         }
710
711         for_each_possible_cpu(cpu)
712                 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
713
714         mmu->kvm = kvm;
715         mmu->pgt = pgt;
716         mmu->pgd_phys = __pa(pgt->pgd);
717         mmu->pgd = (void *)pgt->pgd;
718         mmu->vmid.vmid_gen = 0;
719         return 0;
720
721 out_destroy_pgtable:
722         kvm_pgtable_stage2_destroy(pgt);
723 out_free_pgtable:
724         kfree(pgt);
725         return err;
726 }
727
728 static void stage2_unmap_memslot(struct kvm *kvm,
729                                  struct kvm_memory_slot *memslot)
730 {
731         hva_t hva = memslot->userspace_addr;
732         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
733         phys_addr_t size = PAGE_SIZE * memslot->npages;
734         hva_t reg_end = hva + size;
735
736         /*
737          * A memory region could potentially cover multiple VMAs, and any holes
738          * between them, so iterate over all of them to find out if we should
739          * unmap any of them.
740          *
741          *     +--------------------------------------------+
742          * +---------------+----------------+   +----------------+
743          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
744          * +---------------+----------------+   +----------------+
745          *     |               memory region                |
746          *     +--------------------------------------------+
747          */
748         do {
749                 struct vm_area_struct *vma = find_vma(current->mm, hva);
750                 hva_t vm_start, vm_end;
751
752                 if (!vma || vma->vm_start >= reg_end)
753                         break;
754
755                 /*
756                  * Take the intersection of this VMA with the memory region
757                  */
758                 vm_start = max(hva, vma->vm_start);
759                 vm_end = min(reg_end, vma->vm_end);
760
761                 if (!(vma->vm_flags & VM_PFNMAP)) {
762                         gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
763                         unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
764                 }
765                 hva = vm_end;
766         } while (hva < reg_end);
767 }
768
769 /**
770  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
771  * @kvm: The struct kvm pointer
772  *
773  * Go through the memregions and unmap any regular RAM
774  * backing memory already mapped to the VM.
775  */
776 void stage2_unmap_vm(struct kvm *kvm)
777 {
778         struct kvm_memslots *slots;
779         struct kvm_memory_slot *memslot;
780         int idx;
781
782         idx = srcu_read_lock(&kvm->srcu);
783         mmap_read_lock(current->mm);
784         spin_lock(&kvm->mmu_lock);
785
786         slots = kvm_memslots(kvm);
787         kvm_for_each_memslot(memslot, slots)
788                 stage2_unmap_memslot(kvm, memslot);
789
790         spin_unlock(&kvm->mmu_lock);
791         mmap_read_unlock(current->mm);
792         srcu_read_unlock(&kvm->srcu, idx);
793 }
794
795 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
796 {
797         struct kvm *kvm = mmu->kvm;
798         struct kvm_pgtable *pgt = NULL;
799
800         spin_lock(&kvm->mmu_lock);
801         pgt = mmu->pgt;
802         if (pgt) {
803                 mmu->pgd = NULL;
804                 mmu->pgd_phys = 0;
805                 mmu->pgt = NULL;
806                 free_percpu(mmu->last_vcpu_ran);
807         }
808         spin_unlock(&kvm->mmu_lock);
809
810         if (pgt) {
811                 kvm_pgtable_stage2_destroy(pgt);
812                 kfree(pgt);
813         }
814 }
815
816 static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
817                              phys_addr_t addr)
818 {
819         struct kvm *kvm = mmu->kvm;
820         pgd_t *pgd;
821         p4d_t *p4d;
822
823         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
824         if (stage2_pgd_none(kvm, *pgd)) {
825                 if (!cache)
826                         return NULL;
827                 p4d = kvm_mmu_memory_cache_alloc(cache);
828                 stage2_pgd_populate(kvm, pgd, p4d);
829                 get_page(virt_to_page(pgd));
830         }
831
832         return stage2_p4d_offset(kvm, pgd, addr);
833 }
834
835 static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
836                              phys_addr_t addr)
837 {
838         struct kvm *kvm = mmu->kvm;
839         p4d_t *p4d;
840         pud_t *pud;
841
842         p4d = stage2_get_p4d(mmu, cache, addr);
843         if (stage2_p4d_none(kvm, *p4d)) {
844                 if (!cache)
845                         return NULL;
846                 pud = kvm_mmu_memory_cache_alloc(cache);
847                 stage2_p4d_populate(kvm, p4d, pud);
848                 get_page(virt_to_page(p4d));
849         }
850
851         return stage2_pud_offset(kvm, p4d, addr);
852 }
853
854 static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
855                              phys_addr_t addr)
856 {
857         struct kvm *kvm = mmu->kvm;
858         pud_t *pud;
859         pmd_t *pmd;
860
861         pud = stage2_get_pud(mmu, cache, addr);
862         if (!pud || stage2_pud_huge(kvm, *pud))
863                 return NULL;
864
865         if (stage2_pud_none(kvm, *pud)) {
866                 if (!cache)
867                         return NULL;
868                 pmd = kvm_mmu_memory_cache_alloc(cache);
869                 stage2_pud_populate(kvm, pud, pmd);
870                 get_page(virt_to_page(pud));
871         }
872
873         return stage2_pmd_offset(kvm, pud, addr);
874 }
875
876 static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
877                                struct kvm_mmu_memory_cache *cache,
878                                phys_addr_t addr, const pmd_t *new_pmd)
879 {
880         pmd_t *pmd, old_pmd;
881
882 retry:
883         pmd = stage2_get_pmd(mmu, cache, addr);
884         VM_BUG_ON(!pmd);
885
886         old_pmd = *pmd;
887         /*
888          * Multiple vcpus faulting on the same PMD entry, can
889          * lead to them sequentially updating the PMD with the
890          * same value. Following the break-before-make
891          * (pmd_clear() followed by tlb_flush()) process can
892          * hinder forward progress due to refaults generated
893          * on missing translations.
894          *
895          * Skip updating the page table if the entry is
896          * unchanged.
897          */
898         if (pmd_val(old_pmd) == pmd_val(*new_pmd))
899                 return 0;
900
901         if (pmd_present(old_pmd)) {
902                 /*
903                  * If we already have PTE level mapping for this block,
904                  * we must unmap it to avoid inconsistent TLB state and
905                  * leaking the table page. We could end up in this situation
906                  * if the memory slot was marked for dirty logging and was
907                  * reverted, leaving PTE level mappings for the pages accessed
908                  * during the period. So, unmap the PTE level mapping for this
909                  * block and retry, as we could have released the upper level
910                  * table in the process.
911                  *
912                  * Normal THP split/merge follows mmu_notifier callbacks and do
913                  * get handled accordingly.
914                  */
915                 if (!pmd_thp_or_huge(old_pmd)) {
916                         unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
917                         goto retry;
918                 }
919                 /*
920                  * Mapping in huge pages should only happen through a
921                  * fault.  If a page is merged into a transparent huge
922                  * page, the individual subpages of that huge page
923                  * should be unmapped through MMU notifiers before we
924                  * get here.
925                  *
926                  * Merging of CompoundPages is not supported; they
927                  * should become splitting first, unmapped, merged,
928                  * and mapped back in on-demand.
929                  */
930                 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
931                 pmd_clear(pmd);
932                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
933         } else {
934                 get_page(virt_to_page(pmd));
935         }
936
937         kvm_set_pmd(pmd, *new_pmd);
938         return 0;
939 }
940
941 static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
942                                struct kvm_mmu_memory_cache *cache,
943                                phys_addr_t addr, const pud_t *new_pudp)
944 {
945         struct kvm *kvm = mmu->kvm;
946         pud_t *pudp, old_pud;
947
948 retry:
949         pudp = stage2_get_pud(mmu, cache, addr);
950         VM_BUG_ON(!pudp);
951
952         old_pud = *pudp;
953
954         /*
955          * A large number of vcpus faulting on the same stage 2 entry,
956          * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
957          * Skip updating the page tables if there is no change.
958          */
959         if (pud_val(old_pud) == pud_val(*new_pudp))
960                 return 0;
961
962         if (stage2_pud_present(kvm, old_pud)) {
963                 /*
964                  * If we already have table level mapping for this block, unmap
965                  * the range for this block and retry.
966                  */
967                 if (!stage2_pud_huge(kvm, old_pud)) {
968                         unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
969                         goto retry;
970                 }
971
972                 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
973                 stage2_pud_clear(kvm, pudp);
974                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
975         } else {
976                 get_page(virt_to_page(pudp));
977         }
978
979         kvm_set_pud(pudp, *new_pudp);
980         return 0;
981 }
982
983 /*
984  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
985  * true if a valid and present leaf-entry is found. A pointer to the
986  * leaf-entry is returned in the appropriate level variable - pudpp,
987  * pmdpp, ptepp.
988  */
989 static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
990                                   pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
991 {
992         struct kvm *kvm = mmu->kvm;
993         pud_t *pudp;
994         pmd_t *pmdp;
995         pte_t *ptep;
996
997         *pudpp = NULL;
998         *pmdpp = NULL;
999         *ptepp = NULL;
1000
1001         pudp = stage2_get_pud(mmu, NULL, addr);
1002         if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1003                 return false;
1004
1005         if (stage2_pud_huge(kvm, *pudp)) {
1006                 *pudpp = pudp;
1007                 return true;
1008         }
1009
1010         pmdp = stage2_pmd_offset(kvm, pudp, addr);
1011         if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1012                 return false;
1013
1014         if (pmd_thp_or_huge(*pmdp)) {
1015                 *pmdpp = pmdp;
1016                 return true;
1017         }
1018
1019         ptep = pte_offset_kernel(pmdp, addr);
1020         if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1021                 return false;
1022
1023         *ptepp = ptep;
1024         return true;
1025 }
1026
1027 static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
1028 {
1029         pud_t *pudp;
1030         pmd_t *pmdp;
1031         pte_t *ptep;
1032         bool found;
1033
1034         found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
1035         if (!found)
1036                 return false;
1037
1038         if (pudp)
1039                 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1040         else if (pmdp)
1041                 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1042         else
1043                 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1044 }
1045
1046 static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1047                           struct kvm_mmu_memory_cache *cache,
1048                           phys_addr_t addr, const pte_t *new_pte,
1049                           unsigned long flags)
1050 {
1051         struct kvm *kvm = mmu->kvm;
1052         pud_t *pud;
1053         pmd_t *pmd;
1054         pte_t *pte, old_pte;
1055         bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1056         bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1057
1058         VM_BUG_ON(logging_active && !cache);
1059
1060         /* Create stage-2 page table mapping - Levels 0 and 1 */
1061         pud = stage2_get_pud(mmu, cache, addr);
1062         if (!pud) {
1063                 /*
1064                  * Ignore calls from kvm_set_spte_hva for unallocated
1065                  * address ranges.
1066                  */
1067                 return 0;
1068         }
1069
1070         /*
1071          * While dirty page logging - dissolve huge PUD, then continue
1072          * on to allocate page.
1073          */
1074         if (logging_active)
1075                 stage2_dissolve_pud(mmu, addr, pud);
1076
1077         if (stage2_pud_none(kvm, *pud)) {
1078                 if (!cache)
1079                         return 0; /* ignore calls from kvm_set_spte_hva */
1080                 pmd = kvm_mmu_memory_cache_alloc(cache);
1081                 stage2_pud_populate(kvm, pud, pmd);
1082                 get_page(virt_to_page(pud));
1083         }
1084
1085         pmd = stage2_pmd_offset(kvm, pud, addr);
1086         if (!pmd) {
1087                 /*
1088                  * Ignore calls from kvm_set_spte_hva for unallocated
1089                  * address ranges.
1090                  */
1091                 return 0;
1092         }
1093
1094         /*
1095          * While dirty page logging - dissolve huge PMD, then continue on to
1096          * allocate page.
1097          */
1098         if (logging_active)
1099                 stage2_dissolve_pmd(mmu, addr, pmd);
1100
1101         /* Create stage-2 page mappings - Level 2 */
1102         if (pmd_none(*pmd)) {
1103                 if (!cache)
1104                         return 0; /* ignore calls from kvm_set_spte_hva */
1105                 pte = kvm_mmu_memory_cache_alloc(cache);
1106                 kvm_pmd_populate(pmd, pte);
1107                 get_page(virt_to_page(pmd));
1108         }
1109
1110         pte = pte_offset_kernel(pmd, addr);
1111
1112         if (iomap && pte_present(*pte))
1113                 return -EFAULT;
1114
1115         /* Create 2nd stage page table mapping - Level 3 */
1116         old_pte = *pte;
1117         if (pte_present(old_pte)) {
1118                 /* Skip page table update if there is no change */
1119                 if (pte_val(old_pte) == pte_val(*new_pte))
1120                         return 0;
1121
1122                 kvm_set_pte(pte, __pte(0));
1123                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
1124         } else {
1125                 get_page(virt_to_page(pte));
1126         }
1127
1128         kvm_set_pte(pte, *new_pte);
1129         return 0;
1130 }
1131
1132 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1133 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1134 {
1135         if (pte_young(*pte)) {
1136                 *pte = pte_mkold(*pte);
1137                 return 1;
1138         }
1139         return 0;
1140 }
1141 #else
1142 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1143 {
1144         return __ptep_test_and_clear_young(pte);
1145 }
1146 #endif
1147
1148 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1149 {
1150         return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1151 }
1152
1153 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1154 {
1155         return stage2_ptep_test_and_clear_young((pte_t *)pud);
1156 }
1157
1158 /**
1159  * kvm_phys_addr_ioremap - map a device range to guest IPA
1160  *
1161  * @kvm:        The KVM pointer
1162  * @guest_ipa:  The IPA at which to insert the mapping
1163  * @pa:         The physical address of the device
1164  * @size:       The size of the mapping
1165  */
1166 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1167                           phys_addr_t pa, unsigned long size, bool writable)
1168 {
1169         phys_addr_t addr;
1170         int ret = 0;
1171         struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
1172         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
1173         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1174                                      KVM_PGTABLE_PROT_R |
1175                                      (writable ? KVM_PGTABLE_PROT_W : 0);
1176
1177         size += offset_in_page(guest_ipa);
1178         guest_ipa &= PAGE_MASK;
1179
1180         for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1181                 ret = kvm_mmu_topup_memory_cache(&cache,
1182                                                  kvm_mmu_cache_min_pages(kvm));
1183                 if (ret)
1184                         break;
1185
1186                 spin_lock(&kvm->mmu_lock);
1187                 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1188                                              &cache);
1189                 spin_unlock(&kvm->mmu_lock);
1190                 if (ret)
1191                         break;
1192
1193                 pa += PAGE_SIZE;
1194         }
1195
1196         kvm_mmu_free_memory_cache(&cache);
1197         return ret;
1198 }
1199
1200 /**
1201  * stage2_wp_ptes - write protect PMD range
1202  * @pmd:        pointer to pmd entry
1203  * @addr:       range start address
1204  * @end:        range end address
1205  */
1206 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1207 {
1208         pte_t *pte;
1209
1210         pte = pte_offset_kernel(pmd, addr);
1211         do {
1212                 if (!pte_none(*pte)) {
1213                         if (!kvm_s2pte_readonly(pte))
1214                                 kvm_set_s2pte_readonly(pte);
1215                 }
1216         } while (pte++, addr += PAGE_SIZE, addr != end);
1217 }
1218
1219 /**
1220  * stage2_wp_pmds - write protect PUD range
1221  * kvm:         kvm instance for the VM
1222  * @pud:        pointer to pud entry
1223  * @addr:       range start address
1224  * @end:        range end address
1225  */
1226 static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
1227                            phys_addr_t addr, phys_addr_t end)
1228 {
1229         struct kvm *kvm = mmu->kvm;
1230         pmd_t *pmd;
1231         phys_addr_t next;
1232
1233         pmd = stage2_pmd_offset(kvm, pud, addr);
1234
1235         do {
1236                 next = stage2_pmd_addr_end(kvm, addr, end);
1237                 if (!pmd_none(*pmd)) {
1238                         if (pmd_thp_or_huge(*pmd)) {
1239                                 if (!kvm_s2pmd_readonly(pmd))
1240                                         kvm_set_s2pmd_readonly(pmd);
1241                         } else {
1242                                 stage2_wp_ptes(pmd, addr, next);
1243                         }
1244                 }
1245         } while (pmd++, addr = next, addr != end);
1246 }
1247
1248 /**
1249  * stage2_wp_puds - write protect P4D range
1250  * @p4d:        pointer to p4d entry
1251  * @addr:       range start address
1252  * @end:        range end address
1253  */
1254 static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
1255                             phys_addr_t addr, phys_addr_t end)
1256 {
1257         struct kvm *kvm = mmu->kvm;
1258         pud_t *pud;
1259         phys_addr_t next;
1260
1261         pud = stage2_pud_offset(kvm, p4d, addr);
1262         do {
1263                 next = stage2_pud_addr_end(kvm, addr, end);
1264                 if (!stage2_pud_none(kvm, *pud)) {
1265                         if (stage2_pud_huge(kvm, *pud)) {
1266                                 if (!kvm_s2pud_readonly(pud))
1267                                         kvm_set_s2pud_readonly(pud);
1268                         } else {
1269                                 stage2_wp_pmds(mmu, pud, addr, next);
1270                         }
1271                 }
1272         } while (pud++, addr = next, addr != end);
1273 }
1274
1275 /**
1276  * stage2_wp_p4ds - write protect PGD range
1277  * @pgd:        pointer to pgd entry
1278  * @addr:       range start address
1279  * @end:        range end address
1280  */
1281 static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
1282                             phys_addr_t addr, phys_addr_t end)
1283 {
1284         struct kvm *kvm = mmu->kvm;
1285         p4d_t *p4d;
1286         phys_addr_t next;
1287
1288         p4d = stage2_p4d_offset(kvm, pgd, addr);
1289         do {
1290                 next = stage2_p4d_addr_end(kvm, addr, end);
1291                 if (!stage2_p4d_none(kvm, *p4d))
1292                         stage2_wp_puds(mmu, p4d, addr, next);
1293         } while (p4d++, addr = next, addr != end);
1294 }
1295
1296 /**
1297  * stage2_wp_range() - write protect stage2 memory region range
1298  * @kvm:        The KVM pointer
1299  * @addr:       Start address of range
1300  * @end:        End address of range
1301  */
1302 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1303 {
1304         struct kvm *kvm = mmu->kvm;
1305         pgd_t *pgd;
1306         phys_addr_t next;
1307
1308         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
1309         do {
1310                 /*
1311                  * Release kvm_mmu_lock periodically if the memory region is
1312                  * large. Otherwise, we may see kernel panics with
1313                  * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1314                  * CONFIG_LOCKDEP. Additionally, holding the lock too long
1315                  * will also starve other vCPUs. We have to also make sure
1316                  * that the page tables are not freed while we released
1317                  * the lock.
1318                  */
1319                 cond_resched_lock(&kvm->mmu_lock);
1320                 if (!READ_ONCE(mmu->pgd))
1321                         break;
1322                 next = stage2_pgd_addr_end(kvm, addr, end);
1323                 if (stage2_pgd_present(kvm, *pgd))
1324                         stage2_wp_p4ds(mmu, pgd, addr, next);
1325         } while (pgd++, addr = next, addr != end);
1326 }
1327
1328 /**
1329  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1330  * @kvm:        The KVM pointer
1331  * @slot:       The memory slot to write protect
1332  *
1333  * Called to start logging dirty pages after memory region
1334  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1335  * all present PUD, PMD and PTEs are write protected in the memory region.
1336  * Afterwards read of dirty page log can be called.
1337  *
1338  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1339  * serializing operations for VM memory regions.
1340  */
1341 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1342 {
1343         struct kvm_memslots *slots = kvm_memslots(kvm);
1344         struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1345         phys_addr_t start, end;
1346
1347         if (WARN_ON_ONCE(!memslot))
1348                 return;
1349
1350         start = memslot->base_gfn << PAGE_SHIFT;
1351         end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1352
1353         spin_lock(&kvm->mmu_lock);
1354         stage2_wp_range(&kvm->arch.mmu, start, end);
1355         spin_unlock(&kvm->mmu_lock);
1356         kvm_flush_remote_tlbs(kvm);
1357 }
1358
1359 /**
1360  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1361  * @kvm:        The KVM pointer
1362  * @slot:       The memory slot associated with mask
1363  * @gfn_offset: The gfn offset in memory slot
1364  * @mask:       The mask of dirty pages at offset 'gfn_offset' in this memory
1365  *              slot to be write protected
1366  *
1367  * Walks bits set in mask write protects the associated pte's. Caller must
1368  * acquire kvm_mmu_lock.
1369  */
1370 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1371                 struct kvm_memory_slot *slot,
1372                 gfn_t gfn_offset, unsigned long mask)
1373 {
1374         phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1375         phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1376         phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1377
1378         stage2_wp_range(&kvm->arch.mmu, start, end);
1379 }
1380
1381 /*
1382  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1383  * dirty pages.
1384  *
1385  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1386  * enable dirty logging for them.
1387  */
1388 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1389                 struct kvm_memory_slot *slot,
1390                 gfn_t gfn_offset, unsigned long mask)
1391 {
1392         kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1393 }
1394
1395 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1396 {
1397         __clean_dcache_guest_page(pfn, size);
1398 }
1399
1400 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1401 {
1402         __invalidate_icache_guest_page(pfn, size);
1403 }
1404
1405 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1406 {
1407         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1408 }
1409
1410 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1411                                                unsigned long hva,
1412                                                unsigned long map_size)
1413 {
1414         gpa_t gpa_start;
1415         hva_t uaddr_start, uaddr_end;
1416         size_t size;
1417
1418         /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1419         if (map_size == PAGE_SIZE)
1420                 return true;
1421
1422         size = memslot->npages * PAGE_SIZE;
1423
1424         gpa_start = memslot->base_gfn << PAGE_SHIFT;
1425
1426         uaddr_start = memslot->userspace_addr;
1427         uaddr_end = uaddr_start + size;
1428
1429         /*
1430          * Pages belonging to memslots that don't have the same alignment
1431          * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1432          * PMD/PUD entries, because we'll end up mapping the wrong pages.
1433          *
1434          * Consider a layout like the following:
1435          *
1436          *    memslot->userspace_addr:
1437          *    +-----+--------------------+--------------------+---+
1438          *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1439          *    +-----+--------------------+--------------------+---+
1440          *
1441          *    memslot->base_gfn << PAGE_SHIFT:
1442          *      +---+--------------------+--------------------+-----+
1443          *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1444          *      +---+--------------------+--------------------+-----+
1445          *
1446          * If we create those stage-2 blocks, we'll end up with this incorrect
1447          * mapping:
1448          *   d -> f
1449          *   e -> g
1450          *   f -> h
1451          */
1452         if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1453                 return false;
1454
1455         /*
1456          * Next, let's make sure we're not trying to map anything not covered
1457          * by the memslot. This means we have to prohibit block size mappings
1458          * for the beginning and end of a non-block aligned and non-block sized
1459          * memory slot (illustrated by the head and tail parts of the
1460          * userspace view above containing pages 'abcde' and 'xyz',
1461          * respectively).
1462          *
1463          * Note that it doesn't matter if we do the check using the
1464          * userspace_addr or the base_gfn, as both are equally aligned (per
1465          * the check above) and equally sized.
1466          */
1467         return (hva & ~(map_size - 1)) >= uaddr_start &&
1468                (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1469 }
1470
1471 /*
1472  * Check if the given hva is backed by a transparent huge page (THP) and
1473  * whether it can be mapped using block mapping in stage2. If so, adjust
1474  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1475  * supported. This will need to be updated to support other THP sizes.
1476  *
1477  * Returns the size of the mapping.
1478  */
1479 static unsigned long
1480 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1481                             unsigned long hva, kvm_pfn_t *pfnp,
1482                             phys_addr_t *ipap)
1483 {
1484         kvm_pfn_t pfn = *pfnp;
1485
1486         /*
1487          * Make sure the adjustment is done only for THP pages. Also make
1488          * sure that the HVA and IPA are sufficiently aligned and that the
1489          * block map is contained within the memslot.
1490          */
1491         if (kvm_is_transparent_hugepage(pfn) &&
1492             fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1493                 /*
1494                  * The address we faulted on is backed by a transparent huge
1495                  * page.  However, because we map the compound huge page and
1496                  * not the individual tail page, we need to transfer the
1497                  * refcount to the head page.  We have to be careful that the
1498                  * THP doesn't start to split while we are adjusting the
1499                  * refcounts.
1500                  *
1501                  * We are sure this doesn't happen, because mmu_notifier_retry
1502                  * was successful and we are holding the mmu_lock, so if this
1503                  * THP is trying to split, it will be blocked in the mmu
1504                  * notifier before touching any of the pages, specifically
1505                  * before being able to call __split_huge_page_refcount().
1506                  *
1507                  * We can therefore safely transfer the refcount from PG_tail
1508                  * to PG_head and switch the pfn from a tail page to the head
1509                  * page accordingly.
1510                  */
1511                 *ipap &= PMD_MASK;
1512                 kvm_release_pfn_clean(pfn);
1513                 pfn &= ~(PTRS_PER_PMD - 1);
1514                 kvm_get_pfn(pfn);
1515                 *pfnp = pfn;
1516
1517                 return PMD_SIZE;
1518         }
1519
1520         /* Use page mapping if we cannot use block mapping. */
1521         return PAGE_SIZE;
1522 }
1523
1524 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1525                           struct kvm_memory_slot *memslot, unsigned long hva,
1526                           unsigned long fault_status)
1527 {
1528         int ret;
1529         bool write_fault, writable, force_pte = false;
1530         bool exec_fault, needs_exec;
1531         unsigned long mmu_seq;
1532         gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1533         struct kvm *kvm = vcpu->kvm;
1534         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1535         struct vm_area_struct *vma;
1536         short vma_shift;
1537         kvm_pfn_t pfn;
1538         pgprot_t mem_type = PAGE_S2;
1539         bool logging_active = memslot_is_logging(memslot);
1540         unsigned long vma_pagesize, flags = 0;
1541         struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1542
1543         write_fault = kvm_is_write_fault(vcpu);
1544         exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1545         VM_BUG_ON(write_fault && exec_fault);
1546
1547         if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1548                 kvm_err("Unexpected L2 read permission error\n");
1549                 return -EFAULT;
1550         }
1551
1552         /* Let's check if we will get back a huge page backed by hugetlbfs */
1553         mmap_read_lock(current->mm);
1554         vma = find_vma_intersection(current->mm, hva, hva + 1);
1555         if (unlikely(!vma)) {
1556                 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1557                 mmap_read_unlock(current->mm);
1558                 return -EFAULT;
1559         }
1560
1561         if (is_vm_hugetlb_page(vma))
1562                 vma_shift = huge_page_shift(hstate_vma(vma));
1563         else
1564                 vma_shift = PAGE_SHIFT;
1565
1566         vma_pagesize = 1ULL << vma_shift;
1567         if (logging_active ||
1568             (vma->vm_flags & VM_PFNMAP) ||
1569             !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1570                 force_pte = true;
1571                 vma_pagesize = PAGE_SIZE;
1572         }
1573
1574         /*
1575          * The stage2 has a minimum of 2 level table (For arm64 see
1576          * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1577          * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1578          * As for PUD huge maps, we must make sure that we have at least
1579          * 3 levels, i.e, PMD is not folded.
1580          */
1581         if (vma_pagesize == PMD_SIZE ||
1582             (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1583                 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1584         mmap_read_unlock(current->mm);
1585
1586         /* We need minimum second+third level pages */
1587         ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
1588         if (ret)
1589                 return ret;
1590
1591         mmu_seq = vcpu->kvm->mmu_notifier_seq;
1592         /*
1593          * Ensure the read of mmu_notifier_seq happens before we call
1594          * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1595          * the page we just got a reference to gets unmapped before we have a
1596          * chance to grab the mmu_lock, which ensure that if the page gets
1597          * unmapped afterwards, the call to kvm_unmap_hva will take it away
1598          * from us again properly. This smp_rmb() interacts with the smp_wmb()
1599          * in kvm_mmu_notifier_invalidate_<page|range_end>.
1600          */
1601         smp_rmb();
1602
1603         pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1604         if (pfn == KVM_PFN_ERR_HWPOISON) {
1605                 kvm_send_hwpoison_signal(hva, vma_shift);
1606                 return 0;
1607         }
1608         if (is_error_noslot_pfn(pfn))
1609                 return -EFAULT;
1610
1611         if (kvm_is_device_pfn(pfn)) {
1612                 mem_type = PAGE_S2_DEVICE;
1613                 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1614         } else if (logging_active) {
1615                 /*
1616                  * Faults on pages in a memslot with logging enabled
1617                  * should not be mapped with huge pages (it introduces churn
1618                  * and performance degradation), so force a pte mapping.
1619                  */
1620                 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1621
1622                 /*
1623                  * Only actually map the page as writable if this was a write
1624                  * fault.
1625                  */
1626                 if (!write_fault)
1627                         writable = false;
1628         }
1629
1630         if (exec_fault && is_iomap(flags))
1631                 return -ENOEXEC;
1632
1633         spin_lock(&kvm->mmu_lock);
1634         if (mmu_notifier_retry(kvm, mmu_seq))
1635                 goto out_unlock;
1636
1637         /*
1638          * If we are not forced to use page mapping, check if we are
1639          * backed by a THP and thus use block mapping if possible.
1640          */
1641         if (vma_pagesize == PAGE_SIZE && !force_pte)
1642                 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1643                                                            &pfn, &fault_ipa);
1644         if (writable)
1645                 kvm_set_pfn_dirty(pfn);
1646
1647         if (fault_status != FSC_PERM && !is_iomap(flags))
1648                 clean_dcache_guest_page(pfn, vma_pagesize);
1649
1650         if (exec_fault)
1651                 invalidate_icache_guest_page(pfn, vma_pagesize);
1652
1653         /*
1654          * If we took an execution fault we have made the
1655          * icache/dcache coherent above and should now let the s2
1656          * mapping be executable.
1657          *
1658          * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1659          * execute permissions, and we preserve whatever we have.
1660          */
1661         needs_exec = exec_fault ||
1662                 (fault_status == FSC_PERM &&
1663                  stage2_is_exec(mmu, fault_ipa, vma_pagesize));
1664
1665         if (vma_pagesize == PUD_SIZE) {
1666                 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1667
1668                 new_pud = kvm_pud_mkhuge(new_pud);
1669                 if (writable)
1670                         new_pud = kvm_s2pud_mkwrite(new_pud);
1671
1672                 if (needs_exec)
1673                         new_pud = kvm_s2pud_mkexec(new_pud);
1674
1675                 ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
1676         } else if (vma_pagesize == PMD_SIZE) {
1677                 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1678
1679                 new_pmd = kvm_pmd_mkhuge(new_pmd);
1680
1681                 if (writable)
1682                         new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1683
1684                 if (needs_exec)
1685                         new_pmd = kvm_s2pmd_mkexec(new_pmd);
1686
1687                 ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
1688         } else {
1689                 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1690
1691                 if (writable) {
1692                         new_pte = kvm_s2pte_mkwrite(new_pte);
1693                         mark_page_dirty(kvm, gfn);
1694                 }
1695
1696                 if (needs_exec)
1697                         new_pte = kvm_s2pte_mkexec(new_pte);
1698
1699                 ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
1700         }
1701
1702 out_unlock:
1703         spin_unlock(&kvm->mmu_lock);
1704         kvm_set_pfn_accessed(pfn);
1705         kvm_release_pfn_clean(pfn);
1706         return ret;
1707 }
1708
1709 /*
1710  * Resolve the access fault by making the page young again.
1711  * Note that because the faulting entry is guaranteed not to be
1712  * cached in the TLB, we don't need to invalidate anything.
1713  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1714  * so there is no need for atomic (pte|pmd)_mkyoung operations.
1715  */
1716 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1717 {
1718         pud_t *pud;
1719         pmd_t *pmd;
1720         pte_t *pte;
1721         kvm_pfn_t pfn;
1722         bool pfn_valid = false;
1723
1724         trace_kvm_access_fault(fault_ipa);
1725
1726         spin_lock(&vcpu->kvm->mmu_lock);
1727
1728         if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
1729                 goto out;
1730
1731         if (pud) {              /* HugeTLB */
1732                 *pud = kvm_s2pud_mkyoung(*pud);
1733                 pfn = kvm_pud_pfn(*pud);
1734                 pfn_valid = true;
1735         } else  if (pmd) {      /* THP, HugeTLB */
1736                 *pmd = pmd_mkyoung(*pmd);
1737                 pfn = pmd_pfn(*pmd);
1738                 pfn_valid = true;
1739         } else {
1740                 *pte = pte_mkyoung(*pte);       /* Just a page... */
1741                 pfn = pte_pfn(*pte);
1742                 pfn_valid = true;
1743         }
1744
1745 out:
1746         spin_unlock(&vcpu->kvm->mmu_lock);
1747         if (pfn_valid)
1748                 kvm_set_pfn_accessed(pfn);
1749 }
1750
1751 /**
1752  * kvm_handle_guest_abort - handles all 2nd stage aborts
1753  * @vcpu:       the VCPU pointer
1754  *
1755  * Any abort that gets to the host is almost guaranteed to be caused by a
1756  * missing second stage translation table entry, which can mean that either the
1757  * guest simply needs more memory and we must allocate an appropriate page or it
1758  * can mean that the guest tried to access I/O memory, which is emulated by user
1759  * space. The distinction is based on the IPA causing the fault and whether this
1760  * memory region has been registered as standard RAM by user space.
1761  */
1762 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1763 {
1764         unsigned long fault_status;
1765         phys_addr_t fault_ipa;
1766         struct kvm_memory_slot *memslot;
1767         unsigned long hva;
1768         bool is_iabt, write_fault, writable;
1769         gfn_t gfn;
1770         int ret, idx;
1771
1772         fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1773
1774         fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1775         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1776
1777         /* Synchronous External Abort? */
1778         if (kvm_vcpu_abt_issea(vcpu)) {
1779                 /*
1780                  * For RAS the host kernel may handle this abort.
1781                  * There is no need to pass the error into the guest.
1782                  */
1783                 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1784                         kvm_inject_vabt(vcpu);
1785
1786                 return 1;
1787         }
1788
1789         trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1790                               kvm_vcpu_get_hfar(vcpu), fault_ipa);
1791
1792         /* Check the stage-2 fault is trans. fault or write fault */
1793         if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1794             fault_status != FSC_ACCESS) {
1795                 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1796                         kvm_vcpu_trap_get_class(vcpu),
1797                         (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1798                         (unsigned long)kvm_vcpu_get_esr(vcpu));
1799                 return -EFAULT;
1800         }
1801
1802         idx = srcu_read_lock(&vcpu->kvm->srcu);
1803
1804         gfn = fault_ipa >> PAGE_SHIFT;
1805         memslot = gfn_to_memslot(vcpu->kvm, gfn);
1806         hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1807         write_fault = kvm_is_write_fault(vcpu);
1808         if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1809                 /*
1810                  * The guest has put either its instructions or its page-tables
1811                  * somewhere it shouldn't have. Userspace won't be able to do
1812                  * anything about this (there's no syndrome for a start), so
1813                  * re-inject the abort back into the guest.
1814                  */
1815                 if (is_iabt) {
1816                         ret = -ENOEXEC;
1817                         goto out;
1818                 }
1819
1820                 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1821                         kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1822                         ret = 1;
1823                         goto out_unlock;
1824                 }
1825
1826                 /*
1827                  * Check for a cache maintenance operation. Since we
1828                  * ended-up here, we know it is outside of any memory
1829                  * slot. But we can't find out if that is for a device,
1830                  * or if the guest is just being stupid. The only thing
1831                  * we know for sure is that this range cannot be cached.
1832                  *
1833                  * So let's assume that the guest is just being
1834                  * cautious, and skip the instruction.
1835                  */
1836                 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1837                         kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1838                         ret = 1;
1839                         goto out_unlock;
1840                 }
1841
1842                 /*
1843                  * The IPA is reported as [MAX:12], so we need to
1844                  * complement it with the bottom 12 bits from the
1845                  * faulting VA. This is always 12 bits, irrespective
1846                  * of the page size.
1847                  */
1848                 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1849                 ret = io_mem_abort(vcpu, fault_ipa);
1850                 goto out_unlock;
1851         }
1852
1853         /* Userspace should not be able to register out-of-bounds IPAs */
1854         VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1855
1856         if (fault_status == FSC_ACCESS) {
1857                 handle_access_fault(vcpu, fault_ipa);
1858                 ret = 1;
1859                 goto out_unlock;
1860         }
1861
1862         ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1863         if (ret == 0)
1864                 ret = 1;
1865 out:
1866         if (ret == -ENOEXEC) {
1867                 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1868                 ret = 1;
1869         }
1870 out_unlock:
1871         srcu_read_unlock(&vcpu->kvm->srcu, idx);
1872         return ret;
1873 }
1874
1875 static int handle_hva_to_gpa(struct kvm *kvm,
1876                              unsigned long start,
1877                              unsigned long end,
1878                              int (*handler)(struct kvm *kvm,
1879                                             gpa_t gpa, u64 size,
1880                                             void *data),
1881                              void *data)
1882 {
1883         struct kvm_memslots *slots;
1884         struct kvm_memory_slot *memslot;
1885         int ret = 0;
1886
1887         slots = kvm_memslots(kvm);
1888
1889         /* we only care about the pages that the guest sees */
1890         kvm_for_each_memslot(memslot, slots) {
1891                 unsigned long hva_start, hva_end;
1892                 gfn_t gpa;
1893
1894                 hva_start = max(start, memslot->userspace_addr);
1895                 hva_end = min(end, memslot->userspace_addr +
1896                                         (memslot->npages << PAGE_SHIFT));
1897                 if (hva_start >= hva_end)
1898                         continue;
1899
1900                 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1901                 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
1902         }
1903
1904         return ret;
1905 }
1906
1907 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1908 {
1909         unsigned flags = *(unsigned *)data;
1910         bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1911
1912         __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
1913         return 0;
1914 }
1915
1916 int kvm_unmap_hva_range(struct kvm *kvm,
1917                         unsigned long start, unsigned long end, unsigned flags)
1918 {
1919         if (!kvm->arch.mmu.pgd)
1920                 return 0;
1921
1922         trace_kvm_unmap_hva_range(start, end);
1923         handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
1924         return 0;
1925 }
1926
1927 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1928 {
1929         kvm_pfn_t *pfn = (kvm_pfn_t *)data;
1930
1931         WARN_ON(size != PAGE_SIZE);
1932
1933         /*
1934          * The MMU notifiers will have unmapped a huge PMD before calling
1935          * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1936          * therefore we never need to clear out a huge PMD through this
1937          * calling path and a memcache is not required.
1938          */
1939         kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1940                                __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
1941         return 0;
1942 }
1943
1944 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1945 {
1946         unsigned long end = hva + PAGE_SIZE;
1947         kvm_pfn_t pfn = pte_pfn(pte);
1948
1949         if (!kvm->arch.mmu.pgt)
1950                 return 0;
1951
1952         trace_kvm_set_spte_hva(hva);
1953
1954         /*
1955          * We've moved a page around, probably through CoW, so let's treat it
1956          * just like a translation fault and clean the cache to the PoC.
1957          */
1958         clean_dcache_guest_page(pfn, PAGE_SIZE);
1959         handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
1960         return 0;
1961 }
1962
1963 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1964 {
1965         pud_t *pud;
1966         pmd_t *pmd;
1967         pte_t *pte;
1968
1969         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1970         if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
1971                 return 0;
1972
1973         if (pud)
1974                 return stage2_pudp_test_and_clear_young(pud);
1975         else if (pmd)
1976                 return stage2_pmdp_test_and_clear_young(pmd);
1977         else
1978                 return stage2_ptep_test_and_clear_young(pte);
1979 }
1980
1981 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1982 {
1983         pud_t *pud;
1984         pmd_t *pmd;
1985         pte_t *pte;
1986
1987         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1988         if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
1989                 return 0;
1990
1991         if (pud)
1992                 return kvm_s2pud_young(*pud);
1993         else if (pmd)
1994                 return pmd_young(*pmd);
1995         else
1996                 return pte_young(*pte);
1997 }
1998
1999 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2000 {
2001         if (!kvm->arch.mmu.pgd)
2002                 return 0;
2003         trace_kvm_age_hva(start, end);
2004         return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2005 }
2006
2007 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2008 {
2009         if (!kvm->arch.mmu.pgd)
2010                 return 0;
2011         trace_kvm_test_age_hva(hva);
2012         return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2013                                  kvm_test_age_hva_handler, NULL);
2014 }
2015
2016 phys_addr_t kvm_mmu_get_httbr(void)
2017 {
2018         return __pa(hyp_pgtable->pgd);
2019 }
2020
2021 phys_addr_t kvm_get_idmap_vector(void)
2022 {
2023         return hyp_idmap_vector;
2024 }
2025
2026 static int kvm_map_idmap_text(void)
2027 {
2028         unsigned long size = hyp_idmap_end - hyp_idmap_start;
2029         int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
2030                                         PAGE_HYP_EXEC);
2031         if (err)
2032                 kvm_err("Failed to idmap %lx-%lx\n",
2033                         hyp_idmap_start, hyp_idmap_end);
2034
2035         return err;
2036 }
2037
2038 int kvm_mmu_init(void)
2039 {
2040         int err;
2041         u32 hyp_va_bits;
2042
2043         hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2044         hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2045         hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2046         hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2047         hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2048
2049         /*
2050          * We rely on the linker script to ensure at build time that the HYP
2051          * init code does not cross a page boundary.
2052          */
2053         BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2054
2055         hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
2056         kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
2057         kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2058         kvm_debug("HYP VA range: %lx:%lx\n",
2059                   kern_hyp_va(PAGE_OFFSET),
2060                   kern_hyp_va((unsigned long)high_memory - 1));
2061
2062         if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2063             hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2064             hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2065                 /*
2066                  * The idmap page is intersecting with the VA space,
2067                  * it is not safe to continue further.
2068                  */
2069                 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2070                 err = -EINVAL;
2071                 goto out;
2072         }
2073
2074         hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2075         if (!hyp_pgtable) {
2076                 kvm_err("Hyp mode page-table not allocated\n");
2077                 err = -ENOMEM;
2078                 goto out;
2079         }
2080
2081         err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
2082         if (err)
2083                 goto out_free_pgtable;
2084
2085         err = kvm_map_idmap_text();
2086         if (err)
2087                 goto out_destroy_pgtable;
2088
2089         io_map_base = hyp_idmap_start;
2090         return 0;
2091
2092 out_destroy_pgtable:
2093         kvm_pgtable_hyp_destroy(hyp_pgtable);
2094 out_free_pgtable:
2095         kfree(hyp_pgtable);
2096         hyp_pgtable = NULL;
2097 out:
2098         return err;
2099 }
2100
2101 void kvm_arch_commit_memory_region(struct kvm *kvm,
2102                                    const struct kvm_userspace_memory_region *mem,
2103                                    struct kvm_memory_slot *old,
2104                                    const struct kvm_memory_slot *new,
2105                                    enum kvm_mr_change change)
2106 {
2107         /*
2108          * At this point memslot has been committed and there is an
2109          * allocated dirty_bitmap[], dirty pages will be tracked while the
2110          * memory slot is write protected.
2111          */
2112         if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2113                 /*
2114                  * If we're with initial-all-set, we don't need to write
2115                  * protect any pages because they're all reported as dirty.
2116                  * Huge pages and normal pages will be write protect gradually.
2117                  */
2118                 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2119                         kvm_mmu_wp_memory_region(kvm, mem->slot);
2120                 }
2121         }
2122 }
2123
2124 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2125                                    struct kvm_memory_slot *memslot,
2126                                    const struct kvm_userspace_memory_region *mem,
2127                                    enum kvm_mr_change change)
2128 {
2129         hva_t hva = mem->userspace_addr;
2130         hva_t reg_end = hva + mem->memory_size;
2131         bool writable = !(mem->flags & KVM_MEM_READONLY);
2132         int ret = 0;
2133
2134         if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2135                         change != KVM_MR_FLAGS_ONLY)
2136                 return 0;
2137
2138         /*
2139          * Prevent userspace from creating a memory region outside of the IPA
2140          * space addressable by the KVM guest IPA space.
2141          */
2142         if (memslot->base_gfn + memslot->npages >=
2143             (kvm_phys_size(kvm) >> PAGE_SHIFT))
2144                 return -EFAULT;
2145
2146         mmap_read_lock(current->mm);
2147         /*
2148          * A memory region could potentially cover multiple VMAs, and any holes
2149          * between them, so iterate over all of them to find out if we can map
2150          * any of them right now.
2151          *
2152          *     +--------------------------------------------+
2153          * +---------------+----------------+   +----------------+
2154          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2155          * +---------------+----------------+   +----------------+
2156          *     |               memory region                |
2157          *     +--------------------------------------------+
2158          */
2159         do {
2160                 struct vm_area_struct *vma = find_vma(current->mm, hva);
2161                 hva_t vm_start, vm_end;
2162
2163                 if (!vma || vma->vm_start >= reg_end)
2164                         break;
2165
2166                 /*
2167                  * Take the intersection of this VMA with the memory region
2168                  */
2169                 vm_start = max(hva, vma->vm_start);
2170                 vm_end = min(reg_end, vma->vm_end);
2171
2172                 if (vma->vm_flags & VM_PFNMAP) {
2173                         gpa_t gpa = mem->guest_phys_addr +
2174                                     (vm_start - mem->userspace_addr);
2175                         phys_addr_t pa;
2176
2177                         pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2178                         pa += vm_start - vma->vm_start;
2179
2180                         /* IO region dirty page logging not allowed */
2181                         if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2182                                 ret = -EINVAL;
2183                                 goto out;
2184                         }
2185
2186                         ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2187                                                     vm_end - vm_start,
2188                                                     writable);
2189                         if (ret)
2190                                 break;
2191                 }
2192                 hva = vm_end;
2193         } while (hva < reg_end);
2194
2195         if (change == KVM_MR_FLAGS_ONLY)
2196                 goto out;
2197
2198         spin_lock(&kvm->mmu_lock);
2199         if (ret)
2200                 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
2201         else
2202                 stage2_flush_memslot(kvm, memslot);
2203         spin_unlock(&kvm->mmu_lock);
2204 out:
2205         mmap_read_unlock(current->mm);
2206         return ret;
2207 }
2208
2209 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2210 {
2211 }
2212
2213 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2214 {
2215 }
2216
2217 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2218 {
2219         kvm_free_stage2_pgd(&kvm->arch.mmu);
2220 }
2221
2222 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2223                                    struct kvm_memory_slot *slot)
2224 {
2225         gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2226         phys_addr_t size = slot->npages << PAGE_SHIFT;
2227
2228         spin_lock(&kvm->mmu_lock);
2229         unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2230         spin_unlock(&kvm->mmu_lock);
2231 }
2232
2233 /*
2234  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2235  *
2236  * Main problems:
2237  * - S/W ops are local to a CPU (not broadcast)
2238  * - We have line migration behind our back (speculation)
2239  * - System caches don't support S/W at all (damn!)
2240  *
2241  * In the face of the above, the best we can do is to try and convert
2242  * S/W ops to VA ops. Because the guest is not allowed to infer the
2243  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2244  * which is a rather good thing for us.
2245  *
2246  * Also, it is only used when turning caches on/off ("The expected
2247  * usage of the cache maintenance instructions that operate by set/way
2248  * is associated with the cache maintenance instructions associated
2249  * with the powerdown and powerup of caches, if this is required by
2250  * the implementation.").
2251  *
2252  * We use the following policy:
2253  *
2254  * - If we trap a S/W operation, we enable VM trapping to detect
2255  *   caches being turned on/off, and do a full clean.
2256  *
2257  * - We flush the caches on both caches being turned on and off.
2258  *
2259  * - Once the caches are enabled, we stop trapping VM ops.
2260  */
2261 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2262 {
2263         unsigned long hcr = *vcpu_hcr(vcpu);
2264
2265         /*
2266          * If this is the first time we do a S/W operation
2267          * (i.e. HCR_TVM not set) flush the whole memory, and set the
2268          * VM trapping.
2269          *
2270          * Otherwise, rely on the VM trapping to wait for the MMU +
2271          * Caches to be turned off. At that point, we'll be able to
2272          * clean the caches again.
2273          */
2274         if (!(hcr & HCR_TVM)) {
2275                 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2276                                         vcpu_has_cache_enabled(vcpu));
2277                 stage2_flush_vm(vcpu->kvm);
2278                 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2279         }
2280 }
2281
2282 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2283 {
2284         bool now_enabled = vcpu_has_cache_enabled(vcpu);
2285
2286         /*
2287          * If switching the MMU+caches on, need to invalidate the caches.
2288          * If switching it off, need to clean the caches.
2289          * Clean + invalidate does the trick always.
2290          */
2291         if (now_enabled != was_enabled)
2292                 stage2_flush_vm(vcpu->kvm);
2293
2294         /* Caches are now on, stop trapping VM ops (until a S/W op) */
2295         if (now_enabled)
2296                 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2297
2298         trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2299 }