KVM: arm64: Convert memslot cache-flushing code to generic page-table API
[linux-2.6-microblaze.git] / arch / arm64 / kvm / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5  */
6
7 #include <linux/mman.h>
8 #include <linux/kvm_host.h>
9 #include <linux/io.h>
10 #include <linux/hugetlb.h>
11 #include <linux/sched/signal.h>
12 #include <trace/events/kvm.h>
13 #include <asm/pgalloc.h>
14 #include <asm/cacheflush.h>
15 #include <asm/kvm_arm.h>
16 #include <asm/kvm_mmu.h>
17 #include <asm/kvm_pgtable.h>
18 #include <asm/kvm_ras.h>
19 #include <asm/kvm_asm.h>
20 #include <asm/kvm_emulate.h>
21 #include <asm/virt.h>
22
23 #include "trace.h"
24
25 static struct kvm_pgtable *hyp_pgtable;
26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
27
28 static unsigned long hyp_idmap_start;
29 static unsigned long hyp_idmap_end;
30 static phys_addr_t hyp_idmap_vector;
31
32 static unsigned long io_map_base;
33
34 #define KVM_S2PTE_FLAG_IS_IOMAP         (1UL << 0)
35 #define KVM_S2_FLAG_LOGGING_ACTIVE      (1UL << 1)
36
37 static bool is_iomap(unsigned long flags)
38 {
39         return flags & KVM_S2PTE_FLAG_IS_IOMAP;
40 }
41
42 /*
43  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
44  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
45  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
46  * long will also starve other vCPUs. We have to also make sure that the page
47  * tables are not freed while we released the lock.
48  */
49 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
50                               phys_addr_t end,
51                               int (*fn)(struct kvm_pgtable *, u64, u64),
52                               bool resched)
53 {
54         int ret;
55         u64 next;
56
57         do {
58                 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
59                 if (!pgt)
60                         return -EINVAL;
61
62                 next = stage2_pgd_addr_end(kvm, addr, end);
63                 ret = fn(pgt, addr, next - addr);
64                 if (ret)
65                         break;
66
67                 if (resched && next != end)
68                         cond_resched_lock(&kvm->mmu_lock);
69         } while (addr = next, addr != end);
70
71         return ret;
72 }
73
74 #define stage2_apply_range_resched(kvm, addr, end, fn)                  \
75         stage2_apply_range(kvm, addr, end, fn, true)
76
77 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
78 {
79         return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
80 }
81
82 /**
83  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
84  * @kvm:        pointer to kvm structure.
85  *
86  * Interface to HYP function to flush all VM TLB entries
87  */
88 void kvm_flush_remote_tlbs(struct kvm *kvm)
89 {
90         kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
91 }
92
93 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
94                                    int level)
95 {
96         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
97 }
98
99 /*
100  * D-Cache management functions. They take the page table entries by
101  * value, as they are flushing the cache using the kernel mapping (or
102  * kmap on 32bit).
103  */
104 static void kvm_flush_dcache_pte(pte_t pte)
105 {
106         __kvm_flush_dcache_pte(pte);
107 }
108
109 static void kvm_flush_dcache_pmd(pmd_t pmd)
110 {
111         __kvm_flush_dcache_pmd(pmd);
112 }
113
114 static void kvm_flush_dcache_pud(pud_t pud)
115 {
116         __kvm_flush_dcache_pud(pud);
117 }
118
119 static bool kvm_is_device_pfn(unsigned long pfn)
120 {
121         return !pfn_valid(pfn);
122 }
123
124 /**
125  * stage2_dissolve_pmd() - clear and flush huge PMD entry
126  * @mmu:        pointer to mmu structure to operate on
127  * @addr:       IPA
128  * @pmd:        pmd pointer for IPA
129  *
130  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
131  */
132 static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
133 {
134         if (!pmd_thp_or_huge(*pmd))
135                 return;
136
137         pmd_clear(pmd);
138         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
139         put_page(virt_to_page(pmd));
140 }
141
142 /**
143  * stage2_dissolve_pud() - clear and flush huge PUD entry
144  * @mmu:        pointer to mmu structure to operate on
145  * @addr:       IPA
146  * @pud:        pud pointer for IPA
147  *
148  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
149  */
150 static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
151 {
152         struct kvm *kvm = mmu->kvm;
153
154         if (!stage2_pud_huge(kvm, *pudp))
155                 return;
156
157         stage2_pud_clear(kvm, pudp);
158         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
159         put_page(virt_to_page(pudp));
160 }
161
162 static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
163 {
164         struct kvm *kvm = mmu->kvm;
165         p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
166         stage2_pgd_clear(kvm, pgd);
167         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
168         stage2_p4d_free(kvm, p4d_table);
169         put_page(virt_to_page(pgd));
170 }
171
172 static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
173 {
174         struct kvm *kvm = mmu->kvm;
175         pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
176         stage2_p4d_clear(kvm, p4d);
177         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
178         stage2_pud_free(kvm, pud_table);
179         put_page(virt_to_page(p4d));
180 }
181
182 static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
183 {
184         struct kvm *kvm = mmu->kvm;
185         pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
186
187         VM_BUG_ON(stage2_pud_huge(kvm, *pud));
188         stage2_pud_clear(kvm, pud);
189         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
190         stage2_pmd_free(kvm, pmd_table);
191         put_page(virt_to_page(pud));
192 }
193
194 static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
195 {
196         pte_t *pte_table = pte_offset_kernel(pmd, 0);
197         VM_BUG_ON(pmd_thp_or_huge(*pmd));
198         pmd_clear(pmd);
199         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
200         free_page((unsigned long)pte_table);
201         put_page(virt_to_page(pmd));
202 }
203
204 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
205 {
206         WRITE_ONCE(*ptep, new_pte);
207         dsb(ishst);
208 }
209
210 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
211 {
212         WRITE_ONCE(*pmdp, new_pmd);
213         dsb(ishst);
214 }
215
216 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
217 {
218         kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
219 }
220
221 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
222 {
223         WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
224         dsb(ishst);
225 }
226
227 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
228 {
229         WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
230         dsb(ishst);
231 }
232
233 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
234 {
235 #ifndef __PAGETABLE_P4D_FOLDED
236         WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
237         dsb(ishst);
238 #endif
239 }
240
241 /*
242  * Unmapping vs dcache management:
243  *
244  * If a guest maps certain memory pages as uncached, all writes will
245  * bypass the data cache and go directly to RAM.  However, the CPUs
246  * can still speculate reads (not writes) and fill cache lines with
247  * data.
248  *
249  * Those cache lines will be *clean* cache lines though, so a
250  * clean+invalidate operation is equivalent to an invalidate
251  * operation, because no cache lines are marked dirty.
252  *
253  * Those clean cache lines could be filled prior to an uncached write
254  * by the guest, and the cache coherent IO subsystem would therefore
255  * end up writing old data to disk.
256  *
257  * This is why right after unmapping a page/section and invalidating
258  * the corresponding TLBs, we flush to make sure the IO subsystem will
259  * never hit in the cache.
260  *
261  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
262  * we then fully enforce cacheability of RAM, no matter what the guest
263  * does.
264  */
265 static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
266                        phys_addr_t addr, phys_addr_t end)
267 {
268         phys_addr_t start_addr = addr;
269         pte_t *pte, *start_pte;
270
271         start_pte = pte = pte_offset_kernel(pmd, addr);
272         do {
273                 if (!pte_none(*pte)) {
274                         pte_t old_pte = *pte;
275
276                         kvm_set_pte(pte, __pte(0));
277                         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
278
279                         /* No need to invalidate the cache for device mappings */
280                         if (!kvm_is_device_pfn(pte_pfn(old_pte)))
281                                 kvm_flush_dcache_pte(old_pte);
282
283                         put_page(virt_to_page(pte));
284                 }
285         } while (pte++, addr += PAGE_SIZE, addr != end);
286
287         if (stage2_pte_table_empty(mmu->kvm, start_pte))
288                 clear_stage2_pmd_entry(mmu, pmd, start_addr);
289 }
290
291 static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
292                        phys_addr_t addr, phys_addr_t end)
293 {
294         struct kvm *kvm = mmu->kvm;
295         phys_addr_t next, start_addr = addr;
296         pmd_t *pmd, *start_pmd;
297
298         start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
299         do {
300                 next = stage2_pmd_addr_end(kvm, addr, end);
301                 if (!pmd_none(*pmd)) {
302                         if (pmd_thp_or_huge(*pmd)) {
303                                 pmd_t old_pmd = *pmd;
304
305                                 pmd_clear(pmd);
306                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
307
308                                 kvm_flush_dcache_pmd(old_pmd);
309
310                                 put_page(virt_to_page(pmd));
311                         } else {
312                                 unmap_stage2_ptes(mmu, pmd, addr, next);
313                         }
314                 }
315         } while (pmd++, addr = next, addr != end);
316
317         if (stage2_pmd_table_empty(kvm, start_pmd))
318                 clear_stage2_pud_entry(mmu, pud, start_addr);
319 }
320
321 static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
322                        phys_addr_t addr, phys_addr_t end)
323 {
324         struct kvm *kvm = mmu->kvm;
325         phys_addr_t next, start_addr = addr;
326         pud_t *pud, *start_pud;
327
328         start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
329         do {
330                 next = stage2_pud_addr_end(kvm, addr, end);
331                 if (!stage2_pud_none(kvm, *pud)) {
332                         if (stage2_pud_huge(kvm, *pud)) {
333                                 pud_t old_pud = *pud;
334
335                                 stage2_pud_clear(kvm, pud);
336                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
337                                 kvm_flush_dcache_pud(old_pud);
338                                 put_page(virt_to_page(pud));
339                         } else {
340                                 unmap_stage2_pmds(mmu, pud, addr, next);
341                         }
342                 }
343         } while (pud++, addr = next, addr != end);
344
345         if (stage2_pud_table_empty(kvm, start_pud))
346                 clear_stage2_p4d_entry(mmu, p4d, start_addr);
347 }
348
349 static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
350                        phys_addr_t addr, phys_addr_t end)
351 {
352         struct kvm *kvm = mmu->kvm;
353         phys_addr_t next, start_addr = addr;
354         p4d_t *p4d, *start_p4d;
355
356         start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
357         do {
358                 next = stage2_p4d_addr_end(kvm, addr, end);
359                 if (!stage2_p4d_none(kvm, *p4d))
360                         unmap_stage2_puds(mmu, p4d, addr, next);
361         } while (p4d++, addr = next, addr != end);
362
363         if (stage2_p4d_table_empty(kvm, start_p4d))
364                 clear_stage2_pgd_entry(mmu, pgd, start_addr);
365 }
366
367 /**
368  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
369  * @kvm:   The VM pointer
370  * @start: The intermediate physical base address of the range to unmap
371  * @size:  The size of the area to unmap
372  *
373  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
374  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
375  * destroying the VM), otherwise another faulting VCPU may come in and mess
376  * with things behind our backs.
377  */
378 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
379                                  bool may_block)
380 {
381         struct kvm *kvm = mmu->kvm;
382         phys_addr_t end = start + size;
383
384         assert_spin_locked(&kvm->mmu_lock);
385         WARN_ON(size & ~PAGE_MASK);
386         WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
387                                    may_block));
388 }
389
390 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
391 {
392         __unmap_stage2_range(mmu, start, size, true);
393 }
394
395 static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
396                               phys_addr_t addr, phys_addr_t end)
397 {
398         pte_t *pte;
399
400         pte = pte_offset_kernel(pmd, addr);
401         do {
402                 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
403                         kvm_flush_dcache_pte(*pte);
404         } while (pte++, addr += PAGE_SIZE, addr != end);
405 }
406
407 static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
408                               phys_addr_t addr, phys_addr_t end)
409 {
410         struct kvm *kvm = mmu->kvm;
411         pmd_t *pmd;
412         phys_addr_t next;
413
414         pmd = stage2_pmd_offset(kvm, pud, addr);
415         do {
416                 next = stage2_pmd_addr_end(kvm, addr, end);
417                 if (!pmd_none(*pmd)) {
418                         if (pmd_thp_or_huge(*pmd))
419                                 kvm_flush_dcache_pmd(*pmd);
420                         else
421                                 stage2_flush_ptes(mmu, pmd, addr, next);
422                 }
423         } while (pmd++, addr = next, addr != end);
424 }
425
426 static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
427                               phys_addr_t addr, phys_addr_t end)
428 {
429         struct kvm *kvm = mmu->kvm;
430         pud_t *pud;
431         phys_addr_t next;
432
433         pud = stage2_pud_offset(kvm, p4d, addr);
434         do {
435                 next = stage2_pud_addr_end(kvm, addr, end);
436                 if (!stage2_pud_none(kvm, *pud)) {
437                         if (stage2_pud_huge(kvm, *pud))
438                                 kvm_flush_dcache_pud(*pud);
439                         else
440                                 stage2_flush_pmds(mmu, pud, addr, next);
441                 }
442         } while (pud++, addr = next, addr != end);
443 }
444
445 static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
446                               phys_addr_t addr, phys_addr_t end)
447 {
448         struct kvm *kvm = mmu->kvm;
449         p4d_t *p4d;
450         phys_addr_t next;
451
452         p4d = stage2_p4d_offset(kvm, pgd, addr);
453         do {
454                 next = stage2_p4d_addr_end(kvm, addr, end);
455                 if (!stage2_p4d_none(kvm, *p4d))
456                         stage2_flush_puds(mmu, p4d, addr, next);
457         } while (p4d++, addr = next, addr != end);
458 }
459
460 static void stage2_flush_memslot(struct kvm *kvm,
461                                  struct kvm_memory_slot *memslot)
462 {
463         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
464         phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
465
466         stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
467 }
468
469 /**
470  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
471  * @kvm: The struct kvm pointer
472  *
473  * Go through the stage 2 page tables and invalidate any cache lines
474  * backing memory already mapped to the VM.
475  */
476 static void stage2_flush_vm(struct kvm *kvm)
477 {
478         struct kvm_memslots *slots;
479         struct kvm_memory_slot *memslot;
480         int idx;
481
482         idx = srcu_read_lock(&kvm->srcu);
483         spin_lock(&kvm->mmu_lock);
484
485         slots = kvm_memslots(kvm);
486         kvm_for_each_memslot(memslot, slots)
487                 stage2_flush_memslot(kvm, memslot);
488
489         spin_unlock(&kvm->mmu_lock);
490         srcu_read_unlock(&kvm->srcu, idx);
491 }
492
493 /**
494  * free_hyp_pgds - free Hyp-mode page tables
495  */
496 void free_hyp_pgds(void)
497 {
498         mutex_lock(&kvm_hyp_pgd_mutex);
499         if (hyp_pgtable) {
500                 kvm_pgtable_hyp_destroy(hyp_pgtable);
501                 kfree(hyp_pgtable);
502         }
503         mutex_unlock(&kvm_hyp_pgd_mutex);
504 }
505
506 static int __create_hyp_mappings(unsigned long start, unsigned long size,
507                                  unsigned long phys, enum kvm_pgtable_prot prot)
508 {
509         int err;
510
511         mutex_lock(&kvm_hyp_pgd_mutex);
512         err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
513         mutex_unlock(&kvm_hyp_pgd_mutex);
514
515         return err;
516 }
517
518 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
519 {
520         if (!is_vmalloc_addr(kaddr)) {
521                 BUG_ON(!virt_addr_valid(kaddr));
522                 return __pa(kaddr);
523         } else {
524                 return page_to_phys(vmalloc_to_page(kaddr)) +
525                        offset_in_page(kaddr);
526         }
527 }
528
529 /**
530  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
531  * @from:       The virtual kernel start address of the range
532  * @to:         The virtual kernel end address of the range (exclusive)
533  * @prot:       The protection to be applied to this range
534  *
535  * The same virtual address as the kernel virtual address is also used
536  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
537  * physical pages.
538  */
539 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
540 {
541         phys_addr_t phys_addr;
542         unsigned long virt_addr;
543         unsigned long start = kern_hyp_va((unsigned long)from);
544         unsigned long end = kern_hyp_va((unsigned long)to);
545
546         if (is_kernel_in_hyp_mode())
547                 return 0;
548
549         start = start & PAGE_MASK;
550         end = PAGE_ALIGN(end);
551
552         for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
553                 int err;
554
555                 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
556                 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
557                                             prot);
558                 if (err)
559                         return err;
560         }
561
562         return 0;
563 }
564
565 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
566                                         unsigned long *haddr,
567                                         enum kvm_pgtable_prot prot)
568 {
569         unsigned long base;
570         int ret = 0;
571
572         mutex_lock(&kvm_hyp_pgd_mutex);
573
574         /*
575          * This assumes that we have enough space below the idmap
576          * page to allocate our VAs. If not, the check below will
577          * kick. A potential alternative would be to detect that
578          * overflow and switch to an allocation above the idmap.
579          *
580          * The allocated size is always a multiple of PAGE_SIZE.
581          */
582         size = PAGE_ALIGN(size + offset_in_page(phys_addr));
583         base = io_map_base - size;
584
585         /*
586          * Verify that BIT(VA_BITS - 1) hasn't been flipped by
587          * allocating the new area, as it would indicate we've
588          * overflowed the idmap/IO address range.
589          */
590         if ((base ^ io_map_base) & BIT(VA_BITS - 1))
591                 ret = -ENOMEM;
592         else
593                 io_map_base = base;
594
595         mutex_unlock(&kvm_hyp_pgd_mutex);
596
597         if (ret)
598                 goto out;
599
600         ret = __create_hyp_mappings(base, size, phys_addr, prot);
601         if (ret)
602                 goto out;
603
604         *haddr = base + offset_in_page(phys_addr);
605 out:
606         return ret;
607 }
608
609 /**
610  * create_hyp_io_mappings - Map IO into both kernel and HYP
611  * @phys_addr:  The physical start address which gets mapped
612  * @size:       Size of the region being mapped
613  * @kaddr:      Kernel VA for this mapping
614  * @haddr:      HYP VA for this mapping
615  */
616 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
617                            void __iomem **kaddr,
618                            void __iomem **haddr)
619 {
620         unsigned long addr;
621         int ret;
622
623         *kaddr = ioremap(phys_addr, size);
624         if (!*kaddr)
625                 return -ENOMEM;
626
627         if (is_kernel_in_hyp_mode()) {
628                 *haddr = *kaddr;
629                 return 0;
630         }
631
632         ret = __create_hyp_private_mapping(phys_addr, size,
633                                            &addr, PAGE_HYP_DEVICE);
634         if (ret) {
635                 iounmap(*kaddr);
636                 *kaddr = NULL;
637                 *haddr = NULL;
638                 return ret;
639         }
640
641         *haddr = (void __iomem *)addr;
642         return 0;
643 }
644
645 /**
646  * create_hyp_exec_mappings - Map an executable range into HYP
647  * @phys_addr:  The physical start address which gets mapped
648  * @size:       Size of the region being mapped
649  * @haddr:      HYP VA for this mapping
650  */
651 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
652                              void **haddr)
653 {
654         unsigned long addr;
655         int ret;
656
657         BUG_ON(is_kernel_in_hyp_mode());
658
659         ret = __create_hyp_private_mapping(phys_addr, size,
660                                            &addr, PAGE_HYP_EXEC);
661         if (ret) {
662                 *haddr = NULL;
663                 return ret;
664         }
665
666         *haddr = (void *)addr;
667         return 0;
668 }
669
670 /**
671  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
672  * @kvm:        The pointer to the KVM structure
673  * @mmu:        The pointer to the s2 MMU structure
674  *
675  * Allocates only the stage-2 HW PGD level table(s).
676  * Note we don't need locking here as this is only called when the VM is
677  * created, which can only be done once.
678  */
679 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
680 {
681         int cpu, err;
682         struct kvm_pgtable *pgt;
683
684         if (mmu->pgt != NULL) {
685                 kvm_err("kvm_arch already initialized?\n");
686                 return -EINVAL;
687         }
688
689         pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
690         if (!pgt)
691                 return -ENOMEM;
692
693         err = kvm_pgtable_stage2_init(pgt, kvm);
694         if (err)
695                 goto out_free_pgtable;
696
697         mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
698         if (!mmu->last_vcpu_ran) {
699                 err = -ENOMEM;
700                 goto out_destroy_pgtable;
701         }
702
703         for_each_possible_cpu(cpu)
704                 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
705
706         mmu->kvm = kvm;
707         mmu->pgt = pgt;
708         mmu->pgd_phys = __pa(pgt->pgd);
709         mmu->pgd = (void *)pgt->pgd;
710         mmu->vmid.vmid_gen = 0;
711         return 0;
712
713 out_destroy_pgtable:
714         kvm_pgtable_stage2_destroy(pgt);
715 out_free_pgtable:
716         kfree(pgt);
717         return err;
718 }
719
720 static void stage2_unmap_memslot(struct kvm *kvm,
721                                  struct kvm_memory_slot *memslot)
722 {
723         hva_t hva = memslot->userspace_addr;
724         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
725         phys_addr_t size = PAGE_SIZE * memslot->npages;
726         hva_t reg_end = hva + size;
727
728         /*
729          * A memory region could potentially cover multiple VMAs, and any holes
730          * between them, so iterate over all of them to find out if we should
731          * unmap any of them.
732          *
733          *     +--------------------------------------------+
734          * +---------------+----------------+   +----------------+
735          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
736          * +---------------+----------------+   +----------------+
737          *     |               memory region                |
738          *     +--------------------------------------------+
739          */
740         do {
741                 struct vm_area_struct *vma = find_vma(current->mm, hva);
742                 hva_t vm_start, vm_end;
743
744                 if (!vma || vma->vm_start >= reg_end)
745                         break;
746
747                 /*
748                  * Take the intersection of this VMA with the memory region
749                  */
750                 vm_start = max(hva, vma->vm_start);
751                 vm_end = min(reg_end, vma->vm_end);
752
753                 if (!(vma->vm_flags & VM_PFNMAP)) {
754                         gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
755                         unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
756                 }
757                 hva = vm_end;
758         } while (hva < reg_end);
759 }
760
761 /**
762  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
763  * @kvm: The struct kvm pointer
764  *
765  * Go through the memregions and unmap any regular RAM
766  * backing memory already mapped to the VM.
767  */
768 void stage2_unmap_vm(struct kvm *kvm)
769 {
770         struct kvm_memslots *slots;
771         struct kvm_memory_slot *memslot;
772         int idx;
773
774         idx = srcu_read_lock(&kvm->srcu);
775         mmap_read_lock(current->mm);
776         spin_lock(&kvm->mmu_lock);
777
778         slots = kvm_memslots(kvm);
779         kvm_for_each_memslot(memslot, slots)
780                 stage2_unmap_memslot(kvm, memslot);
781
782         spin_unlock(&kvm->mmu_lock);
783         mmap_read_unlock(current->mm);
784         srcu_read_unlock(&kvm->srcu, idx);
785 }
786
787 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
788 {
789         struct kvm *kvm = mmu->kvm;
790         struct kvm_pgtable *pgt = NULL;
791
792         spin_lock(&kvm->mmu_lock);
793         pgt = mmu->pgt;
794         if (pgt) {
795                 mmu->pgd = NULL;
796                 mmu->pgd_phys = 0;
797                 mmu->pgt = NULL;
798                 free_percpu(mmu->last_vcpu_ran);
799         }
800         spin_unlock(&kvm->mmu_lock);
801
802         if (pgt) {
803                 kvm_pgtable_stage2_destroy(pgt);
804                 kfree(pgt);
805         }
806 }
807
808 static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
809                              phys_addr_t addr)
810 {
811         struct kvm *kvm = mmu->kvm;
812         pgd_t *pgd;
813         p4d_t *p4d;
814
815         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
816         if (stage2_pgd_none(kvm, *pgd)) {
817                 if (!cache)
818                         return NULL;
819                 p4d = kvm_mmu_memory_cache_alloc(cache);
820                 stage2_pgd_populate(kvm, pgd, p4d);
821                 get_page(virt_to_page(pgd));
822         }
823
824         return stage2_p4d_offset(kvm, pgd, addr);
825 }
826
827 static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
828                              phys_addr_t addr)
829 {
830         struct kvm *kvm = mmu->kvm;
831         p4d_t *p4d;
832         pud_t *pud;
833
834         p4d = stage2_get_p4d(mmu, cache, addr);
835         if (stage2_p4d_none(kvm, *p4d)) {
836                 if (!cache)
837                         return NULL;
838                 pud = kvm_mmu_memory_cache_alloc(cache);
839                 stage2_p4d_populate(kvm, p4d, pud);
840                 get_page(virt_to_page(p4d));
841         }
842
843         return stage2_pud_offset(kvm, p4d, addr);
844 }
845
846 static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
847                              phys_addr_t addr)
848 {
849         struct kvm *kvm = mmu->kvm;
850         pud_t *pud;
851         pmd_t *pmd;
852
853         pud = stage2_get_pud(mmu, cache, addr);
854         if (!pud || stage2_pud_huge(kvm, *pud))
855                 return NULL;
856
857         if (stage2_pud_none(kvm, *pud)) {
858                 if (!cache)
859                         return NULL;
860                 pmd = kvm_mmu_memory_cache_alloc(cache);
861                 stage2_pud_populate(kvm, pud, pmd);
862                 get_page(virt_to_page(pud));
863         }
864
865         return stage2_pmd_offset(kvm, pud, addr);
866 }
867
868 static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
869                                struct kvm_mmu_memory_cache *cache,
870                                phys_addr_t addr, const pmd_t *new_pmd)
871 {
872         pmd_t *pmd, old_pmd;
873
874 retry:
875         pmd = stage2_get_pmd(mmu, cache, addr);
876         VM_BUG_ON(!pmd);
877
878         old_pmd = *pmd;
879         /*
880          * Multiple vcpus faulting on the same PMD entry, can
881          * lead to them sequentially updating the PMD with the
882          * same value. Following the break-before-make
883          * (pmd_clear() followed by tlb_flush()) process can
884          * hinder forward progress due to refaults generated
885          * on missing translations.
886          *
887          * Skip updating the page table if the entry is
888          * unchanged.
889          */
890         if (pmd_val(old_pmd) == pmd_val(*new_pmd))
891                 return 0;
892
893         if (pmd_present(old_pmd)) {
894                 /*
895                  * If we already have PTE level mapping for this block,
896                  * we must unmap it to avoid inconsistent TLB state and
897                  * leaking the table page. We could end up in this situation
898                  * if the memory slot was marked for dirty logging and was
899                  * reverted, leaving PTE level mappings for the pages accessed
900                  * during the period. So, unmap the PTE level mapping for this
901                  * block and retry, as we could have released the upper level
902                  * table in the process.
903                  *
904                  * Normal THP split/merge follows mmu_notifier callbacks and do
905                  * get handled accordingly.
906                  */
907                 if (!pmd_thp_or_huge(old_pmd)) {
908                         unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
909                         goto retry;
910                 }
911                 /*
912                  * Mapping in huge pages should only happen through a
913                  * fault.  If a page is merged into a transparent huge
914                  * page, the individual subpages of that huge page
915                  * should be unmapped through MMU notifiers before we
916                  * get here.
917                  *
918                  * Merging of CompoundPages is not supported; they
919                  * should become splitting first, unmapped, merged,
920                  * and mapped back in on-demand.
921                  */
922                 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
923                 pmd_clear(pmd);
924                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
925         } else {
926                 get_page(virt_to_page(pmd));
927         }
928
929         kvm_set_pmd(pmd, *new_pmd);
930         return 0;
931 }
932
933 static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
934                                struct kvm_mmu_memory_cache *cache,
935                                phys_addr_t addr, const pud_t *new_pudp)
936 {
937         struct kvm *kvm = mmu->kvm;
938         pud_t *pudp, old_pud;
939
940 retry:
941         pudp = stage2_get_pud(mmu, cache, addr);
942         VM_BUG_ON(!pudp);
943
944         old_pud = *pudp;
945
946         /*
947          * A large number of vcpus faulting on the same stage 2 entry,
948          * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
949          * Skip updating the page tables if there is no change.
950          */
951         if (pud_val(old_pud) == pud_val(*new_pudp))
952                 return 0;
953
954         if (stage2_pud_present(kvm, old_pud)) {
955                 /*
956                  * If we already have table level mapping for this block, unmap
957                  * the range for this block and retry.
958                  */
959                 if (!stage2_pud_huge(kvm, old_pud)) {
960                         unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
961                         goto retry;
962                 }
963
964                 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
965                 stage2_pud_clear(kvm, pudp);
966                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
967         } else {
968                 get_page(virt_to_page(pudp));
969         }
970
971         kvm_set_pud(pudp, *new_pudp);
972         return 0;
973 }
974
975 /*
976  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
977  * true if a valid and present leaf-entry is found. A pointer to the
978  * leaf-entry is returned in the appropriate level variable - pudpp,
979  * pmdpp, ptepp.
980  */
981 static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
982                                   pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
983 {
984         struct kvm *kvm = mmu->kvm;
985         pud_t *pudp;
986         pmd_t *pmdp;
987         pte_t *ptep;
988
989         *pudpp = NULL;
990         *pmdpp = NULL;
991         *ptepp = NULL;
992
993         pudp = stage2_get_pud(mmu, NULL, addr);
994         if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
995                 return false;
996
997         if (stage2_pud_huge(kvm, *pudp)) {
998                 *pudpp = pudp;
999                 return true;
1000         }
1001
1002         pmdp = stage2_pmd_offset(kvm, pudp, addr);
1003         if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1004                 return false;
1005
1006         if (pmd_thp_or_huge(*pmdp)) {
1007                 *pmdpp = pmdp;
1008                 return true;
1009         }
1010
1011         ptep = pte_offset_kernel(pmdp, addr);
1012         if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1013                 return false;
1014
1015         *ptepp = ptep;
1016         return true;
1017 }
1018
1019 static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
1020 {
1021         pud_t *pudp;
1022         pmd_t *pmdp;
1023         pte_t *ptep;
1024         bool found;
1025
1026         found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
1027         if (!found)
1028                 return false;
1029
1030         if (pudp)
1031                 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1032         else if (pmdp)
1033                 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1034         else
1035                 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1036 }
1037
1038 static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1039                           struct kvm_mmu_memory_cache *cache,
1040                           phys_addr_t addr, const pte_t *new_pte,
1041                           unsigned long flags)
1042 {
1043         struct kvm *kvm = mmu->kvm;
1044         pud_t *pud;
1045         pmd_t *pmd;
1046         pte_t *pte, old_pte;
1047         bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1048         bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1049
1050         VM_BUG_ON(logging_active && !cache);
1051
1052         /* Create stage-2 page table mapping - Levels 0 and 1 */
1053         pud = stage2_get_pud(mmu, cache, addr);
1054         if (!pud) {
1055                 /*
1056                  * Ignore calls from kvm_set_spte_hva for unallocated
1057                  * address ranges.
1058                  */
1059                 return 0;
1060         }
1061
1062         /*
1063          * While dirty page logging - dissolve huge PUD, then continue
1064          * on to allocate page.
1065          */
1066         if (logging_active)
1067                 stage2_dissolve_pud(mmu, addr, pud);
1068
1069         if (stage2_pud_none(kvm, *pud)) {
1070                 if (!cache)
1071                         return 0; /* ignore calls from kvm_set_spte_hva */
1072                 pmd = kvm_mmu_memory_cache_alloc(cache);
1073                 stage2_pud_populate(kvm, pud, pmd);
1074                 get_page(virt_to_page(pud));
1075         }
1076
1077         pmd = stage2_pmd_offset(kvm, pud, addr);
1078         if (!pmd) {
1079                 /*
1080                  * Ignore calls from kvm_set_spte_hva for unallocated
1081                  * address ranges.
1082                  */
1083                 return 0;
1084         }
1085
1086         /*
1087          * While dirty page logging - dissolve huge PMD, then continue on to
1088          * allocate page.
1089          */
1090         if (logging_active)
1091                 stage2_dissolve_pmd(mmu, addr, pmd);
1092
1093         /* Create stage-2 page mappings - Level 2 */
1094         if (pmd_none(*pmd)) {
1095                 if (!cache)
1096                         return 0; /* ignore calls from kvm_set_spte_hva */
1097                 pte = kvm_mmu_memory_cache_alloc(cache);
1098                 kvm_pmd_populate(pmd, pte);
1099                 get_page(virt_to_page(pmd));
1100         }
1101
1102         pte = pte_offset_kernel(pmd, addr);
1103
1104         if (iomap && pte_present(*pte))
1105                 return -EFAULT;
1106
1107         /* Create 2nd stage page table mapping - Level 3 */
1108         old_pte = *pte;
1109         if (pte_present(old_pte)) {
1110                 /* Skip page table update if there is no change */
1111                 if (pte_val(old_pte) == pte_val(*new_pte))
1112                         return 0;
1113
1114                 kvm_set_pte(pte, __pte(0));
1115                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
1116         } else {
1117                 get_page(virt_to_page(pte));
1118         }
1119
1120         kvm_set_pte(pte, *new_pte);
1121         return 0;
1122 }
1123
1124 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1125 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1126 {
1127         if (pte_young(*pte)) {
1128                 *pte = pte_mkold(*pte);
1129                 return 1;
1130         }
1131         return 0;
1132 }
1133 #else
1134 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1135 {
1136         return __ptep_test_and_clear_young(pte);
1137 }
1138 #endif
1139
1140 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1141 {
1142         return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1143 }
1144
1145 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1146 {
1147         return stage2_ptep_test_and_clear_young((pte_t *)pud);
1148 }
1149
1150 /**
1151  * kvm_phys_addr_ioremap - map a device range to guest IPA
1152  *
1153  * @kvm:        The KVM pointer
1154  * @guest_ipa:  The IPA at which to insert the mapping
1155  * @pa:         The physical address of the device
1156  * @size:       The size of the mapping
1157  */
1158 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1159                           phys_addr_t pa, unsigned long size, bool writable)
1160 {
1161         phys_addr_t addr;
1162         int ret = 0;
1163         struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
1164         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
1165         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1166                                      KVM_PGTABLE_PROT_R |
1167                                      (writable ? KVM_PGTABLE_PROT_W : 0);
1168
1169         size += offset_in_page(guest_ipa);
1170         guest_ipa &= PAGE_MASK;
1171
1172         for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1173                 ret = kvm_mmu_topup_memory_cache(&cache,
1174                                                  kvm_mmu_cache_min_pages(kvm));
1175                 if (ret)
1176                         break;
1177
1178                 spin_lock(&kvm->mmu_lock);
1179                 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1180                                              &cache);
1181                 spin_unlock(&kvm->mmu_lock);
1182                 if (ret)
1183                         break;
1184
1185                 pa += PAGE_SIZE;
1186         }
1187
1188         kvm_mmu_free_memory_cache(&cache);
1189         return ret;
1190 }
1191
1192 /**
1193  * stage2_wp_ptes - write protect PMD range
1194  * @pmd:        pointer to pmd entry
1195  * @addr:       range start address
1196  * @end:        range end address
1197  */
1198 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1199 {
1200         pte_t *pte;
1201
1202         pte = pte_offset_kernel(pmd, addr);
1203         do {
1204                 if (!pte_none(*pte)) {
1205                         if (!kvm_s2pte_readonly(pte))
1206                                 kvm_set_s2pte_readonly(pte);
1207                 }
1208         } while (pte++, addr += PAGE_SIZE, addr != end);
1209 }
1210
1211 /**
1212  * stage2_wp_pmds - write protect PUD range
1213  * kvm:         kvm instance for the VM
1214  * @pud:        pointer to pud entry
1215  * @addr:       range start address
1216  * @end:        range end address
1217  */
1218 static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
1219                            phys_addr_t addr, phys_addr_t end)
1220 {
1221         struct kvm *kvm = mmu->kvm;
1222         pmd_t *pmd;
1223         phys_addr_t next;
1224
1225         pmd = stage2_pmd_offset(kvm, pud, addr);
1226
1227         do {
1228                 next = stage2_pmd_addr_end(kvm, addr, end);
1229                 if (!pmd_none(*pmd)) {
1230                         if (pmd_thp_or_huge(*pmd)) {
1231                                 if (!kvm_s2pmd_readonly(pmd))
1232                                         kvm_set_s2pmd_readonly(pmd);
1233                         } else {
1234                                 stage2_wp_ptes(pmd, addr, next);
1235                         }
1236                 }
1237         } while (pmd++, addr = next, addr != end);
1238 }
1239
1240 /**
1241  * stage2_wp_puds - write protect P4D range
1242  * @p4d:        pointer to p4d entry
1243  * @addr:       range start address
1244  * @end:        range end address
1245  */
1246 static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
1247                             phys_addr_t addr, phys_addr_t end)
1248 {
1249         struct kvm *kvm = mmu->kvm;
1250         pud_t *pud;
1251         phys_addr_t next;
1252
1253         pud = stage2_pud_offset(kvm, p4d, addr);
1254         do {
1255                 next = stage2_pud_addr_end(kvm, addr, end);
1256                 if (!stage2_pud_none(kvm, *pud)) {
1257                         if (stage2_pud_huge(kvm, *pud)) {
1258                                 if (!kvm_s2pud_readonly(pud))
1259                                         kvm_set_s2pud_readonly(pud);
1260                         } else {
1261                                 stage2_wp_pmds(mmu, pud, addr, next);
1262                         }
1263                 }
1264         } while (pud++, addr = next, addr != end);
1265 }
1266
1267 /**
1268  * stage2_wp_p4ds - write protect PGD range
1269  * @pgd:        pointer to pgd entry
1270  * @addr:       range start address
1271  * @end:        range end address
1272  */
1273 static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
1274                             phys_addr_t addr, phys_addr_t end)
1275 {
1276         struct kvm *kvm = mmu->kvm;
1277         p4d_t *p4d;
1278         phys_addr_t next;
1279
1280         p4d = stage2_p4d_offset(kvm, pgd, addr);
1281         do {
1282                 next = stage2_p4d_addr_end(kvm, addr, end);
1283                 if (!stage2_p4d_none(kvm, *p4d))
1284                         stage2_wp_puds(mmu, p4d, addr, next);
1285         } while (p4d++, addr = next, addr != end);
1286 }
1287
1288 /**
1289  * stage2_wp_range() - write protect stage2 memory region range
1290  * @kvm:        The KVM pointer
1291  * @addr:       Start address of range
1292  * @end:        End address of range
1293  */
1294 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1295 {
1296         struct kvm *kvm = mmu->kvm;
1297         stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
1298 }
1299
1300 /**
1301  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1302  * @kvm:        The KVM pointer
1303  * @slot:       The memory slot to write protect
1304  *
1305  * Called to start logging dirty pages after memory region
1306  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1307  * all present PUD, PMD and PTEs are write protected in the memory region.
1308  * Afterwards read of dirty page log can be called.
1309  *
1310  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1311  * serializing operations for VM memory regions.
1312  */
1313 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1314 {
1315         struct kvm_memslots *slots = kvm_memslots(kvm);
1316         struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1317         phys_addr_t start, end;
1318
1319         if (WARN_ON_ONCE(!memslot))
1320                 return;
1321
1322         start = memslot->base_gfn << PAGE_SHIFT;
1323         end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1324
1325         spin_lock(&kvm->mmu_lock);
1326         stage2_wp_range(&kvm->arch.mmu, start, end);
1327         spin_unlock(&kvm->mmu_lock);
1328         kvm_flush_remote_tlbs(kvm);
1329 }
1330
1331 /**
1332  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1333  * @kvm:        The KVM pointer
1334  * @slot:       The memory slot associated with mask
1335  * @gfn_offset: The gfn offset in memory slot
1336  * @mask:       The mask of dirty pages at offset 'gfn_offset' in this memory
1337  *              slot to be write protected
1338  *
1339  * Walks bits set in mask write protects the associated pte's. Caller must
1340  * acquire kvm_mmu_lock.
1341  */
1342 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1343                 struct kvm_memory_slot *slot,
1344                 gfn_t gfn_offset, unsigned long mask)
1345 {
1346         phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1347         phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1348         phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1349
1350         stage2_wp_range(&kvm->arch.mmu, start, end);
1351 }
1352
1353 /*
1354  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1355  * dirty pages.
1356  *
1357  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1358  * enable dirty logging for them.
1359  */
1360 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1361                 struct kvm_memory_slot *slot,
1362                 gfn_t gfn_offset, unsigned long mask)
1363 {
1364         kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1365 }
1366
1367 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1368 {
1369         __clean_dcache_guest_page(pfn, size);
1370 }
1371
1372 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1373 {
1374         __invalidate_icache_guest_page(pfn, size);
1375 }
1376
1377 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1378 {
1379         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1380 }
1381
1382 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1383                                                unsigned long hva,
1384                                                unsigned long map_size)
1385 {
1386         gpa_t gpa_start;
1387         hva_t uaddr_start, uaddr_end;
1388         size_t size;
1389
1390         /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1391         if (map_size == PAGE_SIZE)
1392                 return true;
1393
1394         size = memslot->npages * PAGE_SIZE;
1395
1396         gpa_start = memslot->base_gfn << PAGE_SHIFT;
1397
1398         uaddr_start = memslot->userspace_addr;
1399         uaddr_end = uaddr_start + size;
1400
1401         /*
1402          * Pages belonging to memslots that don't have the same alignment
1403          * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1404          * PMD/PUD entries, because we'll end up mapping the wrong pages.
1405          *
1406          * Consider a layout like the following:
1407          *
1408          *    memslot->userspace_addr:
1409          *    +-----+--------------------+--------------------+---+
1410          *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1411          *    +-----+--------------------+--------------------+---+
1412          *
1413          *    memslot->base_gfn << PAGE_SHIFT:
1414          *      +---+--------------------+--------------------+-----+
1415          *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1416          *      +---+--------------------+--------------------+-----+
1417          *
1418          * If we create those stage-2 blocks, we'll end up with this incorrect
1419          * mapping:
1420          *   d -> f
1421          *   e -> g
1422          *   f -> h
1423          */
1424         if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1425                 return false;
1426
1427         /*
1428          * Next, let's make sure we're not trying to map anything not covered
1429          * by the memslot. This means we have to prohibit block size mappings
1430          * for the beginning and end of a non-block aligned and non-block sized
1431          * memory slot (illustrated by the head and tail parts of the
1432          * userspace view above containing pages 'abcde' and 'xyz',
1433          * respectively).
1434          *
1435          * Note that it doesn't matter if we do the check using the
1436          * userspace_addr or the base_gfn, as both are equally aligned (per
1437          * the check above) and equally sized.
1438          */
1439         return (hva & ~(map_size - 1)) >= uaddr_start &&
1440                (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1441 }
1442
1443 /*
1444  * Check if the given hva is backed by a transparent huge page (THP) and
1445  * whether it can be mapped using block mapping in stage2. If so, adjust
1446  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1447  * supported. This will need to be updated to support other THP sizes.
1448  *
1449  * Returns the size of the mapping.
1450  */
1451 static unsigned long
1452 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1453                             unsigned long hva, kvm_pfn_t *pfnp,
1454                             phys_addr_t *ipap)
1455 {
1456         kvm_pfn_t pfn = *pfnp;
1457
1458         /*
1459          * Make sure the adjustment is done only for THP pages. Also make
1460          * sure that the HVA and IPA are sufficiently aligned and that the
1461          * block map is contained within the memslot.
1462          */
1463         if (kvm_is_transparent_hugepage(pfn) &&
1464             fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1465                 /*
1466                  * The address we faulted on is backed by a transparent huge
1467                  * page.  However, because we map the compound huge page and
1468                  * not the individual tail page, we need to transfer the
1469                  * refcount to the head page.  We have to be careful that the
1470                  * THP doesn't start to split while we are adjusting the
1471                  * refcounts.
1472                  *
1473                  * We are sure this doesn't happen, because mmu_notifier_retry
1474                  * was successful and we are holding the mmu_lock, so if this
1475                  * THP is trying to split, it will be blocked in the mmu
1476                  * notifier before touching any of the pages, specifically
1477                  * before being able to call __split_huge_page_refcount().
1478                  *
1479                  * We can therefore safely transfer the refcount from PG_tail
1480                  * to PG_head and switch the pfn from a tail page to the head
1481                  * page accordingly.
1482                  */
1483                 *ipap &= PMD_MASK;
1484                 kvm_release_pfn_clean(pfn);
1485                 pfn &= ~(PTRS_PER_PMD - 1);
1486                 kvm_get_pfn(pfn);
1487                 *pfnp = pfn;
1488
1489                 return PMD_SIZE;
1490         }
1491
1492         /* Use page mapping if we cannot use block mapping. */
1493         return PAGE_SIZE;
1494 }
1495
1496 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1497                           struct kvm_memory_slot *memslot, unsigned long hva,
1498                           unsigned long fault_status)
1499 {
1500         int ret;
1501         bool write_fault, writable, force_pte = false;
1502         bool exec_fault, needs_exec;
1503         unsigned long mmu_seq;
1504         gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1505         struct kvm *kvm = vcpu->kvm;
1506         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1507         struct vm_area_struct *vma;
1508         short vma_shift;
1509         kvm_pfn_t pfn;
1510         pgprot_t mem_type = PAGE_S2;
1511         bool logging_active = memslot_is_logging(memslot);
1512         unsigned long vma_pagesize, flags = 0;
1513         struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1514
1515         write_fault = kvm_is_write_fault(vcpu);
1516         exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1517         VM_BUG_ON(write_fault && exec_fault);
1518
1519         if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1520                 kvm_err("Unexpected L2 read permission error\n");
1521                 return -EFAULT;
1522         }
1523
1524         /* Let's check if we will get back a huge page backed by hugetlbfs */
1525         mmap_read_lock(current->mm);
1526         vma = find_vma_intersection(current->mm, hva, hva + 1);
1527         if (unlikely(!vma)) {
1528                 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1529                 mmap_read_unlock(current->mm);
1530                 return -EFAULT;
1531         }
1532
1533         if (is_vm_hugetlb_page(vma))
1534                 vma_shift = huge_page_shift(hstate_vma(vma));
1535         else
1536                 vma_shift = PAGE_SHIFT;
1537
1538         vma_pagesize = 1ULL << vma_shift;
1539         if (logging_active ||
1540             (vma->vm_flags & VM_PFNMAP) ||
1541             !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1542                 force_pte = true;
1543                 vma_pagesize = PAGE_SIZE;
1544         }
1545
1546         /*
1547          * The stage2 has a minimum of 2 level table (For arm64 see
1548          * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1549          * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1550          * As for PUD huge maps, we must make sure that we have at least
1551          * 3 levels, i.e, PMD is not folded.
1552          */
1553         if (vma_pagesize == PMD_SIZE ||
1554             (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1555                 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1556         mmap_read_unlock(current->mm);
1557
1558         /* We need minimum second+third level pages */
1559         ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
1560         if (ret)
1561                 return ret;
1562
1563         mmu_seq = vcpu->kvm->mmu_notifier_seq;
1564         /*
1565          * Ensure the read of mmu_notifier_seq happens before we call
1566          * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1567          * the page we just got a reference to gets unmapped before we have a
1568          * chance to grab the mmu_lock, which ensure that if the page gets
1569          * unmapped afterwards, the call to kvm_unmap_hva will take it away
1570          * from us again properly. This smp_rmb() interacts with the smp_wmb()
1571          * in kvm_mmu_notifier_invalidate_<page|range_end>.
1572          */
1573         smp_rmb();
1574
1575         pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1576         if (pfn == KVM_PFN_ERR_HWPOISON) {
1577                 kvm_send_hwpoison_signal(hva, vma_shift);
1578                 return 0;
1579         }
1580         if (is_error_noslot_pfn(pfn))
1581                 return -EFAULT;
1582
1583         if (kvm_is_device_pfn(pfn)) {
1584                 mem_type = PAGE_S2_DEVICE;
1585                 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1586         } else if (logging_active) {
1587                 /*
1588                  * Faults on pages in a memslot with logging enabled
1589                  * should not be mapped with huge pages (it introduces churn
1590                  * and performance degradation), so force a pte mapping.
1591                  */
1592                 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1593
1594                 /*
1595                  * Only actually map the page as writable if this was a write
1596                  * fault.
1597                  */
1598                 if (!write_fault)
1599                         writable = false;
1600         }
1601
1602         if (exec_fault && is_iomap(flags))
1603                 return -ENOEXEC;
1604
1605         spin_lock(&kvm->mmu_lock);
1606         if (mmu_notifier_retry(kvm, mmu_seq))
1607                 goto out_unlock;
1608
1609         /*
1610          * If we are not forced to use page mapping, check if we are
1611          * backed by a THP and thus use block mapping if possible.
1612          */
1613         if (vma_pagesize == PAGE_SIZE && !force_pte)
1614                 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1615                                                            &pfn, &fault_ipa);
1616         if (writable)
1617                 kvm_set_pfn_dirty(pfn);
1618
1619         if (fault_status != FSC_PERM && !is_iomap(flags))
1620                 clean_dcache_guest_page(pfn, vma_pagesize);
1621
1622         if (exec_fault)
1623                 invalidate_icache_guest_page(pfn, vma_pagesize);
1624
1625         /*
1626          * If we took an execution fault we have made the
1627          * icache/dcache coherent above and should now let the s2
1628          * mapping be executable.
1629          *
1630          * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1631          * execute permissions, and we preserve whatever we have.
1632          */
1633         needs_exec = exec_fault ||
1634                 (fault_status == FSC_PERM &&
1635                  stage2_is_exec(mmu, fault_ipa, vma_pagesize));
1636
1637         if (vma_pagesize == PUD_SIZE) {
1638                 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1639
1640                 new_pud = kvm_pud_mkhuge(new_pud);
1641                 if (writable)
1642                         new_pud = kvm_s2pud_mkwrite(new_pud);
1643
1644                 if (needs_exec)
1645                         new_pud = kvm_s2pud_mkexec(new_pud);
1646
1647                 ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
1648         } else if (vma_pagesize == PMD_SIZE) {
1649                 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1650
1651                 new_pmd = kvm_pmd_mkhuge(new_pmd);
1652
1653                 if (writable)
1654                         new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1655
1656                 if (needs_exec)
1657                         new_pmd = kvm_s2pmd_mkexec(new_pmd);
1658
1659                 ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
1660         } else {
1661                 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1662
1663                 if (writable) {
1664                         new_pte = kvm_s2pte_mkwrite(new_pte);
1665                         mark_page_dirty(kvm, gfn);
1666                 }
1667
1668                 if (needs_exec)
1669                         new_pte = kvm_s2pte_mkexec(new_pte);
1670
1671                 ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
1672         }
1673
1674 out_unlock:
1675         spin_unlock(&kvm->mmu_lock);
1676         kvm_set_pfn_accessed(pfn);
1677         kvm_release_pfn_clean(pfn);
1678         return ret;
1679 }
1680
1681 /* Resolve the access fault by making the page young again. */
1682 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1683 {
1684         pte_t pte;
1685         kvm_pte_t kpte;
1686         struct kvm_s2_mmu *mmu;
1687
1688         trace_kvm_access_fault(fault_ipa);
1689
1690         spin_lock(&vcpu->kvm->mmu_lock);
1691         mmu = vcpu->arch.hw_mmu;
1692         kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1693         spin_unlock(&vcpu->kvm->mmu_lock);
1694
1695         pte = __pte(kpte);
1696         if (pte_valid(pte))
1697                 kvm_set_pfn_accessed(pte_pfn(pte));
1698 }
1699
1700 /**
1701  * kvm_handle_guest_abort - handles all 2nd stage aborts
1702  * @vcpu:       the VCPU pointer
1703  *
1704  * Any abort that gets to the host is almost guaranteed to be caused by a
1705  * missing second stage translation table entry, which can mean that either the
1706  * guest simply needs more memory and we must allocate an appropriate page or it
1707  * can mean that the guest tried to access I/O memory, which is emulated by user
1708  * space. The distinction is based on the IPA causing the fault and whether this
1709  * memory region has been registered as standard RAM by user space.
1710  */
1711 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1712 {
1713         unsigned long fault_status;
1714         phys_addr_t fault_ipa;
1715         struct kvm_memory_slot *memslot;
1716         unsigned long hva;
1717         bool is_iabt, write_fault, writable;
1718         gfn_t gfn;
1719         int ret, idx;
1720
1721         fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1722
1723         fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1724         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1725
1726         /* Synchronous External Abort? */
1727         if (kvm_vcpu_abt_issea(vcpu)) {
1728                 /*
1729                  * For RAS the host kernel may handle this abort.
1730                  * There is no need to pass the error into the guest.
1731                  */
1732                 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1733                         kvm_inject_vabt(vcpu);
1734
1735                 return 1;
1736         }
1737
1738         trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1739                               kvm_vcpu_get_hfar(vcpu), fault_ipa);
1740
1741         /* Check the stage-2 fault is trans. fault or write fault */
1742         if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1743             fault_status != FSC_ACCESS) {
1744                 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1745                         kvm_vcpu_trap_get_class(vcpu),
1746                         (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1747                         (unsigned long)kvm_vcpu_get_esr(vcpu));
1748                 return -EFAULT;
1749         }
1750
1751         idx = srcu_read_lock(&vcpu->kvm->srcu);
1752
1753         gfn = fault_ipa >> PAGE_SHIFT;
1754         memslot = gfn_to_memslot(vcpu->kvm, gfn);
1755         hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1756         write_fault = kvm_is_write_fault(vcpu);
1757         if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1758                 /*
1759                  * The guest has put either its instructions or its page-tables
1760                  * somewhere it shouldn't have. Userspace won't be able to do
1761                  * anything about this (there's no syndrome for a start), so
1762                  * re-inject the abort back into the guest.
1763                  */
1764                 if (is_iabt) {
1765                         ret = -ENOEXEC;
1766                         goto out;
1767                 }
1768
1769                 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1770                         kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1771                         ret = 1;
1772                         goto out_unlock;
1773                 }
1774
1775                 /*
1776                  * Check for a cache maintenance operation. Since we
1777                  * ended-up here, we know it is outside of any memory
1778                  * slot. But we can't find out if that is for a device,
1779                  * or if the guest is just being stupid. The only thing
1780                  * we know for sure is that this range cannot be cached.
1781                  *
1782                  * So let's assume that the guest is just being
1783                  * cautious, and skip the instruction.
1784                  */
1785                 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1786                         kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1787                         ret = 1;
1788                         goto out_unlock;
1789                 }
1790
1791                 /*
1792                  * The IPA is reported as [MAX:12], so we need to
1793                  * complement it with the bottom 12 bits from the
1794                  * faulting VA. This is always 12 bits, irrespective
1795                  * of the page size.
1796                  */
1797                 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1798                 ret = io_mem_abort(vcpu, fault_ipa);
1799                 goto out_unlock;
1800         }
1801
1802         /* Userspace should not be able to register out-of-bounds IPAs */
1803         VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1804
1805         if (fault_status == FSC_ACCESS) {
1806                 handle_access_fault(vcpu, fault_ipa);
1807                 ret = 1;
1808                 goto out_unlock;
1809         }
1810
1811         ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1812         if (ret == 0)
1813                 ret = 1;
1814 out:
1815         if (ret == -ENOEXEC) {
1816                 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1817                 ret = 1;
1818         }
1819 out_unlock:
1820         srcu_read_unlock(&vcpu->kvm->srcu, idx);
1821         return ret;
1822 }
1823
1824 static int handle_hva_to_gpa(struct kvm *kvm,
1825                              unsigned long start,
1826                              unsigned long end,
1827                              int (*handler)(struct kvm *kvm,
1828                                             gpa_t gpa, u64 size,
1829                                             void *data),
1830                              void *data)
1831 {
1832         struct kvm_memslots *slots;
1833         struct kvm_memory_slot *memslot;
1834         int ret = 0;
1835
1836         slots = kvm_memslots(kvm);
1837
1838         /* we only care about the pages that the guest sees */
1839         kvm_for_each_memslot(memslot, slots) {
1840                 unsigned long hva_start, hva_end;
1841                 gfn_t gpa;
1842
1843                 hva_start = max(start, memslot->userspace_addr);
1844                 hva_end = min(end, memslot->userspace_addr +
1845                                         (memslot->npages << PAGE_SHIFT));
1846                 if (hva_start >= hva_end)
1847                         continue;
1848
1849                 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1850                 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
1851         }
1852
1853         return ret;
1854 }
1855
1856 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1857 {
1858         unsigned flags = *(unsigned *)data;
1859         bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1860
1861         __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
1862         return 0;
1863 }
1864
1865 int kvm_unmap_hva_range(struct kvm *kvm,
1866                         unsigned long start, unsigned long end, unsigned flags)
1867 {
1868         if (!kvm->arch.mmu.pgd)
1869                 return 0;
1870
1871         trace_kvm_unmap_hva_range(start, end);
1872         handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
1873         return 0;
1874 }
1875
1876 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1877 {
1878         kvm_pfn_t *pfn = (kvm_pfn_t *)data;
1879
1880         WARN_ON(size != PAGE_SIZE);
1881
1882         /*
1883          * The MMU notifiers will have unmapped a huge PMD before calling
1884          * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1885          * therefore we never need to clear out a huge PMD through this
1886          * calling path and a memcache is not required.
1887          */
1888         kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1889                                __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
1890         return 0;
1891 }
1892
1893 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1894 {
1895         unsigned long end = hva + PAGE_SIZE;
1896         kvm_pfn_t pfn = pte_pfn(pte);
1897
1898         if (!kvm->arch.mmu.pgt)
1899                 return 0;
1900
1901         trace_kvm_set_spte_hva(hva);
1902
1903         /*
1904          * We've moved a page around, probably through CoW, so let's treat it
1905          * just like a translation fault and clean the cache to the PoC.
1906          */
1907         clean_dcache_guest_page(pfn, PAGE_SIZE);
1908         handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
1909         return 0;
1910 }
1911
1912 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1913 {
1914         pte_t pte;
1915         kvm_pte_t kpte;
1916
1917         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1918         kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
1919         pte = __pte(kpte);
1920         return pte_valid(pte) && pte_young(pte);
1921 }
1922
1923 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1924 {
1925         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1926         return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
1927 }
1928
1929 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1930 {
1931         if (!kvm->arch.mmu.pgd)
1932                 return 0;
1933         trace_kvm_age_hva(start, end);
1934         return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1935 }
1936
1937 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1938 {
1939         if (!kvm->arch.mmu.pgd)
1940                 return 0;
1941         trace_kvm_test_age_hva(hva);
1942         return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
1943                                  kvm_test_age_hva_handler, NULL);
1944 }
1945
1946 phys_addr_t kvm_mmu_get_httbr(void)
1947 {
1948         return __pa(hyp_pgtable->pgd);
1949 }
1950
1951 phys_addr_t kvm_get_idmap_vector(void)
1952 {
1953         return hyp_idmap_vector;
1954 }
1955
1956 static int kvm_map_idmap_text(void)
1957 {
1958         unsigned long size = hyp_idmap_end - hyp_idmap_start;
1959         int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1960                                         PAGE_HYP_EXEC);
1961         if (err)
1962                 kvm_err("Failed to idmap %lx-%lx\n",
1963                         hyp_idmap_start, hyp_idmap_end);
1964
1965         return err;
1966 }
1967
1968 int kvm_mmu_init(void)
1969 {
1970         int err;
1971         u32 hyp_va_bits;
1972
1973         hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
1974         hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1975         hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
1976         hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1977         hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
1978
1979         /*
1980          * We rely on the linker script to ensure at build time that the HYP
1981          * init code does not cross a page boundary.
1982          */
1983         BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1984
1985         hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1986         kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
1987         kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1988         kvm_debug("HYP VA range: %lx:%lx\n",
1989                   kern_hyp_va(PAGE_OFFSET),
1990                   kern_hyp_va((unsigned long)high_memory - 1));
1991
1992         if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
1993             hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
1994             hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
1995                 /*
1996                  * The idmap page is intersecting with the VA space,
1997                  * it is not safe to continue further.
1998                  */
1999                 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2000                 err = -EINVAL;
2001                 goto out;
2002         }
2003
2004         hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2005         if (!hyp_pgtable) {
2006                 kvm_err("Hyp mode page-table not allocated\n");
2007                 err = -ENOMEM;
2008                 goto out;
2009         }
2010
2011         err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
2012         if (err)
2013                 goto out_free_pgtable;
2014
2015         err = kvm_map_idmap_text();
2016         if (err)
2017                 goto out_destroy_pgtable;
2018
2019         io_map_base = hyp_idmap_start;
2020         return 0;
2021
2022 out_destroy_pgtable:
2023         kvm_pgtable_hyp_destroy(hyp_pgtable);
2024 out_free_pgtable:
2025         kfree(hyp_pgtable);
2026         hyp_pgtable = NULL;
2027 out:
2028         return err;
2029 }
2030
2031 void kvm_arch_commit_memory_region(struct kvm *kvm,
2032                                    const struct kvm_userspace_memory_region *mem,
2033                                    struct kvm_memory_slot *old,
2034                                    const struct kvm_memory_slot *new,
2035                                    enum kvm_mr_change change)
2036 {
2037         /*
2038          * At this point memslot has been committed and there is an
2039          * allocated dirty_bitmap[], dirty pages will be tracked while the
2040          * memory slot is write protected.
2041          */
2042         if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2043                 /*
2044                  * If we're with initial-all-set, we don't need to write
2045                  * protect any pages because they're all reported as dirty.
2046                  * Huge pages and normal pages will be write protect gradually.
2047                  */
2048                 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2049                         kvm_mmu_wp_memory_region(kvm, mem->slot);
2050                 }
2051         }
2052 }
2053
2054 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2055                                    struct kvm_memory_slot *memslot,
2056                                    const struct kvm_userspace_memory_region *mem,
2057                                    enum kvm_mr_change change)
2058 {
2059         hva_t hva = mem->userspace_addr;
2060         hva_t reg_end = hva + mem->memory_size;
2061         bool writable = !(mem->flags & KVM_MEM_READONLY);
2062         int ret = 0;
2063
2064         if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2065                         change != KVM_MR_FLAGS_ONLY)
2066                 return 0;
2067
2068         /*
2069          * Prevent userspace from creating a memory region outside of the IPA
2070          * space addressable by the KVM guest IPA space.
2071          */
2072         if (memslot->base_gfn + memslot->npages >=
2073             (kvm_phys_size(kvm) >> PAGE_SHIFT))
2074                 return -EFAULT;
2075
2076         mmap_read_lock(current->mm);
2077         /*
2078          * A memory region could potentially cover multiple VMAs, and any holes
2079          * between them, so iterate over all of them to find out if we can map
2080          * any of them right now.
2081          *
2082          *     +--------------------------------------------+
2083          * +---------------+----------------+   +----------------+
2084          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2085          * +---------------+----------------+   +----------------+
2086          *     |               memory region                |
2087          *     +--------------------------------------------+
2088          */
2089         do {
2090                 struct vm_area_struct *vma = find_vma(current->mm, hva);
2091                 hva_t vm_start, vm_end;
2092
2093                 if (!vma || vma->vm_start >= reg_end)
2094                         break;
2095
2096                 /*
2097                  * Take the intersection of this VMA with the memory region
2098                  */
2099                 vm_start = max(hva, vma->vm_start);
2100                 vm_end = min(reg_end, vma->vm_end);
2101
2102                 if (vma->vm_flags & VM_PFNMAP) {
2103                         gpa_t gpa = mem->guest_phys_addr +
2104                                     (vm_start - mem->userspace_addr);
2105                         phys_addr_t pa;
2106
2107                         pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2108                         pa += vm_start - vma->vm_start;
2109
2110                         /* IO region dirty page logging not allowed */
2111                         if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2112                                 ret = -EINVAL;
2113                                 goto out;
2114                         }
2115
2116                         ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2117                                                     vm_end - vm_start,
2118                                                     writable);
2119                         if (ret)
2120                                 break;
2121                 }
2122                 hva = vm_end;
2123         } while (hva < reg_end);
2124
2125         if (change == KVM_MR_FLAGS_ONLY)
2126                 goto out;
2127
2128         spin_lock(&kvm->mmu_lock);
2129         if (ret)
2130                 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
2131         else
2132                 stage2_flush_memslot(kvm, memslot);
2133         spin_unlock(&kvm->mmu_lock);
2134 out:
2135         mmap_read_unlock(current->mm);
2136         return ret;
2137 }
2138
2139 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2140 {
2141 }
2142
2143 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2144 {
2145 }
2146
2147 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2148 {
2149         kvm_free_stage2_pgd(&kvm->arch.mmu);
2150 }
2151
2152 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2153                                    struct kvm_memory_slot *slot)
2154 {
2155         gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2156         phys_addr_t size = slot->npages << PAGE_SHIFT;
2157
2158         spin_lock(&kvm->mmu_lock);
2159         unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2160         spin_unlock(&kvm->mmu_lock);
2161 }
2162
2163 /*
2164  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2165  *
2166  * Main problems:
2167  * - S/W ops are local to a CPU (not broadcast)
2168  * - We have line migration behind our back (speculation)
2169  * - System caches don't support S/W at all (damn!)
2170  *
2171  * In the face of the above, the best we can do is to try and convert
2172  * S/W ops to VA ops. Because the guest is not allowed to infer the
2173  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2174  * which is a rather good thing for us.
2175  *
2176  * Also, it is only used when turning caches on/off ("The expected
2177  * usage of the cache maintenance instructions that operate by set/way
2178  * is associated with the cache maintenance instructions associated
2179  * with the powerdown and powerup of caches, if this is required by
2180  * the implementation.").
2181  *
2182  * We use the following policy:
2183  *
2184  * - If we trap a S/W operation, we enable VM trapping to detect
2185  *   caches being turned on/off, and do a full clean.
2186  *
2187  * - We flush the caches on both caches being turned on and off.
2188  *
2189  * - Once the caches are enabled, we stop trapping VM ops.
2190  */
2191 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2192 {
2193         unsigned long hcr = *vcpu_hcr(vcpu);
2194
2195         /*
2196          * If this is the first time we do a S/W operation
2197          * (i.e. HCR_TVM not set) flush the whole memory, and set the
2198          * VM trapping.
2199          *
2200          * Otherwise, rely on the VM trapping to wait for the MMU +
2201          * Caches to be turned off. At that point, we'll be able to
2202          * clean the caches again.
2203          */
2204         if (!(hcr & HCR_TVM)) {
2205                 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2206                                         vcpu_has_cache_enabled(vcpu));
2207                 stage2_flush_vm(vcpu->kvm);
2208                 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2209         }
2210 }
2211
2212 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2213 {
2214         bool now_enabled = vcpu_has_cache_enabled(vcpu);
2215
2216         /*
2217          * If switching the MMU+caches on, need to invalidate the caches.
2218          * If switching it off, need to clean the caches.
2219          * Clean + invalidate does the trick always.
2220          */
2221         if (now_enabled != was_enabled)
2222                 stage2_flush_vm(vcpu->kvm);
2223
2224         /* Caches are now on, stop trapping VM ops (until a S/W op) */
2225         if (now_enabled)
2226                 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2227
2228         trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2229 }