arch/arm64/kvm/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   5  */
   6
   7 #include <linux/mman.h>
   8 #include <linux/kvm_host.h>
   9 #include <linux/io.h>
  10 #include <linux/hugetlb.h>
  11 #include <linux/sched/signal.h>
  12 #include <trace/events/kvm.h>
  13 #include <asm/pgalloc.h>
  14 #include <asm/cacheflush.h>
  15 #include <asm/kvm_arm.h>
  16 #include <asm/kvm_mmu.h>
  17 #include <asm/kvm_pgtable.h>
  18 #include <asm/kvm_ras.h>
  19 #include <asm/kvm_asm.h>
  20 #include <asm/kvm_emulate.h>
  21 #include <asm/virt.h>
  22
  23 #include "trace.h"
  24
  25 static struct kvm_pgtable *hyp_pgtable;
  26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  27
  28 static unsigned long hyp_idmap_start;
  29 static unsigned long hyp_idmap_end;
  30 static phys_addr_t hyp_idmap_vector;
  31
  32 static unsigned long io_map_base;
  33
  34 #define KVM_S2PTE_FLAG_IS_IOMAP         (1UL << 0)
  35 #define KVM_S2_FLAG_LOGGING_ACTIVE      (1UL << 1)
  36
  37 static bool is_iomap(unsigned long flags)
  38 {
  39         return flags & KVM_S2PTE_FLAG_IS_IOMAP;
  40 }
  41
  42 /*
  43  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
  44  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
  45  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
  46  * long will also starve other vCPUs. We have to also make sure that the page
  47  * tables are not freed while we released the lock.
  48  */
  49 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
  50                               phys_addr_t end,
  51                               int (*fn)(struct kvm_pgtable *, u64, u64),
  52                               bool resched)
  53 {
  54         int ret;
  55         u64 next;
  56
  57         do {
  58                 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
  59                 if (!pgt)
  60                         return -EINVAL;
  61
  62                 next = stage2_pgd_addr_end(kvm, addr, end);
  63                 ret = fn(pgt, addr, next - addr);
  64                 if (ret)
  65                         break;
  66
  67                 if (resched && next != end)
  68                         cond_resched_lock(&kvm->mmu_lock);
  69         } while (addr = next, addr != end);
  70
  71         return ret;
  72 }
  73
  74 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
  75 {
  76         return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
  77 }
  78
  79 /**
  80  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
  81  * @kvm:        pointer to kvm structure.
  82  *
  83  * Interface to HYP function to flush all VM TLB entries
  84  */
  85 void kvm_flush_remote_tlbs(struct kvm *kvm)
  86 {
  87         kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
  88 }
  89
  90 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
  91                                    int level)
  92 {
  93         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
  94 }
  95
  96 /*
  97  * D-Cache management functions. They take the page table entries by
  98  * value, as they are flushing the cache using the kernel mapping (or
  99  * kmap on 32bit).
 100  */
 101 static void kvm_flush_dcache_pte(pte_t pte)
 102 {
 103         __kvm_flush_dcache_pte(pte);
 104 }
 105
 106 static void kvm_flush_dcache_pmd(pmd_t pmd)
 107 {
 108         __kvm_flush_dcache_pmd(pmd);
 109 }
 110
 111 static void kvm_flush_dcache_pud(pud_t pud)
 112 {
 113         __kvm_flush_dcache_pud(pud);
 114 }
 115
 116 static bool kvm_is_device_pfn(unsigned long pfn)
 117 {
 118         return !pfn_valid(pfn);
 119 }
 120
 121 /**
 122  * stage2_dissolve_pmd() - clear and flush huge PMD entry
 123  * @mmu:        pointer to mmu structure to operate on
 124  * @addr:       IPA
 125  * @pmd:        pmd pointer for IPA
 126  *
 127  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
 128  */
 129 static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
 130 {
 131         if (!pmd_thp_or_huge(*pmd))
 132                 return;
 133
 134         pmd_clear(pmd);
 135         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
 136         put_page(virt_to_page(pmd));
 137 }
 138
 139 /**
 140  * stage2_dissolve_pud() - clear and flush huge PUD entry
 141  * @mmu:        pointer to mmu structure to operate on
 142  * @addr:       IPA
 143  * @pud:        pud pointer for IPA
 144  *
 145  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
 146  */
 147 static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
 148 {
 149         struct kvm *kvm = mmu->kvm;
 150
 151         if (!stage2_pud_huge(kvm, *pudp))
 152                 return;
 153
 154         stage2_pud_clear(kvm, pudp);
 155         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
 156         put_page(virt_to_page(pudp));
 157 }
 158
 159 static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
 160 {
 161         struct kvm *kvm = mmu->kvm;
 162         p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
 163         stage2_pgd_clear(kvm, pgd);
 164         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 165         stage2_p4d_free(kvm, p4d_table);
 166         put_page(virt_to_page(pgd));
 167 }
 168
 169 static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
 170 {
 171         struct kvm *kvm = mmu->kvm;
 172         pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
 173         stage2_p4d_clear(kvm, p4d);
 174         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 175         stage2_pud_free(kvm, pud_table);
 176         put_page(virt_to_page(p4d));
 177 }
 178
 179 static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
 180 {
 181         struct kvm *kvm = mmu->kvm;
 182         pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
 183
 184         VM_BUG_ON(stage2_pud_huge(kvm, *pud));
 185         stage2_pud_clear(kvm, pud);
 186         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 187         stage2_pmd_free(kvm, pmd_table);
 188         put_page(virt_to_page(pud));
 189 }
 190
 191 static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
 192 {
 193         pte_t *pte_table = pte_offset_kernel(pmd, 0);
 194         VM_BUG_ON(pmd_thp_or_huge(*pmd));
 195         pmd_clear(pmd);
 196         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 197         free_page((unsigned long)pte_table);
 198         put_page(virt_to_page(pmd));
 199 }
 200
 201 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
 202 {
 203         WRITE_ONCE(*ptep, new_pte);
 204         dsb(ishst);
 205 }
 206
 207 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
 208 {
 209         WRITE_ONCE(*pmdp, new_pmd);
 210         dsb(ishst);
 211 }
 212
 213 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
 214 {
 215         kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
 216 }
 217
 218 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
 219 {
 220         WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
 221         dsb(ishst);
 222 }
 223
 224 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
 225 {
 226         WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
 227         dsb(ishst);
 228 }
 229
 230 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
 231 {
 232 #ifndef __PAGETABLE_P4D_FOLDED
 233         WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
 234         dsb(ishst);
 235 #endif
 236 }
 237
 238 /*
 239  * Unmapping vs dcache management:
 240  *
 241  * If a guest maps certain memory pages as uncached, all writes will
 242  * bypass the data cache and go directly to RAM.  However, the CPUs
 243  * can still speculate reads (not writes) and fill cache lines with
 244  * data.
 245  *
 246  * Those cache lines will be *clean* cache lines though, so a
 247  * clean+invalidate operation is equivalent to an invalidate
 248  * operation, because no cache lines are marked dirty.
 249  *
 250  * Those clean cache lines could be filled prior to an uncached write
 251  * by the guest, and the cache coherent IO subsystem would therefore
 252  * end up writing old data to disk.
 253  *
 254  * This is why right after unmapping a page/section and invalidating
 255  * the corresponding TLBs, we flush to make sure the IO subsystem will
 256  * never hit in the cache.
 257  *
 258  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
 259  * we then fully enforce cacheability of RAM, no matter what the guest
 260  * does.
 261  */
 262 static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
 263                        phys_addr_t addr, phys_addr_t end)
 264 {
 265         phys_addr_t start_addr = addr;
 266         pte_t *pte, *start_pte;
 267
 268         start_pte = pte = pte_offset_kernel(pmd, addr);
 269         do {
 270                 if (!pte_none(*pte)) {
 271                         pte_t old_pte = *pte;
 272
 273                         kvm_set_pte(pte, __pte(0));
 274                         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
 275
 276                         /* No need to invalidate the cache for device mappings */
 277                         if (!kvm_is_device_pfn(pte_pfn(old_pte)))
 278                                 kvm_flush_dcache_pte(old_pte);
 279
 280                         put_page(virt_to_page(pte));
 281                 }
 282         } while (pte++, addr += PAGE_SIZE, addr != end);
 283
 284         if (stage2_pte_table_empty(mmu->kvm, start_pte))
 285                 clear_stage2_pmd_entry(mmu, pmd, start_addr);
 286 }
 287
 288 static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
 289                        phys_addr_t addr, phys_addr_t end)
 290 {
 291         struct kvm *kvm = mmu->kvm;
 292         phys_addr_t next, start_addr = addr;
 293         pmd_t *pmd, *start_pmd;
 294
 295         start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
 296         do {
 297                 next = stage2_pmd_addr_end(kvm, addr, end);
 298                 if (!pmd_none(*pmd)) {
 299                         if (pmd_thp_or_huge(*pmd)) {
 300                                 pmd_t old_pmd = *pmd;
 301
 302                                 pmd_clear(pmd);
 303                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
 304
 305                                 kvm_flush_dcache_pmd(old_pmd);
 306
 307                                 put_page(virt_to_page(pmd));
 308                         } else {
 309                                 unmap_stage2_ptes(mmu, pmd, addr, next);
 310                         }
 311                 }
 312         } while (pmd++, addr = next, addr != end);
 313
 314         if (stage2_pmd_table_empty(kvm, start_pmd))
 315                 clear_stage2_pud_entry(mmu, pud, start_addr);
 316 }
 317
 318 static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
 319                        phys_addr_t addr, phys_addr_t end)
 320 {
 321         struct kvm *kvm = mmu->kvm;
 322         phys_addr_t next, start_addr = addr;
 323         pud_t *pud, *start_pud;
 324
 325         start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
 326         do {
 327                 next = stage2_pud_addr_end(kvm, addr, end);
 328                 if (!stage2_pud_none(kvm, *pud)) {
 329                         if (stage2_pud_huge(kvm, *pud)) {
 330                                 pud_t old_pud = *pud;
 331
 332                                 stage2_pud_clear(kvm, pud);
 333                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
 334                                 kvm_flush_dcache_pud(old_pud);
 335                                 put_page(virt_to_page(pud));
 336                         } else {
 337                                 unmap_stage2_pmds(mmu, pud, addr, next);
 338                         }
 339                 }
 340         } while (pud++, addr = next, addr != end);
 341
 342         if (stage2_pud_table_empty(kvm, start_pud))
 343                 clear_stage2_p4d_entry(mmu, p4d, start_addr);
 344 }
 345
 346 static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 347                        phys_addr_t addr, phys_addr_t end)
 348 {
 349         struct kvm *kvm = mmu->kvm;
 350         phys_addr_t next, start_addr = addr;
 351         p4d_t *p4d, *start_p4d;
 352
 353         start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
 354         do {
 355                 next = stage2_p4d_addr_end(kvm, addr, end);
 356                 if (!stage2_p4d_none(kvm, *p4d))
 357                         unmap_stage2_puds(mmu, p4d, addr, next);
 358         } while (p4d++, addr = next, addr != end);
 359
 360         if (stage2_p4d_table_empty(kvm, start_p4d))
 361                 clear_stage2_pgd_entry(mmu, pgd, start_addr);
 362 }
 363
 364 /**
 365  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 366  * @kvm:   The VM pointer
 367  * @start: The intermediate physical base address of the range to unmap
 368  * @size:  The size of the area to unmap
 369  *
 370  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 371  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 372  * destroying the VM), otherwise another faulting VCPU may come in and mess
 373  * with things behind our backs.
 374  */
 375 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
 376                                  bool may_block)
 377 {
 378         struct kvm *kvm = mmu->kvm;
 379         phys_addr_t end = start + size;
 380
 381         assert_spin_locked(&kvm->mmu_lock);
 382         WARN_ON(size & ~PAGE_MASK);
 383         WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
 384                                    may_block));
 385 }
 386
 387 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
 388 {
 389         __unmap_stage2_range(mmu, start, size, true);
 390 }
 391
 392 static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
 393                               phys_addr_t addr, phys_addr_t end)
 394 {
 395         pte_t *pte;
 396
 397         pte = pte_offset_kernel(pmd, addr);
 398         do {
 399                 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
 400                         kvm_flush_dcache_pte(*pte);
 401         } while (pte++, addr += PAGE_SIZE, addr != end);
 402 }
 403
 404 static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
 405                               phys_addr_t addr, phys_addr_t end)
 406 {
 407         struct kvm *kvm = mmu->kvm;
 408         pmd_t *pmd;
 409         phys_addr_t next;
 410
 411         pmd = stage2_pmd_offset(kvm, pud, addr);
 412         do {
 413                 next = stage2_pmd_addr_end(kvm, addr, end);
 414                 if (!pmd_none(*pmd)) {
 415                         if (pmd_thp_or_huge(*pmd))
 416                                 kvm_flush_dcache_pmd(*pmd);
 417                         else
 418                                 stage2_flush_ptes(mmu, pmd, addr, next);
 419                 }
 420         } while (pmd++, addr = next, addr != end);
 421 }
 422
 423 static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
 424                               phys_addr_t addr, phys_addr_t end)
 425 {
 426         struct kvm *kvm = mmu->kvm;
 427         pud_t *pud;
 428         phys_addr_t next;
 429
 430         pud = stage2_pud_offset(kvm, p4d, addr);
 431         do {
 432                 next = stage2_pud_addr_end(kvm, addr, end);
 433                 if (!stage2_pud_none(kvm, *pud)) {
 434                         if (stage2_pud_huge(kvm, *pud))
 435                                 kvm_flush_dcache_pud(*pud);
 436                         else
 437                                 stage2_flush_pmds(mmu, pud, addr, next);
 438                 }
 439         } while (pud++, addr = next, addr != end);
 440 }
 441
 442 static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 443                               phys_addr_t addr, phys_addr_t end)
 444 {
 445         struct kvm *kvm = mmu->kvm;
 446         p4d_t *p4d;
 447         phys_addr_t next;
 448
 449         p4d = stage2_p4d_offset(kvm, pgd, addr);
 450         do {
 451                 next = stage2_p4d_addr_end(kvm, addr, end);
 452                 if (!stage2_p4d_none(kvm, *p4d))
 453                         stage2_flush_puds(mmu, p4d, addr, next);
 454         } while (p4d++, addr = next, addr != end);
 455 }
 456
 457 static void stage2_flush_memslot(struct kvm *kvm,
 458                                  struct kvm_memory_slot *memslot)
 459 {
 460         struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
 461         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 462         phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
 463         phys_addr_t next;
 464         pgd_t *pgd;
 465
 466         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
 467         do {
 468                 next = stage2_pgd_addr_end(kvm, addr, end);
 469                 if (!stage2_pgd_none(kvm, *pgd))
 470                         stage2_flush_p4ds(mmu, pgd, addr, next);
 471
 472                 if (next != end)
 473                         cond_resched_lock(&kvm->mmu_lock);
 474         } while (pgd++, addr = next, addr != end);
 475 }
 476
 477 /**
 478  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 479  * @kvm: The struct kvm pointer
 480  *
 481  * Go through the stage 2 page tables and invalidate any cache lines
 482  * backing memory already mapped to the VM.
 483  */
 484 static void stage2_flush_vm(struct kvm *kvm)
 485 {
 486         struct kvm_memslots *slots;
 487         struct kvm_memory_slot *memslot;
 488         int idx;
 489
 490         idx = srcu_read_lock(&kvm->srcu);
 491         spin_lock(&kvm->mmu_lock);
 492
 493         slots = kvm_memslots(kvm);
 494         kvm_for_each_memslot(memslot, slots)
 495                 stage2_flush_memslot(kvm, memslot);
 496
 497         spin_unlock(&kvm->mmu_lock);
 498         srcu_read_unlock(&kvm->srcu, idx);
 499 }
 500
 501 /**
 502  * free_hyp_pgds - free Hyp-mode page tables
 503  */
 504 void free_hyp_pgds(void)
 505 {
 506         mutex_lock(&kvm_hyp_pgd_mutex);
 507         if (hyp_pgtable) {
 508                 kvm_pgtable_hyp_destroy(hyp_pgtable);
 509                 kfree(hyp_pgtable);
 510         }
 511         mutex_unlock(&kvm_hyp_pgd_mutex);
 512 }
 513
 514 static int __create_hyp_mappings(unsigned long start, unsigned long size,
 515                                  unsigned long phys, enum kvm_pgtable_prot prot)
 516 {
 517         int err;
 518
 519         mutex_lock(&kvm_hyp_pgd_mutex);
 520         err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
 521         mutex_unlock(&kvm_hyp_pgd_mutex);
 522
 523         return err;
 524 }
 525
 526 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 527 {
 528         if (!is_vmalloc_addr(kaddr)) {
 529                 BUG_ON(!virt_addr_valid(kaddr));
 530                 return __pa(kaddr);
 531         } else {
 532                 return page_to_phys(vmalloc_to_page(kaddr)) +
 533                        offset_in_page(kaddr);
 534         }
 535 }
 536
 537 /**
 538  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 539  * @from:       The virtual kernel start address of the range
 540  * @to:         The virtual kernel end address of the range (exclusive)
 541  * @prot:       The protection to be applied to this range
 542  *
 543  * The same virtual address as the kernel virtual address is also used
 544  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 545  * physical pages.
 546  */
 547 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 548 {
 549         phys_addr_t phys_addr;
 550         unsigned long virt_addr;
 551         unsigned long start = kern_hyp_va((unsigned long)from);
 552         unsigned long end = kern_hyp_va((unsigned long)to);
 553
 554         if (is_kernel_in_hyp_mode())
 555                 return 0;
 556
 557         start = start & PAGE_MASK;
 558         end = PAGE_ALIGN(end);
 559
 560         for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
 561                 int err;
 562
 563                 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
 564                 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
 565                                             prot);
 566                 if (err)
 567                         return err;
 568         }
 569
 570         return 0;
 571 }
 572
 573 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
 574                                         unsigned long *haddr,
 575                                         enum kvm_pgtable_prot prot)
 576 {
 577         unsigned long base;
 578         int ret = 0;
 579
 580         mutex_lock(&kvm_hyp_pgd_mutex);
 581
 582         /*
 583          * This assumes that we have enough space below the idmap
 584          * page to allocate our VAs. If not, the check below will
 585          * kick. A potential alternative would be to detect that
 586          * overflow and switch to an allocation above the idmap.
 587          *
 588          * The allocated size is always a multiple of PAGE_SIZE.
 589          */
 590         size = PAGE_ALIGN(size + offset_in_page(phys_addr));
 591         base = io_map_base - size;
 592
 593         /*
 594          * Verify that BIT(VA_BITS - 1) hasn't been flipped by
 595          * allocating the new area, as it would indicate we've
 596          * overflowed the idmap/IO address range.
 597          */
 598         if ((base ^ io_map_base) & BIT(VA_BITS - 1))
 599                 ret = -ENOMEM;
 600         else
 601                 io_map_base = base;
 602
 603         mutex_unlock(&kvm_hyp_pgd_mutex);
 604
 605         if (ret)
 606                 goto out;
 607
 608         ret = __create_hyp_mappings(base, size, phys_addr, prot);
 609         if (ret)
 610                 goto out;
 611
 612         *haddr = base + offset_in_page(phys_addr);
 613 out:
 614         return ret;
 615 }
 616
 617 /**
 618  * create_hyp_io_mappings - Map IO into both kernel and HYP
 619  * @phys_addr:  The physical start address which gets mapped
 620  * @size:       Size of the region being mapped
 621  * @kaddr:      Kernel VA for this mapping
 622  * @haddr:      HYP VA for this mapping
 623  */
 624 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
 625                            void __iomem **kaddr,
 626                            void __iomem **haddr)
 627 {
 628         unsigned long addr;
 629         int ret;
 630
 631         *kaddr = ioremap(phys_addr, size);
 632         if (!*kaddr)
 633                 return -ENOMEM;
 634
 635         if (is_kernel_in_hyp_mode()) {
 636                 *haddr = *kaddr;
 637                 return 0;
 638         }
 639
 640         ret = __create_hyp_private_mapping(phys_addr, size,
 641                                            &addr, PAGE_HYP_DEVICE);
 642         if (ret) {
 643                 iounmap(*kaddr);
 644                 *kaddr = NULL;
 645                 *haddr = NULL;
 646                 return ret;
 647         }
 648
 649         *haddr = (void __iomem *)addr;
 650         return 0;
 651 }
 652
 653 /**
 654  * create_hyp_exec_mappings - Map an executable range into HYP
 655  * @phys_addr:  The physical start address which gets mapped
 656  * @size:       Size of the region being mapped
 657  * @haddr:      HYP VA for this mapping
 658  */
 659 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 660                              void **haddr)
 661 {
 662         unsigned long addr;
 663         int ret;
 664
 665         BUG_ON(is_kernel_in_hyp_mode());
 666
 667         ret = __create_hyp_private_mapping(phys_addr, size,
 668                                            &addr, PAGE_HYP_EXEC);
 669         if (ret) {
 670                 *haddr = NULL;
 671                 return ret;
 672         }
 673
 674         *haddr = (void *)addr;
 675         return 0;
 676 }
 677
 678 /**
 679  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
 680  * @kvm:        The pointer to the KVM structure
 681  * @mmu:        The pointer to the s2 MMU structure
 682  *
 683  * Allocates only the stage-2 HW PGD level table(s).
 684  * Note we don't need locking here as this is only called when the VM is
 685  * created, which can only be done once.
 686  */
 687 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
 688 {
 689         int cpu, err;
 690         struct kvm_pgtable *pgt;
 691
 692         if (mmu->pgt != NULL) {
 693                 kvm_err("kvm_arch already initialized?\n");
 694                 return -EINVAL;
 695         }
 696
 697         pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
 698         if (!pgt)
 699                 return -ENOMEM;
 700
 701         err = kvm_pgtable_stage2_init(pgt, kvm);
 702         if (err)
 703                 goto out_free_pgtable;
 704
 705         mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
 706         if (!mmu->last_vcpu_ran) {
 707                 err = -ENOMEM;
 708                 goto out_destroy_pgtable;
 709         }
 710
 711         for_each_possible_cpu(cpu)
 712                 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 713
 714         mmu->kvm = kvm;
 715         mmu->pgt = pgt;
 716         mmu->pgd_phys = __pa(pgt->pgd);
 717         mmu->pgd = (void *)pgt->pgd;
 718         mmu->vmid.vmid_gen = 0;
 719         return 0;
 720
 721 out_destroy_pgtable:
 722         kvm_pgtable_stage2_destroy(pgt);
 723 out_free_pgtable:
 724         kfree(pgt);
 725         return err;
 726 }
 727
 728 static void stage2_unmap_memslot(struct kvm *kvm,
 729                                  struct kvm_memory_slot *memslot)
 730 {
 731         hva_t hva = memslot->userspace_addr;
 732         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 733         phys_addr_t size = PAGE_SIZE * memslot->npages;
 734         hva_t reg_end = hva + size;
 735
 736         /*
 737          * A memory region could potentially cover multiple VMAs, and any holes
 738          * between them, so iterate over all of them to find out if we should
 739          * unmap any of them.
 740          *
 741          *     +--------------------------------------------+
 742          * +---------------+----------------+   +----------------+
 743          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
 744          * +---------------+----------------+   +----------------+
 745          *     |               memory region                |
 746          *     +--------------------------------------------+
 747          */
 748         do {
 749                 struct vm_area_struct *vma = find_vma(current->mm, hva);
 750                 hva_t vm_start, vm_end;
 751
 752                 if (!vma || vma->vm_start >= reg_end)
 753                         break;
 754
 755                 /*
 756                  * Take the intersection of this VMA with the memory region
 757                  */
 758                 vm_start = max(hva, vma->vm_start);
 759                 vm_end = min(reg_end, vma->vm_end);
 760
 761                 if (!(vma->vm_flags & VM_PFNMAP)) {
 762                         gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
 763                         unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
 764                 }
 765                 hva = vm_end;
 766         } while (hva < reg_end);
 767 }
 768
 769 /**
 770  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 771  * @kvm: The struct kvm pointer
 772  *
 773  * Go through the memregions and unmap any regular RAM
 774  * backing memory already mapped to the VM.
 775  */
 776 void stage2_unmap_vm(struct kvm *kvm)
 777 {
 778         struct kvm_memslots *slots;
 779         struct kvm_memory_slot *memslot;
 780         int idx;
 781
 782         idx = srcu_read_lock(&kvm->srcu);
 783         mmap_read_lock(current->mm);
 784         spin_lock(&kvm->mmu_lock);
 785
 786         slots = kvm_memslots(kvm);
 787         kvm_for_each_memslot(memslot, slots)
 788                 stage2_unmap_memslot(kvm, memslot);
 789
 790         spin_unlock(&kvm->mmu_lock);
 791         mmap_read_unlock(current->mm);
 792         srcu_read_unlock(&kvm->srcu, idx);
 793 }
 794
 795 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 796 {
 797         struct kvm *kvm = mmu->kvm;
 798         struct kvm_pgtable *pgt = NULL;
 799
 800         spin_lock(&kvm->mmu_lock);
 801         pgt = mmu->pgt;
 802         if (pgt) {
 803                 mmu->pgd = NULL;
 804                 mmu->pgd_phys = 0;
 805                 mmu->pgt = NULL;
 806                 free_percpu(mmu->last_vcpu_ran);
 807         }
 808         spin_unlock(&kvm->mmu_lock);
 809
 810         if (pgt) {
 811                 kvm_pgtable_stage2_destroy(pgt);
 812                 kfree(pgt);
 813         }
 814 }
 815
 816 static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
 817                              phys_addr_t addr)
 818 {
 819         struct kvm *kvm = mmu->kvm;
 820         pgd_t *pgd;
 821         p4d_t *p4d;
 822
 823         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
 824         if (stage2_pgd_none(kvm, *pgd)) {
 825                 if (!cache)
 826                         return NULL;
 827                 p4d = kvm_mmu_memory_cache_alloc(cache);
 828                 stage2_pgd_populate(kvm, pgd, p4d);
 829                 get_page(virt_to_page(pgd));
 830         }
 831
 832         return stage2_p4d_offset(kvm, pgd, addr);
 833 }
 834
 835 static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
 836                              phys_addr_t addr)
 837 {
 838         struct kvm *kvm = mmu->kvm;
 839         p4d_t *p4d;
 840         pud_t *pud;
 841
 842         p4d = stage2_get_p4d(mmu, cache, addr);
 843         if (stage2_p4d_none(kvm, *p4d)) {
 844                 if (!cache)
 845                         return NULL;
 846                 pud = kvm_mmu_memory_cache_alloc(cache);
 847                 stage2_p4d_populate(kvm, p4d, pud);
 848                 get_page(virt_to_page(p4d));
 849         }
 850
 851         return stage2_pud_offset(kvm, p4d, addr);
 852 }
 853
 854 static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
 855                              phys_addr_t addr)
 856 {
 857         struct kvm *kvm = mmu->kvm;
 858         pud_t *pud;
 859         pmd_t *pmd;
 860
 861         pud = stage2_get_pud(mmu, cache, addr);
 862         if (!pud || stage2_pud_huge(kvm, *pud))
 863                 return NULL;
 864
 865         if (stage2_pud_none(kvm, *pud)) {
 866                 if (!cache)
 867                         return NULL;
 868                 pmd = kvm_mmu_memory_cache_alloc(cache);
 869                 stage2_pud_populate(kvm, pud, pmd);
 870                 get_page(virt_to_page(pud));
 871         }
 872
 873         return stage2_pmd_offset(kvm, pud, addr);
 874 }
 875
 876 static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
 877                                struct kvm_mmu_memory_cache *cache,
 878                                phys_addr_t addr, const pmd_t *new_pmd)
 879 {
 880         pmd_t *pmd, old_pmd;
 881
 882 retry:
 883         pmd = stage2_get_pmd(mmu, cache, addr);
 884         VM_BUG_ON(!pmd);
 885
 886         old_pmd = *pmd;
 887         /*
 888          * Multiple vcpus faulting on the same PMD entry, can
 889          * lead to them sequentially updating the PMD with the
 890          * same value. Following the break-before-make
 891          * (pmd_clear() followed by tlb_flush()) process can
 892          * hinder forward progress due to refaults generated
 893          * on missing translations.
 894          *
 895          * Skip updating the page table if the entry is
 896          * unchanged.
 897          */
 898         if (pmd_val(old_pmd) == pmd_val(*new_pmd))
 899                 return 0;
 900
 901         if (pmd_present(old_pmd)) {
 902                 /*
 903                  * If we already have PTE level mapping for this block,
 904                  * we must unmap it to avoid inconsistent TLB state and
 905                  * leaking the table page. We could end up in this situation
 906                  * if the memory slot was marked for dirty logging and was
 907                  * reverted, leaving PTE level mappings for the pages accessed
 908                  * during the period. So, unmap the PTE level mapping for this
 909                  * block and retry, as we could have released the upper level
 910                  * table in the process.
 911                  *
 912                  * Normal THP split/merge follows mmu_notifier callbacks and do
 913                  * get handled accordingly.
 914                  */
 915                 if (!pmd_thp_or_huge(old_pmd)) {
 916                         unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
 917                         goto retry;
 918                 }
 919                 /*
 920                  * Mapping in huge pages should only happen through a
 921                  * fault.  If a page is merged into a transparent huge
 922                  * page, the individual subpages of that huge page
 923                  * should be unmapped through MMU notifiers before we
 924                  * get here.
 925                  *
 926                  * Merging of CompoundPages is not supported; they
 927                  * should become splitting first, unmapped, merged,
 928                  * and mapped back in on-demand.
 929                  */
 930                 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
 931                 pmd_clear(pmd);
 932                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
 933         } else {
 934                 get_page(virt_to_page(pmd));
 935         }
 936
 937         kvm_set_pmd(pmd, *new_pmd);
 938         return 0;
 939 }
 940
 941 static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
 942                                struct kvm_mmu_memory_cache *cache,
 943                                phys_addr_t addr, const pud_t *new_pudp)
 944 {
 945         struct kvm *kvm = mmu->kvm;
 946         pud_t *pudp, old_pud;
 947
 948 retry:
 949         pudp = stage2_get_pud(mmu, cache, addr);
 950         VM_BUG_ON(!pudp);
 951
 952         old_pud = *pudp;
 953
 954         /*
 955          * A large number of vcpus faulting on the same stage 2 entry,
 956          * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
 957          * Skip updating the page tables if there is no change.
 958          */
 959         if (pud_val(old_pud) == pud_val(*new_pudp))
 960                 return 0;
 961
 962         if (stage2_pud_present(kvm, old_pud)) {
 963                 /*
 964                  * If we already have table level mapping for this block, unmap
 965                  * the range for this block and retry.
 966                  */
 967                 if (!stage2_pud_huge(kvm, old_pud)) {
 968                         unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
 969                         goto retry;
 970                 }
 971
 972                 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
 973                 stage2_pud_clear(kvm, pudp);
 974                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
 975         } else {
 976                 get_page(virt_to_page(pudp));
 977         }
 978
 979         kvm_set_pud(pudp, *new_pudp);
 980         return 0;
 981 }
 982
 983 /*
 984  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
 985  * true if a valid and present leaf-entry is found. A pointer to the
 986  * leaf-entry is returned in the appropriate level variable - pudpp,
 987  * pmdpp, ptepp.
 988  */
 989 static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
 990                                   pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
 991 {
 992         struct kvm *kvm = mmu->kvm;
 993         pud_t *pudp;
 994         pmd_t *pmdp;
 995         pte_t *ptep;
 996
 997         *pudpp = NULL;
 998         *pmdpp = NULL;
 999         *ptepp = NULL;
1000
1001         pudp = stage2_get_pud(mmu, NULL, addr);
1002         if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
1003                 return false;
1004
1005         if (stage2_pud_huge(kvm, *pudp)) {
1006                 *pudpp = pudp;
1007                 return true;
1008         }
1009
1010         pmdp = stage2_pmd_offset(kvm, pudp, addr);
1011         if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1012                 return false;
1013
1014         if (pmd_thp_or_huge(*pmdp)) {
1015                 *pmdpp = pmdp;
1016                 return true;
1017         }
1018
1019         ptep = pte_offset_kernel(pmdp, addr);
1020         if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1021                 return false;
1022
1023         *ptepp = ptep;
1024         return true;
1025 }
1026
1027 static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
1028 {
1029         pud_t *pudp;
1030         pmd_t *pmdp;
1031         pte_t *ptep;
1032         bool found;
1033
1034         found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
1035         if (!found)
1036                 return false;
1037
1038         if (pudp)
1039                 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1040         else if (pmdp)
1041                 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1042         else
1043                 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1044 }
1045
1046 static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1047                           struct kvm_mmu_memory_cache *cache,
1048                           phys_addr_t addr, const pte_t *new_pte,
1049                           unsigned long flags)
1050 {
1051         struct kvm *kvm = mmu->kvm;
1052         pud_t *pud;
1053         pmd_t *pmd;
1054         pte_t *pte, old_pte;
1055         bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1056         bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1057
1058         VM_BUG_ON(logging_active && !cache);
1059
1060         /* Create stage-2 page table mapping - Levels 0 and 1 */
1061         pud = stage2_get_pud(mmu, cache, addr);
1062         if (!pud) {
1063                 /*
1064                  * Ignore calls from kvm_set_spte_hva for unallocated
1065                  * address ranges.
1066                  */
1067                 return 0;
1068         }
1069
1070         /*
1071          * While dirty page logging - dissolve huge PUD, then continue
1072          * on to allocate page.
1073          */
1074         if (logging_active)
1075                 stage2_dissolve_pud(mmu, addr, pud);
1076
1077         if (stage2_pud_none(kvm, *pud)) {
1078                 if (!cache)
1079                         return 0; /* ignore calls from kvm_set_spte_hva */
1080                 pmd = kvm_mmu_memory_cache_alloc(cache);
1081                 stage2_pud_populate(kvm, pud, pmd);
1082                 get_page(virt_to_page(pud));
1083         }
1084
1085         pmd = stage2_pmd_offset(kvm, pud, addr);
1086         if (!pmd) {
1087                 /*
1088                  * Ignore calls from kvm_set_spte_hva for unallocated
1089                  * address ranges.
1090                  */
1091                 return 0;
1092         }
1093
1094         /*
1095          * While dirty page logging - dissolve huge PMD, then continue on to
1096          * allocate page.
1097          */
1098         if (logging_active)
1099                 stage2_dissolve_pmd(mmu, addr, pmd);
1100
1101         /* Create stage-2 page mappings - Level 2 */
1102         if (pmd_none(*pmd)) {
1103                 if (!cache)
1104                         return 0; /* ignore calls from kvm_set_spte_hva */
1105                 pte = kvm_mmu_memory_cache_alloc(cache);
1106                 kvm_pmd_populate(pmd, pte);
1107                 get_page(virt_to_page(pmd));
1108         }
1109
1110         pte = pte_offset_kernel(pmd, addr);
1111
1112         if (iomap && pte_present(*pte))
1113                 return -EFAULT;
1114
1115         /* Create 2nd stage page table mapping - Level 3 */
1116         old_pte = *pte;
1117         if (pte_present(old_pte)) {
1118                 /* Skip page table update if there is no change */
1119                 if (pte_val(old_pte) == pte_val(*new_pte))
1120                         return 0;
1121
1122                 kvm_set_pte(pte, __pte(0));
1123                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
1124         } else {
1125                 get_page(virt_to_page(pte));
1126         }
1127
1128         kvm_set_pte(pte, *new_pte);
1129         return 0;
1130 }
1131
1132 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1133 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1134 {
1135         if (pte_young(*pte)) {
1136                 *pte = pte_mkold(*pte);
1137                 return 1;
1138         }
1139         return 0;
1140 }
1141 #else
1142 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1143 {
1144         return __ptep_test_and_clear_young(pte);
1145 }
1146 #endif
1147
1148 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1149 {
1150         return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1151 }
1152
1153 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1154 {
1155         return stage2_ptep_test_and_clear_young((pte_t *)pud);
1156 }
1157
1158 /**
1159  * kvm_phys_addr_ioremap - map a device range to guest IPA
1160  *
1161  * @kvm:        The KVM pointer
1162  * @guest_ipa:  The IPA at which to insert the mapping
1163  * @pa:         The physical address of the device
1164  * @size:       The size of the mapping
1165  */
1166 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1167                           phys_addr_t pa, unsigned long size, bool writable)
1168 {
1169         phys_addr_t addr;
1170         int ret = 0;
1171         struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
1172         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
1173         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1174                                      KVM_PGTABLE_PROT_R |
1175                                      (writable ? KVM_PGTABLE_PROT_W : 0);
1176
1177         size += offset_in_page(guest_ipa);
1178         guest_ipa &= PAGE_MASK;
1179
1180         for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1181                 ret = kvm_mmu_topup_memory_cache(&cache,
1182                                                  kvm_mmu_cache_min_pages(kvm));
1183                 if (ret)
1184                         break;
1185
1186                 spin_lock(&kvm->mmu_lock);
1187                 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1188                                              &cache);
1189                 spin_unlock(&kvm->mmu_lock);
1190                 if (ret)
1191                         break;
1192
1193                 pa += PAGE_SIZE;
1194         }
1195
1196         kvm_mmu_free_memory_cache(&cache);
1197         return ret;
1198 }
1199
1200 /**
1201  * stage2_wp_ptes - write protect PMD range
1202  * @pmd:        pointer to pmd entry
1203  * @addr:       range start address
1204  * @end:        range end address
1205  */
1206 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1207 {
1208         pte_t *pte;
1209
1210         pte = pte_offset_kernel(pmd, addr);
1211         do {
1212                 if (!pte_none(*pte)) {
1213                         if (!kvm_s2pte_readonly(pte))
1214                                 kvm_set_s2pte_readonly(pte);
1215                 }
1216         } while (pte++, addr += PAGE_SIZE, addr != end);
1217 }
1218
1219 /**
1220  * stage2_wp_pmds - write protect PUD range
1221  * kvm:         kvm instance for the VM
1222  * @pud:        pointer to pud entry
1223  * @addr:       range start address
1224  * @end:        range end address
1225  */
1226 static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
1227                            phys_addr_t addr, phys_addr_t end)
1228 {
1229         struct kvm *kvm = mmu->kvm;
1230         pmd_t *pmd;
1231         phys_addr_t next;
1232
1233         pmd = stage2_pmd_offset(kvm, pud, addr);
1234
1235         do {
1236                 next = stage2_pmd_addr_end(kvm, addr, end);
1237                 if (!pmd_none(*pmd)) {
1238                         if (pmd_thp_or_huge(*pmd)) {
1239                                 if (!kvm_s2pmd_readonly(pmd))
1240                                         kvm_set_s2pmd_readonly(pmd);
1241                         } else {
1242                                 stage2_wp_ptes(pmd, addr, next);
1243                         }
1244                 }
1245         } while (pmd++, addr = next, addr != end);
1246 }
1247
1248 /**
1249  * stage2_wp_puds - write protect P4D range
1250  * @p4d:        pointer to p4d entry
1251  * @addr:       range start address
1252  * @end:        range end address
1253  */
1254 static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
1255                             phys_addr_t addr, phys_addr_t end)
1256 {
1257         struct kvm *kvm = mmu->kvm;
1258         pud_t *pud;
1259         phys_addr_t next;
1260
1261         pud = stage2_pud_offset(kvm, p4d, addr);
1262         do {
1263                 next = stage2_pud_addr_end(kvm, addr, end);
1264                 if (!stage2_pud_none(kvm, *pud)) {
1265                         if (stage2_pud_huge(kvm, *pud)) {
1266                                 if (!kvm_s2pud_readonly(pud))
1267                                         kvm_set_s2pud_readonly(pud);
1268                         } else {
1269                                 stage2_wp_pmds(mmu, pud, addr, next);
1270                         }
1271                 }
1272         } while (pud++, addr = next, addr != end);
1273 }
1274
1275 /**
1276  * stage2_wp_p4ds - write protect PGD range
1277  * @pgd:        pointer to pgd entry
1278  * @addr:       range start address
1279  * @end:        range end address
1280  */
1281 static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
1282                             phys_addr_t addr, phys_addr_t end)
1283 {
1284         struct kvm *kvm = mmu->kvm;
1285         p4d_t *p4d;
1286         phys_addr_t next;
1287
1288         p4d = stage2_p4d_offset(kvm, pgd, addr);
1289         do {
1290                 next = stage2_p4d_addr_end(kvm, addr, end);
1291                 if (!stage2_p4d_none(kvm, *p4d))
1292                         stage2_wp_puds(mmu, p4d, addr, next);
1293         } while (p4d++, addr = next, addr != end);
1294 }
1295
1296 /**
1297  * stage2_wp_range() - write protect stage2 memory region range
1298  * @kvm:        The KVM pointer
1299  * @addr:       Start address of range
1300  * @end:        End address of range
1301  */
1302 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1303 {
1304         struct kvm *kvm = mmu->kvm;
1305         pgd_t *pgd;
1306         phys_addr_t next;
1307
1308         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
1309         do {
1310                 /*
1311                  * Release kvm_mmu_lock periodically if the memory region is
1312                  * large. Otherwise, we may see kernel panics with
1313                  * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1314                  * CONFIG_LOCKDEP. Additionally, holding the lock too long
1315                  * will also starve other vCPUs. We have to also make sure
1316                  * that the page tables are not freed while we released
1317                  * the lock.
1318                  */
1319                 cond_resched_lock(&kvm->mmu_lock);
1320                 if (!READ_ONCE(mmu->pgd))
1321                         break;
1322                 next = stage2_pgd_addr_end(kvm, addr, end);
1323                 if (stage2_pgd_present(kvm, *pgd))
1324                         stage2_wp_p4ds(mmu, pgd, addr, next);
1325         } while (pgd++, addr = next, addr != end);
1326 }
1327
1328 /**
1329  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1330  * @kvm:        The KVM pointer
1331  * @slot:       The memory slot to write protect
1332  *
1333  * Called to start logging dirty pages after memory region
1334  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1335  * all present PUD, PMD and PTEs are write protected in the memory region.
1336  * Afterwards read of dirty page log can be called.
1337  *
1338  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1339  * serializing operations for VM memory regions.
1340  */
1341 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1342 {
1343         struct kvm_memslots *slots = kvm_memslots(kvm);
1344         struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1345         phys_addr_t start, end;
1346
1347         if (WARN_ON_ONCE(!memslot))
1348                 return;
1349
1350         start = memslot->base_gfn << PAGE_SHIFT;
1351         end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1352
1353         spin_lock(&kvm->mmu_lock);
1354         stage2_wp_range(&kvm->arch.mmu, start, end);
1355         spin_unlock(&kvm->mmu_lock);
1356         kvm_flush_remote_tlbs(kvm);
1357 }
1358
1359 /**
1360  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1361  * @kvm:        The KVM pointer
1362  * @slot:       The memory slot associated with mask
1363  * @gfn_offset: The gfn offset in memory slot
1364  * @mask:       The mask of dirty pages at offset 'gfn_offset' in this memory
1365  *              slot to be write protected
1366  *
1367  * Walks bits set in mask write protects the associated pte's. Caller must
1368  * acquire kvm_mmu_lock.
1369  */
1370 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1371                 struct kvm_memory_slot *slot,
1372                 gfn_t gfn_offset, unsigned long mask)
1373 {
1374         phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1375         phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1376         phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1377
1378         stage2_wp_range(&kvm->arch.mmu, start, end);
1379 }
1380
1381 /*
1382  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1383  * dirty pages.
1384  *
1385  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1386  * enable dirty logging for them.
1387  */
1388 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1389                 struct kvm_memory_slot *slot,
1390                 gfn_t gfn_offset, unsigned long mask)
1391 {
1392         kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1393 }
1394
1395 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1396 {
1397         __clean_dcache_guest_page(pfn, size);
1398 }
1399
1400 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1401 {
1402         __invalidate_icache_guest_page(pfn, size);
1403 }
1404
1405 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1406 {
1407         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1408 }
1409
1410 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1411                                                unsigned long hva,
1412                                                unsigned long map_size)
1413 {
1414         gpa_t gpa_start;
1415         hva_t uaddr_start, uaddr_end;
1416         size_t size;
1417
1418         /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1419         if (map_size == PAGE_SIZE)
1420                 return true;
1421
1422         size = memslot->npages * PAGE_SIZE;
1423
1424         gpa_start = memslot->base_gfn << PAGE_SHIFT;
1425
1426         uaddr_start = memslot->userspace_addr;
1427         uaddr_end = uaddr_start + size;
1428
1429         /*
1430          * Pages belonging to memslots that don't have the same alignment
1431          * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1432          * PMD/PUD entries, because we'll end up mapping the wrong pages.
1433          *
1434          * Consider a layout like the following:
1435          *
1436          *    memslot->userspace_addr:
1437          *    +-----+--------------------+--------------------+---+
1438          *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1439          *    +-----+--------------------+--------------------+---+
1440          *
1441          *    memslot->base_gfn << PAGE_SHIFT:
1442          *      +---+--------------------+--------------------+-----+
1443          *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1444          *      +---+--------------------+--------------------+-----+
1445          *
1446          * If we create those stage-2 blocks, we'll end up with this incorrect
1447          * mapping:
1448          *   d -> f
1449          *   e -> g
1450          *   f -> h
1451          */
1452         if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1453                 return false;
1454
1455         /*
1456          * Next, let's make sure we're not trying to map anything not covered
1457          * by the memslot. This means we have to prohibit block size mappings
1458          * for the beginning and end of a non-block aligned and non-block sized
1459          * memory slot (illustrated by the head and tail parts of the
1460          * userspace view above containing pages 'abcde' and 'xyz',
1461          * respectively).
1462          *
1463          * Note that it doesn't matter if we do the check using the
1464          * userspace_addr or the base_gfn, as both are equally aligned (per
1465          * the check above) and equally sized.
1466          */
1467         return (hva & ~(map_size - 1)) >= uaddr_start &&
1468                (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1469 }
1470
1471 /*
1472  * Check if the given hva is backed by a transparent huge page (THP) and
1473  * whether it can be mapped using block mapping in stage2. If so, adjust
1474  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1475  * supported. This will need to be updated to support other THP sizes.
1476  *
1477  * Returns the size of the mapping.
1478  */
1479 static unsigned long
1480 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1481                             unsigned long hva, kvm_pfn_t *pfnp,
1482                             phys_addr_t *ipap)
1483 {
1484         kvm_pfn_t pfn = *pfnp;
1485
1486         /*
1487          * Make sure the adjustment is done only for THP pages. Also make
1488          * sure that the HVA and IPA are sufficiently aligned and that the
1489          * block map is contained within the memslot.
1490          */
1491         if (kvm_is_transparent_hugepage(pfn) &&
1492             fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1493                 /*
1494                  * The address we faulted on is backed by a transparent huge
1495                  * page.  However, because we map the compound huge page and
1496                  * not the individual tail page, we need to transfer the
1497                  * refcount to the head page.  We have to be careful that the
1498                  * THP doesn't start to split while we are adjusting the
1499                  * refcounts.
1500                  *
1501                  * We are sure this doesn't happen, because mmu_notifier_retry
1502                  * was successful and we are holding the mmu_lock, so if this
1503                  * THP is trying to split, it will be blocked in the mmu
1504                  * notifier before touching any of the pages, specifically
1505                  * before being able to call __split_huge_page_refcount().
1506                  *
1507                  * We can therefore safely transfer the refcount from PG_tail
1508                  * to PG_head and switch the pfn from a tail page to the head
1509                  * page accordingly.
1510                  */
1511                 *ipap &= PMD_MASK;
1512                 kvm_release_pfn_clean(pfn);
1513                 pfn &= ~(PTRS_PER_PMD - 1);
1514                 kvm_get_pfn(pfn);
1515                 *pfnp = pfn;
1516
1517                 return PMD_SIZE;
1518         }
1519
1520         /* Use page mapping if we cannot use block mapping. */
1521         return PAGE_SIZE;
1522 }
1523
1524 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1525                           struct kvm_memory_slot *memslot, unsigned long hva,
1526                           unsigned long fault_status)
1527 {
1528         int ret;
1529         bool write_fault, writable, force_pte = false;
1530         bool exec_fault, needs_exec;
1531         unsigned long mmu_seq;
1532         gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1533         struct kvm *kvm = vcpu->kvm;
1534         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1535         struct vm_area_struct *vma;
1536         short vma_shift;
1537         kvm_pfn_t pfn;
1538         pgprot_t mem_type = PAGE_S2;
1539         bool logging_active = memslot_is_logging(memslot);
1540         unsigned long vma_pagesize, flags = 0;
1541         struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1542
1543         write_fault = kvm_is_write_fault(vcpu);
1544         exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1545         VM_BUG_ON(write_fault && exec_fault);
1546
1547         if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1548                 kvm_err("Unexpected L2 read permission error\n");
1549                 return -EFAULT;
1550         }
1551
1552         /* Let's check if we will get back a huge page backed by hugetlbfs */
1553         mmap_read_lock(current->mm);
1554         vma = find_vma_intersection(current->mm, hva, hva + 1);
1555         if (unlikely(!vma)) {
1556                 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1557                 mmap_read_unlock(current->mm);
1558                 return -EFAULT;
1559         }
1560
1561         if (is_vm_hugetlb_page(vma))
1562                 vma_shift = huge_page_shift(hstate_vma(vma));
1563         else
1564                 vma_shift = PAGE_SHIFT;
1565
1566         vma_pagesize = 1ULL << vma_shift;
1567         if (logging_active ||
1568             (vma->vm_flags & VM_PFNMAP) ||
1569             !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1570                 force_pte = true;
1571                 vma_pagesize = PAGE_SIZE;
1572         }
1573
1574         /*
1575          * The stage2 has a minimum of 2 level table (For arm64 see
1576          * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1577          * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1578          * As for PUD huge maps, we must make sure that we have at least
1579          * 3 levels, i.e, PMD is not folded.
1580          */
1581         if (vma_pagesize == PMD_SIZE ||
1582             (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1583                 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1584         mmap_read_unlock(current->mm);
1585
1586         /* We need minimum second+third level pages */
1587         ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
1588         if (ret)
1589                 return ret;
1590
1591         mmu_seq = vcpu->kvm->mmu_notifier_seq;
1592         /*
1593          * Ensure the read of mmu_notifier_seq happens before we call
1594          * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1595          * the page we just got a reference to gets unmapped before we have a
1596          * chance to grab the mmu_lock, which ensure that if the page gets
1597          * unmapped afterwards, the call to kvm_unmap_hva will take it away
1598          * from us again properly. This smp_rmb() interacts with the smp_wmb()
1599          * in kvm_mmu_notifier_invalidate_<page|range_end>.
1600          */
1601         smp_rmb();
1602
1603         pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1604         if (pfn == KVM_PFN_ERR_HWPOISON) {
1605                 kvm_send_hwpoison_signal(hva, vma_shift);
1606                 return 0;
1607         }
1608         if (is_error_noslot_pfn(pfn))
1609                 return -EFAULT;
1610
1611         if (kvm_is_device_pfn(pfn)) {
1612                 mem_type = PAGE_S2_DEVICE;
1613                 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1614         } else if (logging_active) {
1615                 /*
1616                  * Faults on pages in a memslot with logging enabled
1617                  * should not be mapped with huge pages (it introduces churn
1618                  * and performance degradation), so force a pte mapping.
1619                  */
1620                 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1621
1622                 /*
1623                  * Only actually map the page as writable if this was a write
1624                  * fault.
1625                  */
1626                 if (!write_fault)
1627                         writable = false;
1628         }
1629
1630         if (exec_fault && is_iomap(flags))
1631                 return -ENOEXEC;
1632
1633         spin_lock(&kvm->mmu_lock);
1634         if (mmu_notifier_retry(kvm, mmu_seq))
1635                 goto out_unlock;
1636
1637         /*
1638          * If we are not forced to use page mapping, check if we are
1639          * backed by a THP and thus use block mapping if possible.
1640          */
1641         if (vma_pagesize == PAGE_SIZE && !force_pte)
1642                 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1643                                                            &pfn, &fault_ipa);
1644         if (writable)
1645                 kvm_set_pfn_dirty(pfn);
1646
1647         if (fault_status != FSC_PERM && !is_iomap(flags))
1648                 clean_dcache_guest_page(pfn, vma_pagesize);
1649
1650         if (exec_fault)
1651                 invalidate_icache_guest_page(pfn, vma_pagesize);
1652
1653         /*
1654          * If we took an execution fault we have made the
1655          * icache/dcache coherent above and should now let the s2
1656          * mapping be executable.
1657          *
1658          * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1659          * execute permissions, and we preserve whatever we have.
1660          */
1661         needs_exec = exec_fault ||
1662                 (fault_status == FSC_PERM &&
1663                  stage2_is_exec(mmu, fault_ipa, vma_pagesize));
1664
1665         if (vma_pagesize == PUD_SIZE) {
1666                 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1667
1668                 new_pud = kvm_pud_mkhuge(new_pud);
1669                 if (writable)
1670                         new_pud = kvm_s2pud_mkwrite(new_pud);
1671
1672                 if (needs_exec)
1673                         new_pud = kvm_s2pud_mkexec(new_pud);
1674
1675                 ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
1676         } else if (vma_pagesize == PMD_SIZE) {
1677                 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1678
1679                 new_pmd = kvm_pmd_mkhuge(new_pmd);
1680
1681                 if (writable)
1682                         new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1683
1684                 if (needs_exec)
1685                         new_pmd = kvm_s2pmd_mkexec(new_pmd);
1686
1687                 ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
1688         } else {
1689                 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1690
1691                 if (writable) {
1692                         new_pte = kvm_s2pte_mkwrite(new_pte);
1693                         mark_page_dirty(kvm, gfn);
1694                 }
1695
1696                 if (needs_exec)
1697                         new_pte = kvm_s2pte_mkexec(new_pte);
1698
1699                 ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
1700         }
1701
1702 out_unlock:
1703         spin_unlock(&kvm->mmu_lock);
1704         kvm_set_pfn_accessed(pfn);
1705         kvm_release_pfn_clean(pfn);
1706         return ret;
1707 }
1708
1709 /*
1710  * Resolve the access fault by making the page young again.
1711  * Note that because the faulting entry is guaranteed not to be
1712  * cached in the TLB, we don't need to invalidate anything.
1713  * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
1714  * so there is no need for atomic (pte|pmd)_mkyoung operations.
1715  */
1716 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1717 {
1718         pud_t *pud;
1719         pmd_t *pmd;
1720         pte_t *pte;
1721         kvm_pfn_t pfn;
1722         bool pfn_valid = false;
1723
1724         trace_kvm_access_fault(fault_ipa);
1725
1726         spin_lock(&vcpu->kvm->mmu_lock);
1727
1728         if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
1729                 goto out;
1730
1731         if (pud) {              /* HugeTLB */
1732                 *pud = kvm_s2pud_mkyoung(*pud);
1733                 pfn = kvm_pud_pfn(*pud);
1734                 pfn_valid = true;
1735         } else  if (pmd) {      /* THP, HugeTLB */
1736                 *pmd = pmd_mkyoung(*pmd);
1737                 pfn = pmd_pfn(*pmd);
1738                 pfn_valid = true;
1739         } else {
1740                 *pte = pte_mkyoung(*pte);       /* Just a page... */
1741                 pfn = pte_pfn(*pte);
1742                 pfn_valid = true;
1743         }
1744
1745 out:
1746         spin_unlock(&vcpu->kvm->mmu_lock);
1747         if (pfn_valid)
1748                 kvm_set_pfn_accessed(pfn);
1749 }
1750
1751 /**
1752  * kvm_handle_guest_abort - handles all 2nd stage aborts
1753  * @vcpu:       the VCPU pointer
1754  *
1755  * Any abort that gets to the host is almost guaranteed to be caused by a
1756  * missing second stage translation table entry, which can mean that either the
1757  * guest simply needs more memory and we must allocate an appropriate page or it
1758  * can mean that the guest tried to access I/O memory, which is emulated by user
1759  * space. The distinction is based on the IPA causing the fault and whether this
1760  * memory region has been registered as standard RAM by user space.
1761  */
1762 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1763 {
1764         unsigned long fault_status;
1765         phys_addr_t fault_ipa;
1766         struct kvm_memory_slot *memslot;
1767         unsigned long hva;
1768         bool is_iabt, write_fault, writable;
1769         gfn_t gfn;
1770         int ret, idx;
1771
1772         fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1773
1774         fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1775         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1776
1777         /* Synchronous External Abort? */
1778         if (kvm_vcpu_abt_issea(vcpu)) {
1779                 /*
1780                  * For RAS the host kernel may handle this abort.
1781                  * There is no need to pass the error into the guest.
1782                  */
1783                 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1784                         kvm_inject_vabt(vcpu);
1785
1786                 return 1;
1787         }
1788
1789         trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1790                               kvm_vcpu_get_hfar(vcpu), fault_ipa);
1791
1792         /* Check the stage-2 fault is trans. fault or write fault */
1793         if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1794             fault_status != FSC_ACCESS) {
1795                 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1796                         kvm_vcpu_trap_get_class(vcpu),
1797                         (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1798                         (unsigned long)kvm_vcpu_get_esr(vcpu));
1799                 return -EFAULT;
1800         }
1801
1802         idx = srcu_read_lock(&vcpu->kvm->srcu);
1803
1804         gfn = fault_ipa >> PAGE_SHIFT;
1805         memslot = gfn_to_memslot(vcpu->kvm, gfn);
1806         hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1807         write_fault = kvm_is_write_fault(vcpu);
1808         if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1809                 /*
1810                  * The guest has put either its instructions or its page-tables
1811                  * somewhere it shouldn't have. Userspace won't be able to do
1812                  * anything about this (there's no syndrome for a start), so
1813                  * re-inject the abort back into the guest.
1814                  */
1815                 if (is_iabt) {
1816                         ret = -ENOEXEC;
1817                         goto out;
1818                 }
1819
1820                 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1821                         kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1822                         ret = 1;
1823                         goto out_unlock;
1824                 }
1825
1826                 /*
1827                  * Check for a cache maintenance operation. Since we
1828                  * ended-up here, we know it is outside of any memory
1829                  * slot. But we can't find out if that is for a device,
1830                  * or if the guest is just being stupid. The only thing
1831                  * we know for sure is that this range cannot be cached.
1832                  *
1833                  * So let's assume that the guest is just being
1834                  * cautious, and skip the instruction.
1835                  */
1836                 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1837                         kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1838                         ret = 1;
1839                         goto out_unlock;
1840                 }
1841
1842                 /*
1843                  * The IPA is reported as [MAX:12], so we need to
1844                  * complement it with the bottom 12 bits from the
1845                  * faulting VA. This is always 12 bits, irrespective
1846                  * of the page size.
1847                  */
1848                 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1849                 ret = io_mem_abort(vcpu, fault_ipa);
1850                 goto out_unlock;
1851         }
1852
1853         /* Userspace should not be able to register out-of-bounds IPAs */
1854         VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1855
1856         if (fault_status == FSC_ACCESS) {
1857                 handle_access_fault(vcpu, fault_ipa);
1858                 ret = 1;
1859                 goto out_unlock;
1860         }
1861
1862         ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1863         if (ret == 0)
1864                 ret = 1;
1865 out:
1866         if (ret == -ENOEXEC) {
1867                 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1868                 ret = 1;
1869         }
1870 out_unlock:
1871         srcu_read_unlock(&vcpu->kvm->srcu, idx);
1872         return ret;
1873 }
1874
1875 static int handle_hva_to_gpa(struct kvm *kvm,
1876                              unsigned long start,
1877                              unsigned long end,
1878                              int (*handler)(struct kvm *kvm,
1879                                             gpa_t gpa, u64 size,
1880                                             void *data),
1881                              void *data)
1882 {
1883         struct kvm_memslots *slots;
1884         struct kvm_memory_slot *memslot;
1885         int ret = 0;
1886
1887         slots = kvm_memslots(kvm);
1888
1889         /* we only care about the pages that the guest sees */
1890         kvm_for_each_memslot(memslot, slots) {
1891                 unsigned long hva_start, hva_end;
1892                 gfn_t gpa;
1893
1894                 hva_start = max(start, memslot->userspace_addr);
1895                 hva_end = min(end, memslot->userspace_addr +
1896                                         (memslot->npages << PAGE_SHIFT));
1897                 if (hva_start >= hva_end)
1898                         continue;
1899
1900                 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1901                 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
1902         }
1903
1904         return ret;
1905 }
1906
1907 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1908 {
1909         unsigned flags = *(unsigned *)data;
1910         bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1911
1912         __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
1913         return 0;
1914 }
1915
1916 int kvm_unmap_hva_range(struct kvm *kvm,
1917                         unsigned long start, unsigned long end, unsigned flags)
1918 {
1919         if (!kvm->arch.mmu.pgd)
1920                 return 0;
1921
1922         trace_kvm_unmap_hva_range(start, end);
1923         handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
1924         return 0;
1925 }
1926
1927 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1928 {
1929         kvm_pfn_t *pfn = (kvm_pfn_t *)data;
1930
1931         WARN_ON(size != PAGE_SIZE);
1932
1933         /*
1934          * The MMU notifiers will have unmapped a huge PMD before calling
1935          * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1936          * therefore we never need to clear out a huge PMD through this
1937          * calling path and a memcache is not required.
1938          */
1939         kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1940                                __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
1941         return 0;
1942 }
1943
1944 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1945 {
1946         unsigned long end = hva + PAGE_SIZE;
1947         kvm_pfn_t pfn = pte_pfn(pte);
1948
1949         if (!kvm->arch.mmu.pgt)
1950                 return 0;
1951
1952         trace_kvm_set_spte_hva(hva);
1953
1954         /*
1955          * We've moved a page around, probably through CoW, so let's treat it
1956          * just like a translation fault and clean the cache to the PoC.
1957          */
1958         clean_dcache_guest_page(pfn, PAGE_SIZE);
1959         handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
1960         return 0;
1961 }
1962
1963 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1964 {
1965         pud_t *pud;
1966         pmd_t *pmd;
1967         pte_t *pte;
1968
1969         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1970         if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
1971                 return 0;
1972
1973         if (pud)
1974                 return stage2_pudp_test_and_clear_young(pud);
1975         else if (pmd)
1976                 return stage2_pmdp_test_and_clear_young(pmd);
1977         else
1978                 return stage2_ptep_test_and_clear_young(pte);
1979 }
1980
1981 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1982 {
1983         pud_t *pud;
1984         pmd_t *pmd;
1985         pte_t *pte;
1986
1987         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1988         if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
1989                 return 0;
1990
1991         if (pud)
1992                 return kvm_s2pud_young(*pud);
1993         else if (pmd)
1994                 return pmd_young(*pmd);
1995         else
1996                 return pte_young(*pte);
1997 }
1998
1999 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2000 {
2001         if (!kvm->arch.mmu.pgd)
2002                 return 0;
2003         trace_kvm_age_hva(start, end);
2004         return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
2005 }
2006
2007 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2008 {
2009         if (!kvm->arch.mmu.pgd)
2010                 return 0;
2011         trace_kvm_test_age_hva(hva);
2012         return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
2013                                  kvm_test_age_hva_handler, NULL);
2014 }
2015
2016 phys_addr_t kvm_mmu_get_httbr(void)
2017 {
2018         return __pa(hyp_pgtable->pgd);
2019 }
2020
2021 phys_addr_t kvm_get_idmap_vector(void)
2022 {
2023         return hyp_idmap_vector;
2024 }
2025
2026 static int kvm_map_idmap_text(void)
2027 {
2028         unsigned long size = hyp_idmap_end - hyp_idmap_start;
2029         int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
2030                                         PAGE_HYP_EXEC);
2031         if (err)
2032                 kvm_err("Failed to idmap %lx-%lx\n",
2033                         hyp_idmap_start, hyp_idmap_end);
2034
2035         return err;
2036 }
2037
2038 int kvm_mmu_init(void)
2039 {
2040         int err;
2041         u32 hyp_va_bits;
2042
2043         hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
2044         hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
2045         hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
2046         hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
2047         hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
2048
2049         /*
2050          * We rely on the linker script to ensure at build time that the HYP
2051          * init code does not cross a page boundary.
2052          */
2053         BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
2054
2055         hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
2056         kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
2057         kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
2058         kvm_debug("HYP VA range: %lx:%lx\n",
2059                   kern_hyp_va(PAGE_OFFSET),
2060                   kern_hyp_va((unsigned long)high_memory - 1));
2061
2062         if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2063             hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2064             hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2065                 /*
2066                  * The idmap page is intersecting with the VA space,
2067                  * it is not safe to continue further.
2068                  */
2069                 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2070                 err = -EINVAL;
2071                 goto out;
2072         }
2073
2074         hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2075         if (!hyp_pgtable) {
2076                 kvm_err("Hyp mode page-table not allocated\n");
2077                 err = -ENOMEM;
2078                 goto out;
2079         }
2080
2081         err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
2082         if (err)
2083                 goto out_free_pgtable;
2084
2085         err = kvm_map_idmap_text();
2086         if (err)
2087                 goto out_destroy_pgtable;
2088
2089         io_map_base = hyp_idmap_start;
2090         return 0;
2091
2092 out_destroy_pgtable:
2093         kvm_pgtable_hyp_destroy(hyp_pgtable);
2094 out_free_pgtable:
2095         kfree(hyp_pgtable);
2096         hyp_pgtable = NULL;
2097 out:
2098         return err;
2099 }
2100
2101 void kvm_arch_commit_memory_region(struct kvm *kvm,
2102                                    const struct kvm_userspace_memory_region *mem,
2103                                    struct kvm_memory_slot *old,
2104                                    const struct kvm_memory_slot *new,
2105                                    enum kvm_mr_change change)
2106 {
2107         /*
2108          * At this point memslot has been committed and there is an
2109          * allocated dirty_bitmap[], dirty pages will be tracked while the
2110          * memory slot is write protected.
2111          */
2112         if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2113                 /*
2114                  * If we're with initial-all-set, we don't need to write
2115                  * protect any pages because they're all reported as dirty.
2116                  * Huge pages and normal pages will be write protect gradually.
2117                  */
2118                 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2119                         kvm_mmu_wp_memory_region(kvm, mem->slot);
2120                 }
2121         }
2122 }
2123
2124 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2125                                    struct kvm_memory_slot *memslot,
2126                                    const struct kvm_userspace_memory_region *mem,
2127                                    enum kvm_mr_change change)
2128 {
2129         hva_t hva = mem->userspace_addr;
2130         hva_t reg_end = hva + mem->memory_size;
2131         bool writable = !(mem->flags & KVM_MEM_READONLY);
2132         int ret = 0;
2133
2134         if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2135                         change != KVM_MR_FLAGS_ONLY)
2136                 return 0;
2137
2138         /*
2139          * Prevent userspace from creating a memory region outside of the IPA
2140          * space addressable by the KVM guest IPA space.
2141          */
2142         if (memslot->base_gfn + memslot->npages >=
2143             (kvm_phys_size(kvm) >> PAGE_SHIFT))
2144                 return -EFAULT;
2145
2146         mmap_read_lock(current->mm);
2147         /*
2148          * A memory region could potentially cover multiple VMAs, and any holes
2149          * between them, so iterate over all of them to find out if we can map
2150          * any of them right now.
2151          *
2152          *     +--------------------------------------------+
2153          * +---------------+----------------+   +----------------+
2154          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2155          * +---------------+----------------+   +----------------+
2156          *     |               memory region                |
2157          *     +--------------------------------------------+
2158          */
2159         do {
2160                 struct vm_area_struct *vma = find_vma(current->mm, hva);
2161                 hva_t vm_start, vm_end;
2162
2163                 if (!vma || vma->vm_start >= reg_end)
2164                         break;
2165
2166                 /*
2167                  * Take the intersection of this VMA with the memory region
2168                  */
2169                 vm_start = max(hva, vma->vm_start);
2170                 vm_end = min(reg_end, vma->vm_end);
2171
2172                 if (vma->vm_flags & VM_PFNMAP) {
2173                         gpa_t gpa = mem->guest_phys_addr +
2174                                     (vm_start - mem->userspace_addr);
2175                         phys_addr_t pa;
2176
2177                         pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2178                         pa += vm_start - vma->vm_start;
2179
2180                         /* IO region dirty page logging not allowed */
2181                         if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2182                                 ret = -EINVAL;
2183                                 goto out;
2184                         }
2185
2186                         ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2187                                                     vm_end - vm_start,
2188                                                     writable);
2189                         if (ret)
2190                                 break;
2191                 }
2192                 hva = vm_end;
2193         } while (hva < reg_end);
2194
2195         if (change == KVM_MR_FLAGS_ONLY)
2196                 goto out;
2197
2198         spin_lock(&kvm->mmu_lock);
2199         if (ret)
2200                 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
2201         else
2202                 stage2_flush_memslot(kvm, memslot);
2203         spin_unlock(&kvm->mmu_lock);
2204 out:
2205         mmap_read_unlock(current->mm);
2206         return ret;
2207 }
2208
2209 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2210 {
2211 }
2212
2213 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2214 {
2215 }
2216
2217 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2218 {
2219         kvm_free_stage2_pgd(&kvm->arch.mmu);
2220 }
2221
2222 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2223                                    struct kvm_memory_slot *slot)
2224 {
2225         gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2226         phys_addr_t size = slot->npages << PAGE_SHIFT;
2227
2228         spin_lock(&kvm->mmu_lock);
2229         unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2230         spin_unlock(&kvm->mmu_lock);
2231 }
2232
2233 /*
2234  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2235  *
2236  * Main problems:
2237  * - S/W ops are local to a CPU (not broadcast)
2238  * - We have line migration behind our back (speculation)
2239  * - System caches don't support S/W at all (damn!)
2240  *
2241  * In the face of the above, the best we can do is to try and convert
2242  * S/W ops to VA ops. Because the guest is not allowed to infer the
2243  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2244  * which is a rather good thing for us.
2245  *
2246  * Also, it is only used when turning caches on/off ("The expected
2247  * usage of the cache maintenance instructions that operate by set/way
2248  * is associated with the cache maintenance instructions associated
2249  * with the powerdown and powerup of caches, if this is required by
2250  * the implementation.").
2251  *
2252  * We use the following policy:
2253  *
2254  * - If we trap a S/W operation, we enable VM trapping to detect
2255  *   caches being turned on/off, and do a full clean.
2256  *
2257  * - We flush the caches on both caches being turned on and off.
2258  *
2259  * - Once the caches are enabled, we stop trapping VM ops.
2260  */
2261 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2262 {
2263         unsigned long hcr = *vcpu_hcr(vcpu);
2264
2265         /*
2266          * If this is the first time we do a S/W operation
2267          * (i.e. HCR_TVM not set) flush the whole memory, and set the
2268          * VM trapping.
2269          *
2270          * Otherwise, rely on the VM trapping to wait for the MMU +
2271          * Caches to be turned off. At that point, we'll be able to
2272          * clean the caches again.
2273          */
2274         if (!(hcr & HCR_TVM)) {
2275                 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2276                                         vcpu_has_cache_enabled(vcpu));
2277                 stage2_flush_vm(vcpu->kvm);
2278                 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2279         }
2280 }
2281
2282 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2283 {
2284         bool now_enabled = vcpu_has_cache_enabled(vcpu);
2285
2286         /*
2287          * If switching the MMU+caches on, need to invalidate the caches.
2288          * If switching it off, need to clean the caches.
2289          * Clean + invalidate does the trick always.
2290          */
2291         if (now_enabled != was_enabled)
2292                 stage2_flush_vm(vcpu->kvm);
2293
2294         /* Caches are now on, stop trapping VM ops (until a S/W op) */
2295         if (now_enabled)
2296                 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2297
2298         trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2299 }