arch/arm64/kvm/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   4  * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   5  */
   6
   7 #include <linux/mman.h>
   8 #include <linux/kvm_host.h>
   9 #include <linux/io.h>
  10 #include <linux/hugetlb.h>
  11 #include <linux/sched/signal.h>
  12 #include <trace/events/kvm.h>
  13 #include <asm/pgalloc.h>
  14 #include <asm/cacheflush.h>
  15 #include <asm/kvm_arm.h>
  16 #include <asm/kvm_mmu.h>
  17 #include <asm/kvm_pgtable.h>
  18 #include <asm/kvm_ras.h>
  19 #include <asm/kvm_asm.h>
  20 #include <asm/kvm_emulate.h>
  21 #include <asm/virt.h>
  22
  23 #include "trace.h"
  24
  25 static struct kvm_pgtable *hyp_pgtable;
  26 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
  27
  28 static unsigned long hyp_idmap_start;
  29 static unsigned long hyp_idmap_end;
  30 static phys_addr_t hyp_idmap_vector;
  31
  32 static unsigned long io_map_base;
  33
  34 #define KVM_S2PTE_FLAG_IS_IOMAP         (1UL << 0)
  35 #define KVM_S2_FLAG_LOGGING_ACTIVE      (1UL << 1)
  36
  37 static bool is_iomap(unsigned long flags)
  38 {
  39         return flags & KVM_S2PTE_FLAG_IS_IOMAP;
  40 }
  41
  42 /*
  43  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
  44  * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
  45  * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
  46  * long will also starve other vCPUs. We have to also make sure that the page
  47  * tables are not freed while we released the lock.
  48  */
  49 static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
  50                               phys_addr_t end,
  51                               int (*fn)(struct kvm_pgtable *, u64, u64),
  52                               bool resched)
  53 {
  54         int ret;
  55         u64 next;
  56
  57         do {
  58                 struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
  59                 if (!pgt)
  60                         return -EINVAL;
  61
  62                 next = stage2_pgd_addr_end(kvm, addr, end);
  63                 ret = fn(pgt, addr, next - addr);
  64                 if (ret)
  65                         break;
  66
  67                 if (resched && next != end)
  68                         cond_resched_lock(&kvm->mmu_lock);
  69         } while (addr = next, addr != end);
  70
  71         return ret;
  72 }
  73
  74 #define stage2_apply_range_resched(kvm, addr, end, fn)                  \
  75         stage2_apply_range(kvm, addr, end, fn, true)
  76
  77 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
  78 {
  79         return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
  80 }
  81
  82 /**
  83  * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
  84  * @kvm:        pointer to kvm structure.
  85  *
  86  * Interface to HYP function to flush all VM TLB entries
  87  */
  88 void kvm_flush_remote_tlbs(struct kvm *kvm)
  89 {
  90         kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
  91 }
  92
  93 static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
  94                                    int level)
  95 {
  96         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
  97 }
  98
  99 /*
 100  * D-Cache management functions. They take the page table entries by
 101  * value, as they are flushing the cache using the kernel mapping (or
 102  * kmap on 32bit).
 103  */
 104 static void kvm_flush_dcache_pte(pte_t pte)
 105 {
 106         __kvm_flush_dcache_pte(pte);
 107 }
 108
 109 static void kvm_flush_dcache_pmd(pmd_t pmd)
 110 {
 111         __kvm_flush_dcache_pmd(pmd);
 112 }
 113
 114 static void kvm_flush_dcache_pud(pud_t pud)
 115 {
 116         __kvm_flush_dcache_pud(pud);
 117 }
 118
 119 static bool kvm_is_device_pfn(unsigned long pfn)
 120 {
 121         return !pfn_valid(pfn);
 122 }
 123
 124 /**
 125  * stage2_dissolve_pmd() - clear and flush huge PMD entry
 126  * @mmu:        pointer to mmu structure to operate on
 127  * @addr:       IPA
 128  * @pmd:        pmd pointer for IPA
 129  *
 130  * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
 131  */
 132 static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
 133 {
 134         if (!pmd_thp_or_huge(*pmd))
 135                 return;
 136
 137         pmd_clear(pmd);
 138         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
 139         put_page(virt_to_page(pmd));
 140 }
 141
 142 /**
 143  * stage2_dissolve_pud() - clear and flush huge PUD entry
 144  * @mmu:        pointer to mmu structure to operate on
 145  * @addr:       IPA
 146  * @pud:        pud pointer for IPA
 147  *
 148  * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
 149  */
 150 static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
 151 {
 152         struct kvm *kvm = mmu->kvm;
 153
 154         if (!stage2_pud_huge(kvm, *pudp))
 155                 return;
 156
 157         stage2_pud_clear(kvm, pudp);
 158         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
 159         put_page(virt_to_page(pudp));
 160 }
 161
 162 static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
 163 {
 164         struct kvm *kvm = mmu->kvm;
 165         p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
 166         stage2_pgd_clear(kvm, pgd);
 167         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 168         stage2_p4d_free(kvm, p4d_table);
 169         put_page(virt_to_page(pgd));
 170 }
 171
 172 static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
 173 {
 174         struct kvm *kvm = mmu->kvm;
 175         pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
 176         stage2_p4d_clear(kvm, p4d);
 177         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 178         stage2_pud_free(kvm, pud_table);
 179         put_page(virt_to_page(p4d));
 180 }
 181
 182 static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
 183 {
 184         struct kvm *kvm = mmu->kvm;
 185         pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
 186
 187         VM_BUG_ON(stage2_pud_huge(kvm, *pud));
 188         stage2_pud_clear(kvm, pud);
 189         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 190         stage2_pmd_free(kvm, pmd_table);
 191         put_page(virt_to_page(pud));
 192 }
 193
 194 static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
 195 {
 196         pte_t *pte_table = pte_offset_kernel(pmd, 0);
 197         VM_BUG_ON(pmd_thp_or_huge(*pmd));
 198         pmd_clear(pmd);
 199         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
 200         free_page((unsigned long)pte_table);
 201         put_page(virt_to_page(pmd));
 202 }
 203
 204 static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
 205 {
 206         WRITE_ONCE(*ptep, new_pte);
 207         dsb(ishst);
 208 }
 209
 210 static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
 211 {
 212         WRITE_ONCE(*pmdp, new_pmd);
 213         dsb(ishst);
 214 }
 215
 216 static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
 217 {
 218         kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
 219 }
 220
 221 static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
 222 {
 223         WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
 224         dsb(ishst);
 225 }
 226
 227 static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
 228 {
 229         WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
 230         dsb(ishst);
 231 }
 232
 233 static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
 234 {
 235 #ifndef __PAGETABLE_P4D_FOLDED
 236         WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
 237         dsb(ishst);
 238 #endif
 239 }
 240
 241 /*
 242  * Unmapping vs dcache management:
 243  *
 244  * If a guest maps certain memory pages as uncached, all writes will
 245  * bypass the data cache and go directly to RAM.  However, the CPUs
 246  * can still speculate reads (not writes) and fill cache lines with
 247  * data.
 248  *
 249  * Those cache lines will be *clean* cache lines though, so a
 250  * clean+invalidate operation is equivalent to an invalidate
 251  * operation, because no cache lines are marked dirty.
 252  *
 253  * Those clean cache lines could be filled prior to an uncached write
 254  * by the guest, and the cache coherent IO subsystem would therefore
 255  * end up writing old data to disk.
 256  *
 257  * This is why right after unmapping a page/section and invalidating
 258  * the corresponding TLBs, we flush to make sure the IO subsystem will
 259  * never hit in the cache.
 260  *
 261  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
 262  * we then fully enforce cacheability of RAM, no matter what the guest
 263  * does.
 264  */
 265 static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
 266                        phys_addr_t addr, phys_addr_t end)
 267 {
 268         phys_addr_t start_addr = addr;
 269         pte_t *pte, *start_pte;
 270
 271         start_pte = pte = pte_offset_kernel(pmd, addr);
 272         do {
 273                 if (!pte_none(*pte)) {
 274                         pte_t old_pte = *pte;
 275
 276                         kvm_set_pte(pte, __pte(0));
 277                         kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
 278
 279                         /* No need to invalidate the cache for device mappings */
 280                         if (!kvm_is_device_pfn(pte_pfn(old_pte)))
 281                                 kvm_flush_dcache_pte(old_pte);
 282
 283                         put_page(virt_to_page(pte));
 284                 }
 285         } while (pte++, addr += PAGE_SIZE, addr != end);
 286
 287         if (stage2_pte_table_empty(mmu->kvm, start_pte))
 288                 clear_stage2_pmd_entry(mmu, pmd, start_addr);
 289 }
 290
 291 static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
 292                        phys_addr_t addr, phys_addr_t end)
 293 {
 294         struct kvm *kvm = mmu->kvm;
 295         phys_addr_t next, start_addr = addr;
 296         pmd_t *pmd, *start_pmd;
 297
 298         start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
 299         do {
 300                 next = stage2_pmd_addr_end(kvm, addr, end);
 301                 if (!pmd_none(*pmd)) {
 302                         if (pmd_thp_or_huge(*pmd)) {
 303                                 pmd_t old_pmd = *pmd;
 304
 305                                 pmd_clear(pmd);
 306                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
 307
 308                                 kvm_flush_dcache_pmd(old_pmd);
 309
 310                                 put_page(virt_to_page(pmd));
 311                         } else {
 312                                 unmap_stage2_ptes(mmu, pmd, addr, next);
 313                         }
 314                 }
 315         } while (pmd++, addr = next, addr != end);
 316
 317         if (stage2_pmd_table_empty(kvm, start_pmd))
 318                 clear_stage2_pud_entry(mmu, pud, start_addr);
 319 }
 320
 321 static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
 322                        phys_addr_t addr, phys_addr_t end)
 323 {
 324         struct kvm *kvm = mmu->kvm;
 325         phys_addr_t next, start_addr = addr;
 326         pud_t *pud, *start_pud;
 327
 328         start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
 329         do {
 330                 next = stage2_pud_addr_end(kvm, addr, end);
 331                 if (!stage2_pud_none(kvm, *pud)) {
 332                         if (stage2_pud_huge(kvm, *pud)) {
 333                                 pud_t old_pud = *pud;
 334
 335                                 stage2_pud_clear(kvm, pud);
 336                                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
 337                                 kvm_flush_dcache_pud(old_pud);
 338                                 put_page(virt_to_page(pud));
 339                         } else {
 340                                 unmap_stage2_pmds(mmu, pud, addr, next);
 341                         }
 342                 }
 343         } while (pud++, addr = next, addr != end);
 344
 345         if (stage2_pud_table_empty(kvm, start_pud))
 346                 clear_stage2_p4d_entry(mmu, p4d, start_addr);
 347 }
 348
 349 static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 350                        phys_addr_t addr, phys_addr_t end)
 351 {
 352         struct kvm *kvm = mmu->kvm;
 353         phys_addr_t next, start_addr = addr;
 354         p4d_t *p4d, *start_p4d;
 355
 356         start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
 357         do {
 358                 next = stage2_p4d_addr_end(kvm, addr, end);
 359                 if (!stage2_p4d_none(kvm, *p4d))
 360                         unmap_stage2_puds(mmu, p4d, addr, next);
 361         } while (p4d++, addr = next, addr != end);
 362
 363         if (stage2_p4d_table_empty(kvm, start_p4d))
 364                 clear_stage2_pgd_entry(mmu, pgd, start_addr);
 365 }
 366
 367 /**
 368  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 369  * @kvm:   The VM pointer
 370  * @start: The intermediate physical base address of the range to unmap
 371  * @size:  The size of the area to unmap
 372  *
 373  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 374  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 375  * destroying the VM), otherwise another faulting VCPU may come in and mess
 376  * with things behind our backs.
 377  */
 378 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
 379                                  bool may_block)
 380 {
 381         struct kvm *kvm = mmu->kvm;
 382         phys_addr_t end = start + size;
 383
 384         assert_spin_locked(&kvm->mmu_lock);
 385         WARN_ON(size & ~PAGE_MASK);
 386         WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
 387                                    may_block));
 388 }
 389
 390 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
 391 {
 392         __unmap_stage2_range(mmu, start, size, true);
 393 }
 394
 395 static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
 396                               phys_addr_t addr, phys_addr_t end)
 397 {
 398         pte_t *pte;
 399
 400         pte = pte_offset_kernel(pmd, addr);
 401         do {
 402                 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
 403                         kvm_flush_dcache_pte(*pte);
 404         } while (pte++, addr += PAGE_SIZE, addr != end);
 405 }
 406
 407 static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
 408                               phys_addr_t addr, phys_addr_t end)
 409 {
 410         struct kvm *kvm = mmu->kvm;
 411         pmd_t *pmd;
 412         phys_addr_t next;
 413
 414         pmd = stage2_pmd_offset(kvm, pud, addr);
 415         do {
 416                 next = stage2_pmd_addr_end(kvm, addr, end);
 417                 if (!pmd_none(*pmd)) {
 418                         if (pmd_thp_or_huge(*pmd))
 419                                 kvm_flush_dcache_pmd(*pmd);
 420                         else
 421                                 stage2_flush_ptes(mmu, pmd, addr, next);
 422                 }
 423         } while (pmd++, addr = next, addr != end);
 424 }
 425
 426 static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
 427                               phys_addr_t addr, phys_addr_t end)
 428 {
 429         struct kvm *kvm = mmu->kvm;
 430         pud_t *pud;
 431         phys_addr_t next;
 432
 433         pud = stage2_pud_offset(kvm, p4d, addr);
 434         do {
 435                 next = stage2_pud_addr_end(kvm, addr, end);
 436                 if (!stage2_pud_none(kvm, *pud)) {
 437                         if (stage2_pud_huge(kvm, *pud))
 438                                 kvm_flush_dcache_pud(*pud);
 439                         else
 440                                 stage2_flush_pmds(mmu, pud, addr, next);
 441                 }
 442         } while (pud++, addr = next, addr != end);
 443 }
 444
 445 static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 446                               phys_addr_t addr, phys_addr_t end)
 447 {
 448         struct kvm *kvm = mmu->kvm;
 449         p4d_t *p4d;
 450         phys_addr_t next;
 451
 452         p4d = stage2_p4d_offset(kvm, pgd, addr);
 453         do {
 454                 next = stage2_p4d_addr_end(kvm, addr, end);
 455                 if (!stage2_p4d_none(kvm, *p4d))
 456                         stage2_flush_puds(mmu, p4d, addr, next);
 457         } while (p4d++, addr = next, addr != end);
 458 }
 459
 460 static void stage2_flush_memslot(struct kvm *kvm,
 461                                  struct kvm_memory_slot *memslot)
 462 {
 463         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 464         phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
 465
 466         stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
 467 }
 468
 469 /**
 470  * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 471  * @kvm: The struct kvm pointer
 472  *
 473  * Go through the stage 2 page tables and invalidate any cache lines
 474  * backing memory already mapped to the VM.
 475  */
 476 static void stage2_flush_vm(struct kvm *kvm)
 477 {
 478         struct kvm_memslots *slots;
 479         struct kvm_memory_slot *memslot;
 480         int idx;
 481
 482         idx = srcu_read_lock(&kvm->srcu);
 483         spin_lock(&kvm->mmu_lock);
 484
 485         slots = kvm_memslots(kvm);
 486         kvm_for_each_memslot(memslot, slots)
 487                 stage2_flush_memslot(kvm, memslot);
 488
 489         spin_unlock(&kvm->mmu_lock);
 490         srcu_read_unlock(&kvm->srcu, idx);
 491 }
 492
 493 /**
 494  * free_hyp_pgds - free Hyp-mode page tables
 495  */
 496 void free_hyp_pgds(void)
 497 {
 498         mutex_lock(&kvm_hyp_pgd_mutex);
 499         if (hyp_pgtable) {
 500                 kvm_pgtable_hyp_destroy(hyp_pgtable);
 501                 kfree(hyp_pgtable);
 502         }
 503         mutex_unlock(&kvm_hyp_pgd_mutex);
 504 }
 505
 506 static int __create_hyp_mappings(unsigned long start, unsigned long size,
 507                                  unsigned long phys, enum kvm_pgtable_prot prot)
 508 {
 509         int err;
 510
 511         mutex_lock(&kvm_hyp_pgd_mutex);
 512         err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
 513         mutex_unlock(&kvm_hyp_pgd_mutex);
 514
 515         return err;
 516 }
 517
 518 static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 519 {
 520         if (!is_vmalloc_addr(kaddr)) {
 521                 BUG_ON(!virt_addr_valid(kaddr));
 522                 return __pa(kaddr);
 523         } else {
 524                 return page_to_phys(vmalloc_to_page(kaddr)) +
 525                        offset_in_page(kaddr);
 526         }
 527 }
 528
 529 /**
 530  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 531  * @from:       The virtual kernel start address of the range
 532  * @to:         The virtual kernel end address of the range (exclusive)
 533  * @prot:       The protection to be applied to this range
 534  *
 535  * The same virtual address as the kernel virtual address is also used
 536  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 537  * physical pages.
 538  */
 539 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 540 {
 541         phys_addr_t phys_addr;
 542         unsigned long virt_addr;
 543         unsigned long start = kern_hyp_va((unsigned long)from);
 544         unsigned long end = kern_hyp_va((unsigned long)to);
 545
 546         if (is_kernel_in_hyp_mode())
 547                 return 0;
 548
 549         start = start & PAGE_MASK;
 550         end = PAGE_ALIGN(end);
 551
 552         for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
 553                 int err;
 554
 555                 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
 556                 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
 557                                             prot);
 558                 if (err)
 559                         return err;
 560         }
 561
 562         return 0;
 563 }
 564
 565 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
 566                                         unsigned long *haddr,
 567                                         enum kvm_pgtable_prot prot)
 568 {
 569         unsigned long base;
 570         int ret = 0;
 571
 572         mutex_lock(&kvm_hyp_pgd_mutex);
 573
 574         /*
 575          * This assumes that we have enough space below the idmap
 576          * page to allocate our VAs. If not, the check below will
 577          * kick. A potential alternative would be to detect that
 578          * overflow and switch to an allocation above the idmap.
 579          *
 580          * The allocated size is always a multiple of PAGE_SIZE.
 581          */
 582         size = PAGE_ALIGN(size + offset_in_page(phys_addr));
 583         base = io_map_base - size;
 584
 585         /*
 586          * Verify that BIT(VA_BITS - 1) hasn't been flipped by
 587          * allocating the new area, as it would indicate we've
 588          * overflowed the idmap/IO address range.
 589          */
 590         if ((base ^ io_map_base) & BIT(VA_BITS - 1))
 591                 ret = -ENOMEM;
 592         else
 593                 io_map_base = base;
 594
 595         mutex_unlock(&kvm_hyp_pgd_mutex);
 596
 597         if (ret)
 598                 goto out;
 599
 600         ret = __create_hyp_mappings(base, size, phys_addr, prot);
 601         if (ret)
 602                 goto out;
 603
 604         *haddr = base + offset_in_page(phys_addr);
 605 out:
 606         return ret;
 607 }
 608
 609 /**
 610  * create_hyp_io_mappings - Map IO into both kernel and HYP
 611  * @phys_addr:  The physical start address which gets mapped
 612  * @size:       Size of the region being mapped
 613  * @kaddr:      Kernel VA for this mapping
 614  * @haddr:      HYP VA for this mapping
 615  */
 616 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
 617                            void __iomem **kaddr,
 618                            void __iomem **haddr)
 619 {
 620         unsigned long addr;
 621         int ret;
 622
 623         *kaddr = ioremap(phys_addr, size);
 624         if (!*kaddr)
 625                 return -ENOMEM;
 626
 627         if (is_kernel_in_hyp_mode()) {
 628                 *haddr = *kaddr;
 629                 return 0;
 630         }
 631
 632         ret = __create_hyp_private_mapping(phys_addr, size,
 633                                            &addr, PAGE_HYP_DEVICE);
 634         if (ret) {
 635                 iounmap(*kaddr);
 636                 *kaddr = NULL;
 637                 *haddr = NULL;
 638                 return ret;
 639         }
 640
 641         *haddr = (void __iomem *)addr;
 642         return 0;
 643 }
 644
 645 /**
 646  * create_hyp_exec_mappings - Map an executable range into HYP
 647  * @phys_addr:  The physical start address which gets mapped
 648  * @size:       Size of the region being mapped
 649  * @haddr:      HYP VA for this mapping
 650  */
 651 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 652                              void **haddr)
 653 {
 654         unsigned long addr;
 655         int ret;
 656
 657         BUG_ON(is_kernel_in_hyp_mode());
 658
 659         ret = __create_hyp_private_mapping(phys_addr, size,
 660                                            &addr, PAGE_HYP_EXEC);
 661         if (ret) {
 662                 *haddr = NULL;
 663                 return ret;
 664         }
 665
 666         *haddr = (void *)addr;
 667         return 0;
 668 }
 669
 670 /**
 671  * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
 672  * @kvm:        The pointer to the KVM structure
 673  * @mmu:        The pointer to the s2 MMU structure
 674  *
 675  * Allocates only the stage-2 HW PGD level table(s).
 676  * Note we don't need locking here as this is only called when the VM is
 677  * created, which can only be done once.
 678  */
 679 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
 680 {
 681         int cpu, err;
 682         struct kvm_pgtable *pgt;
 683
 684         if (mmu->pgt != NULL) {
 685                 kvm_err("kvm_arch already initialized?\n");
 686                 return -EINVAL;
 687         }
 688
 689         pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
 690         if (!pgt)
 691                 return -ENOMEM;
 692
 693         err = kvm_pgtable_stage2_init(pgt, kvm);
 694         if (err)
 695                 goto out_free_pgtable;
 696
 697         mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
 698         if (!mmu->last_vcpu_ran) {
 699                 err = -ENOMEM;
 700                 goto out_destroy_pgtable;
 701         }
 702
 703         for_each_possible_cpu(cpu)
 704                 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 705
 706         mmu->kvm = kvm;
 707         mmu->pgt = pgt;
 708         mmu->pgd_phys = __pa(pgt->pgd);
 709         mmu->pgd = (void *)pgt->pgd;
 710         mmu->vmid.vmid_gen = 0;
 711         return 0;
 712
 713 out_destroy_pgtable:
 714         kvm_pgtable_stage2_destroy(pgt);
 715 out_free_pgtable:
 716         kfree(pgt);
 717         return err;
 718 }
 719
 720 static void stage2_unmap_memslot(struct kvm *kvm,
 721                                  struct kvm_memory_slot *memslot)
 722 {
 723         hva_t hva = memslot->userspace_addr;
 724         phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 725         phys_addr_t size = PAGE_SIZE * memslot->npages;
 726         hva_t reg_end = hva + size;
 727
 728         /*
 729          * A memory region could potentially cover multiple VMAs, and any holes
 730          * between them, so iterate over all of them to find out if we should
 731          * unmap any of them.
 732          *
 733          *     +--------------------------------------------+
 734          * +---------------+----------------+   +----------------+
 735          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
 736          * +---------------+----------------+   +----------------+
 737          *     |               memory region                |
 738          *     +--------------------------------------------+
 739          */
 740         do {
 741                 struct vm_area_struct *vma = find_vma(current->mm, hva);
 742                 hva_t vm_start, vm_end;
 743
 744                 if (!vma || vma->vm_start >= reg_end)
 745                         break;
 746
 747                 /*
 748                  * Take the intersection of this VMA with the memory region
 749                  */
 750                 vm_start = max(hva, vma->vm_start);
 751                 vm_end = min(reg_end, vma->vm_end);
 752
 753                 if (!(vma->vm_flags & VM_PFNMAP)) {
 754                         gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
 755                         unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
 756                 }
 757                 hva = vm_end;
 758         } while (hva < reg_end);
 759 }
 760
 761 /**
 762  * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 763  * @kvm: The struct kvm pointer
 764  *
 765  * Go through the memregions and unmap any regular RAM
 766  * backing memory already mapped to the VM.
 767  */
 768 void stage2_unmap_vm(struct kvm *kvm)
 769 {
 770         struct kvm_memslots *slots;
 771         struct kvm_memory_slot *memslot;
 772         int idx;
 773
 774         idx = srcu_read_lock(&kvm->srcu);
 775         mmap_read_lock(current->mm);
 776         spin_lock(&kvm->mmu_lock);
 777
 778         slots = kvm_memslots(kvm);
 779         kvm_for_each_memslot(memslot, slots)
 780                 stage2_unmap_memslot(kvm, memslot);
 781
 782         spin_unlock(&kvm->mmu_lock);
 783         mmap_read_unlock(current->mm);
 784         srcu_read_unlock(&kvm->srcu, idx);
 785 }
 786
 787 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 788 {
 789         struct kvm *kvm = mmu->kvm;
 790         struct kvm_pgtable *pgt = NULL;
 791
 792         spin_lock(&kvm->mmu_lock);
 793         pgt = mmu->pgt;
 794         if (pgt) {
 795                 mmu->pgd = NULL;
 796                 mmu->pgd_phys = 0;
 797                 mmu->pgt = NULL;
 798                 free_percpu(mmu->last_vcpu_ran);
 799         }
 800         spin_unlock(&kvm->mmu_lock);
 801
 802         if (pgt) {
 803                 kvm_pgtable_stage2_destroy(pgt);
 804                 kfree(pgt);
 805         }
 806 }
 807
 808 static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
 809                              phys_addr_t addr)
 810 {
 811         struct kvm *kvm = mmu->kvm;
 812         pgd_t *pgd;
 813         p4d_t *p4d;
 814
 815         pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
 816         if (stage2_pgd_none(kvm, *pgd)) {
 817                 if (!cache)
 818                         return NULL;
 819                 p4d = kvm_mmu_memory_cache_alloc(cache);
 820                 stage2_pgd_populate(kvm, pgd, p4d);
 821                 get_page(virt_to_page(pgd));
 822         }
 823
 824         return stage2_p4d_offset(kvm, pgd, addr);
 825 }
 826
 827 static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
 828                              phys_addr_t addr)
 829 {
 830         struct kvm *kvm = mmu->kvm;
 831         p4d_t *p4d;
 832         pud_t *pud;
 833
 834         p4d = stage2_get_p4d(mmu, cache, addr);
 835         if (stage2_p4d_none(kvm, *p4d)) {
 836                 if (!cache)
 837                         return NULL;
 838                 pud = kvm_mmu_memory_cache_alloc(cache);
 839                 stage2_p4d_populate(kvm, p4d, pud);
 840                 get_page(virt_to_page(p4d));
 841         }
 842
 843         return stage2_pud_offset(kvm, p4d, addr);
 844 }
 845
 846 static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
 847                              phys_addr_t addr)
 848 {
 849         struct kvm *kvm = mmu->kvm;
 850         pud_t *pud;
 851         pmd_t *pmd;
 852
 853         pud = stage2_get_pud(mmu, cache, addr);
 854         if (!pud || stage2_pud_huge(kvm, *pud))
 855                 return NULL;
 856
 857         if (stage2_pud_none(kvm, *pud)) {
 858                 if (!cache)
 859                         return NULL;
 860                 pmd = kvm_mmu_memory_cache_alloc(cache);
 861                 stage2_pud_populate(kvm, pud, pmd);
 862                 get_page(virt_to_page(pud));
 863         }
 864
 865         return stage2_pmd_offset(kvm, pud, addr);
 866 }
 867
 868 static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
 869                                struct kvm_mmu_memory_cache *cache,
 870                                phys_addr_t addr, const pmd_t *new_pmd)
 871 {
 872         pmd_t *pmd, old_pmd;
 873
 874 retry:
 875         pmd = stage2_get_pmd(mmu, cache, addr);
 876         VM_BUG_ON(!pmd);
 877
 878         old_pmd = *pmd;
 879         /*
 880          * Multiple vcpus faulting on the same PMD entry, can
 881          * lead to them sequentially updating the PMD with the
 882          * same value. Following the break-before-make
 883          * (pmd_clear() followed by tlb_flush()) process can
 884          * hinder forward progress due to refaults generated
 885          * on missing translations.
 886          *
 887          * Skip updating the page table if the entry is
 888          * unchanged.
 889          */
 890         if (pmd_val(old_pmd) == pmd_val(*new_pmd))
 891                 return 0;
 892
 893         if (pmd_present(old_pmd)) {
 894                 /*
 895                  * If we already have PTE level mapping for this block,
 896                  * we must unmap it to avoid inconsistent TLB state and
 897                  * leaking the table page. We could end up in this situation
 898                  * if the memory slot was marked for dirty logging and was
 899                  * reverted, leaving PTE level mappings for the pages accessed
 900                  * during the period. So, unmap the PTE level mapping for this
 901                  * block and retry, as we could have released the upper level
 902                  * table in the process.
 903                  *
 904                  * Normal THP split/merge follows mmu_notifier callbacks and do
 905                  * get handled accordingly.
 906                  */
 907                 if (!pmd_thp_or_huge(old_pmd)) {
 908                         unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
 909                         goto retry;
 910                 }
 911                 /*
 912                  * Mapping in huge pages should only happen through a
 913                  * fault.  If a page is merged into a transparent huge
 914                  * page, the individual subpages of that huge page
 915                  * should be unmapped through MMU notifiers before we
 916                  * get here.
 917                  *
 918                  * Merging of CompoundPages is not supported; they
 919                  * should become splitting first, unmapped, merged,
 920                  * and mapped back in on-demand.
 921                  */
 922                 WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
 923                 pmd_clear(pmd);
 924                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
 925         } else {
 926                 get_page(virt_to_page(pmd));
 927         }
 928
 929         kvm_set_pmd(pmd, *new_pmd);
 930         return 0;
 931 }
 932
 933 static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
 934                                struct kvm_mmu_memory_cache *cache,
 935                                phys_addr_t addr, const pud_t *new_pudp)
 936 {
 937         struct kvm *kvm = mmu->kvm;
 938         pud_t *pudp, old_pud;
 939
 940 retry:
 941         pudp = stage2_get_pud(mmu, cache, addr);
 942         VM_BUG_ON(!pudp);
 943
 944         old_pud = *pudp;
 945
 946         /*
 947          * A large number of vcpus faulting on the same stage 2 entry,
 948          * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
 949          * Skip updating the page tables if there is no change.
 950          */
 951         if (pud_val(old_pud) == pud_val(*new_pudp))
 952                 return 0;
 953
 954         if (stage2_pud_present(kvm, old_pud)) {
 955                 /*
 956                  * If we already have table level mapping for this block, unmap
 957                  * the range for this block and retry.
 958                  */
 959                 if (!stage2_pud_huge(kvm, old_pud)) {
 960                         unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
 961                         goto retry;
 962                 }
 963
 964                 WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
 965                 stage2_pud_clear(kvm, pudp);
 966                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
 967         } else {
 968                 get_page(virt_to_page(pudp));
 969         }
 970
 971         kvm_set_pud(pudp, *new_pudp);
 972         return 0;
 973 }
 974
 975 /*
 976  * stage2_get_leaf_entry - walk the stage2 VM page tables and return
 977  * true if a valid and present leaf-entry is found. A pointer to the
 978  * leaf-entry is returned in the appropriate level variable - pudpp,
 979  * pmdpp, ptepp.
 980  */
 981 static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
 982                                   pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
 983 {
 984         struct kvm *kvm = mmu->kvm;
 985         pud_t *pudp;
 986         pmd_t *pmdp;
 987         pte_t *ptep;
 988
 989         *pudpp = NULL;
 990         *pmdpp = NULL;
 991         *ptepp = NULL;
 992
 993         pudp = stage2_get_pud(mmu, NULL, addr);
 994         if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
 995                 return false;
 996
 997         if (stage2_pud_huge(kvm, *pudp)) {
 998                 *pudpp = pudp;
 999                 return true;
1000         }
1001
1002         pmdp = stage2_pmd_offset(kvm, pudp, addr);
1003         if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
1004                 return false;
1005
1006         if (pmd_thp_or_huge(*pmdp)) {
1007                 *pmdpp = pmdp;
1008                 return true;
1009         }
1010
1011         ptep = pte_offset_kernel(pmdp, addr);
1012         if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
1013                 return false;
1014
1015         *ptepp = ptep;
1016         return true;
1017 }
1018
1019 static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
1020 {
1021         pud_t *pudp;
1022         pmd_t *pmdp;
1023         pte_t *ptep;
1024         bool found;
1025
1026         found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
1027         if (!found)
1028                 return false;
1029
1030         if (pudp)
1031                 return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
1032         else if (pmdp)
1033                 return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
1034         else
1035                 return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
1036 }
1037
1038 static int stage2_set_pte(struct kvm_s2_mmu *mmu,
1039                           struct kvm_mmu_memory_cache *cache,
1040                           phys_addr_t addr, const pte_t *new_pte,
1041                           unsigned long flags)
1042 {
1043         struct kvm *kvm = mmu->kvm;
1044         pud_t *pud;
1045         pmd_t *pmd;
1046         pte_t *pte, old_pte;
1047         bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
1048         bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
1049
1050         VM_BUG_ON(logging_active && !cache);
1051
1052         /* Create stage-2 page table mapping - Levels 0 and 1 */
1053         pud = stage2_get_pud(mmu, cache, addr);
1054         if (!pud) {
1055                 /*
1056                  * Ignore calls from kvm_set_spte_hva for unallocated
1057                  * address ranges.
1058                  */
1059                 return 0;
1060         }
1061
1062         /*
1063          * While dirty page logging - dissolve huge PUD, then continue
1064          * on to allocate page.
1065          */
1066         if (logging_active)
1067                 stage2_dissolve_pud(mmu, addr, pud);
1068
1069         if (stage2_pud_none(kvm, *pud)) {
1070                 if (!cache)
1071                         return 0; /* ignore calls from kvm_set_spte_hva */
1072                 pmd = kvm_mmu_memory_cache_alloc(cache);
1073                 stage2_pud_populate(kvm, pud, pmd);
1074                 get_page(virt_to_page(pud));
1075         }
1076
1077         pmd = stage2_pmd_offset(kvm, pud, addr);
1078         if (!pmd) {
1079                 /*
1080                  * Ignore calls from kvm_set_spte_hva for unallocated
1081                  * address ranges.
1082                  */
1083                 return 0;
1084         }
1085
1086         /*
1087          * While dirty page logging - dissolve huge PMD, then continue on to
1088          * allocate page.
1089          */
1090         if (logging_active)
1091                 stage2_dissolve_pmd(mmu, addr, pmd);
1092
1093         /* Create stage-2 page mappings - Level 2 */
1094         if (pmd_none(*pmd)) {
1095                 if (!cache)
1096                         return 0; /* ignore calls from kvm_set_spte_hva */
1097                 pte = kvm_mmu_memory_cache_alloc(cache);
1098                 kvm_pmd_populate(pmd, pte);
1099                 get_page(virt_to_page(pmd));
1100         }
1101
1102         pte = pte_offset_kernel(pmd, addr);
1103
1104         if (iomap && pte_present(*pte))
1105                 return -EFAULT;
1106
1107         /* Create 2nd stage page table mapping - Level 3 */
1108         old_pte = *pte;
1109         if (pte_present(old_pte)) {
1110                 /* Skip page table update if there is no change */
1111                 if (pte_val(old_pte) == pte_val(*new_pte))
1112                         return 0;
1113
1114                 kvm_set_pte(pte, __pte(0));
1115                 kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
1116         } else {
1117                 get_page(virt_to_page(pte));
1118         }
1119
1120         kvm_set_pte(pte, *new_pte);
1121         return 0;
1122 }
1123
1124 #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
1125 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1126 {
1127         if (pte_young(*pte)) {
1128                 *pte = pte_mkold(*pte);
1129                 return 1;
1130         }
1131         return 0;
1132 }
1133 #else
1134 static int stage2_ptep_test_and_clear_young(pte_t *pte)
1135 {
1136         return __ptep_test_and_clear_young(pte);
1137 }
1138 #endif
1139
1140 static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
1141 {
1142         return stage2_ptep_test_and_clear_young((pte_t *)pmd);
1143 }
1144
1145 static int stage2_pudp_test_and_clear_young(pud_t *pud)
1146 {
1147         return stage2_ptep_test_and_clear_young((pte_t *)pud);
1148 }
1149
1150 /**
1151  * kvm_phys_addr_ioremap - map a device range to guest IPA
1152  *
1153  * @kvm:        The KVM pointer
1154  * @guest_ipa:  The IPA at which to insert the mapping
1155  * @pa:         The physical address of the device
1156  * @size:       The size of the mapping
1157  */
1158 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1159                           phys_addr_t pa, unsigned long size, bool writable)
1160 {
1161         phys_addr_t addr;
1162         int ret = 0;
1163         struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
1164         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
1165         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
1166                                      KVM_PGTABLE_PROT_R |
1167                                      (writable ? KVM_PGTABLE_PROT_W : 0);
1168
1169         size += offset_in_page(guest_ipa);
1170         guest_ipa &= PAGE_MASK;
1171
1172         for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1173                 ret = kvm_mmu_topup_memory_cache(&cache,
1174                                                  kvm_mmu_cache_min_pages(kvm));
1175                 if (ret)
1176                         break;
1177
1178                 spin_lock(&kvm->mmu_lock);
1179                 ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1180                                              &cache);
1181                 spin_unlock(&kvm->mmu_lock);
1182                 if (ret)
1183                         break;
1184
1185                 pa += PAGE_SIZE;
1186         }
1187
1188         kvm_mmu_free_memory_cache(&cache);
1189         return ret;
1190 }
1191
1192 /**
1193  * stage2_wp_ptes - write protect PMD range
1194  * @pmd:        pointer to pmd entry
1195  * @addr:       range start address
1196  * @end:        range end address
1197  */
1198 static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1199 {
1200         pte_t *pte;
1201
1202         pte = pte_offset_kernel(pmd, addr);
1203         do {
1204                 if (!pte_none(*pte)) {
1205                         if (!kvm_s2pte_readonly(pte))
1206                                 kvm_set_s2pte_readonly(pte);
1207                 }
1208         } while (pte++, addr += PAGE_SIZE, addr != end);
1209 }
1210
1211 /**
1212  * stage2_wp_pmds - write protect PUD range
1213  * kvm:         kvm instance for the VM
1214  * @pud:        pointer to pud entry
1215  * @addr:       range start address
1216  * @end:        range end address
1217  */
1218 static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
1219                            phys_addr_t addr, phys_addr_t end)
1220 {
1221         struct kvm *kvm = mmu->kvm;
1222         pmd_t *pmd;
1223         phys_addr_t next;
1224
1225         pmd = stage2_pmd_offset(kvm, pud, addr);
1226
1227         do {
1228                 next = stage2_pmd_addr_end(kvm, addr, end);
1229                 if (!pmd_none(*pmd)) {
1230                         if (pmd_thp_or_huge(*pmd)) {
1231                                 if (!kvm_s2pmd_readonly(pmd))
1232                                         kvm_set_s2pmd_readonly(pmd);
1233                         } else {
1234                                 stage2_wp_ptes(pmd, addr, next);
1235                         }
1236                 }
1237         } while (pmd++, addr = next, addr != end);
1238 }
1239
1240 /**
1241  * stage2_wp_puds - write protect P4D range
1242  * @p4d:        pointer to p4d entry
1243  * @addr:       range start address
1244  * @end:        range end address
1245  */
1246 static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
1247                             phys_addr_t addr, phys_addr_t end)
1248 {
1249         struct kvm *kvm = mmu->kvm;
1250         pud_t *pud;
1251         phys_addr_t next;
1252
1253         pud = stage2_pud_offset(kvm, p4d, addr);
1254         do {
1255                 next = stage2_pud_addr_end(kvm, addr, end);
1256                 if (!stage2_pud_none(kvm, *pud)) {
1257                         if (stage2_pud_huge(kvm, *pud)) {
1258                                 if (!kvm_s2pud_readonly(pud))
1259                                         kvm_set_s2pud_readonly(pud);
1260                         } else {
1261                                 stage2_wp_pmds(mmu, pud, addr, next);
1262                         }
1263                 }
1264         } while (pud++, addr = next, addr != end);
1265 }
1266
1267 /**
1268  * stage2_wp_p4ds - write protect PGD range
1269  * @pgd:        pointer to pgd entry
1270  * @addr:       range start address
1271  * @end:        range end address
1272  */
1273 static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
1274                             phys_addr_t addr, phys_addr_t end)
1275 {
1276         struct kvm *kvm = mmu->kvm;
1277         p4d_t *p4d;
1278         phys_addr_t next;
1279
1280         p4d = stage2_p4d_offset(kvm, pgd, addr);
1281         do {
1282                 next = stage2_p4d_addr_end(kvm, addr, end);
1283                 if (!stage2_p4d_none(kvm, *p4d))
1284                         stage2_wp_puds(mmu, p4d, addr, next);
1285         } while (p4d++, addr = next, addr != end);
1286 }
1287
1288 /**
1289  * stage2_wp_range() - write protect stage2 memory region range
1290  * @kvm:        The KVM pointer
1291  * @addr:       Start address of range
1292  * @end:        End address of range
1293  */
1294 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1295 {
1296         struct kvm *kvm = mmu->kvm;
1297         stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
1298 }
1299
1300 /**
1301  * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1302  * @kvm:        The KVM pointer
1303  * @slot:       The memory slot to write protect
1304  *
1305  * Called to start logging dirty pages after memory region
1306  * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1307  * all present PUD, PMD and PTEs are write protected in the memory region.
1308  * Afterwards read of dirty page log can be called.
1309  *
1310  * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1311  * serializing operations for VM memory regions.
1312  */
1313 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1314 {
1315         struct kvm_memslots *slots = kvm_memslots(kvm);
1316         struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1317         phys_addr_t start, end;
1318
1319         if (WARN_ON_ONCE(!memslot))
1320                 return;
1321
1322         start = memslot->base_gfn << PAGE_SHIFT;
1323         end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1324
1325         spin_lock(&kvm->mmu_lock);
1326         stage2_wp_range(&kvm->arch.mmu, start, end);
1327         spin_unlock(&kvm->mmu_lock);
1328         kvm_flush_remote_tlbs(kvm);
1329 }
1330
1331 /**
1332  * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
1333  * @kvm:        The KVM pointer
1334  * @slot:       The memory slot associated with mask
1335  * @gfn_offset: The gfn offset in memory slot
1336  * @mask:       The mask of dirty pages at offset 'gfn_offset' in this memory
1337  *              slot to be write protected
1338  *
1339  * Walks bits set in mask write protects the associated pte's. Caller must
1340  * acquire kvm_mmu_lock.
1341  */
1342 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1343                 struct kvm_memory_slot *slot,
1344                 gfn_t gfn_offset, unsigned long mask)
1345 {
1346         phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1347         phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
1348         phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1349
1350         stage2_wp_range(&kvm->arch.mmu, start, end);
1351 }
1352
1353 /*
1354  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1355  * dirty pages.
1356  *
1357  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1358  * enable dirty logging for them.
1359  */
1360 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1361                 struct kvm_memory_slot *slot,
1362                 gfn_t gfn_offset, unsigned long mask)
1363 {
1364         kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1365 }
1366
1367 static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
1368 {
1369         __clean_dcache_guest_page(pfn, size);
1370 }
1371
1372 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
1373 {
1374         __invalidate_icache_guest_page(pfn, size);
1375 }
1376
1377 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1378 {
1379         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1380 }
1381
1382 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
1383                                                unsigned long hva,
1384                                                unsigned long map_size)
1385 {
1386         gpa_t gpa_start;
1387         hva_t uaddr_start, uaddr_end;
1388         size_t size;
1389
1390         /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
1391         if (map_size == PAGE_SIZE)
1392                 return true;
1393
1394         size = memslot->npages * PAGE_SIZE;
1395
1396         gpa_start = memslot->base_gfn << PAGE_SHIFT;
1397
1398         uaddr_start = memslot->userspace_addr;
1399         uaddr_end = uaddr_start + size;
1400
1401         /*
1402          * Pages belonging to memslots that don't have the same alignment
1403          * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1404          * PMD/PUD entries, because we'll end up mapping the wrong pages.
1405          *
1406          * Consider a layout like the following:
1407          *
1408          *    memslot->userspace_addr:
1409          *    +-----+--------------------+--------------------+---+
1410          *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1411          *    +-----+--------------------+--------------------+---+
1412          *
1413          *    memslot->base_gfn << PAGE_SHIFT:
1414          *      +---+--------------------+--------------------+-----+
1415          *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1416          *      +---+--------------------+--------------------+-----+
1417          *
1418          * If we create those stage-2 blocks, we'll end up with this incorrect
1419          * mapping:
1420          *   d -> f
1421          *   e -> g
1422          *   f -> h
1423          */
1424         if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1425                 return false;
1426
1427         /*
1428          * Next, let's make sure we're not trying to map anything not covered
1429          * by the memslot. This means we have to prohibit block size mappings
1430          * for the beginning and end of a non-block aligned and non-block sized
1431          * memory slot (illustrated by the head and tail parts of the
1432          * userspace view above containing pages 'abcde' and 'xyz',
1433          * respectively).
1434          *
1435          * Note that it doesn't matter if we do the check using the
1436          * userspace_addr or the base_gfn, as both are equally aligned (per
1437          * the check above) and equally sized.
1438          */
1439         return (hva & ~(map_size - 1)) >= uaddr_start &&
1440                (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1441 }
1442
1443 /*
1444  * Check if the given hva is backed by a transparent huge page (THP) and
1445  * whether it can be mapped using block mapping in stage2. If so, adjust
1446  * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
1447  * supported. This will need to be updated to support other THP sizes.
1448  *
1449  * Returns the size of the mapping.
1450  */
1451 static unsigned long
1452 transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
1453                             unsigned long hva, kvm_pfn_t *pfnp,
1454                             phys_addr_t *ipap)
1455 {
1456         kvm_pfn_t pfn = *pfnp;
1457
1458         /*
1459          * Make sure the adjustment is done only for THP pages. Also make
1460          * sure that the HVA and IPA are sufficiently aligned and that the
1461          * block map is contained within the memslot.
1462          */
1463         if (kvm_is_transparent_hugepage(pfn) &&
1464             fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
1465                 /*
1466                  * The address we faulted on is backed by a transparent huge
1467                  * page.  However, because we map the compound huge page and
1468                  * not the individual tail page, we need to transfer the
1469                  * refcount to the head page.  We have to be careful that the
1470                  * THP doesn't start to split while we are adjusting the
1471                  * refcounts.
1472                  *
1473                  * We are sure this doesn't happen, because mmu_notifier_retry
1474                  * was successful and we are holding the mmu_lock, so if this
1475                  * THP is trying to split, it will be blocked in the mmu
1476                  * notifier before touching any of the pages, specifically
1477                  * before being able to call __split_huge_page_refcount().
1478                  *
1479                  * We can therefore safely transfer the refcount from PG_tail
1480                  * to PG_head and switch the pfn from a tail page to the head
1481                  * page accordingly.
1482                  */
1483                 *ipap &= PMD_MASK;
1484                 kvm_release_pfn_clean(pfn);
1485                 pfn &= ~(PTRS_PER_PMD - 1);
1486                 kvm_get_pfn(pfn);
1487                 *pfnp = pfn;
1488
1489                 return PMD_SIZE;
1490         }
1491
1492         /* Use page mapping if we cannot use block mapping. */
1493         return PAGE_SIZE;
1494 }
1495
1496 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1497                           struct kvm_memory_slot *memslot, unsigned long hva,
1498                           unsigned long fault_status)
1499 {
1500         int ret;
1501         bool write_fault, writable, force_pte = false;
1502         bool exec_fault, needs_exec;
1503         unsigned long mmu_seq;
1504         gfn_t gfn = fault_ipa >> PAGE_SHIFT;
1505         struct kvm *kvm = vcpu->kvm;
1506         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1507         struct vm_area_struct *vma;
1508         short vma_shift;
1509         kvm_pfn_t pfn;
1510         pgprot_t mem_type = PAGE_S2;
1511         bool logging_active = memslot_is_logging(memslot);
1512         unsigned long vma_pagesize, flags = 0;
1513         struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
1514
1515         write_fault = kvm_is_write_fault(vcpu);
1516         exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
1517         VM_BUG_ON(write_fault && exec_fault);
1518
1519         if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
1520                 kvm_err("Unexpected L2 read permission error\n");
1521                 return -EFAULT;
1522         }
1523
1524         /* Let's check if we will get back a huge page backed by hugetlbfs */
1525         mmap_read_lock(current->mm);
1526         vma = find_vma_intersection(current->mm, hva, hva + 1);
1527         if (unlikely(!vma)) {
1528                 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1529                 mmap_read_unlock(current->mm);
1530                 return -EFAULT;
1531         }
1532
1533         if (is_vm_hugetlb_page(vma))
1534                 vma_shift = huge_page_shift(hstate_vma(vma));
1535         else
1536                 vma_shift = PAGE_SHIFT;
1537
1538         vma_pagesize = 1ULL << vma_shift;
1539         if (logging_active ||
1540             (vma->vm_flags & VM_PFNMAP) ||
1541             !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
1542                 force_pte = true;
1543                 vma_pagesize = PAGE_SIZE;
1544         }
1545
1546         /*
1547          * The stage2 has a minimum of 2 level table (For arm64 see
1548          * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
1549          * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
1550          * As for PUD huge maps, we must make sure that we have at least
1551          * 3 levels, i.e, PMD is not folded.
1552          */
1553         if (vma_pagesize == PMD_SIZE ||
1554             (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
1555                 gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
1556         mmap_read_unlock(current->mm);
1557
1558         /* We need minimum second+third level pages */
1559         ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
1560         if (ret)
1561                 return ret;
1562
1563         mmu_seq = vcpu->kvm->mmu_notifier_seq;
1564         /*
1565          * Ensure the read of mmu_notifier_seq happens before we call
1566          * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1567          * the page we just got a reference to gets unmapped before we have a
1568          * chance to grab the mmu_lock, which ensure that if the page gets
1569          * unmapped afterwards, the call to kvm_unmap_hva will take it away
1570          * from us again properly. This smp_rmb() interacts with the smp_wmb()
1571          * in kvm_mmu_notifier_invalidate_<page|range_end>.
1572          */
1573         smp_rmb();
1574
1575         pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1576         if (pfn == KVM_PFN_ERR_HWPOISON) {
1577                 kvm_send_hwpoison_signal(hva, vma_shift);
1578                 return 0;
1579         }
1580         if (is_error_noslot_pfn(pfn))
1581                 return -EFAULT;
1582
1583         if (kvm_is_device_pfn(pfn)) {
1584                 mem_type = PAGE_S2_DEVICE;
1585                 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1586         } else if (logging_active) {
1587                 /*
1588                  * Faults on pages in a memslot with logging enabled
1589                  * should not be mapped with huge pages (it introduces churn
1590                  * and performance degradation), so force a pte mapping.
1591                  */
1592                 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1593
1594                 /*
1595                  * Only actually map the page as writable if this was a write
1596                  * fault.
1597                  */
1598                 if (!write_fault)
1599                         writable = false;
1600         }
1601
1602         if (exec_fault && is_iomap(flags))
1603                 return -ENOEXEC;
1604
1605         spin_lock(&kvm->mmu_lock);
1606         if (mmu_notifier_retry(kvm, mmu_seq))
1607                 goto out_unlock;
1608
1609         /*
1610          * If we are not forced to use page mapping, check if we are
1611          * backed by a THP and thus use block mapping if possible.
1612          */
1613         if (vma_pagesize == PAGE_SIZE && !force_pte)
1614                 vma_pagesize = transparent_hugepage_adjust(memslot, hva,
1615                                                            &pfn, &fault_ipa);
1616         if (writable)
1617                 kvm_set_pfn_dirty(pfn);
1618
1619         if (fault_status != FSC_PERM && !is_iomap(flags))
1620                 clean_dcache_guest_page(pfn, vma_pagesize);
1621
1622         if (exec_fault)
1623                 invalidate_icache_guest_page(pfn, vma_pagesize);
1624
1625         /*
1626          * If we took an execution fault we have made the
1627          * icache/dcache coherent above and should now let the s2
1628          * mapping be executable.
1629          *
1630          * Write faults (!exec_fault && FSC_PERM) are orthogonal to
1631          * execute permissions, and we preserve whatever we have.
1632          */
1633         needs_exec = exec_fault ||
1634                 (fault_status == FSC_PERM &&
1635                  stage2_is_exec(mmu, fault_ipa, vma_pagesize));
1636
1637         if (vma_pagesize == PUD_SIZE) {
1638                 pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
1639
1640                 new_pud = kvm_pud_mkhuge(new_pud);
1641                 if (writable)
1642                         new_pud = kvm_s2pud_mkwrite(new_pud);
1643
1644                 if (needs_exec)
1645                         new_pud = kvm_s2pud_mkexec(new_pud);
1646
1647                 ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
1648         } else if (vma_pagesize == PMD_SIZE) {
1649                 pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
1650
1651                 new_pmd = kvm_pmd_mkhuge(new_pmd);
1652
1653                 if (writable)
1654                         new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1655
1656                 if (needs_exec)
1657                         new_pmd = kvm_s2pmd_mkexec(new_pmd);
1658
1659                 ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
1660         } else {
1661                 pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
1662
1663                 if (writable) {
1664                         new_pte = kvm_s2pte_mkwrite(new_pte);
1665                         mark_page_dirty(kvm, gfn);
1666                 }
1667
1668                 if (needs_exec)
1669                         new_pte = kvm_s2pte_mkexec(new_pte);
1670
1671                 ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
1672         }
1673
1674 out_unlock:
1675         spin_unlock(&kvm->mmu_lock);
1676         kvm_set_pfn_accessed(pfn);
1677         kvm_release_pfn_clean(pfn);
1678         return ret;
1679 }
1680
1681 /* Resolve the access fault by making the page young again. */
1682 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1683 {
1684         pte_t pte;
1685         kvm_pte_t kpte;
1686         struct kvm_s2_mmu *mmu;
1687
1688         trace_kvm_access_fault(fault_ipa);
1689
1690         spin_lock(&vcpu->kvm->mmu_lock);
1691         mmu = vcpu->arch.hw_mmu;
1692         kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1693         spin_unlock(&vcpu->kvm->mmu_lock);
1694
1695         pte = __pte(kpte);
1696         if (pte_valid(pte))
1697                 kvm_set_pfn_accessed(pte_pfn(pte));
1698 }
1699
1700 /**
1701  * kvm_handle_guest_abort - handles all 2nd stage aborts
1702  * @vcpu:       the VCPU pointer
1703  *
1704  * Any abort that gets to the host is almost guaranteed to be caused by a
1705  * missing second stage translation table entry, which can mean that either the
1706  * guest simply needs more memory and we must allocate an appropriate page or it
1707  * can mean that the guest tried to access I/O memory, which is emulated by user
1708  * space. The distinction is based on the IPA causing the fault and whether this
1709  * memory region has been registered as standard RAM by user space.
1710  */
1711 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1712 {
1713         unsigned long fault_status;
1714         phys_addr_t fault_ipa;
1715         struct kvm_memory_slot *memslot;
1716         unsigned long hva;
1717         bool is_iabt, write_fault, writable;
1718         gfn_t gfn;
1719         int ret, idx;
1720
1721         fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
1722
1723         fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1724         is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1725
1726         /* Synchronous External Abort? */
1727         if (kvm_vcpu_abt_issea(vcpu)) {
1728                 /*
1729                  * For RAS the host kernel may handle this abort.
1730                  * There is no need to pass the error into the guest.
1731                  */
1732                 if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1733                         kvm_inject_vabt(vcpu);
1734
1735                 return 1;
1736         }
1737
1738         trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1739                               kvm_vcpu_get_hfar(vcpu), fault_ipa);
1740
1741         /* Check the stage-2 fault is trans. fault or write fault */
1742         if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1743             fault_status != FSC_ACCESS) {
1744                 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1745                         kvm_vcpu_trap_get_class(vcpu),
1746                         (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1747                         (unsigned long)kvm_vcpu_get_esr(vcpu));
1748                 return -EFAULT;
1749         }
1750
1751         idx = srcu_read_lock(&vcpu->kvm->srcu);
1752
1753         gfn = fault_ipa >> PAGE_SHIFT;
1754         memslot = gfn_to_memslot(vcpu->kvm, gfn);
1755         hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1756         write_fault = kvm_is_write_fault(vcpu);
1757         if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1758                 /*
1759                  * The guest has put either its instructions or its page-tables
1760                  * somewhere it shouldn't have. Userspace won't be able to do
1761                  * anything about this (there's no syndrome for a start), so
1762                  * re-inject the abort back into the guest.
1763                  */
1764                 if (is_iabt) {
1765                         ret = -ENOEXEC;
1766                         goto out;
1767                 }
1768
1769                 if (kvm_vcpu_dabt_iss1tw(vcpu)) {
1770                         kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1771                         ret = 1;
1772                         goto out_unlock;
1773                 }
1774
1775                 /*
1776                  * Check for a cache maintenance operation. Since we
1777                  * ended-up here, we know it is outside of any memory
1778                  * slot. But we can't find out if that is for a device,
1779                  * or if the guest is just being stupid. The only thing
1780                  * we know for sure is that this range cannot be cached.
1781                  *
1782                  * So let's assume that the guest is just being
1783                  * cautious, and skip the instruction.
1784                  */
1785                 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1786                         kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1787                         ret = 1;
1788                         goto out_unlock;
1789                 }
1790
1791                 /*
1792                  * The IPA is reported as [MAX:12], so we need to
1793                  * complement it with the bottom 12 bits from the
1794                  * faulting VA. This is always 12 bits, irrespective
1795                  * of the page size.
1796                  */
1797                 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
1798                 ret = io_mem_abort(vcpu, fault_ipa);
1799                 goto out_unlock;
1800         }
1801
1802         /* Userspace should not be able to register out-of-bounds IPAs */
1803         VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
1804
1805         if (fault_status == FSC_ACCESS) {
1806                 handle_access_fault(vcpu, fault_ipa);
1807                 ret = 1;
1808                 goto out_unlock;
1809         }
1810
1811         ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
1812         if (ret == 0)
1813                 ret = 1;
1814 out:
1815         if (ret == -ENOEXEC) {
1816                 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
1817                 ret = 1;
1818         }
1819 out_unlock:
1820         srcu_read_unlock(&vcpu->kvm->srcu, idx);
1821         return ret;
1822 }
1823
1824 static int handle_hva_to_gpa(struct kvm *kvm,
1825                              unsigned long start,
1826                              unsigned long end,
1827                              int (*handler)(struct kvm *kvm,
1828                                             gpa_t gpa, u64 size,
1829                                             void *data),
1830                              void *data)
1831 {
1832         struct kvm_memslots *slots;
1833         struct kvm_memory_slot *memslot;
1834         int ret = 0;
1835
1836         slots = kvm_memslots(kvm);
1837
1838         /* we only care about the pages that the guest sees */
1839         kvm_for_each_memslot(memslot, slots) {
1840                 unsigned long hva_start, hva_end;
1841                 gfn_t gpa;
1842
1843                 hva_start = max(start, memslot->userspace_addr);
1844                 hva_end = min(end, memslot->userspace_addr +
1845                                         (memslot->npages << PAGE_SHIFT));
1846                 if (hva_start >= hva_end)
1847                         continue;
1848
1849                 gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
1850                 ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
1851         }
1852
1853         return ret;
1854 }
1855
1856 static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1857 {
1858         unsigned flags = *(unsigned *)data;
1859         bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
1860
1861         __unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
1862         return 0;
1863 }
1864
1865 int kvm_unmap_hva_range(struct kvm *kvm,
1866                         unsigned long start, unsigned long end, unsigned flags)
1867 {
1868         if (!kvm->arch.mmu.pgd)
1869                 return 0;
1870
1871         trace_kvm_unmap_hva_range(start, end);
1872         handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
1873         return 0;
1874 }
1875
1876 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1877 {
1878         kvm_pfn_t *pfn = (kvm_pfn_t *)data;
1879
1880         WARN_ON(size != PAGE_SIZE);
1881
1882         /*
1883          * The MMU notifiers will have unmapped a huge PMD before calling
1884          * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1885          * therefore we never need to clear out a huge PMD through this
1886          * calling path and a memcache is not required.
1887          */
1888         kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
1889                                __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
1890         return 0;
1891 }
1892
1893 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1894 {
1895         unsigned long end = hva + PAGE_SIZE;
1896         kvm_pfn_t pfn = pte_pfn(pte);
1897
1898         if (!kvm->arch.mmu.pgt)
1899                 return 0;
1900
1901         trace_kvm_set_spte_hva(hva);
1902
1903         /*
1904          * We've moved a page around, probably through CoW, so let's treat it
1905          * just like a translation fault and clean the cache to the PoC.
1906          */
1907         clean_dcache_guest_page(pfn, PAGE_SIZE);
1908         handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
1909         return 0;
1910 }
1911
1912 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1913 {
1914         pte_t pte;
1915         kvm_pte_t kpte;
1916
1917         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1918         kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
1919         pte = __pte(kpte);
1920         return pte_valid(pte) && pte_young(pte);
1921 }
1922
1923 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
1924 {
1925         WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
1926         return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
1927 }
1928
1929 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1930 {
1931         if (!kvm->arch.mmu.pgd)
1932                 return 0;
1933         trace_kvm_age_hva(start, end);
1934         return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1935 }
1936
1937 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1938 {
1939         if (!kvm->arch.mmu.pgd)
1940                 return 0;
1941         trace_kvm_test_age_hva(hva);
1942         return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
1943                                  kvm_test_age_hva_handler, NULL);
1944 }
1945
1946 phys_addr_t kvm_mmu_get_httbr(void)
1947 {
1948         return __pa(hyp_pgtable->pgd);
1949 }
1950
1951 phys_addr_t kvm_get_idmap_vector(void)
1952 {
1953         return hyp_idmap_vector;
1954 }
1955
1956 static int kvm_map_idmap_text(void)
1957 {
1958         unsigned long size = hyp_idmap_end - hyp_idmap_start;
1959         int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
1960                                         PAGE_HYP_EXEC);
1961         if (err)
1962                 kvm_err("Failed to idmap %lx-%lx\n",
1963                         hyp_idmap_start, hyp_idmap_end);
1964
1965         return err;
1966 }
1967
1968 int kvm_mmu_init(void)
1969 {
1970         int err;
1971         u32 hyp_va_bits;
1972
1973         hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
1974         hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1975         hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
1976         hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1977         hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
1978
1979         /*
1980          * We rely on the linker script to ensure at build time that the HYP
1981          * init code does not cross a page boundary.
1982          */
1983         BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1984
1985         hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
1986         kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
1987         kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1988         kvm_debug("HYP VA range: %lx:%lx\n",
1989                   kern_hyp_va(PAGE_OFFSET),
1990                   kern_hyp_va((unsigned long)high_memory - 1));
1991
1992         if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
1993             hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
1994             hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
1995                 /*
1996                  * The idmap page is intersecting with the VA space,
1997                  * it is not safe to continue further.
1998                  */
1999                 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
2000                 err = -EINVAL;
2001                 goto out;
2002         }
2003
2004         hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
2005         if (!hyp_pgtable) {
2006                 kvm_err("Hyp mode page-table not allocated\n");
2007                 err = -ENOMEM;
2008                 goto out;
2009         }
2010
2011         err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
2012         if (err)
2013                 goto out_free_pgtable;
2014
2015         err = kvm_map_idmap_text();
2016         if (err)
2017                 goto out_destroy_pgtable;
2018
2019         io_map_base = hyp_idmap_start;
2020         return 0;
2021
2022 out_destroy_pgtable:
2023         kvm_pgtable_hyp_destroy(hyp_pgtable);
2024 out_free_pgtable:
2025         kfree(hyp_pgtable);
2026         hyp_pgtable = NULL;
2027 out:
2028         return err;
2029 }
2030
2031 void kvm_arch_commit_memory_region(struct kvm *kvm,
2032                                    const struct kvm_userspace_memory_region *mem,
2033                                    struct kvm_memory_slot *old,
2034                                    const struct kvm_memory_slot *new,
2035                                    enum kvm_mr_change change)
2036 {
2037         /*
2038          * At this point memslot has been committed and there is an
2039          * allocated dirty_bitmap[], dirty pages will be tracked while the
2040          * memory slot is write protected.
2041          */
2042         if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2043                 /*
2044                  * If we're with initial-all-set, we don't need to write
2045                  * protect any pages because they're all reported as dirty.
2046                  * Huge pages and normal pages will be write protect gradually.
2047                  */
2048                 if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
2049                         kvm_mmu_wp_memory_region(kvm, mem->slot);
2050                 }
2051         }
2052 }
2053
2054 int kvm_arch_prepare_memory_region(struct kvm *kvm,
2055                                    struct kvm_memory_slot *memslot,
2056                                    const struct kvm_userspace_memory_region *mem,
2057                                    enum kvm_mr_change change)
2058 {
2059         hva_t hva = mem->userspace_addr;
2060         hva_t reg_end = hva + mem->memory_size;
2061         bool writable = !(mem->flags & KVM_MEM_READONLY);
2062         int ret = 0;
2063
2064         if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
2065                         change != KVM_MR_FLAGS_ONLY)
2066                 return 0;
2067
2068         /*
2069          * Prevent userspace from creating a memory region outside of the IPA
2070          * space addressable by the KVM guest IPA space.
2071          */
2072         if (memslot->base_gfn + memslot->npages >=
2073             (kvm_phys_size(kvm) >> PAGE_SHIFT))
2074                 return -EFAULT;
2075
2076         mmap_read_lock(current->mm);
2077         /*
2078          * A memory region could potentially cover multiple VMAs, and any holes
2079          * between them, so iterate over all of them to find out if we can map
2080          * any of them right now.
2081          *
2082          *     +--------------------------------------------+
2083          * +---------------+----------------+   +----------------+
2084          * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
2085          * +---------------+----------------+   +----------------+
2086          *     |               memory region                |
2087          *     +--------------------------------------------+
2088          */
2089         do {
2090                 struct vm_area_struct *vma = find_vma(current->mm, hva);
2091                 hva_t vm_start, vm_end;
2092
2093                 if (!vma || vma->vm_start >= reg_end)
2094                         break;
2095
2096                 /*
2097                  * Take the intersection of this VMA with the memory region
2098                  */
2099                 vm_start = max(hva, vma->vm_start);
2100                 vm_end = min(reg_end, vma->vm_end);
2101
2102                 if (vma->vm_flags & VM_PFNMAP) {
2103                         gpa_t gpa = mem->guest_phys_addr +
2104                                     (vm_start - mem->userspace_addr);
2105                         phys_addr_t pa;
2106
2107                         pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
2108                         pa += vm_start - vma->vm_start;
2109
2110                         /* IO region dirty page logging not allowed */
2111                         if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2112                                 ret = -EINVAL;
2113                                 goto out;
2114                         }
2115
2116                         ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
2117                                                     vm_end - vm_start,
2118                                                     writable);
2119                         if (ret)
2120                                 break;
2121                 }
2122                 hva = vm_end;
2123         } while (hva < reg_end);
2124
2125         if (change == KVM_MR_FLAGS_ONLY)
2126                 goto out;
2127
2128         spin_lock(&kvm->mmu_lock);
2129         if (ret)
2130                 unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
2131         else
2132                 stage2_flush_memslot(kvm, memslot);
2133         spin_unlock(&kvm->mmu_lock);
2134 out:
2135         mmap_read_unlock(current->mm);
2136         return ret;
2137 }
2138
2139 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2140 {
2141 }
2142
2143 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2144 {
2145 }
2146
2147 void kvm_arch_flush_shadow_all(struct kvm *kvm)
2148 {
2149         kvm_free_stage2_pgd(&kvm->arch.mmu);
2150 }
2151
2152 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2153                                    struct kvm_memory_slot *slot)
2154 {
2155         gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
2156         phys_addr_t size = slot->npages << PAGE_SHIFT;
2157
2158         spin_lock(&kvm->mmu_lock);
2159         unmap_stage2_range(&kvm->arch.mmu, gpa, size);
2160         spin_unlock(&kvm->mmu_lock);
2161 }
2162
2163 /*
2164  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
2165  *
2166  * Main problems:
2167  * - S/W ops are local to a CPU (not broadcast)
2168  * - We have line migration behind our back (speculation)
2169  * - System caches don't support S/W at all (damn!)
2170  *
2171  * In the face of the above, the best we can do is to try and convert
2172  * S/W ops to VA ops. Because the guest is not allowed to infer the
2173  * S/W to PA mapping, it can only use S/W to nuke the whole cache,
2174  * which is a rather good thing for us.
2175  *
2176  * Also, it is only used when turning caches on/off ("The expected
2177  * usage of the cache maintenance instructions that operate by set/way
2178  * is associated with the cache maintenance instructions associated
2179  * with the powerdown and powerup of caches, if this is required by
2180  * the implementation.").
2181  *
2182  * We use the following policy:
2183  *
2184  * - If we trap a S/W operation, we enable VM trapping to detect
2185  *   caches being turned on/off, and do a full clean.
2186  *
2187  * - We flush the caches on both caches being turned on and off.
2188  *
2189  * - Once the caches are enabled, we stop trapping VM ops.
2190  */
2191 void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2192 {
2193         unsigned long hcr = *vcpu_hcr(vcpu);
2194
2195         /*
2196          * If this is the first time we do a S/W operation
2197          * (i.e. HCR_TVM not set) flush the whole memory, and set the
2198          * VM trapping.
2199          *
2200          * Otherwise, rely on the VM trapping to wait for the MMU +
2201          * Caches to be turned off. At that point, we'll be able to
2202          * clean the caches again.
2203          */
2204         if (!(hcr & HCR_TVM)) {
2205                 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2206                                         vcpu_has_cache_enabled(vcpu));
2207                 stage2_flush_vm(vcpu->kvm);
2208                 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2209         }
2210 }
2211
2212 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2213 {
2214         bool now_enabled = vcpu_has_cache_enabled(vcpu);
2215
2216         /*
2217          * If switching the MMU+caches on, need to invalidate the caches.
2218          * If switching it off, need to clean the caches.
2219          * Clean + invalidate does the trick always.
2220          */
2221         if (now_enabled != was_enabled)
2222                 stage2_flush_vm(vcpu->kvm);
2223
2224         /* Caches are now on, stop trapping VM ops (until a S/W op) */
2225         if (now_enabled)
2226                 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2227
2228         trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2229 }