arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "mmutrace.h"
   6 #include "tdp_iter.h"
   7 #include "tdp_mmu.h"
   8 #include "spte.h"
   9
  10 static bool __read_mostly tdp_mmu_enabled = false;
  11
  12 static bool is_tdp_mmu_enabled(void)
  13 {
  14 #ifdef CONFIG_X86_64
  15         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
  16 #else
  17         return false;
  18 #endif /* CONFIG_X86_64 */
  19 }
  20
  21 /* Initializes the TDP MMU for the VM, if enabled. */
  22 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  23 {
  24         if (!is_tdp_mmu_enabled())
  25                 return;
  26
  27         /* This should not be changed for the lifetime of the VM. */
  28         kvm->arch.tdp_mmu_enabled = true;
  29
  30         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  31         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  32 }
  33
  34 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  35 {
  36         if (!kvm->arch.tdp_mmu_enabled)
  37                 return;
  38
  39         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  40 }
  41
  42 #define for_each_tdp_mmu_root(_kvm, _root)                          \
  43         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  44
  45 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
  46 {
  47         struct kvm_mmu_page *sp;
  48
  49         sp = to_shadow_page(hpa);
  50
  51         return sp->tdp_mmu_page && sp->root_count;
  52 }
  53
  54 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  55                           gfn_t start, gfn_t end, bool can_yield);
  56
  57 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  58 {
  59         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
  60
  61         lockdep_assert_held(&kvm->mmu_lock);
  62
  63         WARN_ON(root->root_count);
  64         WARN_ON(!root->tdp_mmu_page);
  65
  66         list_del(&root->link);
  67
  68         zap_gfn_range(kvm, root, 0, max_gfn, false);
  69
  70         free_page((unsigned long)root->spt);
  71         kmem_cache_free(mmu_page_header_cache, root);
  72 }
  73
  74 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
  75                                                    int level)
  76 {
  77         union kvm_mmu_page_role role;
  78
  79         role = vcpu->arch.mmu->mmu_role.base;
  80         role.level = level;
  81         role.direct = true;
  82         role.gpte_is_8_bytes = true;
  83         role.access = ACC_ALL;
  84
  85         return role;
  86 }
  87
  88 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  89                                                int level)
  90 {
  91         struct kvm_mmu_page *sp;
  92
  93         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
  94         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
  95         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
  96
  97         sp->role.word = page_role_for_level(vcpu, level).word;
  98         sp->gfn = gfn;
  99         sp->tdp_mmu_page = true;
 100
 101         return sp;
 102 }
 103
 104 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 105 {
 106         union kvm_mmu_page_role role;
 107         struct kvm *kvm = vcpu->kvm;
 108         struct kvm_mmu_page *root;
 109
 110         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 111
 112         spin_lock(&kvm->mmu_lock);
 113
 114         /* Check for an existing root before allocating a new one. */
 115         for_each_tdp_mmu_root(kvm, root) {
 116                 if (root->role.word == role.word) {
 117                         kvm_mmu_get_root(kvm, root);
 118                         spin_unlock(&kvm->mmu_lock);
 119                         return root;
 120                 }
 121         }
 122
 123         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 124         root->root_count = 1;
 125
 126         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 127
 128         spin_unlock(&kvm->mmu_lock);
 129
 130         return root;
 131 }
 132
 133 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 134 {
 135         struct kvm_mmu_page *root;
 136
 137         root = get_tdp_mmu_vcpu_root(vcpu);
 138         if (!root)
 139                 return INVALID_PAGE;
 140
 141         return __pa(root->spt);
 142 }
 143
 144 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 145                                 u64 old_spte, u64 new_spte, int level);
 146
 147 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 148 {
 149         return sp->role.smm ? 1 : 0;
 150 }
 151
 152 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
 153 {
 154         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 155
 156         if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
 157                 return;
 158
 159         if (is_accessed_spte(old_spte) &&
 160             (!is_accessed_spte(new_spte) || pfn_changed))
 161                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 162 }
 163
 164 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 165                                           u64 old_spte, u64 new_spte, int level)
 166 {
 167         bool pfn_changed;
 168         struct kvm_memory_slot *slot;
 169
 170         if (level > PG_LEVEL_4K)
 171                 return;
 172
 173         pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 174
 175         if ((!is_writable_pte(old_spte) || pfn_changed) &&
 176             is_writable_pte(new_spte)) {
 177                 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
 178                 mark_page_dirty_in_slot(slot, gfn);
 179         }
 180 }
 181
 182 /**
 183  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 184  * @kvm: kvm instance
 185  * @as_id: the address space of the paging structure the SPTE was a part of
 186  * @gfn: the base GFN that was mapped by the SPTE
 187  * @old_spte: The value of the SPTE before the change
 188  * @new_spte: The value of the SPTE after the change
 189  * @level: the level of the PT the SPTE is part of in the paging structure
 190  *
 191  * Handle bookkeeping that might result from the modification of a SPTE.
 192  * This function must be called for all TDP SPTE modifications.
 193  */
 194 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 195                                 u64 old_spte, u64 new_spte, int level)
 196 {
 197         bool was_present = is_shadow_present_pte(old_spte);
 198         bool is_present = is_shadow_present_pte(new_spte);
 199         bool was_leaf = was_present && is_last_spte(old_spte, level);
 200         bool is_leaf = is_present && is_last_spte(new_spte, level);
 201         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 202         u64 *pt;
 203         struct kvm_mmu_page *sp;
 204         u64 old_child_spte;
 205         int i;
 206
 207         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 208         WARN_ON(level < PG_LEVEL_4K);
 209         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
 210
 211         /*
 212          * If this warning were to trigger it would indicate that there was a
 213          * missing MMU notifier or a race with some notifier handler.
 214          * A present, leaf SPTE should never be directly replaced with another
 215          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 216          * should be zapping the SPTE before the main MM's page table is
 217          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 218          * thread before replacement.
 219          */
 220         if (was_leaf && is_leaf && pfn_changed) {
 221                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 222                        "SPTE with another present leaf SPTE mapping a\n"
 223                        "different PFN!\n"
 224                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 225                        as_id, gfn, old_spte, new_spte, level);
 226
 227                 /*
 228                  * Crash the host to prevent error propagation and guest data
 229                  * courruption.
 230                  */
 231                 BUG();
 232         }
 233
 234         if (old_spte == new_spte)
 235                 return;
 236
 237         /*
 238          * The only times a SPTE should be changed from a non-present to
 239          * non-present state is when an MMIO entry is installed/modified/
 240          * removed. In that case, there is nothing to do here.
 241          */
 242         if (!was_present && !is_present) {
 243                 /*
 244                  * If this change does not involve a MMIO SPTE, it is
 245                  * unexpected. Log the change, though it should not impact the
 246                  * guest since both the former and current SPTEs are nonpresent.
 247                  */
 248                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
 249                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 250                                "should not be replaced with another,\n"
 251                                "different nonpresent SPTE, unless one or both\n"
 252                                "are MMIO SPTEs.\n"
 253                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 254                                as_id, gfn, old_spte, new_spte, level);
 255                 return;
 256         }
 257
 258
 259         if (was_leaf && is_dirty_spte(old_spte) &&
 260             (!is_dirty_spte(new_spte) || pfn_changed))
 261                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 262
 263         /*
 264          * Recursively handle child PTs if the change removed a subtree from
 265          * the paging structure.
 266          */
 267         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
 268                 pt = spte_to_child_pt(old_spte, level);
 269                 sp = sptep_to_sp(pt);
 270
 271                 list_del(&sp->link);
 272
 273                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 274                         old_child_spte = READ_ONCE(*(pt + i));
 275                         WRITE_ONCE(*(pt + i), 0);
 276                         handle_changed_spte(kvm, as_id,
 277                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
 278                                 old_child_spte, 0, level - 1);
 279                 }
 280
 281                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
 282                                                    KVM_PAGES_PER_HPAGE(level));
 283
 284                 free_page((unsigned long)pt);
 285                 kmem_cache_free(mmu_page_header_cache, sp);
 286         }
 287 }
 288
 289 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 290                                 u64 old_spte, u64 new_spte, int level)
 291 {
 292         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
 293         handle_changed_spte_acc_track(old_spte, new_spte, level);
 294         handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
 295                                       new_spte, level);
 296 }
 297
 298 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 299                                       u64 new_spte, bool record_acc_track,
 300                                       bool record_dirty_log)
 301 {
 302         u64 *root_pt = tdp_iter_root_pt(iter);
 303         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 304         int as_id = kvm_mmu_page_as_id(root);
 305
 306         WRITE_ONCE(*iter->sptep, new_spte);
 307
 308         __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 309                               iter->level);
 310         if (record_acc_track)
 311                 handle_changed_spte_acc_track(iter->old_spte, new_spte,
 312                                               iter->level);
 313         if (record_dirty_log)
 314                 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
 315                                               iter->old_spte, new_spte,
 316                                               iter->level);
 317 }
 318
 319 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 320                                     u64 new_spte)
 321 {
 322         __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
 323 }
 324
 325 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
 326                                                  struct tdp_iter *iter,
 327                                                  u64 new_spte)
 328 {
 329         __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
 330 }
 331
 332 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
 333                                                  struct tdp_iter *iter,
 334                                                  u64 new_spte)
 335 {
 336         __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
 337 }
 338
 339 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 340         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 341
 342 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)  \
 343         tdp_root_for_each_pte(_iter, _root, _start, _end)               \
 344                 if (!is_shadow_present_pte(_iter.old_spte) ||           \
 345                     !is_last_spte(_iter.old_spte, _iter.level))         \
 346                         continue;                                       \
 347                 else
 348
 349 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)         \
 350         for_each_tdp_pte(_iter, __va(_mmu->root_hpa),           \
 351                          _mmu->shadow_root_level, _start, _end)
 352
 353 /*
 354  * Flush the TLB if the process should drop kvm->mmu_lock.
 355  * Return whether the caller still needs to flush the tlb.
 356  */
 357 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
 358 {
 359         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 360                 kvm_flush_remote_tlbs(kvm);
 361                 cond_resched_lock(&kvm->mmu_lock);
 362                 tdp_iter_refresh_walk(iter);
 363                 return false;
 364         } else {
 365                 return true;
 366         }
 367 }
 368
 369 static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
 370 {
 371         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 372                 cond_resched_lock(&kvm->mmu_lock);
 373                 tdp_iter_refresh_walk(iter);
 374         }
 375 }
 376
 377 /*
 378  * Tears down the mappings for the range of gfns, [start, end), and frees the
 379  * non-root pages mapping GFNs strictly within that range. Returns true if
 380  * SPTEs have been cleared and a TLB flush is needed before releasing the
 381  * MMU lock.
 382  * If can_yield is true, will release the MMU lock and reschedule if the
 383  * scheduler needs the CPU or there is contention on the MMU lock. If this
 384  * function cannot yield, it will not release the MMU lock or reschedule and
 385  * the caller must ensure it does not supply too large a GFN range, or the
 386  * operation can cause a soft lockup.
 387  */
 388 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 389                           gfn_t start, gfn_t end, bool can_yield)
 390 {
 391         struct tdp_iter iter;
 392         bool flush_needed = false;
 393
 394         tdp_root_for_each_pte(iter, root, start, end) {
 395                 if (!is_shadow_present_pte(iter.old_spte))
 396                         continue;
 397
 398                 /*
 399                  * If this is a non-last-level SPTE that covers a larger range
 400                  * than should be zapped, continue, and zap the mappings at a
 401                  * lower level.
 402                  */
 403                 if ((iter.gfn < start ||
 404                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 405                     !is_last_spte(iter.old_spte, iter.level))
 406                         continue;
 407
 408                 tdp_mmu_set_spte(kvm, &iter, 0);
 409
 410                 if (can_yield)
 411                         flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
 412                 else
 413                         flush_needed = true;
 414         }
 415         return flush_needed;
 416 }
 417
 418 /*
 419  * Tears down the mappings for the range of gfns, [start, end), and frees the
 420  * non-root pages mapping GFNs strictly within that range. Returns true if
 421  * SPTEs have been cleared and a TLB flush is needed before releasing the
 422  * MMU lock.
 423  */
 424 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 425 {
 426         struct kvm_mmu_page *root;
 427         bool flush = false;
 428
 429         for_each_tdp_mmu_root(kvm, root) {
 430                 /*
 431                  * Take a reference on the root so that it cannot be freed if
 432                  * this thread releases the MMU lock and yields in this loop.
 433                  */
 434                 kvm_mmu_get_root(kvm, root);
 435
 436                 flush |= zap_gfn_range(kvm, root, start, end, true);
 437
 438                 kvm_mmu_put_root(kvm, root);
 439         }
 440
 441         return flush;
 442 }
 443
 444 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 445 {
 446         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
 447         bool flush;
 448
 449         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 450         if (flush)
 451                 kvm_flush_remote_tlbs(kvm);
 452 }
 453
 454 /*
 455  * Installs a last-level SPTE to handle a TDP page fault.
 456  * (NPT/EPT violation/misconfiguration)
 457  */
 458 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
 459                                           int map_writable,
 460                                           struct tdp_iter *iter,
 461                                           kvm_pfn_t pfn, bool prefault)
 462 {
 463         u64 new_spte;
 464         int ret = 0;
 465         int make_spte_ret = 0;
 466
 467         if (unlikely(is_noslot_pfn(pfn))) {
 468                 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
 469                 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
 470         } else
 471                 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
 472                                          pfn, iter->old_spte, prefault, true,
 473                                          map_writable, !shadow_accessed_mask,
 474                                          &new_spte);
 475
 476         if (new_spte == iter->old_spte)
 477                 ret = RET_PF_SPURIOUS;
 478         else
 479                 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
 480
 481         /*
 482          * If the page fault was caused by a write but the page is write
 483          * protected, emulation is needed. If the emulation was skipped,
 484          * the vCPU would have the same fault again.
 485          */
 486         if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
 487                 if (write)
 488                         ret = RET_PF_EMULATE;
 489                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 490         }
 491
 492         /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
 493         if (unlikely(is_mmio_spte(new_spte)))
 494                 ret = RET_PF_EMULATE;
 495
 496         trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
 497         if (!prefault)
 498                 vcpu->stat.pf_fixed++;
 499
 500         return ret;
 501 }
 502
 503 /*
 504  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
 505  * page tables and SPTEs to translate the faulting guest physical address.
 506  */
 507 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 508                     int map_writable, int max_level, kvm_pfn_t pfn,
 509                     bool prefault)
 510 {
 511         bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
 512         bool write = error_code & PFERR_WRITE_MASK;
 513         bool exec = error_code & PFERR_FETCH_MASK;
 514         bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
 515         struct kvm_mmu *mmu = vcpu->arch.mmu;
 516         struct tdp_iter iter;
 517         struct kvm_mmu_page *sp;
 518         u64 *child_pt;
 519         u64 new_spte;
 520         int ret;
 521         gfn_t gfn = gpa >> PAGE_SHIFT;
 522         int level;
 523         int req_level;
 524
 525         if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
 526                 return RET_PF_RETRY;
 527         if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
 528                 return RET_PF_RETRY;
 529
 530         level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
 531                                         huge_page_disallowed, &req_level);
 532
 533         trace_kvm_mmu_spte_requested(gpa, level, pfn);
 534         tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
 535                 if (nx_huge_page_workaround_enabled)
 536                         disallowed_hugepage_adjust(iter.old_spte, gfn,
 537                                                    iter.level, &pfn, &level);
 538
 539                 if (iter.level == level)
 540                         break;
 541
 542                 /*
 543                  * If there is an SPTE mapping a large page at a higher level
 544                  * than the target, that SPTE must be cleared and replaced
 545                  * with a non-leaf SPTE.
 546                  */
 547                 if (is_shadow_present_pte(iter.old_spte) &&
 548                     is_large_pte(iter.old_spte)) {
 549                         tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
 550
 551                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
 552                                         KVM_PAGES_PER_HPAGE(iter.level));
 553
 554                         /*
 555                          * The iter must explicitly re-read the spte here
 556                          * because the new value informs the !present
 557                          * path below.
 558                          */
 559                         iter.old_spte = READ_ONCE(*iter.sptep);
 560                 }
 561
 562                 if (!is_shadow_present_pte(iter.old_spte)) {
 563                         sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 564                         list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
 565                         child_pt = sp->spt;
 566                         clear_page(child_pt);
 567                         new_spte = make_nonleaf_spte(child_pt,
 568                                                      !shadow_accessed_mask);
 569
 570                         trace_kvm_mmu_get_page(sp, true);
 571                         tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
 572                 }
 573         }
 574
 575         if (WARN_ON(iter.level != level))
 576                 return RET_PF_RETRY;
 577
 578         ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
 579                                               pfn, prefault);
 580
 581         return ret;
 582 }
 583
 584 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
 585                 unsigned long end, unsigned long data,
 586                 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
 587                                struct kvm_mmu_page *root, gfn_t start,
 588                                gfn_t end, unsigned long data))
 589 {
 590         struct kvm_memslots *slots;
 591         struct kvm_memory_slot *memslot;
 592         struct kvm_mmu_page *root;
 593         int ret = 0;
 594         int as_id;
 595
 596         for_each_tdp_mmu_root(kvm, root) {
 597                 /*
 598                  * Take a reference on the root so that it cannot be freed if
 599                  * this thread releases the MMU lock and yields in this loop.
 600                  */
 601                 kvm_mmu_get_root(kvm, root);
 602
 603                 as_id = kvm_mmu_page_as_id(root);
 604                 slots = __kvm_memslots(kvm, as_id);
 605                 kvm_for_each_memslot(memslot, slots) {
 606                         unsigned long hva_start, hva_end;
 607                         gfn_t gfn_start, gfn_end;
 608
 609                         hva_start = max(start, memslot->userspace_addr);
 610                         hva_end = min(end, memslot->userspace_addr +
 611                                       (memslot->npages << PAGE_SHIFT));
 612                         if (hva_start >= hva_end)
 613                                 continue;
 614                         /*
 615                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 616                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 617                          */
 618                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
 619                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 620
 621                         ret |= handler(kvm, memslot, root, gfn_start,
 622                                        gfn_end, data);
 623                 }
 624
 625                 kvm_mmu_put_root(kvm, root);
 626         }
 627
 628         return ret;
 629 }
 630
 631 static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
 632                                      struct kvm_memory_slot *slot,
 633                                      struct kvm_mmu_page *root, gfn_t start,
 634                                      gfn_t end, unsigned long unused)
 635 {
 636         return zap_gfn_range(kvm, root, start, end, false);
 637 }
 638
 639 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
 640                               unsigned long end)
 641 {
 642         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 643                                             zap_gfn_range_hva_wrapper);
 644 }
 645
 646 /*
 647  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
 648  * if any of the GFNs in the range have been accessed.
 649  */
 650 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
 651                          struct kvm_mmu_page *root, gfn_t start, gfn_t end,
 652                          unsigned long unused)
 653 {
 654         struct tdp_iter iter;
 655         int young = 0;
 656         u64 new_spte = 0;
 657
 658         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 659                 /*
 660                  * If we have a non-accessed entry we don't need to change the
 661                  * pte.
 662                  */
 663                 if (!is_accessed_spte(iter.old_spte))
 664                         continue;
 665
 666                 new_spte = iter.old_spte;
 667
 668                 if (spte_ad_enabled(new_spte)) {
 669                         clear_bit((ffs(shadow_accessed_mask) - 1),
 670                                   (unsigned long *)&new_spte);
 671                 } else {
 672                         /*
 673                          * Capture the dirty status of the page, so that it doesn't get
 674                          * lost when the SPTE is marked for access tracking.
 675                          */
 676                         if (is_writable_pte(new_spte))
 677                                 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 678
 679                         new_spte = mark_spte_for_access_track(new_spte);
 680                 }
 681                 new_spte &= ~shadow_dirty_mask;
 682
 683                 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
 684                 young = 1;
 685         }
 686
 687         return young;
 688 }
 689
 690 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
 691                               unsigned long end)
 692 {
 693         return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
 694                                             age_gfn_range);
 695 }
 696
 697 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 698                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
 699                         unsigned long unused2)
 700 {
 701         struct tdp_iter iter;
 702
 703         tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1)
 704                 if (is_accessed_spte(iter.old_spte))
 705                         return 1;
 706
 707         return 0;
 708 }
 709
 710 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
 711 {
 712         return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0,
 713                                             test_age_gfn);
 714 }
 715
 716 /*
 717  * Handle the changed_pte MMU notifier for the TDP MMU.
 718  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
 719  * notifier.
 720  * Returns non-zero if a flush is needed before releasing the MMU lock.
 721  */
 722 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
 723                         struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused,
 724                         unsigned long data)
 725 {
 726         struct tdp_iter iter;
 727         pte_t *ptep = (pte_t *)data;
 728         kvm_pfn_t new_pfn;
 729         u64 new_spte;
 730         int need_flush = 0;
 731
 732         WARN_ON(pte_huge(*ptep));
 733
 734         new_pfn = pte_pfn(*ptep);
 735
 736         tdp_root_for_each_pte(iter, root, gfn, gfn + 1) {
 737                 if (iter.level != PG_LEVEL_4K)
 738                         continue;
 739
 740                 if (!is_shadow_present_pte(iter.old_spte))
 741                         break;
 742
 743                 tdp_mmu_set_spte(kvm, &iter, 0);
 744
 745                 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1);
 746
 747                 if (!pte_write(*ptep)) {
 748                         new_spte = kvm_mmu_changed_pte_notifier_make_spte(
 749                                         iter.old_spte, new_pfn);
 750
 751                         tdp_mmu_set_spte(kvm, &iter, new_spte);
 752                 }
 753
 754                 need_flush = 1;
 755         }
 756
 757         if (need_flush)
 758                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
 759
 760         return 0;
 761 }
 762
 763 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
 764                              pte_t *host_ptep)
 765 {
 766         return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1,
 767                                             (unsigned long)host_ptep,
 768                                             set_tdp_spte);
 769 }
 770
 771 /*
 772  * Remove write access from all the SPTEs mapping GFNs [start, end). If
 773  * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
 774  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
 775  */
 776 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 777                              gfn_t start, gfn_t end, int min_level)
 778 {
 779         struct tdp_iter iter;
 780         u64 new_spte;
 781         bool spte_set = false;
 782
 783         BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
 784
 785         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
 786                                    min_level, start, end) {
 787                 if (!is_shadow_present_pte(iter.old_spte) ||
 788                     !is_last_spte(iter.old_spte, iter.level))
 789                         continue;
 790
 791                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 792
 793                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 794                 spte_set = true;
 795
 796                 tdp_mmu_iter_cond_resched(kvm, &iter);
 797         }
 798         return spte_set;
 799 }
 800
 801 /*
 802  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
 803  * only affect leaf SPTEs down to min_level.
 804  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
 805  */
 806 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
 807                              int min_level)
 808 {
 809         struct kvm_mmu_page *root;
 810         int root_as_id;
 811         bool spte_set = false;
 812
 813         for_each_tdp_mmu_root(kvm, root) {
 814                 root_as_id = kvm_mmu_page_as_id(root);
 815                 if (root_as_id != slot->as_id)
 816                         continue;
 817
 818                 /*
 819                  * Take a reference on the root so that it cannot be freed if
 820                  * this thread releases the MMU lock and yields in this loop.
 821                  */
 822                 kvm_mmu_get_root(kvm, root);
 823
 824                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
 825                              slot->base_gfn + slot->npages, min_level);
 826
 827                 kvm_mmu_put_root(kvm, root);
 828         }
 829
 830         return spte_set;
 831 }
 832
 833 /*
 834  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
 835  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
 836  * If AD bits are not enabled, this will require clearing the writable bit on
 837  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
 838  * be flushed.
 839  */
 840 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 841                            gfn_t start, gfn_t end)
 842 {
 843         struct tdp_iter iter;
 844         u64 new_spte;
 845         bool spte_set = false;
 846
 847         tdp_root_for_each_leaf_pte(iter, root, start, end) {
 848                 if (spte_ad_need_write_protect(iter.old_spte)) {
 849                         if (is_writable_pte(iter.old_spte))
 850                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 851                         else
 852                                 continue;
 853                 } else {
 854                         if (iter.old_spte & shadow_dirty_mask)
 855                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
 856                         else
 857                                 continue;
 858                 }
 859
 860                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 861                 spte_set = true;
 862
 863                 tdp_mmu_iter_cond_resched(kvm, &iter);
 864         }
 865         return spte_set;
 866 }
 867
 868 /*
 869  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
 870  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
 871  * If AD bits are not enabled, this will require clearing the writable bit on
 872  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
 873  * be flushed.
 874  */
 875 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
 876 {
 877         struct kvm_mmu_page *root;
 878         int root_as_id;
 879         bool spte_set = false;
 880
 881         for_each_tdp_mmu_root(kvm, root) {
 882                 root_as_id = kvm_mmu_page_as_id(root);
 883                 if (root_as_id != slot->as_id)
 884                         continue;
 885
 886                 /*
 887                  * Take a reference on the root so that it cannot be freed if
 888                  * this thread releases the MMU lock and yields in this loop.
 889                  */
 890                 kvm_mmu_get_root(kvm, root);
 891
 892                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
 893                                 slot->base_gfn + slot->npages);
 894
 895                 kvm_mmu_put_root(kvm, root);
 896         }
 897
 898         return spte_set;
 899 }
 900
 901 /*
 902  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
 903  * set in mask, starting at gfn. The given memslot is expected to contain all
 904  * the GFNs represented by set bits in the mask. If AD bits are enabled,
 905  * clearing the dirty status will involve clearing the dirty bit on each SPTE
 906  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
 907  */
 908 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
 909                                   gfn_t gfn, unsigned long mask, bool wrprot)
 910 {
 911         struct tdp_iter iter;
 912         u64 new_spte;
 913
 914         tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
 915                                     gfn + BITS_PER_LONG) {
 916                 if (!mask)
 917                         break;
 918
 919                 if (iter.level > PG_LEVEL_4K ||
 920                     !(mask & (1UL << (iter.gfn - gfn))))
 921                         continue;
 922
 923                 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
 924                         if (is_writable_pte(iter.old_spte))
 925                                 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
 926                         else
 927                                 continue;
 928                 } else {
 929                         if (iter.old_spte & shadow_dirty_mask)
 930                                 new_spte = iter.old_spte & ~shadow_dirty_mask;
 931                         else
 932                                 continue;
 933                 }
 934
 935                 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
 936
 937                 mask &= ~(1UL << (iter.gfn - gfn));
 938         }
 939 }
 940
 941 /*
 942  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
 943  * set in mask, starting at gfn. The given memslot is expected to contain all
 944  * the GFNs represented by set bits in the mask. If AD bits are enabled,
 945  * clearing the dirty status will involve clearing the dirty bit on each SPTE
 946  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
 947  */
 948 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 949                                        struct kvm_memory_slot *slot,
 950                                        gfn_t gfn, unsigned long mask,
 951                                        bool wrprot)
 952 {
 953         struct kvm_mmu_page *root;
 954         int root_as_id;
 955
 956         lockdep_assert_held(&kvm->mmu_lock);
 957         for_each_tdp_mmu_root(kvm, root) {
 958                 root_as_id = kvm_mmu_page_as_id(root);
 959                 if (root_as_id != slot->as_id)
 960                         continue;
 961
 962                 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
 963         }
 964 }
 965
 966 /*
 967  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
 968  * only used for PML, and so will involve setting the dirty bit on each SPTE.
 969  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
 970  */
 971 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 972                                 gfn_t start, gfn_t end)
 973 {
 974         struct tdp_iter iter;
 975         u64 new_spte;
 976         bool spte_set = false;
 977
 978         tdp_root_for_each_pte(iter, root, start, end) {
 979                 if (!is_shadow_present_pte(iter.old_spte))
 980                         continue;
 981
 982                 new_spte = iter.old_spte | shadow_dirty_mask;
 983
 984                 tdp_mmu_set_spte(kvm, &iter, new_spte);
 985                 spte_set = true;
 986
 987                 tdp_mmu_iter_cond_resched(kvm, &iter);
 988         }
 989
 990         return spte_set;
 991 }
 992
 993 /*
 994  * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
 995  * only used for PML, and so will involve setting the dirty bit on each SPTE.
 996  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
 997  */
 998 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
 999 {
1000         struct kvm_mmu_page *root;
1001         int root_as_id;
1002         bool spte_set = false;
1003
1004         for_each_tdp_mmu_root(kvm, root) {
1005                 root_as_id = kvm_mmu_page_as_id(root);
1006                 if (root_as_id != slot->as_id)
1007                         continue;
1008
1009                 /*
1010                  * Take a reference on the root so that it cannot be freed if
1011                  * this thread releases the MMU lock and yields in this loop.
1012                  */
1013                 kvm_mmu_get_root(kvm, root);
1014
1015                 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
1016                                 slot->base_gfn + slot->npages);
1017
1018                 kvm_mmu_put_root(kvm, root);
1019         }
1020         return spte_set;
1021 }
1022
1023 /*
1024  * Clear non-leaf entries (and free associated page tables) which could
1025  * be replaced by large mappings, for GFNs within the slot.
1026  */
1027 static void zap_collapsible_spte_range(struct kvm *kvm,
1028                                        struct kvm_mmu_page *root,
1029                                        gfn_t start, gfn_t end)
1030 {
1031         struct tdp_iter iter;
1032         kvm_pfn_t pfn;
1033         bool spte_set = false;
1034
1035         tdp_root_for_each_pte(iter, root, start, end) {
1036                 if (!is_shadow_present_pte(iter.old_spte) ||
1037                     is_last_spte(iter.old_spte, iter.level))
1038                         continue;
1039
1040                 pfn = spte_to_pfn(iter.old_spte);
1041                 if (kvm_is_reserved_pfn(pfn) ||
1042                     !PageTransCompoundMap(pfn_to_page(pfn)))
1043                         continue;
1044
1045                 tdp_mmu_set_spte(kvm, &iter, 0);
1046
1047                 spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
1048         }
1049
1050         if (spte_set)
1051                 kvm_flush_remote_tlbs(kvm);
1052 }
1053
1054 /*
1055  * Clear non-leaf entries (and free associated page tables) which could
1056  * be replaced by large mappings, for GFNs within the slot.
1057  */
1058 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1059                                        const struct kvm_memory_slot *slot)
1060 {
1061         struct kvm_mmu_page *root;
1062         int root_as_id;
1063
1064         for_each_tdp_mmu_root(kvm, root) {
1065                 root_as_id = kvm_mmu_page_as_id(root);
1066                 if (root_as_id != slot->as_id)
1067                         continue;
1068
1069                 /*
1070                  * Take a reference on the root so that it cannot be freed if
1071                  * this thread releases the MMU lock and yields in this loop.
1072                  */
1073                 kvm_mmu_get_root(kvm, root);
1074
1075                 zap_collapsible_spte_range(kvm, root, slot->base_gfn,
1076                                            slot->base_gfn + slot->npages);
1077
1078                 kvm_mmu_put_root(kvm, root);
1079         }
1080 }