arch/x86/kvm/mmu/tdp_mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "mmu.h"
   4 #include "mmu_internal.h"
   5 #include "tdp_iter.h"
   6 #include "tdp_mmu.h"
   7 #include "spte.h"
   8
   9 static bool __read_mostly tdp_mmu_enabled = false;
  10
  11 static bool is_tdp_mmu_enabled(void)
  12 {
  13 #ifdef CONFIG_X86_64
  14         return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
  15 #else
  16         return false;
  17 #endif /* CONFIG_X86_64 */
  18 }
  19
  20 /* Initializes the TDP MMU for the VM, if enabled. */
  21 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
  22 {
  23         if (!is_tdp_mmu_enabled())
  24                 return;
  25
  26         /* This should not be changed for the lifetime of the VM. */
  27         kvm->arch.tdp_mmu_enabled = true;
  28
  29         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
  30 }
  31
  32 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  33 {
  34         if (!kvm->arch.tdp_mmu_enabled)
  35                 return;
  36
  37         WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
  38 }
  39
  40 #define for_each_tdp_mmu_root(_kvm, _root)                          \
  41         list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
  42
  43 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
  44 {
  45         struct kvm_mmu_page *sp;
  46
  47         sp = to_shadow_page(hpa);
  48
  49         return sp->tdp_mmu_page && sp->root_count;
  50 }
  51
  52 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  53                           gfn_t start, gfn_t end);
  54
  55 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
  56 {
  57         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
  58
  59         lockdep_assert_held(&kvm->mmu_lock);
  60
  61         WARN_ON(root->root_count);
  62         WARN_ON(!root->tdp_mmu_page);
  63
  64         list_del(&root->link);
  65
  66         zap_gfn_range(kvm, root, 0, max_gfn);
  67
  68         free_page((unsigned long)root->spt);
  69         kmem_cache_free(mmu_page_header_cache, root);
  70 }
  71
  72 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
  73                                                    int level)
  74 {
  75         union kvm_mmu_page_role role;
  76
  77         role = vcpu->arch.mmu->mmu_role.base;
  78         role.level = level;
  79         role.direct = true;
  80         role.gpte_is_8_bytes = true;
  81         role.access = ACC_ALL;
  82
  83         return role;
  84 }
  85
  86 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  87                                                int level)
  88 {
  89         struct kvm_mmu_page *sp;
  90
  91         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
  92         sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
  93         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
  94
  95         sp->role.word = page_role_for_level(vcpu, level).word;
  96         sp->gfn = gfn;
  97         sp->tdp_mmu_page = true;
  98
  99         return sp;
 100 }
 101
 102 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
 103 {
 104         union kvm_mmu_page_role role;
 105         struct kvm *kvm = vcpu->kvm;
 106         struct kvm_mmu_page *root;
 107
 108         role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
 109
 110         spin_lock(&kvm->mmu_lock);
 111
 112         /* Check for an existing root before allocating a new one. */
 113         for_each_tdp_mmu_root(kvm, root) {
 114                 if (root->role.word == role.word) {
 115                         kvm_mmu_get_root(kvm, root);
 116                         spin_unlock(&kvm->mmu_lock);
 117                         return root;
 118                 }
 119         }
 120
 121         root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
 122         root->root_count = 1;
 123
 124         list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 125
 126         spin_unlock(&kvm->mmu_lock);
 127
 128         return root;
 129 }
 130
 131 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 132 {
 133         struct kvm_mmu_page *root;
 134
 135         root = get_tdp_mmu_vcpu_root(vcpu);
 136         if (!root)
 137                 return INVALID_PAGE;
 138
 139         return __pa(root->spt);
 140 }
 141
 142 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 143                                 u64 old_spte, u64 new_spte, int level);
 144
 145 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 146 {
 147         return sp->role.smm ? 1 : 0;
 148 }
 149
 150 /**
 151  * handle_changed_spte - handle bookkeeping associated with an SPTE change
 152  * @kvm: kvm instance
 153  * @as_id: the address space of the paging structure the SPTE was a part of
 154  * @gfn: the base GFN that was mapped by the SPTE
 155  * @old_spte: The value of the SPTE before the change
 156  * @new_spte: The value of the SPTE after the change
 157  * @level: the level of the PT the SPTE is part of in the paging structure
 158  *
 159  * Handle bookkeeping that might result from the modification of a SPTE.
 160  * This function must be called for all TDP SPTE modifications.
 161  */
 162 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 163                                 u64 old_spte, u64 new_spte, int level)
 164 {
 165         bool was_present = is_shadow_present_pte(old_spte);
 166         bool is_present = is_shadow_present_pte(new_spte);
 167         bool was_leaf = was_present && is_last_spte(old_spte, level);
 168         bool is_leaf = is_present && is_last_spte(new_spte, level);
 169         bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
 170         u64 *pt;
 171         u64 old_child_spte;
 172         int i;
 173
 174         WARN_ON(level > PT64_ROOT_MAX_LEVEL);
 175         WARN_ON(level < PG_LEVEL_4K);
 176         WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level));
 177
 178         /*
 179          * If this warning were to trigger it would indicate that there was a
 180          * missing MMU notifier or a race with some notifier handler.
 181          * A present, leaf SPTE should never be directly replaced with another
 182          * present leaf SPTE pointing to a differnt PFN. A notifier handler
 183          * should be zapping the SPTE before the main MM's page table is
 184          * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 185          * thread before replacement.
 186          */
 187         if (was_leaf && is_leaf && pfn_changed) {
 188                 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 189                        "SPTE with another present leaf SPTE mapping a\n"
 190                        "different PFN!\n"
 191                        "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 192                        as_id, gfn, old_spte, new_spte, level);
 193
 194                 /*
 195                  * Crash the host to prevent error propagation and guest data
 196                  * courruption.
 197                  */
 198                 BUG();
 199         }
 200
 201         if (old_spte == new_spte)
 202                 return;
 203
 204         /*
 205          * The only times a SPTE should be changed from a non-present to
 206          * non-present state is when an MMIO entry is installed/modified/
 207          * removed. In that case, there is nothing to do here.
 208          */
 209         if (!was_present && !is_present) {
 210                 /*
 211                  * If this change does not involve a MMIO SPTE, it is
 212                  * unexpected. Log the change, though it should not impact the
 213                  * guest since both the former and current SPTEs are nonpresent.
 214                  */
 215                 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
 216                         pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
 217                                "should not be replaced with another,\n"
 218                                "different nonpresent SPTE, unless one or both\n"
 219                                "are MMIO SPTEs.\n"
 220                                "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
 221                                as_id, gfn, old_spte, new_spte, level);
 222                 return;
 223         }
 224
 225
 226         if (was_leaf && is_dirty_spte(old_spte) &&
 227             (!is_dirty_spte(new_spte) || pfn_changed))
 228                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 229
 230         /*
 231          * Recursively handle child PTs if the change removed a subtree from
 232          * the paging structure.
 233          */
 234         if (was_present && !was_leaf && (pfn_changed || !is_present)) {
 235                 pt = spte_to_child_pt(old_spte, level);
 236
 237                 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 238                         old_child_spte = READ_ONCE(*(pt + i));
 239                         WRITE_ONCE(*(pt + i), 0);
 240                         handle_changed_spte(kvm, as_id,
 241                                 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
 242                                 old_child_spte, 0, level - 1);
 243                 }
 244
 245                 kvm_flush_remote_tlbs_with_address(kvm, gfn,
 246                                                    KVM_PAGES_PER_HPAGE(level));
 247
 248                 free_page((unsigned long)pt);
 249         }
 250 }
 251
 252 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 253                                 u64 old_spte, u64 new_spte, int level)
 254 {
 255         __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
 256 }
 257
 258 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 259                                     u64 new_spte)
 260 {
 261         u64 *root_pt = tdp_iter_root_pt(iter);
 262         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
 263         int as_id = kvm_mmu_page_as_id(root);
 264
 265         *iter->sptep = new_spte;
 266
 267         handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
 268                             iter->level);
 269 }
 270
 271 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
 272         for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
 273
 274 /*
 275  * Flush the TLB if the process should drop kvm->mmu_lock.
 276  * Return whether the caller still needs to flush the tlb.
 277  */
 278 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
 279 {
 280         if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 281                 kvm_flush_remote_tlbs(kvm);
 282                 cond_resched_lock(&kvm->mmu_lock);
 283                 tdp_iter_refresh_walk(iter);
 284                 return false;
 285         } else {
 286                 return true;
 287         }
 288 }
 289
 290 /*
 291  * Tears down the mappings for the range of gfns, [start, end), and frees the
 292  * non-root pages mapping GFNs strictly within that range. Returns true if
 293  * SPTEs have been cleared and a TLB flush is needed before releasing the
 294  * MMU lock.
 295  */
 296 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
 297                           gfn_t start, gfn_t end)
 298 {
 299         struct tdp_iter iter;
 300         bool flush_needed = false;
 301
 302         tdp_root_for_each_pte(iter, root, start, end) {
 303                 if (!is_shadow_present_pte(iter.old_spte))
 304                         continue;
 305
 306                 /*
 307                  * If this is a non-last-level SPTE that covers a larger range
 308                  * than should be zapped, continue, and zap the mappings at a
 309                  * lower level.
 310                  */
 311                 if ((iter.gfn < start ||
 312                      iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
 313                     !is_last_spte(iter.old_spte, iter.level))
 314                         continue;
 315
 316                 tdp_mmu_set_spte(kvm, &iter, 0);
 317
 318                 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
 319         }
 320         return flush_needed;
 321 }
 322
 323 /*
 324  * Tears down the mappings for the range of gfns, [start, end), and frees the
 325  * non-root pages mapping GFNs strictly within that range. Returns true if
 326  * SPTEs have been cleared and a TLB flush is needed before releasing the
 327  * MMU lock.
 328  */
 329 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
 330 {
 331         struct kvm_mmu_page *root;
 332         bool flush = false;
 333
 334         for_each_tdp_mmu_root(kvm, root) {
 335                 /*
 336                  * Take a reference on the root so that it cannot be freed if
 337                  * this thread releases the MMU lock and yields in this loop.
 338                  */
 339                 kvm_mmu_get_root(kvm, root);
 340
 341                 flush |= zap_gfn_range(kvm, root, start, end);
 342
 343                 kvm_mmu_put_root(kvm, root);
 344         }
 345
 346         return flush;
 347 }
 348
 349 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
 350 {
 351         gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT);
 352         bool flush;
 353
 354         flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
 355         if (flush)
 356                 kvm_flush_remote_tlbs(kvm);
 357 }