arch/x86/kvm/svm/svm.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * AMD SVM support
   6  *
   7  * Copyright (C) 2006 Qumranet, Inc.
   8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   9  *
  10  * Authors:
  11  *   Yaniv Kamay  <yaniv@qumranet.com>
  12  *   Avi Kivity   <avi@qumranet.com>
  13  */
  14
  15 #define pr_fmt(fmt) "SVM: " fmt
  16
  17 #include <linux/kvm_host.h>
  18
  19 #include "irq.h"
  20 #include "mmu.h"
  21 #include "kvm_cache_regs.h"
  22 #include "x86.h"
  23 #include "cpuid.h"
  24 #include "pmu.h"
  25
  26 #include <linux/module.h>
  27 #include <linux/mod_devicetable.h>
  28 #include <linux/kernel.h>
  29 #include <linux/vmalloc.h>
  30 #include <linux/highmem.h>
  31 #include <linux/sched.h>
  32 #include <linux/trace_events.h>
  33 #include <linux/slab.h>
  34 #include <linux/amd-iommu.h>
  35 #include <linux/hashtable.h>
  36 #include <linux/frame.h>
  37 #include <linux/psp-sev.h>
  38 #include <linux/file.h>
  39 #include <linux/pagemap.h>
  40 #include <linux/swap.h>
  41 #include <linux/rwsem.h>
  42
  43 #include <asm/apic.h>
  44 #include <asm/perf_event.h>
  45 #include <asm/tlbflush.h>
  46 #include <asm/desc.h>
  47 #include <asm/debugreg.h>
  48 #include <asm/kvm_para.h>
  49 #include <asm/irq_remapping.h>
  50 #include <asm/spec-ctrl.h>
  51 #include <asm/cpu_device_id.h>
  52
  53 #include <asm/virtext.h>
  54 #include "trace.h"
  55
  56 #include "svm.h"
  57
  58 #define __ex(x) __kvm_handle_fault_on_reboot(x)
  59
  60 MODULE_AUTHOR("Qumranet");
  61 MODULE_LICENSE("GPL");
  62
  63 #ifdef MODULE
  64 static const struct x86_cpu_id svm_cpu_id[] = {
  65         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  66         {}
  67 };
  68 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  69 #endif
  70
  71 #define IOPM_ALLOC_ORDER 2
  72 #define MSRPM_ALLOC_ORDER 1
  73
  74 #define SEG_TYPE_LDT 2
  75 #define SEG_TYPE_BUSY_TSS16 3
  76
  77 #define SVM_FEATURE_LBRV           (1 <<  1)
  78 #define SVM_FEATURE_SVML           (1 <<  2)
  79 #define SVM_FEATURE_TSC_RATE       (1 <<  4)
  80 #define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  81 #define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  82 #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  83 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  84
  85 #define SVM_AVIC_DOORBELL       0xc001011b
  86
  87 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  88
  89 #define TSC_RATIO_RSVD          0xffffff0000000000ULL
  90 #define TSC_RATIO_MIN           0x0000000000000001ULL
  91 #define TSC_RATIO_MAX           0x000000ffffffffffULL
  92
  93 #define AVIC_HPA_MASK   ~((0xFFFULL << 52) | 0xFFF)
  94
  95 /*
  96  * 0xff is broadcast, so the max index allowed for physical APIC ID
  97  * table is 0xfe.  APIC IDs above 0xff are reserved.
  98  */
  99 #define AVIC_MAX_PHYSICAL_ID_COUNT      255
 100
 101 #define AVIC_UNACCEL_ACCESS_WRITE_MASK          1
 102 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK         0xFF0
 103 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK         0xFFFFFFFF
 104
 105 /* AVIC GATAG is encoded using VM and VCPU IDs */
 106 #define AVIC_VCPU_ID_BITS               8
 107 #define AVIC_VCPU_ID_MASK               ((1 << AVIC_VCPU_ID_BITS) - 1)
 108
 109 #define AVIC_VM_ID_BITS                 24
 110 #define AVIC_VM_ID_NR                   (1 << AVIC_VM_ID_BITS)
 111 #define AVIC_VM_ID_MASK                 ((1 << AVIC_VM_ID_BITS) - 1)
 112
 113 #define AVIC_GATAG(x, y)                (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
 114                                                 (y & AVIC_VCPU_ID_MASK))
 115 #define AVIC_GATAG_TO_VMID(x)           ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
 116 #define AVIC_GATAG_TO_VCPUID(x)         (x & AVIC_VCPU_ID_MASK)
 117
 118 static bool erratum_383_found __read_mostly;
 119
 120 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 121
 122 /*
 123  * Set osvw_len to higher value when updated Revision Guides
 124  * are published and we know what the new status bits are
 125  */
 126 static uint64_t osvw_len = 4, osvw_status;
 127
 128 /*
 129  * This is a wrapper of struct amd_iommu_ir_data.
 130  */
 131 struct amd_svm_iommu_ir {
 132         struct list_head node;  /* Used by SVM for per-vcpu ir_list */
 133         void *data;             /* Storing pointer to struct amd_ir_data */
 134 };
 135
 136 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK    (0xFF)
 137 #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT                 31
 138 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK                (1 << 31)
 139
 140 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK    (0xFFULL)
 141 #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK        (0xFFFFFFFFFFULL << 12)
 142 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK          (1ULL << 62)
 143 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK               (1ULL << 63)
 144
 145 static DEFINE_PER_CPU(u64, current_tsc_ratio);
 146 #define TSC_RATIO_DEFAULT       0x0100000000ULL
 147
 148 static const struct svm_direct_access_msrs {
 149         u32 index;   /* Index of the MSR */
 150         bool always; /* True if intercept is always on */
 151 } direct_access_msrs[] = {
 152         { .index = MSR_STAR,                            .always = true  },
 153         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
 154 #ifdef CONFIG_X86_64
 155         { .index = MSR_GS_BASE,                         .always = true  },
 156         { .index = MSR_FS_BASE,                         .always = true  },
 157         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 158         { .index = MSR_LSTAR,                           .always = true  },
 159         { .index = MSR_CSTAR,                           .always = true  },
 160         { .index = MSR_SYSCALL_MASK,                    .always = true  },
 161 #endif
 162         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
 163         { .index = MSR_IA32_PRED_CMD,                   .always = false },
 164         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 165         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 166         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 167         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 168         { .index = MSR_INVALID,                         .always = false },
 169 };
 170
 171 /* enable NPT for AMD64 and X86 with PAE */
 172 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 173 bool npt_enabled = true;
 174 #else
 175 bool npt_enabled;
 176 #endif
 177
 178 /*
 179  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 180  * pause_filter_count: On processors that support Pause filtering(indicated
 181  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 182  *      count value. On VMRUN this value is loaded into an internal counter.
 183  *      Each time a pause instruction is executed, this counter is decremented
 184  *      until it reaches zero at which time a #VMEXIT is generated if pause
 185  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 186  *      Intercept Filtering for more details.
 187  *      This also indicate if ple logic enabled.
 188  *
 189  * pause_filter_thresh: In addition, some processor families support advanced
 190  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 191  *      the amount of time a guest is allowed to execute in a pause loop.
 192  *      In this mode, a 16-bit pause filter threshold field is added in the
 193  *      VMCB. The threshold value is a cycle count that is used to reset the
 194  *      pause counter. As with simple pause filtering, VMRUN loads the pause
 195  *      count value from VMCB into an internal counter. Then, on each pause
 196  *      instruction the hardware checks the elapsed number of cycles since
 197  *      the most recent pause instruction against the pause filter threshold.
 198  *      If the elapsed cycle count is greater than the pause filter threshold,
 199  *      then the internal pause count is reloaded from the VMCB and execution
 200  *      continues. If the elapsed cycle count is less than the pause filter
 201  *      threshold, then the internal pause count is decremented. If the count
 202  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 203  *      triggered. If advanced pause filtering is supported and pause filter
 204  *      threshold field is set to zero, the filter will operate in the simpler,
 205  *      count only mode.
 206  */
 207
 208 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 209 module_param(pause_filter_thresh, ushort, 0444);
 210
 211 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 212 module_param(pause_filter_count, ushort, 0444);
 213
 214 /* Default doubles per-vcpu window every exit. */
 215 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 216 module_param(pause_filter_count_grow, ushort, 0444);
 217
 218 /* Default resets per-vcpu window every exit to pause_filter_count. */
 219 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 220 module_param(pause_filter_count_shrink, ushort, 0444);
 221
 222 /* Default is to compute the maximum so we can never overflow. */
 223 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 224 module_param(pause_filter_count_max, ushort, 0444);
 225
 226 /* allow nested paging (virtualized MMU) for all guests */
 227 static int npt = true;
 228 module_param(npt, int, S_IRUGO);
 229
 230 /* allow nested virtualization in KVM/SVM */
 231 static int nested = true;
 232 module_param(nested, int, S_IRUGO);
 233
 234 /* enable / disable AVIC */
 235 static int avic;
 236 #ifdef CONFIG_X86_LOCAL_APIC
 237 module_param(avic, int, S_IRUGO);
 238 #endif
 239
 240 /* enable/disable Next RIP Save */
 241 static int nrips = true;
 242 module_param(nrips, int, 0444);
 243
 244 /* enable/disable Virtual VMLOAD VMSAVE */
 245 static int vls = true;
 246 module_param(vls, int, 0444);
 247
 248 /* enable/disable Virtual GIF */
 249 static int vgif = true;
 250 module_param(vgif, int, 0444);
 251
 252 /* enable/disable SEV support */
 253 static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
 254 module_param(sev, int, 0444);
 255
 256 static bool __read_mostly dump_invalid_vmcb = 0;
 257 module_param(dump_invalid_vmcb, bool, 0644);
 258
 259 static u8 rsm_ins_bytes[] = "\x0f\xaa";
 260
 261 static void svm_complete_interrupts(struct vcpu_svm *svm);
 262 static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
 263 static inline void avic_post_state_restore(struct kvm_vcpu *vcpu);
 264
 265 #define VMCB_AVIC_APIC_BAR_MASK         0xFFFFFFFFFF000ULL
 266
 267 static int sev_flush_asids(void);
 268 static DECLARE_RWSEM(sev_deactivate_lock);
 269 static DEFINE_MUTEX(sev_bitmap_lock);
 270 static unsigned int max_sev_asid;
 271 static unsigned int min_sev_asid;
 272 static unsigned long *sev_asid_bitmap;
 273 static unsigned long *sev_reclaim_asid_bitmap;
 274 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
 275
 276 struct enc_region {
 277         struct list_head list;
 278         unsigned long npages;
 279         struct page **pages;
 280         unsigned long uaddr;
 281         unsigned long size;
 282 };
 283
 284
 285 static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
 286 {
 287         return container_of(kvm, struct kvm_svm, kvm);
 288 }
 289
 290 static inline bool svm_sev_enabled(void)
 291 {
 292         return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
 293 }
 294
 295 static inline bool sev_guest(struct kvm *kvm)
 296 {
 297 #ifdef CONFIG_KVM_AMD_SEV
 298         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
 299
 300         return sev->active;
 301 #else
 302         return false;
 303 #endif
 304 }
 305
 306 static inline int sev_get_asid(struct kvm *kvm)
 307 {
 308         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
 309
 310         return sev->asid;
 311 }
 312
 313 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
 314 {
 315         svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
 316         mark_dirty(svm->vmcb, VMCB_AVIC);
 317 }
 318
 319 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
 320 {
 321         struct vcpu_svm *svm = to_svm(vcpu);
 322         u64 *entry = svm->avic_physical_id_cache;
 323
 324         if (!entry)
 325                 return false;
 326
 327         return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 328 }
 329
 330 static unsigned long iopm_base;
 331
 332 struct kvm_ldttss_desc {
 333         u16 limit0;
 334         u16 base0;
 335         unsigned base1:8, type:5, dpl:2, p:1;
 336         unsigned limit1:4, zero0:3, g:1, base2:8;
 337         u32 base3;
 338         u32 zero1;
 339 } __attribute__((packed));
 340
 341 struct svm_cpu_data {
 342         int cpu;
 343
 344         u64 asid_generation;
 345         u32 max_asid;
 346         u32 next_asid;
 347         u32 min_asid;
 348         struct kvm_ldttss_desc *tss_desc;
 349
 350         struct page *save_area;
 351         struct vmcb *current_vmcb;
 352
 353         /* index = sev_asid, value = vmcb pointer */
 354         struct vmcb **sev_vmcbs;
 355 };
 356
 357 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 358
 359 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 360
 361 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 362 #define MSRS_RANGE_SIZE 2048
 363 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 364
 365 u32 svm_msrpm_offset(u32 msr)
 366 {
 367         u32 offset;
 368         int i;
 369
 370         for (i = 0; i < NUM_MSR_MAPS; i++) {
 371                 if (msr < msrpm_ranges[i] ||
 372                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 373                         continue;
 374
 375                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 376                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 377
 378                 /* Now we have the u8 offset - but need the u32 offset */
 379                 return offset / 4;
 380         }
 381
 382         /* MSR not in any range */
 383         return MSR_INVALID;
 384 }
 385
 386 #define MAX_INST_SIZE 15
 387
 388 static inline void clgi(void)
 389 {
 390         asm volatile (__ex("clgi"));
 391 }
 392
 393 static inline void stgi(void)
 394 {
 395         asm volatile (__ex("stgi"));
 396 }
 397
 398 static inline void invlpga(unsigned long addr, u32 asid)
 399 {
 400         asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
 401 }
 402
 403 static int get_npt_level(struct kvm_vcpu *vcpu)
 404 {
 405 #ifdef CONFIG_X86_64
 406         return PT64_ROOT_4LEVEL;
 407 #else
 408         return PT32E_ROOT_LEVEL;
 409 #endif
 410 }
 411
 412 void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 413 {
 414         vcpu->arch.efer = efer;
 415
 416         if (!npt_enabled) {
 417                 /* Shadow paging assumes NX to be available.  */
 418                 efer |= EFER_NX;
 419
 420                 if (!(efer & EFER_LMA))
 421                         efer &= ~EFER_LME;
 422         }
 423
 424         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 425         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 426 }
 427
 428 static int is_external_interrupt(u32 info)
 429 {
 430         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 431         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 432 }
 433
 434 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 435 {
 436         struct vcpu_svm *svm = to_svm(vcpu);
 437         u32 ret = 0;
 438
 439         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 440                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 441         return ret;
 442 }
 443
 444 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 445 {
 446         struct vcpu_svm *svm = to_svm(vcpu);
 447
 448         if (mask == 0)
 449                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 450         else
 451                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 452
 453 }
 454
 455 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 456 {
 457         struct vcpu_svm *svm = to_svm(vcpu);
 458
 459         if (nrips && svm->vmcb->control.next_rip != 0) {
 460                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 461                 svm->next_rip = svm->vmcb->control.next_rip;
 462         }
 463
 464         if (!svm->next_rip) {
 465                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 466                         return 0;
 467         } else {
 468                 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 469                         pr_err("%s: ip 0x%lx next 0x%llx\n",
 470                                __func__, kvm_rip_read(vcpu), svm->next_rip);
 471                 kvm_rip_write(vcpu, svm->next_rip);
 472         }
 473         svm_set_interrupt_shadow(vcpu, 0);
 474
 475         return 1;
 476 }
 477
 478 static void svm_queue_exception(struct kvm_vcpu *vcpu)
 479 {
 480         struct vcpu_svm *svm = to_svm(vcpu);
 481         unsigned nr = vcpu->arch.exception.nr;
 482         bool has_error_code = vcpu->arch.exception.has_error_code;
 483         bool reinject = vcpu->arch.exception.injected;
 484         u32 error_code = vcpu->arch.exception.error_code;
 485
 486         /*
 487          * If we are within a nested VM we'd better #VMEXIT and let the guest
 488          * handle the exception
 489          */
 490         if (!reinject &&
 491             nested_svm_check_exception(svm, nr, has_error_code, error_code))
 492                 return;
 493
 494         kvm_deliver_exception_payload(&svm->vcpu);
 495
 496         if (nr == BP_VECTOR && !nrips) {
 497                 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 498
 499                 /*
 500                  * For guest debugging where we have to reinject #BP if some
 501                  * INT3 is guest-owned:
 502                  * Emulate nRIP by moving RIP forward. Will fail if injection
 503                  * raises a fault that is not intercepted. Still better than
 504                  * failing in all cases.
 505                  */
 506                 (void)skip_emulated_instruction(&svm->vcpu);
 507                 rip = kvm_rip_read(&svm->vcpu);
 508                 svm->int3_rip = rip + svm->vmcb->save.cs.base;
 509                 svm->int3_injected = rip - old_rip;
 510         }
 511
 512         svm->vmcb->control.event_inj = nr
 513                 | SVM_EVTINJ_VALID
 514                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 515                 | SVM_EVTINJ_TYPE_EXEPT;
 516         svm->vmcb->control.event_inj_err = error_code;
 517 }
 518
 519 static void svm_init_erratum_383(void)
 520 {
 521         u32 low, high;
 522         int err;
 523         u64 val;
 524
 525         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 526                 return;
 527
 528         /* Use _safe variants to not break nested virtualization */
 529         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 530         if (err)
 531                 return;
 532
 533         val |= (1ULL << 47);
 534
 535         low  = lower_32_bits(val);
 536         high = upper_32_bits(val);
 537
 538         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 539
 540         erratum_383_found = true;
 541 }
 542
 543 static void svm_init_osvw(struct kvm_vcpu *vcpu)
 544 {
 545         /*
 546          * Guests should see errata 400 and 415 as fixed (assuming that
 547          * HLT and IO instructions are intercepted).
 548          */
 549         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 550         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 551
 552         /*
 553          * By increasing VCPU's osvw.length to 3 we are telling the guest that
 554          * all osvw.status bits inside that length, including bit 0 (which is
 555          * reserved for erratum 298), are valid. However, if host processor's
 556          * osvw_len is 0 then osvw_status[0] carries no information. We need to
 557          * be conservative here and therefore we tell the guest that erratum 298
 558          * is present (because we really don't know).
 559          */
 560         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 561                 vcpu->arch.osvw.status |= 1;
 562 }
 563
 564 static int has_svm(void)
 565 {
 566         const char *msg;
 567
 568         if (!cpu_has_svm(&msg)) {
 569                 printk(KERN_INFO "has_svm: %s\n", msg);
 570                 return 0;
 571         }
 572
 573         return 1;
 574 }
 575
 576 static void svm_hardware_disable(void)
 577 {
 578         /* Make sure we clean up behind us */
 579         if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 580                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 581
 582         cpu_svm_disable();
 583
 584         amd_pmu_disable_virt();
 585 }
 586
 587 static int svm_hardware_enable(void)
 588 {
 589
 590         struct svm_cpu_data *sd;
 591         uint64_t efer;
 592         struct desc_struct *gdt;
 593         int me = raw_smp_processor_id();
 594
 595         rdmsrl(MSR_EFER, efer);
 596         if (efer & EFER_SVME)
 597                 return -EBUSY;
 598
 599         if (!has_svm()) {
 600                 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 601                 return -EINVAL;
 602         }
 603         sd = per_cpu(svm_data, me);
 604         if (!sd) {
 605                 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 606                 return -EINVAL;
 607         }
 608
 609         sd->asid_generation = 1;
 610         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 611         sd->next_asid = sd->max_asid + 1;
 612         sd->min_asid = max_sev_asid + 1;
 613
 614         gdt = get_current_gdt_rw();
 615         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 616
 617         wrmsrl(MSR_EFER, efer | EFER_SVME);
 618
 619         wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 620
 621         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 622                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 623                 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 624         }
 625
 626
 627         /*
 628          * Get OSVW bits.
 629          *
 630          * Note that it is possible to have a system with mixed processor
 631          * revisions and therefore different OSVW bits. If bits are not the same
 632          * on different processors then choose the worst case (i.e. if erratum
 633          * is present on one processor and not on another then assume that the
 634          * erratum is present everywhere).
 635          */
 636         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 637                 uint64_t len, status = 0;
 638                 int err;
 639
 640                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 641                 if (!err)
 642                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 643                                                       &err);
 644
 645                 if (err)
 646                         osvw_status = osvw_len = 0;
 647                 else {
 648                         if (len < osvw_len)
 649                                 osvw_len = len;
 650                         osvw_status |= status;
 651                         osvw_status &= (1ULL << osvw_len) - 1;
 652                 }
 653         } else
 654                 osvw_status = osvw_len = 0;
 655
 656         svm_init_erratum_383();
 657
 658         amd_pmu_enable_virt();
 659
 660         return 0;
 661 }
 662
 663 static void svm_cpu_uninit(int cpu)
 664 {
 665         struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 666
 667         if (!sd)
 668                 return;
 669
 670         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 671         kfree(sd->sev_vmcbs);
 672         __free_page(sd->save_area);
 673         kfree(sd);
 674 }
 675
 676 static int svm_cpu_init(int cpu)
 677 {
 678         struct svm_cpu_data *sd;
 679
 680         sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 681         if (!sd)
 682                 return -ENOMEM;
 683         sd->cpu = cpu;
 684         sd->save_area = alloc_page(GFP_KERNEL);
 685         if (!sd->save_area)
 686                 goto free_cpu_data;
 687
 688         if (svm_sev_enabled()) {
 689                 sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
 690                                               sizeof(void *),
 691                                               GFP_KERNEL);
 692                 if (!sd->sev_vmcbs)
 693                         goto free_save_area;
 694         }
 695
 696         per_cpu(svm_data, cpu) = sd;
 697
 698         return 0;
 699
 700 free_save_area:
 701         __free_page(sd->save_area);
 702 free_cpu_data:
 703         kfree(sd);
 704         return -ENOMEM;
 705
 706 }
 707
 708 static bool valid_msr_intercept(u32 index)
 709 {
 710         int i;
 711
 712         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 713                 if (direct_access_msrs[i].index == index)
 714                         return true;
 715
 716         return false;
 717 }
 718
 719 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
 720 {
 721         u8 bit_write;
 722         unsigned long tmp;
 723         u32 offset;
 724         u32 *msrpm;
 725
 726         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 727                                       to_svm(vcpu)->msrpm;
 728
 729         offset    = svm_msrpm_offset(msr);
 730         bit_write = 2 * (msr & 0x0f) + 1;
 731         tmp       = msrpm[offset];
 732
 733         BUG_ON(offset == MSR_INVALID);
 734
 735         return !!test_bit(bit_write,  &tmp);
 736 }
 737
 738 static void set_msr_interception(u32 *msrpm, unsigned msr,
 739                                  int read, int write)
 740 {
 741         u8 bit_read, bit_write;
 742         unsigned long tmp;
 743         u32 offset;
 744
 745         /*
 746          * If this warning triggers extend the direct_access_msrs list at the
 747          * beginning of the file
 748          */
 749         WARN_ON(!valid_msr_intercept(msr));
 750
 751         offset    = svm_msrpm_offset(msr);
 752         bit_read  = 2 * (msr & 0x0f);
 753         bit_write = 2 * (msr & 0x0f) + 1;
 754         tmp       = msrpm[offset];
 755
 756         BUG_ON(offset == MSR_INVALID);
 757
 758         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 759         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 760
 761         msrpm[offset] = tmp;
 762 }
 763
 764 static void svm_vcpu_init_msrpm(u32 *msrpm)
 765 {
 766         int i;
 767
 768         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 769
 770         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 771                 if (!direct_access_msrs[i].always)
 772                         continue;
 773
 774                 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 775         }
 776 }
 777
 778 static void add_msr_offset(u32 offset)
 779 {
 780         int i;
 781
 782         for (i = 0; i < MSRPM_OFFSETS; ++i) {
 783
 784                 /* Offset already in list? */
 785                 if (msrpm_offsets[i] == offset)
 786                         return;
 787
 788                 /* Slot used by another offset? */
 789                 if (msrpm_offsets[i] != MSR_INVALID)
 790                         continue;
 791
 792                 /* Add offset to list */
 793                 msrpm_offsets[i] = offset;
 794
 795                 return;
 796         }
 797
 798         /*
 799          * If this BUG triggers the msrpm_offsets table has an overflow. Just
 800          * increase MSRPM_OFFSETS in this case.
 801          */
 802         BUG();
 803 }
 804
 805 static void init_msrpm_offsets(void)
 806 {
 807         int i;
 808
 809         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 810
 811         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 812                 u32 offset;
 813
 814                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
 815                 BUG_ON(offset == MSR_INVALID);
 816
 817                 add_msr_offset(offset);
 818         }
 819 }
 820
 821 static void svm_enable_lbrv(struct vcpu_svm *svm)
 822 {
 823         u32 *msrpm = svm->msrpm;
 824
 825         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 826         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 827         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 828         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 829         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 830 }
 831
 832 static void svm_disable_lbrv(struct vcpu_svm *svm)
 833 {
 834         u32 *msrpm = svm->msrpm;
 835
 836         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 837         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 838         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 839         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 840         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 841 }
 842
 843 void disable_nmi_singlestep(struct vcpu_svm *svm)
 844 {
 845         svm->nmi_singlestep = false;
 846
 847         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
 848                 /* Clear our flags if they were not set by the guest */
 849                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
 850                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
 851                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
 852                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
 853         }
 854 }
 855
 856 /* Note:
 857  * This hash table is used to map VM_ID to a struct kvm_svm,
 858  * when handling AMD IOMMU GALOG notification to schedule in
 859  * a particular vCPU.
 860  */
 861 #define SVM_VM_DATA_HASH_BITS   8
 862 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
 863 static u32 next_vm_id = 0;
 864 static bool next_vm_id_wrapped = 0;
 865 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 866
 867 /* Note:
 868  * This function is called from IOMMU driver to notify
 869  * SVM to schedule in a particular vCPU of a particular VM.
 870  */
 871 static int avic_ga_log_notifier(u32 ga_tag)
 872 {
 873         unsigned long flags;
 874         struct kvm_svm *kvm_svm;
 875         struct kvm_vcpu *vcpu = NULL;
 876         u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
 877         u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
 878
 879         pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
 880         trace_kvm_avic_ga_log(vm_id, vcpu_id);
 881
 882         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
 883         hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
 884                 if (kvm_svm->avic_vm_id != vm_id)
 885                         continue;
 886                 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
 887                 break;
 888         }
 889         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 890
 891         /* Note:
 892          * At this point, the IOMMU should have already set the pending
 893          * bit in the vAPIC backing page. So, we just need to schedule
 894          * in the vcpu.
 895          */
 896         if (vcpu)
 897                 kvm_vcpu_wake_up(vcpu);
 898
 899         return 0;
 900 }
 901
 902 static __init int sev_hardware_setup(void)
 903 {
 904         struct sev_user_data_status *status;
 905         int rc;
 906
 907         /* Maximum number of encrypted guests supported simultaneously */
 908         max_sev_asid = cpuid_ecx(0x8000001F);
 909
 910         if (!max_sev_asid)
 911                 return 1;
 912
 913         /* Minimum ASID value that should be used for SEV guest */
 914         min_sev_asid = cpuid_edx(0x8000001F);
 915
 916         /* Initialize SEV ASID bitmaps */
 917         sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
 918         if (!sev_asid_bitmap)
 919                 return 1;
 920
 921         sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
 922         if (!sev_reclaim_asid_bitmap)
 923                 return 1;
 924
 925         status = kmalloc(sizeof(*status), GFP_KERNEL);
 926         if (!status)
 927                 return 1;
 928
 929         /*
 930          * Check SEV platform status.
 931          *
 932          * PLATFORM_STATUS can be called in any state, if we failed to query
 933          * the PLATFORM status then either PSP firmware does not support SEV
 934          * feature or SEV firmware is dead.
 935          */
 936         rc = sev_platform_status(status, NULL);
 937         if (rc)
 938                 goto err;
 939
 940         pr_info("SEV supported\n");
 941
 942 err:
 943         kfree(status);
 944         return rc;
 945 }
 946
 947 static void grow_ple_window(struct kvm_vcpu *vcpu)
 948 {
 949         struct vcpu_svm *svm = to_svm(vcpu);
 950         struct vmcb_control_area *control = &svm->vmcb->control;
 951         int old = control->pause_filter_count;
 952
 953         control->pause_filter_count = __grow_ple_window(old,
 954                                                         pause_filter_count,
 955                                                         pause_filter_count_grow,
 956                                                         pause_filter_count_max);
 957
 958         if (control->pause_filter_count != old) {
 959                 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 960                 trace_kvm_ple_window_update(vcpu->vcpu_id,
 961                                             control->pause_filter_count, old);
 962         }
 963 }
 964
 965 static void shrink_ple_window(struct kvm_vcpu *vcpu)
 966 {
 967         struct vcpu_svm *svm = to_svm(vcpu);
 968         struct vmcb_control_area *control = &svm->vmcb->control;
 969         int old = control->pause_filter_count;
 970
 971         control->pause_filter_count =
 972                                 __shrink_ple_window(old,
 973                                                     pause_filter_count,
 974                                                     pause_filter_count_shrink,
 975                                                     pause_filter_count);
 976         if (control->pause_filter_count != old) {
 977                 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 978                 trace_kvm_ple_window_update(vcpu->vcpu_id,
 979                                             control->pause_filter_count, old);
 980         }
 981 }
 982
 983 /*
 984  * The default MMIO mask is a single bit (excluding the present bit),
 985  * which could conflict with the memory encryption bit. Check for
 986  * memory encryption support and override the default MMIO mask if
 987  * memory encryption is enabled.
 988  */
 989 static __init void svm_adjust_mmio_mask(void)
 990 {
 991         unsigned int enc_bit, mask_bit;
 992         u64 msr, mask;
 993
 994         /* If there is no memory encryption support, use existing mask */
 995         if (cpuid_eax(0x80000000) < 0x8000001f)
 996                 return;
 997
 998         /* If memory encryption is not enabled, use existing mask */
 999         rdmsrl(MSR_K8_SYSCFG, msr);
1000         if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
1001                 return;
1002
1003         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
1004         mask_bit = boot_cpu_data.x86_phys_bits;
1005
1006         /* Increment the mask bit if it is the same as the encryption bit */
1007         if (enc_bit == mask_bit)
1008                 mask_bit++;
1009
1010         /*
1011          * If the mask bit location is below 52, then some bits above the
1012          * physical addressing limit will always be reserved, so use the
1013          * rsvd_bits() function to generate the mask. This mask, along with
1014          * the present bit, will be used to generate a page fault with
1015          * PFER.RSV = 1.
1016          *
1017          * If the mask bit location is 52 (or above), then clear the mask.
1018          */
1019         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
1020
1021         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
1022 }
1023
1024 static void svm_hardware_teardown(void)
1025 {
1026         int cpu;
1027
1028         if (svm_sev_enabled()) {
1029                 bitmap_free(sev_asid_bitmap);
1030                 bitmap_free(sev_reclaim_asid_bitmap);
1031
1032                 sev_flush_asids();
1033         }
1034
1035         for_each_possible_cpu(cpu)
1036                 svm_cpu_uninit(cpu);
1037
1038         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1039         iopm_base = 0;
1040 }
1041
1042 static __init void svm_set_cpu_caps(void)
1043 {
1044         kvm_set_cpu_caps();
1045
1046         supported_xss = 0;
1047
1048         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
1049         if (nested) {
1050                 kvm_cpu_cap_set(X86_FEATURE_SVM);
1051
1052                 if (nrips)
1053                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
1054
1055                 if (npt_enabled)
1056                         kvm_cpu_cap_set(X86_FEATURE_NPT);
1057         }
1058
1059         /* CPUID 0x80000008 */
1060         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
1061             boot_cpu_has(X86_FEATURE_AMD_SSBD))
1062                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
1063 }
1064
1065 static __init int svm_hardware_setup(void)
1066 {
1067         int cpu;
1068         struct page *iopm_pages;
1069         void *iopm_va;
1070         int r;
1071
1072         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
1073
1074         if (!iopm_pages)
1075                 return -ENOMEM;
1076
1077         iopm_va = page_address(iopm_pages);
1078         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
1079         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
1080
1081         init_msrpm_offsets();
1082
1083         supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
1084
1085         if (boot_cpu_has(X86_FEATURE_NX))
1086                 kvm_enable_efer_bits(EFER_NX);
1087
1088         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
1089                 kvm_enable_efer_bits(EFER_FFXSR);
1090
1091         if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1092                 kvm_has_tsc_control = true;
1093                 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
1094                 kvm_tsc_scaling_ratio_frac_bits = 32;
1095         }
1096
1097         /* Check for pause filtering support */
1098         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1099                 pause_filter_count = 0;
1100                 pause_filter_thresh = 0;
1101         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
1102                 pause_filter_thresh = 0;
1103         }
1104
1105         if (nested) {
1106                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
1107                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
1108         }
1109
1110         if (sev) {
1111                 if (boot_cpu_has(X86_FEATURE_SEV) &&
1112                     IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
1113                         r = sev_hardware_setup();
1114                         if (r)
1115                                 sev = false;
1116                 } else {
1117                         sev = false;
1118                 }
1119         }
1120
1121         svm_adjust_mmio_mask();
1122
1123         for_each_possible_cpu(cpu) {
1124                 r = svm_cpu_init(cpu);
1125                 if (r)
1126                         goto err;
1127         }
1128
1129         if (!boot_cpu_has(X86_FEATURE_NPT))
1130                 npt_enabled = false;
1131
1132         if (npt_enabled && !npt)
1133                 npt_enabled = false;
1134
1135         kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL);
1136         pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
1137
1138         if (nrips) {
1139                 if (!boot_cpu_has(X86_FEATURE_NRIPS))
1140                         nrips = false;
1141         }
1142
1143         if (avic) {
1144                 if (!npt_enabled ||
1145                     !boot_cpu_has(X86_FEATURE_AVIC) ||
1146                     !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
1147                         avic = false;
1148                 } else {
1149                         pr_info("AVIC enabled\n");
1150
1151                         amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1152                 }
1153         }
1154
1155         if (vls) {
1156                 if (!npt_enabled ||
1157                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
1158                     !IS_ENABLED(CONFIG_X86_64)) {
1159                         vls = false;
1160                 } else {
1161                         pr_info("Virtual VMLOAD VMSAVE supported\n");
1162                 }
1163         }
1164
1165         if (vgif) {
1166                 if (!boot_cpu_has(X86_FEATURE_VGIF))
1167                         vgif = false;
1168                 else
1169                         pr_info("Virtual GIF supported\n");
1170         }
1171
1172         svm_set_cpu_caps();
1173
1174         return 0;
1175
1176 err:
1177         svm_hardware_teardown();
1178         return r;
1179 }
1180
1181 static void init_seg(struct vmcb_seg *seg)
1182 {
1183         seg->selector = 0;
1184         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1185                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1186         seg->limit = 0xffff;
1187         seg->base = 0;
1188 }
1189
1190 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1191 {
1192         seg->selector = 0;
1193         seg->attrib = SVM_SELECTOR_P_MASK | type;
1194         seg->limit = 0xffff;
1195         seg->base = 0;
1196 }
1197
1198 static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
1199 {
1200         struct vcpu_svm *svm = to_svm(vcpu);
1201
1202         if (is_guest_mode(vcpu))
1203                 return svm->nested.hsave->control.tsc_offset;
1204
1205         return vcpu->arch.tsc_offset;
1206 }
1207
1208 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1209 {
1210         struct vcpu_svm *svm = to_svm(vcpu);
1211         u64 g_tsc_offset = 0;
1212
1213         if (is_guest_mode(vcpu)) {
1214                 /* Write L1's TSC offset.  */
1215                 g_tsc_offset = svm->vmcb->control.tsc_offset -
1216                                svm->nested.hsave->control.tsc_offset;
1217                 svm->nested.hsave->control.tsc_offset = offset;
1218         }
1219
1220         trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1221                                    svm->vmcb->control.tsc_offset - g_tsc_offset,
1222                                    offset);
1223
1224         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1225
1226         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1227         return svm->vmcb->control.tsc_offset;
1228 }
1229
1230 static void avic_init_vmcb(struct vcpu_svm *svm)
1231 {
1232         struct vmcb *vmcb = svm->vmcb;
1233         struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
1234         phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
1235         phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
1236         phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
1237
1238         vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1239         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
1240         vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1241         vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1242         if (kvm_apicv_activated(svm->vcpu.kvm))
1243                 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1244         else
1245                 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
1246 }
1247
1248 static void init_vmcb(struct vcpu_svm *svm)
1249 {
1250         struct vmcb_control_area *control = &svm->vmcb->control;
1251         struct vmcb_save_area *save = &svm->vmcb->save;
1252
1253         svm->vcpu.arch.hflags = 0;
1254
1255         set_cr_intercept(svm, INTERCEPT_CR0_READ);
1256         set_cr_intercept(svm, INTERCEPT_CR3_READ);
1257         set_cr_intercept(svm, INTERCEPT_CR4_READ);
1258         set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1259         set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1260         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1261         if (!kvm_vcpu_apicv_active(&svm->vcpu))
1262                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1263
1264         set_dr_intercepts(svm);
1265
1266         set_exception_intercept(svm, PF_VECTOR);
1267         set_exception_intercept(svm, UD_VECTOR);
1268         set_exception_intercept(svm, MC_VECTOR);
1269         set_exception_intercept(svm, AC_VECTOR);
1270         set_exception_intercept(svm, DB_VECTOR);
1271         /*
1272          * Guest access to VMware backdoor ports could legitimately
1273          * trigger #GP because of TSS I/O permission bitmap.
1274          * We intercept those #GP and allow access to them anyway
1275          * as VMware does.
1276          */
1277         if (enable_vmware_backdoor)
1278                 set_exception_intercept(svm, GP_VECTOR);
1279
1280         set_intercept(svm, INTERCEPT_INTR);
1281         set_intercept(svm, INTERCEPT_NMI);
1282         set_intercept(svm, INTERCEPT_SMI);
1283         set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1284         set_intercept(svm, INTERCEPT_RDPMC);
1285         set_intercept(svm, INTERCEPT_CPUID);
1286         set_intercept(svm, INTERCEPT_INVD);
1287         set_intercept(svm, INTERCEPT_INVLPG);
1288         set_intercept(svm, INTERCEPT_INVLPGA);
1289         set_intercept(svm, INTERCEPT_IOIO_PROT);
1290         set_intercept(svm, INTERCEPT_MSR_PROT);
1291         set_intercept(svm, INTERCEPT_TASK_SWITCH);
1292         set_intercept(svm, INTERCEPT_SHUTDOWN);
1293         set_intercept(svm, INTERCEPT_VMRUN);
1294         set_intercept(svm, INTERCEPT_VMMCALL);
1295         set_intercept(svm, INTERCEPT_VMLOAD);
1296         set_intercept(svm, INTERCEPT_VMSAVE);
1297         set_intercept(svm, INTERCEPT_STGI);
1298         set_intercept(svm, INTERCEPT_CLGI);
1299         set_intercept(svm, INTERCEPT_SKINIT);
1300         set_intercept(svm, INTERCEPT_WBINVD);
1301         set_intercept(svm, INTERCEPT_XSETBV);
1302         set_intercept(svm, INTERCEPT_RDPRU);
1303         set_intercept(svm, INTERCEPT_RSM);
1304
1305         if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
1306                 set_intercept(svm, INTERCEPT_MONITOR);
1307                 set_intercept(svm, INTERCEPT_MWAIT);
1308         }
1309
1310         if (!kvm_hlt_in_guest(svm->vcpu.kvm))
1311                 set_intercept(svm, INTERCEPT_HLT);
1312
1313         control->iopm_base_pa = __sme_set(iopm_base);
1314         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1315         control->int_ctl = V_INTR_MASKING_MASK;
1316
1317         init_seg(&save->es);
1318         init_seg(&save->ss);
1319         init_seg(&save->ds);
1320         init_seg(&save->fs);
1321         init_seg(&save->gs);
1322
1323         save->cs.selector = 0xf000;
1324         save->cs.base = 0xffff0000;
1325         /* Executable/Readable Code Segment */
1326         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1327                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1328         save->cs.limit = 0xffff;
1329
1330         save->gdtr.limit = 0xffff;
1331         save->idtr.limit = 0xffff;
1332
1333         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1334         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1335
1336         svm_set_efer(&svm->vcpu, 0);
1337         save->dr6 = 0xffff0ff0;
1338         kvm_set_rflags(&svm->vcpu, 2);
1339         save->rip = 0x0000fff0;
1340         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1341
1342         /*
1343          * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1344          * It also updates the guest-visible cr0 value.
1345          */
1346         svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1347         kvm_mmu_reset_context(&svm->vcpu);
1348
1349         save->cr4 = X86_CR4_PAE;
1350         /* rdx = ?? */
1351
1352         if (npt_enabled) {
1353                 /* Setup VMCB for Nested Paging */
1354                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1355                 clr_intercept(svm, INTERCEPT_INVLPG);
1356                 clr_exception_intercept(svm, PF_VECTOR);
1357                 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1358                 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1359                 save->g_pat = svm->vcpu.arch.pat;
1360                 save->cr3 = 0;
1361                 save->cr4 = 0;
1362         }
1363         svm->asid_generation = 0;
1364
1365         svm->nested.vmcb = 0;
1366         svm->vcpu.arch.hflags = 0;
1367
1368         if (pause_filter_count) {
1369                 control->pause_filter_count = pause_filter_count;
1370                 if (pause_filter_thresh)
1371                         control->pause_filter_thresh = pause_filter_thresh;
1372                 set_intercept(svm, INTERCEPT_PAUSE);
1373         } else {
1374                 clr_intercept(svm, INTERCEPT_PAUSE);
1375         }
1376
1377         if (kvm_vcpu_apicv_active(&svm->vcpu))
1378                 avic_init_vmcb(svm);
1379
1380         /*
1381          * If hardware supports Virtual VMLOAD VMSAVE then enable it
1382          * in VMCB and clear intercepts to avoid #VMEXIT.
1383          */
1384         if (vls) {
1385                 clr_intercept(svm, INTERCEPT_VMLOAD);
1386                 clr_intercept(svm, INTERCEPT_VMSAVE);
1387                 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1388         }
1389
1390         if (vgif) {
1391                 clr_intercept(svm, INTERCEPT_STGI);
1392                 clr_intercept(svm, INTERCEPT_CLGI);
1393                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1394         }
1395
1396         if (sev_guest(svm->vcpu.kvm)) {
1397                 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1398                 clr_exception_intercept(svm, UD_VECTOR);
1399         }
1400
1401         mark_all_dirty(svm->vmcb);
1402
1403         enable_gif(svm);
1404
1405 }
1406
1407 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
1408                                        unsigned int index)
1409 {
1410         u64 *avic_physical_id_table;
1411         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1412
1413         if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1414                 return NULL;
1415
1416         avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
1417
1418         return &avic_physical_id_table[index];
1419 }
1420
1421 /**
1422  * Note:
1423  * AVIC hardware walks the nested page table to check permissions,
1424  * but does not use the SPA address specified in the leaf page
1425  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
1426  * field of the VMCB. Therefore, we set up the
1427  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
1428  */
1429 static int avic_update_access_page(struct kvm *kvm, bool activate)
1430 {
1431         int ret = 0;
1432
1433         mutex_lock(&kvm->slots_lock);
1434         /*
1435          * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
1436          * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
1437          * memory region. So, we need to ensure that kvm->mm == current->mm.
1438          */
1439         if ((kvm->arch.apic_access_page_done == activate) ||
1440             (kvm->mm != current->mm))
1441                 goto out;
1442
1443         ret = __x86_set_memory_region(kvm,
1444                                       APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
1445                                       APIC_DEFAULT_PHYS_BASE,
1446                                       activate ? PAGE_SIZE : 0);
1447         if (ret)
1448                 goto out;
1449
1450         kvm->arch.apic_access_page_done = activate;
1451 out:
1452         mutex_unlock(&kvm->slots_lock);
1453         return ret;
1454 }
1455
1456 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1457 {
1458         u64 *entry, new_entry;
1459         int id = vcpu->vcpu_id;
1460         struct vcpu_svm *svm = to_svm(vcpu);
1461
1462         if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
1463                 return -EINVAL;
1464
1465         if (!svm->vcpu.arch.apic->regs)
1466                 return -EINVAL;
1467
1468         if (kvm_apicv_activated(vcpu->kvm)) {
1469                 int ret;
1470
1471                 ret = avic_update_access_page(vcpu->kvm, true);
1472                 if (ret)
1473                         return ret;
1474         }
1475
1476         svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
1477
1478         /* Setting AVIC backing page address in the phy APIC ID table */
1479         entry = avic_get_physical_id_entry(vcpu, id);
1480         if (!entry)
1481                 return -EINVAL;
1482
1483         new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
1484                               AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1485                               AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
1486         WRITE_ONCE(*entry, new_entry);
1487
1488         svm->avic_physical_id_cache = entry;
1489
1490         return 0;
1491 }
1492
1493 static void sev_asid_free(int asid)
1494 {
1495         struct svm_cpu_data *sd;
1496         int cpu, pos;
1497
1498         mutex_lock(&sev_bitmap_lock);
1499
1500         pos = asid - 1;
1501         __set_bit(pos, sev_reclaim_asid_bitmap);
1502
1503         for_each_possible_cpu(cpu) {
1504                 sd = per_cpu(svm_data, cpu);
1505                 sd->sev_vmcbs[pos] = NULL;
1506         }
1507
1508         mutex_unlock(&sev_bitmap_lock);
1509 }
1510
1511 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
1512 {
1513         struct sev_data_decommission *decommission;
1514         struct sev_data_deactivate *data;
1515
1516         if (!handle)
1517                 return;
1518
1519         data = kzalloc(sizeof(*data), GFP_KERNEL);
1520         if (!data)
1521                 return;
1522
1523         /* deactivate handle */
1524         data->handle = handle;
1525
1526         /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
1527         down_read(&sev_deactivate_lock);
1528         sev_guest_deactivate(data, NULL);
1529         up_read(&sev_deactivate_lock);
1530
1531         kfree(data);
1532
1533         decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
1534         if (!decommission)
1535                 return;
1536
1537         /* decommission handle */
1538         decommission->handle = handle;
1539         sev_guest_decommission(decommission, NULL);
1540
1541         kfree(decommission);
1542 }
1543
1544 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
1545                                     unsigned long ulen, unsigned long *n,
1546                                     int write)
1547 {
1548         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1549         unsigned long npages, npinned, size;
1550         unsigned long locked, lock_limit;
1551         struct page **pages;
1552         unsigned long first, last;
1553
1554         if (ulen == 0 || uaddr + ulen < uaddr)
1555                 return NULL;
1556
1557         /* Calculate number of pages. */
1558         first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
1559         last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
1560         npages = (last - first + 1);
1561
1562         locked = sev->pages_locked + npages;
1563         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1564         if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
1565                 pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
1566                 return NULL;
1567         }
1568
1569         /* Avoid using vmalloc for smaller buffers. */
1570         size = npages * sizeof(struct page *);
1571         if (size > PAGE_SIZE)
1572                 pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
1573                                   PAGE_KERNEL);
1574         else
1575                 pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
1576
1577         if (!pages)
1578                 return NULL;
1579
1580         /* Pin the user virtual address. */
1581         npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
1582         if (npinned != npages) {
1583                 pr_err("SEV: Failure locking %lu pages.\n", npages);
1584                 goto err;
1585         }
1586
1587         *n = npages;
1588         sev->pages_locked = locked;
1589
1590         return pages;
1591
1592 err:
1593         if (npinned > 0)
1594                 release_pages(pages, npinned);
1595
1596         kvfree(pages);
1597         return NULL;
1598 }
1599
1600 static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
1601                              unsigned long npages)
1602 {
1603         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1604
1605         release_pages(pages, npages);
1606         kvfree(pages);
1607         sev->pages_locked -= npages;
1608 }
1609
1610 static void sev_clflush_pages(struct page *pages[], unsigned long npages)
1611 {
1612         uint8_t *page_virtual;
1613         unsigned long i;
1614
1615         if (npages == 0 || pages == NULL)
1616                 return;
1617
1618         for (i = 0; i < npages; i++) {
1619                 page_virtual = kmap_atomic(pages[i]);
1620                 clflush_cache_range(page_virtual, PAGE_SIZE);
1621                 kunmap_atomic(page_virtual);
1622         }
1623 }
1624
1625 static void __unregister_enc_region_locked(struct kvm *kvm,
1626                                            struct enc_region *region)
1627 {
1628         sev_unpin_memory(kvm, region->pages, region->npages);
1629         list_del(&region->list);
1630         kfree(region);
1631 }
1632
1633 static void sev_vm_destroy(struct kvm *kvm)
1634 {
1635         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1636         struct list_head *head = &sev->regions_list;
1637         struct list_head *pos, *q;
1638
1639         if (!sev_guest(kvm))
1640                 return;
1641
1642         mutex_lock(&kvm->lock);
1643
1644         /*
1645          * Ensure that all guest tagged cache entries are flushed before
1646          * releasing the pages back to the system for use. CLFLUSH will
1647          * not do this, so issue a WBINVD.
1648          */
1649         wbinvd_on_all_cpus();
1650
1651         /*
1652          * if userspace was terminated before unregistering the memory regions
1653          * then lets unpin all the registered memory.
1654          */
1655         if (!list_empty(head)) {
1656                 list_for_each_safe(pos, q, head) {
1657                         __unregister_enc_region_locked(kvm,
1658                                 list_entry(pos, struct enc_region, list));
1659                 }
1660         }
1661
1662         mutex_unlock(&kvm->lock);
1663
1664         sev_unbind_asid(kvm, sev->handle);
1665         sev_asid_free(sev->asid);
1666 }
1667
1668 static void avic_vm_destroy(struct kvm *kvm)
1669 {
1670         unsigned long flags;
1671         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
1672
1673         if (!avic)
1674                 return;
1675
1676         if (kvm_svm->avic_logical_id_table_page)
1677                 __free_page(kvm_svm->avic_logical_id_table_page);
1678         if (kvm_svm->avic_physical_id_table_page)
1679                 __free_page(kvm_svm->avic_physical_id_table_page);
1680
1681         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1682         hash_del(&kvm_svm->hnode);
1683         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1684 }
1685
1686 static void svm_vm_destroy(struct kvm *kvm)
1687 {
1688         avic_vm_destroy(kvm);
1689         sev_vm_destroy(kvm);
1690 }
1691
1692 static int avic_vm_init(struct kvm *kvm)
1693 {
1694         unsigned long flags;
1695         int err = -ENOMEM;
1696         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
1697         struct kvm_svm *k2;
1698         struct page *p_page;
1699         struct page *l_page;
1700         u32 vm_id;
1701
1702         if (!avic)
1703                 return 0;
1704
1705         /* Allocating physical APIC ID table (4KB) */
1706         p_page = alloc_page(GFP_KERNEL_ACCOUNT);
1707         if (!p_page)
1708                 goto free_avic;
1709
1710         kvm_svm->avic_physical_id_table_page = p_page;
1711         clear_page(page_address(p_page));
1712
1713         /* Allocating logical APIC ID table (4KB) */
1714         l_page = alloc_page(GFP_KERNEL_ACCOUNT);
1715         if (!l_page)
1716                 goto free_avic;
1717
1718         kvm_svm->avic_logical_id_table_page = l_page;
1719         clear_page(page_address(l_page));
1720
1721         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1722  again:
1723         vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
1724         if (vm_id == 0) { /* id is 1-based, zero is not okay */
1725                 next_vm_id_wrapped = 1;
1726                 goto again;
1727         }
1728         /* Is it still in use? Only possible if wrapped at least once */
1729         if (next_vm_id_wrapped) {
1730                 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
1731                         if (k2->avic_vm_id == vm_id)
1732                                 goto again;
1733                 }
1734         }
1735         kvm_svm->avic_vm_id = vm_id;
1736         hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
1737         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1738
1739         return 0;
1740
1741 free_avic:
1742         avic_vm_destroy(kvm);
1743         return err;
1744 }
1745
1746 static int svm_vm_init(struct kvm *kvm)
1747 {
1748         if (avic) {
1749                 int ret = avic_vm_init(kvm);
1750                 if (ret)
1751                         return ret;
1752         }
1753
1754         kvm_apicv_init(kvm, avic);
1755         return 0;
1756 }
1757
1758 static inline int
1759 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
1760 {
1761         int ret = 0;
1762         unsigned long flags;
1763         struct amd_svm_iommu_ir *ir;
1764         struct vcpu_svm *svm = to_svm(vcpu);
1765
1766         if (!kvm_arch_has_assigned_device(vcpu->kvm))
1767                 return 0;
1768
1769         /*
1770          * Here, we go through the per-vcpu ir_list to update all existing
1771          * interrupt remapping table entry targeting this vcpu.
1772          */
1773         spin_lock_irqsave(&svm->ir_list_lock, flags);
1774
1775         if (list_empty(&svm->ir_list))
1776                 goto out;
1777
1778         list_for_each_entry(ir, &svm->ir_list, node) {
1779                 ret = amd_iommu_update_ga(cpu, r, ir->data);
1780                 if (ret)
1781                         break;
1782         }
1783 out:
1784         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1785         return ret;
1786 }
1787
1788 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1789 {
1790         u64 entry;
1791         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1792         int h_physical_id = kvm_cpu_get_apicid(cpu);
1793         struct vcpu_svm *svm = to_svm(vcpu);
1794
1795         if (!kvm_vcpu_apicv_active(vcpu))
1796                 return;
1797
1798         /*
1799          * Since the host physical APIC id is 8 bits,
1800          * we can support host APIC ID upto 255.
1801          */
1802         if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
1803                 return;
1804
1805         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1806         WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1807
1808         entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1809         entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1810
1811         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1812         if (svm->avic_is_running)
1813                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1814
1815         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1816         avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
1817                                         svm->avic_is_running);
1818 }
1819
1820 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
1821 {
1822         u64 entry;
1823         struct vcpu_svm *svm = to_svm(vcpu);
1824
1825         if (!kvm_vcpu_apicv_active(vcpu))
1826                 return;
1827
1828         entry = READ_ONCE(*(svm->avic_physical_id_cache));
1829         if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
1830                 avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
1831
1832         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1833         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1834 }
1835
1836 /**
1837  * This function is called during VCPU halt/unhalt.
1838  */
1839 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
1840 {
1841         struct vcpu_svm *svm = to_svm(vcpu);
1842
1843         svm->avic_is_running = is_run;
1844         if (is_run)
1845                 avic_vcpu_load(vcpu, vcpu->cpu);
1846         else
1847                 avic_vcpu_put(vcpu);
1848 }
1849
1850 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1851 {
1852         struct vcpu_svm *svm = to_svm(vcpu);
1853         u32 dummy;
1854         u32 eax = 1;
1855
1856         svm->spec_ctrl = 0;
1857         svm->virt_spec_ctrl = 0;
1858
1859         if (!init_event) {
1860                 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1861                                            MSR_IA32_APICBASE_ENABLE;
1862                 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1863                         svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1864         }
1865         init_vmcb(svm);
1866
1867         kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
1868         kvm_rdx_write(vcpu, eax);
1869
1870         if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1871                 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1872 }
1873
1874 static int avic_init_vcpu(struct vcpu_svm *svm)
1875 {
1876         int ret;
1877         struct kvm_vcpu *vcpu = &svm->vcpu;
1878
1879         if (!avic || !irqchip_in_kernel(vcpu->kvm))
1880                 return 0;
1881
1882         ret = avic_init_backing_page(&svm->vcpu);
1883         if (ret)
1884                 return ret;
1885
1886         INIT_LIST_HEAD(&svm->ir_list);
1887         spin_lock_init(&svm->ir_list_lock);
1888         svm->dfr_reg = APIC_DFR_FLAT;
1889
1890         return ret;
1891 }
1892
1893 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
1894 {
1895         struct vcpu_svm *svm;
1896         struct page *page;
1897         struct page *msrpm_pages;
1898         struct page *hsave_page;
1899         struct page *nested_msrpm_pages;
1900         int err;
1901
1902         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1903         svm = to_svm(vcpu);
1904
1905         err = -ENOMEM;
1906         page = alloc_page(GFP_KERNEL_ACCOUNT);
1907         if (!page)
1908                 goto out;
1909
1910         msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
1911         if (!msrpm_pages)
1912                 goto free_page1;
1913
1914         nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
1915         if (!nested_msrpm_pages)
1916                 goto free_page2;
1917
1918         hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
1919         if (!hsave_page)
1920                 goto free_page3;
1921
1922         err = avic_init_vcpu(svm);
1923         if (err)
1924                 goto free_page4;
1925
1926         /* We initialize this flag to true to make sure that the is_running
1927          * bit would be set the first time the vcpu is loaded.
1928          */
1929         if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
1930                 svm->avic_is_running = true;
1931
1932         svm->nested.hsave = page_address(hsave_page);
1933
1934         svm->msrpm = page_address(msrpm_pages);
1935         svm_vcpu_init_msrpm(svm->msrpm);
1936
1937         svm->nested.msrpm = page_address(nested_msrpm_pages);
1938         svm_vcpu_init_msrpm(svm->nested.msrpm);
1939
1940         svm->vmcb = page_address(page);
1941         clear_page(svm->vmcb);
1942         svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
1943         svm->asid_generation = 0;
1944         init_vmcb(svm);
1945
1946         svm_init_osvw(vcpu);
1947         vcpu->arch.microcode_version = 0x01000065;
1948
1949         return 0;
1950
1951 free_page4:
1952         __free_page(hsave_page);
1953 free_page3:
1954         __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1955 free_page2:
1956         __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1957 free_page1:
1958         __free_page(page);
1959 out:
1960         return err;
1961 }
1962
1963 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1964 {
1965         int i;
1966
1967         for_each_online_cpu(i)
1968                 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1969 }
1970
1971 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1972 {
1973         struct vcpu_svm *svm = to_svm(vcpu);
1974
1975         /*
1976          * The vmcb page can be recycled, causing a false negative in
1977          * svm_vcpu_load(). So, ensure that no logical CPU has this
1978          * vmcb page recorded as its current vmcb.
1979          */
1980         svm_clear_current_vmcb(svm->vmcb);
1981
1982         __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
1983         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1984         __free_page(virt_to_page(svm->nested.hsave));
1985         __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1986 }
1987
1988 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1989 {
1990         struct vcpu_svm *svm = to_svm(vcpu);
1991         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1992         int i;
1993
1994         if (unlikely(cpu != vcpu->cpu)) {
1995                 svm->asid_generation = 0;
1996                 mark_all_dirty(svm->vmcb);
1997         }
1998
1999 #ifdef CONFIG_X86_64
2000         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
2001 #endif
2002         savesegment(fs, svm->host.fs);
2003         savesegment(gs, svm->host.gs);
2004         svm->host.ldt = kvm_read_ldt();
2005
2006         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
2007                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
2008
2009         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
2010                 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
2011                 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
2012                         __this_cpu_write(current_tsc_ratio, tsc_ratio);
2013                         wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
2014                 }
2015         }
2016         /* This assumes that the kernel never uses MSR_TSC_AUX */
2017         if (static_cpu_has(X86_FEATURE_RDTSCP))
2018                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
2019
2020         if (sd->current_vmcb != svm->vmcb) {
2021                 sd->current_vmcb = svm->vmcb;
2022                 indirect_branch_prediction_barrier();
2023         }
2024         avic_vcpu_load(vcpu, cpu);
2025 }
2026
2027 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
2028 {
2029         struct vcpu_svm *svm = to_svm(vcpu);
2030         int i;
2031
2032         avic_vcpu_put(vcpu);
2033
2034         ++vcpu->stat.host_state_reload;
2035         kvm_load_ldt(svm->host.ldt);
2036 #ifdef CONFIG_X86_64
2037         loadsegment(fs, svm->host.fs);
2038         wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
2039         load_gs_index(svm->host.gs);
2040 #else
2041 #ifdef CONFIG_X86_32_LAZY_GS
2042         loadsegment(gs, svm->host.gs);
2043 #endif
2044 #endif
2045         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
2046                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
2047 }
2048
2049 static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
2050 {
2051         avic_set_running(vcpu, false);
2052 }
2053
2054 static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
2055 {
2056         if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
2057                 kvm_vcpu_update_apicv(vcpu);
2058         avic_set_running(vcpu, true);
2059 }
2060
2061 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
2062 {
2063         struct vcpu_svm *svm = to_svm(vcpu);
2064         unsigned long rflags = svm->vmcb->save.rflags;
2065
2066         if (svm->nmi_singlestep) {
2067                 /* Hide our flags if they were not set by the guest */
2068                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
2069                         rflags &= ~X86_EFLAGS_TF;
2070                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
2071                         rflags &= ~X86_EFLAGS_RF;
2072         }
2073         return rflags;
2074 }
2075
2076 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2077 {
2078         if (to_svm(vcpu)->nmi_singlestep)
2079                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2080
2081        /*
2082         * Any change of EFLAGS.VM is accompanied by a reload of SS
2083         * (caused by either a task switch or an inter-privilege IRET),
2084         * so we do not need to update the CPL here.
2085         */
2086         to_svm(vcpu)->vmcb->save.rflags = rflags;
2087 }
2088
2089 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2090 {
2091         switch (reg) {
2092         case VCPU_EXREG_PDPTR:
2093                 BUG_ON(!npt_enabled);
2094                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
2095                 break;
2096         default:
2097                 WARN_ON_ONCE(1);
2098         }
2099 }
2100
2101 static inline void svm_enable_vintr(struct vcpu_svm *svm)
2102 {
2103         struct vmcb_control_area *control;
2104
2105         /* The following fields are ignored when AVIC is enabled */
2106         WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
2107
2108         /*
2109          * This is just a dummy VINTR to actually cause a vmexit to happen.
2110          * Actual injection of virtual interrupts happens through EVENTINJ.
2111          */
2112         control = &svm->vmcb->control;
2113         control->int_vector = 0x0;
2114         control->int_ctl &= ~V_INTR_PRIO_MASK;
2115         control->int_ctl |= V_IRQ_MASK |
2116                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2117         mark_dirty(svm->vmcb, VMCB_INTR);
2118 }
2119
2120 static void svm_set_vintr(struct vcpu_svm *svm)
2121 {
2122         set_intercept(svm, INTERCEPT_VINTR);
2123         if (is_intercept(svm, INTERCEPT_VINTR))
2124                 svm_enable_vintr(svm);
2125 }
2126
2127 static void svm_clear_vintr(struct vcpu_svm *svm)
2128 {
2129         clr_intercept(svm, INTERCEPT_VINTR);
2130
2131         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2132         mark_dirty(svm->vmcb, VMCB_INTR);
2133 }
2134
2135 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
2136 {
2137         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
2138
2139         switch (seg) {
2140         case VCPU_SREG_CS: return &save->cs;
2141         case VCPU_SREG_DS: return &save->ds;
2142         case VCPU_SREG_ES: return &save->es;
2143         case VCPU_SREG_FS: return &save->fs;
2144         case VCPU_SREG_GS: return &save->gs;
2145         case VCPU_SREG_SS: return &save->ss;
2146         case VCPU_SREG_TR: return &save->tr;
2147         case VCPU_SREG_LDTR: return &save->ldtr;
2148         }
2149         BUG();
2150         return NULL;
2151 }
2152
2153 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2154 {
2155         struct vmcb_seg *s = svm_seg(vcpu, seg);
2156
2157         return s->base;
2158 }
2159
2160 static void svm_get_segment(struct kvm_vcpu *vcpu,
2161                             struct kvm_segment *var, int seg)
2162 {
2163         struct vmcb_seg *s = svm_seg(vcpu, seg);
2164
2165         var->base = s->base;
2166         var->limit = s->limit;
2167         var->selector = s->selector;
2168         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
2169         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
2170         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
2171         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
2172         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
2173         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
2174         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
2175
2176         /*
2177          * AMD CPUs circa 2014 track the G bit for all segments except CS.
2178          * However, the SVM spec states that the G bit is not observed by the
2179          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
2180          * So let's synthesize a legal G bit for all segments, this helps
2181          * running KVM nested. It also helps cross-vendor migration, because
2182          * Intel's vmentry has a check on the 'G' bit.
2183          */
2184         var->g = s->limit > 0xfffff;
2185
2186         /*
2187          * AMD's VMCB does not have an explicit unusable field, so emulate it
2188          * for cross vendor migration purposes by "not present"
2189          */
2190         var->unusable = !var->present;
2191
2192         switch (seg) {
2193         case VCPU_SREG_TR:
2194                 /*
2195                  * Work around a bug where the busy flag in the tr selector
2196                  * isn't exposed
2197                  */
2198                 var->type |= 0x2;
2199                 break;
2200         case VCPU_SREG_DS:
2201         case VCPU_SREG_ES:
2202         case VCPU_SREG_FS:
2203         case VCPU_SREG_GS:
2204                 /*
2205                  * The accessed bit must always be set in the segment
2206                  * descriptor cache, although it can be cleared in the
2207                  * descriptor, the cached bit always remains at 1. Since
2208                  * Intel has a check on this, set it here to support
2209                  * cross-vendor migration.
2210                  */
2211                 if (!var->unusable)
2212                         var->type |= 0x1;
2213                 break;
2214         case VCPU_SREG_SS:
2215                 /*
2216                  * On AMD CPUs sometimes the DB bit in the segment
2217                  * descriptor is left as 1, although the whole segment has
2218                  * been made unusable. Clear it here to pass an Intel VMX
2219                  * entry check when cross vendor migrating.
2220                  */
2221                 if (var->unusable)
2222                         var->db = 0;
2223                 /* This is symmetric with svm_set_segment() */
2224                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
2225                 break;
2226         }
2227 }
2228
2229 static int svm_get_cpl(struct kvm_vcpu *vcpu)
2230 {
2231         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
2232
2233         return save->cpl;
2234 }
2235
2236 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2237 {
2238         struct vcpu_svm *svm = to_svm(vcpu);
2239
2240         dt->size = svm->vmcb->save.idtr.limit;
2241         dt->address = svm->vmcb->save.idtr.base;
2242 }
2243
2244 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2245 {
2246         struct vcpu_svm *svm = to_svm(vcpu);
2247
2248         svm->vmcb->save.idtr.limit = dt->size;
2249         svm->vmcb->save.idtr.base = dt->address ;
2250         mark_dirty(svm->vmcb, VMCB_DT);
2251 }
2252
2253 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2254 {
2255         struct vcpu_svm *svm = to_svm(vcpu);
2256
2257         dt->size = svm->vmcb->save.gdtr.limit;
2258         dt->address = svm->vmcb->save.gdtr.base;
2259 }
2260
2261 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2262 {
2263         struct vcpu_svm *svm = to_svm(vcpu);
2264
2265         svm->vmcb->save.gdtr.limit = dt->size;
2266         svm->vmcb->save.gdtr.base = dt->address ;
2267         mark_dirty(svm->vmcb, VMCB_DT);
2268 }
2269
2270 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2271 {
2272 }
2273
2274 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
2275 {
2276 }
2277
2278 static void update_cr0_intercept(struct vcpu_svm *svm)
2279 {
2280         ulong gcr0 = svm->vcpu.arch.cr0;
2281         u64 *hcr0 = &svm->vmcb->save.cr0;
2282
2283         *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
2284                 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
2285
2286         mark_dirty(svm->vmcb, VMCB_CR);
2287
2288         if (gcr0 == *hcr0) {
2289                 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
2290                 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
2291         } else {
2292                 set_cr_intercept(svm, INTERCEPT_CR0_READ);
2293                 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
2294         }
2295 }
2296
2297 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2298 {
2299         struct vcpu_svm *svm = to_svm(vcpu);
2300
2301 #ifdef CONFIG_X86_64
2302         if (vcpu->arch.efer & EFER_LME) {
2303                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
2304                         vcpu->arch.efer |= EFER_LMA;
2305                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
2306                 }
2307
2308                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
2309                         vcpu->arch.efer &= ~EFER_LMA;
2310                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
2311                 }
2312         }
2313 #endif
2314         vcpu->arch.cr0 = cr0;
2315
2316         if (!npt_enabled)
2317                 cr0 |= X86_CR0_PG | X86_CR0_WP;
2318
2319         /*
2320          * re-enable caching here because the QEMU bios
2321          * does not do it - this results in some delay at
2322          * reboot
2323          */
2324         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
2325                 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
2326         svm->vmcb->save.cr0 = cr0;
2327         mark_dirty(svm->vmcb, VMCB_CR);
2328         update_cr0_intercept(svm);
2329 }
2330
2331 int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2332 {
2333         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
2334         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
2335
2336         if (cr4 & X86_CR4_VMXE)
2337                 return 1;
2338
2339         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
2340                 svm_flush_tlb(vcpu, true);
2341
2342         vcpu->arch.cr4 = cr4;
2343         if (!npt_enabled)
2344                 cr4 |= X86_CR4_PAE;
2345         cr4 |= host_cr4_mce;
2346         to_svm(vcpu)->vmcb->save.cr4 = cr4;
2347         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
2348         return 0;
2349 }
2350
2351 static void svm_set_segment(struct kvm_vcpu *vcpu,
2352                             struct kvm_segment *var, int seg)
2353 {
2354         struct vcpu_svm *svm = to_svm(vcpu);
2355         struct vmcb_seg *s = svm_seg(vcpu, seg);
2356
2357         s->base = var->base;
2358         s->limit = var->limit;
2359         s->selector = var->selector;
2360         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
2361         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
2362         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
2363         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
2364         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
2365         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
2366         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
2367         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
2368
2369         /*
2370          * This is always accurate, except if SYSRET returned to a segment
2371          * with SS.DPL != 3.  Intel does not have this quirk, and always
2372          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
2373          * would entail passing the CPL to userspace and back.
2374          */
2375         if (seg == VCPU_SREG_SS)
2376                 /* This is symmetric with svm_get_segment() */
2377                 svm->vmcb->save.cpl = (var->dpl & 3);
2378
2379         mark_dirty(svm->vmcb, VMCB_SEG);
2380 }
2381
2382 static void update_bp_intercept(struct kvm_vcpu *vcpu)
2383 {
2384         struct vcpu_svm *svm = to_svm(vcpu);
2385
2386         clr_exception_intercept(svm, BP_VECTOR);
2387
2388         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
2389                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2390                         set_exception_intercept(svm, BP_VECTOR);
2391         } else
2392                 vcpu->guest_debug = 0;
2393 }
2394
2395 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
2396 {
2397         if (sd->next_asid > sd->max_asid) {
2398                 ++sd->asid_generation;
2399                 sd->next_asid = sd->min_asid;
2400                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
2401         }
2402
2403         svm->asid_generation = sd->asid_generation;
2404         svm->vmcb->control.asid = sd->next_asid++;
2405
2406         mark_dirty(svm->vmcb, VMCB_ASID);
2407 }
2408
2409 static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
2410 {
2411         return to_svm(vcpu)->vmcb->save.dr6;
2412 }
2413
2414 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
2415 {
2416         struct vcpu_svm *svm = to_svm(vcpu);
2417
2418         svm->vmcb->save.dr6 = value;
2419         mark_dirty(svm->vmcb, VMCB_DR);
2420 }
2421
2422 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
2423 {
2424         struct vcpu_svm *svm = to_svm(vcpu);
2425
2426         get_debugreg(vcpu->arch.db[0], 0);
2427         get_debugreg(vcpu->arch.db[1], 1);
2428         get_debugreg(vcpu->arch.db[2], 2);
2429         get_debugreg(vcpu->arch.db[3], 3);
2430         vcpu->arch.dr6 = svm_get_dr6(vcpu);
2431         vcpu->arch.dr7 = svm->vmcb->save.dr7;
2432
2433         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2434         set_dr_intercepts(svm);
2435 }
2436
2437 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2438 {
2439         struct vcpu_svm *svm = to_svm(vcpu);
2440
2441         svm->vmcb->save.dr7 = value;
2442         mark_dirty(svm->vmcb, VMCB_DR);
2443 }
2444
2445 static int pf_interception(struct vcpu_svm *svm)
2446 {
2447         u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
2448         u64 error_code = svm->vmcb->control.exit_info_1;
2449
2450         return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
2451                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2452                         svm->vmcb->control.insn_bytes : NULL,
2453                         svm->vmcb->control.insn_len);
2454 }
2455
2456 static int npf_interception(struct vcpu_svm *svm)
2457 {
2458         u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
2459         u64 error_code = svm->vmcb->control.exit_info_1;
2460
2461         trace_kvm_page_fault(fault_address, error_code);
2462         return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
2463                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2464                         svm->vmcb->control.insn_bytes : NULL,
2465                         svm->vmcb->control.insn_len);
2466 }
2467
2468 static int db_interception(struct vcpu_svm *svm)
2469 {
2470         struct kvm_run *kvm_run = svm->vcpu.run;
2471         struct kvm_vcpu *vcpu = &svm->vcpu;
2472
2473         if (!(svm->vcpu.guest_debug &
2474               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2475                 !svm->nmi_singlestep) {
2476                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
2477                 return 1;
2478         }
2479
2480         if (svm->nmi_singlestep) {
2481                 disable_nmi_singlestep(svm);
2482                 /* Make sure we check for pending NMIs upon entry */
2483                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2484         }
2485
2486         if (svm->vcpu.guest_debug &
2487             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2488                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2489                 kvm_run->debug.arch.pc =
2490                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2491                 kvm_run->debug.arch.exception = DB_VECTOR;
2492                 return 0;
2493         }
2494
2495         return 1;
2496 }
2497
2498 static int bp_interception(struct vcpu_svm *svm)
2499 {
2500         struct kvm_run *kvm_run = svm->vcpu.run;
2501
2502         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2503         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2504         kvm_run->debug.arch.exception = BP_VECTOR;
2505         return 0;
2506 }
2507
2508 static int ud_interception(struct vcpu_svm *svm)
2509 {
2510         return handle_ud(&svm->vcpu);
2511 }
2512
2513 static int ac_interception(struct vcpu_svm *svm)
2514 {
2515         kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
2516         return 1;
2517 }
2518
2519 static int gp_interception(struct vcpu_svm *svm)
2520 {
2521         struct kvm_vcpu *vcpu = &svm->vcpu;
2522         u32 error_code = svm->vmcb->control.exit_info_1;
2523
2524         WARN_ON_ONCE(!enable_vmware_backdoor);
2525
2526         /*
2527          * VMware backdoor emulation on #GP interception only handles IN{S},
2528          * OUT{S}, and RDPMC, none of which generate a non-zero error code.
2529          */
2530         if (error_code) {
2531                 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2532                 return 1;
2533         }
2534         return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
2535 }
2536
2537 static bool is_erratum_383(void)
2538 {
2539         int err, i;
2540         u64 value;
2541
2542         if (!erratum_383_found)
2543                 return false;
2544
2545         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2546         if (err)
2547                 return false;
2548
2549         /* Bit 62 may or may not be set for this mce */
2550         value &= ~(1ULL << 62);
2551
2552         if (value != 0xb600000000010015ULL)
2553                 return false;
2554
2555         /* Clear MCi_STATUS registers */
2556         for (i = 0; i < 6; ++i)
2557                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2558
2559         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2560         if (!err) {
2561                 u32 low, high;
2562
2563                 value &= ~(1ULL << 2);
2564                 low    = lower_32_bits(value);
2565                 high   = upper_32_bits(value);
2566
2567                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2568         }
2569
2570         /* Flush tlb to evict multi-match entries */
2571         __flush_tlb_all();
2572
2573         return true;
2574 }
2575
2576 static void svm_handle_mce(struct vcpu_svm *svm)
2577 {
2578         if (is_erratum_383()) {
2579                 /*
2580                  * Erratum 383 triggered. Guest state is corrupt so kill the
2581                  * guest.
2582                  */
2583                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2584
2585                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
2586
2587                 return;
2588         }
2589
2590         /*
2591          * On an #MC intercept the MCE handler is not called automatically in
2592          * the host. So do it by hand here.
2593          */
2594         asm volatile (
2595                 "int $0x12\n");
2596         /* not sure if we ever come back to this point */
2597
2598         return;
2599 }
2600
2601 static int mc_interception(struct vcpu_svm *svm)
2602 {
2603         return 1;
2604 }
2605
2606 static int shutdown_interception(struct vcpu_svm *svm)
2607 {
2608         struct kvm_run *kvm_run = svm->vcpu.run;
2609
2610         /*
2611          * VMCB is undefined after a SHUTDOWN intercept
2612          * so reinitialize it.
2613          */
2614         clear_page(svm->vmcb);
2615         init_vmcb(svm);
2616
2617         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2618         return 0;
2619 }
2620
2621 static int io_interception(struct vcpu_svm *svm)
2622 {
2623         struct kvm_vcpu *vcpu = &svm->vcpu;
2624         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2625         int size, in, string;
2626         unsigned port;
2627
2628         ++svm->vcpu.stat.io_exits;
2629         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2630         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2631         if (string)
2632                 return kvm_emulate_instruction(vcpu, 0);
2633
2634         port = io_info >> 16;
2635         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2636         svm->next_rip = svm->vmcb->control.exit_info_2;
2637
2638         return kvm_fast_pio(&svm->vcpu, size, port, in);
2639 }
2640
2641 static int nmi_interception(struct vcpu_svm *svm)
2642 {
2643         return 1;
2644 }
2645
2646 static int intr_interception(struct vcpu_svm *svm)
2647 {
2648         ++svm->vcpu.stat.irq_exits;
2649         return 1;
2650 }
2651
2652 static int nop_on_interception(struct vcpu_svm *svm)
2653 {
2654         return 1;
2655 }
2656
2657 static int halt_interception(struct vcpu_svm *svm)
2658 {
2659         return kvm_emulate_halt(&svm->vcpu);
2660 }
2661
2662 static int vmmcall_interception(struct vcpu_svm *svm)
2663 {
2664         return kvm_emulate_hypercall(&svm->vcpu);
2665 }
2666
2667 static int vmload_interception(struct vcpu_svm *svm)
2668 {
2669         struct vmcb *nested_vmcb;
2670         struct kvm_host_map map;
2671         int ret;
2672
2673         if (nested_svm_check_permissions(svm))
2674                 return 1;
2675
2676         ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2677         if (ret) {
2678                 if (ret == -EINVAL)
2679                         kvm_inject_gp(&svm->vcpu, 0);
2680                 return 1;
2681         }
2682
2683         nested_vmcb = map.hva;
2684
2685         ret = kvm_skip_emulated_instruction(&svm->vcpu);
2686
2687         nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2688         kvm_vcpu_unmap(&svm->vcpu, &map, true);
2689
2690         return ret;
2691 }
2692
2693 static int vmsave_interception(struct vcpu_svm *svm)
2694 {
2695         struct vmcb *nested_vmcb;
2696         struct kvm_host_map map;
2697         int ret;
2698
2699         if (nested_svm_check_permissions(svm))
2700                 return 1;
2701
2702         ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2703         if (ret) {
2704                 if (ret == -EINVAL)
2705                         kvm_inject_gp(&svm->vcpu, 0);
2706                 return 1;
2707         }
2708
2709         nested_vmcb = map.hva;
2710
2711         ret = kvm_skip_emulated_instruction(&svm->vcpu);
2712
2713         nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2714         kvm_vcpu_unmap(&svm->vcpu, &map, true);
2715
2716         return ret;
2717 }
2718
2719 static int vmrun_interception(struct vcpu_svm *svm)
2720 {
2721         if (nested_svm_check_permissions(svm))
2722                 return 1;
2723
2724         return nested_svm_vmrun(svm);
2725 }
2726
2727 static int stgi_interception(struct vcpu_svm *svm)
2728 {
2729         int ret;
2730
2731         if (nested_svm_check_permissions(svm))
2732                 return 1;
2733
2734         /*
2735          * If VGIF is enabled, the STGI intercept is only added to
2736          * detect the opening of the SMI/NMI window; remove it now.
2737          */
2738         if (vgif_enabled(svm))
2739                 clr_intercept(svm, INTERCEPT_STGI);
2740
2741         ret = kvm_skip_emulated_instruction(&svm->vcpu);
2742         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2743
2744         enable_gif(svm);
2745
2746         return ret;
2747 }
2748
2749 static int clgi_interception(struct vcpu_svm *svm)
2750 {
2751         int ret;
2752
2753         if (nested_svm_check_permissions(svm))
2754                 return 1;
2755
2756         ret = kvm_skip_emulated_instruction(&svm->vcpu);
2757
2758         disable_gif(svm);
2759
2760         /* After a CLGI no interrupts should come */
2761         if (!kvm_vcpu_apicv_active(&svm->vcpu))
2762                 svm_clear_vintr(svm);
2763
2764         return ret;
2765 }
2766
2767 static int invlpga_interception(struct vcpu_svm *svm)
2768 {
2769         struct kvm_vcpu *vcpu = &svm->vcpu;
2770
2771         trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
2772                           kvm_rax_read(&svm->vcpu));
2773
2774         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2775         kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
2776
2777         return kvm_skip_emulated_instruction(&svm->vcpu);
2778 }
2779
2780 static int skinit_interception(struct vcpu_svm *svm)
2781 {
2782         trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
2783
2784         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2785         return 1;
2786 }
2787
2788 static int wbinvd_interception(struct vcpu_svm *svm)
2789 {
2790         return kvm_emulate_wbinvd(&svm->vcpu);
2791 }
2792
2793 static int xsetbv_interception(struct vcpu_svm *svm)
2794 {
2795         u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2796         u32 index = kvm_rcx_read(&svm->vcpu);
2797
2798         if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2799                 return kvm_skip_emulated_instruction(&svm->vcpu);
2800         }
2801
2802         return 1;
2803 }
2804
2805 static int rdpru_interception(struct vcpu_svm *svm)
2806 {
2807         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2808         return 1;
2809 }
2810
2811 static int task_switch_interception(struct vcpu_svm *svm)
2812 {
2813         u16 tss_selector;
2814         int reason;
2815         int int_type = svm->vmcb->control.exit_int_info &
2816                 SVM_EXITINTINFO_TYPE_MASK;
2817         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2818         uint32_t type =
2819                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2820         uint32_t idt_v =
2821                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2822         bool has_error_code = false;
2823         u32 error_code = 0;
2824
2825         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2826
2827         if (svm->vmcb->control.exit_info_2 &
2828             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2829                 reason = TASK_SWITCH_IRET;
2830         else if (svm->vmcb->control.exit_info_2 &
2831                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2832                 reason = TASK_SWITCH_JMP;
2833         else if (idt_v)
2834                 reason = TASK_SWITCH_GATE;
2835         else
2836                 reason = TASK_SWITCH_CALL;
2837
2838         if (reason == TASK_SWITCH_GATE) {
2839                 switch (type) {
2840                 case SVM_EXITINTINFO_TYPE_NMI:
2841                         svm->vcpu.arch.nmi_injected = false;
2842                         break;
2843                 case SVM_EXITINTINFO_TYPE_EXEPT:
2844                         if (svm->vmcb->control.exit_info_2 &
2845                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2846                                 has_error_code = true;
2847                                 error_code =
2848                                         (u32)svm->vmcb->control.exit_info_2;
2849                         }
2850                         kvm_clear_exception_queue(&svm->vcpu);
2851                         break;
2852                 case SVM_EXITINTINFO_TYPE_INTR:
2853                         kvm_clear_interrupt_queue(&svm->vcpu);
2854                         break;
2855                 default:
2856                         break;
2857                 }
2858         }
2859
2860         if (reason != TASK_SWITCH_GATE ||
2861             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2862             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2863              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2864                 if (!skip_emulated_instruction(&svm->vcpu))
2865                         return 0;
2866         }
2867
2868         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2869                 int_vec = -1;
2870
2871         return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2872                                has_error_code, error_code);
2873 }
2874
2875 static int cpuid_interception(struct vcpu_svm *svm)
2876 {
2877         return kvm_emulate_cpuid(&svm->vcpu);
2878 }
2879
2880 static int iret_interception(struct vcpu_svm *svm)
2881 {
2882         ++svm->vcpu.stat.nmi_window_exits;
2883         clr_intercept(svm, INTERCEPT_IRET);
2884         svm->vcpu.arch.hflags |= HF_IRET_MASK;
2885         svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2886         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2887         return 1;
2888 }
2889
2890 static int invlpg_interception(struct vcpu_svm *svm)
2891 {
2892         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2893                 return kvm_emulate_instruction(&svm->vcpu, 0);
2894
2895         kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2896         return kvm_skip_emulated_instruction(&svm->vcpu);
2897 }
2898
2899 static int emulate_on_interception(struct vcpu_svm *svm)
2900 {
2901         return kvm_emulate_instruction(&svm->vcpu, 0);
2902 }
2903
2904 static int rsm_interception(struct vcpu_svm *svm)
2905 {
2906         return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
2907 }
2908
2909 static int rdpmc_interception(struct vcpu_svm *svm)
2910 {
2911         int err;
2912
2913         if (!nrips)
2914                 return emulate_on_interception(svm);
2915
2916         err = kvm_rdpmc(&svm->vcpu);
2917         return kvm_complete_insn_gp(&svm->vcpu, err);
2918 }
2919
2920 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
2921                                             unsigned long val)
2922 {
2923         unsigned long cr0 = svm->vcpu.arch.cr0;
2924         bool ret = false;
2925         u64 intercept;
2926
2927         intercept = svm->nested.intercept;
2928
2929         if (!is_guest_mode(&svm->vcpu) ||
2930             (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2931                 return false;
2932
2933         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2934         val &= ~SVM_CR0_SELECTIVE_MASK;
2935
2936         if (cr0 ^ val) {
2937                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2938                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2939         }
2940
2941         return ret;
2942 }
2943
2944 #define CR_VALID (1ULL << 63)
2945
2946 static int cr_interception(struct vcpu_svm *svm)
2947 {
2948         int reg, cr;
2949         unsigned long val;
2950         int err;
2951
2952         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2953                 return emulate_on_interception(svm);
2954
2955         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2956                 return emulate_on_interception(svm);
2957
2958         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2959         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2960                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2961         else
2962                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2963
2964         err = 0;
2965         if (cr >= 16) { /* mov to cr */
2966                 cr -= 16;
2967                 val = kvm_register_read(&svm->vcpu, reg);
2968                 switch (cr) {
2969                 case 0:
2970                         if (!check_selective_cr0_intercepted(svm, val))
2971                                 err = kvm_set_cr0(&svm->vcpu, val);
2972                         else
2973                                 return 1;
2974
2975                         break;
2976                 case 3:
2977                         err = kvm_set_cr3(&svm->vcpu, val);
2978                         break;
2979                 case 4:
2980                         err = kvm_set_cr4(&svm->vcpu, val);
2981                         break;
2982                 case 8:
2983                         err = kvm_set_cr8(&svm->vcpu, val);
2984                         break;
2985                 default:
2986                         WARN(1, "unhandled write to CR%d", cr);
2987                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2988                         return 1;
2989                 }
2990         } else { /* mov from cr */
2991                 switch (cr) {
2992                 case 0:
2993                         val = kvm_read_cr0(&svm->vcpu);
2994                         break;
2995                 case 2:
2996                         val = svm->vcpu.arch.cr2;
2997                         break;
2998                 case 3:
2999                         val = kvm_read_cr3(&svm->vcpu);
3000                         break;
3001                 case 4:
3002                         val = kvm_read_cr4(&svm->vcpu);
3003                         break;
3004                 case 8:
3005                         val = kvm_get_cr8(&svm->vcpu);
3006                         break;
3007                 default:
3008                         WARN(1, "unhandled read from CR%d", cr);
3009                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3010                         return 1;
3011                 }
3012                 kvm_register_write(&svm->vcpu, reg, val);
3013         }
3014         return kvm_complete_insn_gp(&svm->vcpu, err);
3015 }
3016
3017 static int dr_interception(struct vcpu_svm *svm)
3018 {
3019         int reg, dr;
3020         unsigned long val;
3021
3022         if (svm->vcpu.guest_debug == 0) {
3023                 /*
3024                  * No more DR vmexits; force a reload of the debug registers
3025                  * and reenter on this instruction.  The next vmexit will
3026                  * retrieve the full state of the debug registers.
3027                  */
3028                 clr_dr_intercepts(svm);
3029                 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3030                 return 1;
3031         }
3032
3033         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3034                 return emulate_on_interception(svm);
3035
3036         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3037         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3038
3039         if (dr >= 16) { /* mov to DRn */
3040                 if (!kvm_require_dr(&svm->vcpu, dr - 16))
3041                         return 1;
3042                 val = kvm_register_read(&svm->vcpu, reg);
3043                 kvm_set_dr(&svm->vcpu, dr - 16, val);
3044         } else {
3045                 if (!kvm_require_dr(&svm->vcpu, dr))
3046                         return 1;
3047                 kvm_get_dr(&svm->vcpu, dr, &val);
3048                 kvm_register_write(&svm->vcpu, reg, val);
3049         }
3050
3051         return kvm_skip_emulated_instruction(&svm->vcpu);
3052 }
3053
3054 static int cr8_write_interception(struct vcpu_svm *svm)
3055 {
3056         struct kvm_run *kvm_run = svm->vcpu.run;
3057         int r;
3058
3059         u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3060         /* instruction emulation calls kvm_set_cr8() */
3061         r = cr_interception(svm);
3062         if (lapic_in_kernel(&svm->vcpu))
3063                 return r;
3064         if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3065                 return r;
3066         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3067         return 0;
3068 }
3069
3070 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
3071 {
3072         msr->data = 0;
3073
3074         switch (msr->index) {
3075         case MSR_F10H_DECFG:
3076                 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
3077                         msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
3078                 break;
3079         default:
3080                 return 1;
3081         }
3082
3083         return 0;
3084 }
3085
3086 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3087 {
3088         struct vcpu_svm *svm = to_svm(vcpu);
3089
3090         switch (msr_info->index) {
3091         case MSR_STAR:
3092                 msr_info->data = svm->vmcb->save.star;
3093                 break;
3094 #ifdef CONFIG_X86_64
3095         case MSR_LSTAR:
3096                 msr_info->data = svm->vmcb->save.lstar;
3097                 break;
3098         case MSR_CSTAR:
3099                 msr_info->data = svm->vmcb->save.cstar;
3100                 break;
3101         case MSR_KERNEL_GS_BASE:
3102                 msr_info->data = svm->vmcb->save.kernel_gs_base;
3103                 break;
3104         case MSR_SYSCALL_MASK:
3105                 msr_info->data = svm->vmcb->save.sfmask;
3106                 break;
3107 #endif
3108         case MSR_IA32_SYSENTER_CS:
3109                 msr_info->data = svm->vmcb->save.sysenter_cs;
3110                 break;
3111         case MSR_IA32_SYSENTER_EIP:
3112                 msr_info->data = svm->sysenter_eip;
3113                 break;
3114         case MSR_IA32_SYSENTER_ESP:
3115                 msr_info->data = svm->sysenter_esp;
3116                 break;
3117         case MSR_TSC_AUX:
3118                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3119                         return 1;
3120                 msr_info->data = svm->tsc_aux;
3121                 break;
3122         /*
3123          * Nobody will change the following 5 values in the VMCB so we can
3124          * safely return them on rdmsr. They will always be 0 until LBRV is
3125          * implemented.
3126          */
3127         case MSR_IA32_DEBUGCTLMSR:
3128                 msr_info->data = svm->vmcb->save.dbgctl;
3129                 break;
3130         case MSR_IA32_LASTBRANCHFROMIP:
3131                 msr_info->data = svm->vmcb->save.br_from;
3132                 break;
3133         case MSR_IA32_LASTBRANCHTOIP:
3134                 msr_info->data = svm->vmcb->save.br_to;
3135                 break;
3136         case MSR_IA32_LASTINTFROMIP:
3137                 msr_info->data = svm->vmcb->save.last_excp_from;
3138                 break;
3139         case MSR_IA32_LASTINTTOIP:
3140                 msr_info->data = svm->vmcb->save.last_excp_to;
3141                 break;
3142         case MSR_VM_HSAVE_PA:
3143                 msr_info->data = svm->nested.hsave_msr;
3144                 break;
3145         case MSR_VM_CR:
3146                 msr_info->data = svm->nested.vm_cr_msr;
3147                 break;
3148         case MSR_IA32_SPEC_CTRL:
3149                 if (!msr_info->host_initiated &&
3150                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
3151                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
3152                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
3153                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
3154                         return 1;
3155
3156                 msr_info->data = svm->spec_ctrl;
3157                 break;
3158         case MSR_AMD64_VIRT_SPEC_CTRL:
3159                 if (!msr_info->host_initiated &&
3160                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3161                         return 1;
3162
3163                 msr_info->data = svm->virt_spec_ctrl;
3164                 break;
3165         case MSR_F15H_IC_CFG: {
3166
3167                 int family, model;
3168
3169                 family = guest_cpuid_family(vcpu);
3170                 model  = guest_cpuid_model(vcpu);
3171
3172                 if (family < 0 || model < 0)
3173                         return kvm_get_msr_common(vcpu, msr_info);
3174
3175                 msr_info->data = 0;
3176
3177                 if (family == 0x15 &&
3178                     (model >= 0x2 && model < 0x20))
3179                         msr_info->data = 0x1E;
3180                 }
3181                 break;
3182         case MSR_F10H_DECFG:
3183                 msr_info->data = svm->msr_decfg;
3184                 break;
3185         default:
3186                 return kvm_get_msr_common(vcpu, msr_info);
3187         }
3188         return 0;
3189 }
3190
3191 static int rdmsr_interception(struct vcpu_svm *svm)
3192 {
3193         return kvm_emulate_rdmsr(&svm->vcpu);
3194 }
3195
3196 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3197 {
3198         struct vcpu_svm *svm = to_svm(vcpu);
3199         int svm_dis, chg_mask;
3200
3201         if (data & ~SVM_VM_CR_VALID_MASK)
3202                 return 1;
3203
3204         chg_mask = SVM_VM_CR_VALID_MASK;
3205
3206         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3207                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3208
3209         svm->nested.vm_cr_msr &= ~chg_mask;
3210         svm->nested.vm_cr_msr |= (data & chg_mask);
3211
3212         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3213
3214         /* check for svm_disable while efer.svme is set */
3215         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3216                 return 1;
3217
3218         return 0;
3219 }
3220
3221 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3222 {
3223         struct vcpu_svm *svm = to_svm(vcpu);
3224
3225         u32 ecx = msr->index;
3226         u64 data = msr->data;
3227         switch (ecx) {
3228         case MSR_IA32_CR_PAT:
3229                 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3230                         return 1;
3231                 vcpu->arch.pat = data;
3232                 svm->vmcb->save.g_pat = data;
3233                 mark_dirty(svm->vmcb, VMCB_NPT);
3234                 break;
3235         case MSR_IA32_SPEC_CTRL:
3236                 if (!msr->host_initiated &&
3237                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
3238                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
3239                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
3240                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
3241                         return 1;
3242
3243                 if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
3244                         return 1;
3245
3246                 svm->spec_ctrl = data;
3247                 if (!data)
3248                         break;
3249
3250                 /*
3251                  * For non-nested:
3252                  * When it's written (to non-zero) for the first time, pass
3253                  * it through.
3254                  *
3255                  * For nested:
3256                  * The handling of the MSR bitmap for L2 guests is done in
3257                  * nested_svm_vmrun_msrpm.
3258                  * We update the L1 MSR bit as well since it will end up
3259                  * touching the MSR anyway now.
3260                  */
3261                 set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
3262                 break;
3263         case MSR_IA32_PRED_CMD:
3264                 if (!msr->host_initiated &&
3265                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
3266                         return 1;
3267
3268                 if (data & ~PRED_CMD_IBPB)
3269                         return 1;
3270                 if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
3271                         return 1;
3272                 if (!data)
3273                         break;
3274
3275                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3276                 set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
3277                 break;
3278         case MSR_AMD64_VIRT_SPEC_CTRL:
3279                 if (!msr->host_initiated &&
3280                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3281                         return 1;
3282
3283                 if (data & ~SPEC_CTRL_SSBD)
3284                         return 1;
3285
3286                 svm->virt_spec_ctrl = data;
3287                 break;
3288         case MSR_STAR:
3289                 svm->vmcb->save.star = data;
3290                 break;
3291 #ifdef CONFIG_X86_64
3292         case MSR_LSTAR:
3293                 svm->vmcb->save.lstar = data;
3294                 break;
3295         case MSR_CSTAR:
3296                 svm->vmcb->save.cstar = data;
3297                 break;
3298         case MSR_KERNEL_GS_BASE:
3299                 svm->vmcb->save.kernel_gs_base = data;
3300                 break;
3301         case MSR_SYSCALL_MASK:
3302                 svm->vmcb->save.sfmask = data;
3303                 break;
3304 #endif
3305         case MSR_IA32_SYSENTER_CS:
3306                 svm->vmcb->save.sysenter_cs = data;
3307                 break;
3308         case MSR_IA32_SYSENTER_EIP:
3309                 svm->sysenter_eip = data;
3310                 svm->vmcb->save.sysenter_eip = data;
3311                 break;
3312         case MSR_IA32_SYSENTER_ESP:
3313                 svm->sysenter_esp = data;
3314                 svm->vmcb->save.sysenter_esp = data;
3315                 break;
3316         case MSR_TSC_AUX:
3317                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3318                         return 1;
3319
3320                 /*
3321                  * This is rare, so we update the MSR here instead of using
3322                  * direct_access_msrs.  Doing that would require a rdmsr in
3323                  * svm_vcpu_put.
3324                  */
3325                 svm->tsc_aux = data;
3326                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
3327                 break;
3328         case MSR_IA32_DEBUGCTLMSR:
3329                 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3330                         vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3331                                     __func__, data);
3332                         break;
3333                 }
3334                 if (data & DEBUGCTL_RESERVED_BITS)
3335                         return 1;
3336
3337                 svm->vmcb->save.dbgctl = data;
3338                 mark_dirty(svm->vmcb, VMCB_LBR);
3339                 if (data & (1ULL<<0))
3340                         svm_enable_lbrv(svm);
3341                 else
3342                         svm_disable_lbrv(svm);
3343                 break;
3344         case MSR_VM_HSAVE_PA:
3345                 svm->nested.hsave_msr = data;
3346                 break;
3347         case MSR_VM_CR:
3348                 return svm_set_vm_cr(vcpu, data);
3349         case MSR_VM_IGNNE:
3350                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3351                 break;
3352         case MSR_F10H_DECFG: {
3353                 struct kvm_msr_entry msr_entry;
3354
3355                 msr_entry.index = msr->index;
3356                 if (svm_get_msr_feature(&msr_entry))
3357                         return 1;
3358
3359                 /* Check the supported bits */
3360                 if (data & ~msr_entry.data)
3361                         return 1;
3362
3363                 /* Don't allow the guest to change a bit, #GP */
3364                 if (!msr->host_initiated && (data ^ msr_entry.data))
3365                         return 1;
3366
3367                 svm->msr_decfg = data;
3368                 break;
3369         }
3370         case MSR_IA32_APICBASE:
3371                 if (kvm_vcpu_apicv_active(vcpu))
3372                         avic_update_vapic_bar(to_svm(vcpu), data);
3373                 /* Fall through */
3374         default:
3375                 return kvm_set_msr_common(vcpu, msr);
3376         }
3377         return 0;
3378 }
3379
3380 static int wrmsr_interception(struct vcpu_svm *svm)
3381 {
3382         return kvm_emulate_wrmsr(&svm->vcpu);
3383 }
3384
3385 static int msr_interception(struct vcpu_svm *svm)
3386 {
3387         if (svm->vmcb->control.exit_info_1)
3388                 return wrmsr_interception(svm);
3389         else
3390                 return rdmsr_interception(svm);
3391 }
3392
3393 static int interrupt_window_interception(struct vcpu_svm *svm)
3394 {
3395         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3396         svm_clear_vintr(svm);
3397
3398         /*
3399          * For AVIC, the only reason to end up here is ExtINTs.
3400          * In this case AVIC was temporarily disabled for
3401          * requesting the IRQ window and we have to re-enable it.
3402          */
3403         svm_toggle_avic_for_irq_window(&svm->vcpu, true);
3404
3405         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3406         mark_dirty(svm->vmcb, VMCB_INTR);
3407         ++svm->vcpu.stat.irq_window_exits;
3408         return 1;
3409 }
3410
3411 static int pause_interception(struct vcpu_svm *svm)
3412 {
3413         struct kvm_vcpu *vcpu = &svm->vcpu;
3414         bool in_kernel = (svm_get_cpl(vcpu) == 0);
3415
3416         if (pause_filter_thresh)
3417                 grow_ple_window(vcpu);
3418
3419         kvm_vcpu_on_spin(vcpu, in_kernel);
3420         return 1;
3421 }
3422
3423 static int nop_interception(struct vcpu_svm *svm)
3424 {
3425         return kvm_skip_emulated_instruction(&(svm->vcpu));
3426 }
3427
3428 static int monitor_interception(struct vcpu_svm *svm)
3429 {
3430         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3431         return nop_interception(svm);
3432 }
3433
3434 static int mwait_interception(struct vcpu_svm *svm)
3435 {
3436         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3437         return nop_interception(svm);
3438 }
3439
3440 enum avic_ipi_failure_cause {
3441         AVIC_IPI_FAILURE_INVALID_INT_TYPE,
3442         AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
3443         AVIC_IPI_FAILURE_INVALID_TARGET,
3444         AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
3445 };
3446
3447 static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
3448 {
3449         u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
3450         u32 icrl = svm->vmcb->control.exit_info_1;
3451         u32 id = svm->vmcb->control.exit_info_2 >> 32;
3452         u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
3453         struct kvm_lapic *apic = svm->vcpu.arch.apic;
3454
3455         trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
3456
3457         switch (id) {
3458         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
3459                 /*
3460                  * AVIC hardware handles the generation of
3461                  * IPIs when the specified Message Type is Fixed
3462                  * (also known as fixed delivery mode) and
3463                  * the Trigger Mode is edge-triggered. The hardware
3464                  * also supports self and broadcast delivery modes
3465                  * specified via the Destination Shorthand(DSH)
3466                  * field of the ICRL. Logical and physical APIC ID
3467                  * formats are supported. All other IPI types cause
3468                  * a #VMEXIT, which needs to emulated.
3469                  */
3470                 kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
3471                 kvm_lapic_reg_write(apic, APIC_ICR, icrl);
3472                 break;
3473         case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
3474                 int i;
3475                 struct kvm_vcpu *vcpu;
3476                 struct kvm *kvm = svm->vcpu.kvm;
3477                 struct kvm_lapic *apic = svm->vcpu.arch.apic;
3478
3479                 /*
3480                  * At this point, we expect that the AVIC HW has already
3481                  * set the appropriate IRR bits on the valid target
3482                  * vcpus. So, we just need to kick the appropriate vcpu.
3483                  */
3484                 kvm_for_each_vcpu(i, vcpu, kvm) {
3485                         bool m = kvm_apic_match_dest(vcpu, apic,
3486                                                      icrl & APIC_SHORT_MASK,
3487                                                      GET_APIC_DEST_FIELD(icrh),
3488                                                      icrl & APIC_DEST_MASK);
3489
3490                         if (m && !avic_vcpu_is_running(vcpu))
3491                                 kvm_vcpu_wake_up(vcpu);
3492                 }
3493                 break;
3494         }
3495         case AVIC_IPI_FAILURE_INVALID_TARGET:
3496                 WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
3497                           index, svm->vcpu.vcpu_id, icrh, icrl);
3498                 break;
3499         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
3500                 WARN_ONCE(1, "Invalid backing page\n");
3501                 break;
3502         default:
3503                 pr_err("Unknown IPI interception\n");
3504         }
3505
3506         return 1;
3507 }
3508
3509 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
3510 {
3511         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
3512         int index;
3513         u32 *logical_apic_id_table;
3514         int dlid = GET_APIC_LOGICAL_ID(ldr);
3515
3516         if (!dlid)
3517                 return NULL;
3518
3519         if (flat) { /* flat */
3520                 index = ffs(dlid) - 1;
3521                 if (index > 7)
3522                         return NULL;
3523         } else { /* cluster */
3524                 int cluster = (dlid & 0xf0) >> 4;
3525                 int apic = ffs(dlid & 0x0f) - 1;
3526
3527                 if ((apic < 0) || (apic > 7) ||
3528                     (cluster >= 0xf))
3529                         return NULL;
3530                 index = (cluster << 2) + apic;
3531         }
3532
3533         logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
3534
3535         return &logical_apic_id_table[index];
3536 }
3537
3538 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
3539 {
3540         bool flat;
3541         u32 *entry, new_entry;
3542
3543         flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
3544         entry = avic_get_logical_id_entry(vcpu, ldr, flat);
3545         if (!entry)
3546                 return -EINVAL;
3547
3548         new_entry = READ_ONCE(*entry);
3549         new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
3550         new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
3551         new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3552         WRITE_ONCE(*entry, new_entry);
3553
3554         return 0;
3555 }
3556
3557 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
3558 {
3559         struct vcpu_svm *svm = to_svm(vcpu);
3560         bool flat = svm->dfr_reg == APIC_DFR_FLAT;
3561         u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
3562
3563         if (entry)
3564                 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
3565 }
3566
3567 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
3568 {
3569         int ret = 0;
3570         struct vcpu_svm *svm = to_svm(vcpu);
3571         u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
3572         u32 id = kvm_xapic_id(vcpu->arch.apic);
3573
3574         if (ldr == svm->ldr_reg)
3575                 return 0;
3576
3577         avic_invalidate_logical_id_entry(vcpu);
3578
3579         if (ldr)
3580                 ret = avic_ldr_write(vcpu, id, ldr);
3581
3582         if (!ret)
3583                 svm->ldr_reg = ldr;
3584
3585         return ret;
3586 }
3587
3588 static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
3589 {
3590         u64 *old, *new;
3591         struct vcpu_svm *svm = to_svm(vcpu);
3592         u32 id = kvm_xapic_id(vcpu->arch.apic);
3593
3594         if (vcpu->vcpu_id == id)
3595                 return 0;
3596
3597         old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
3598         new = avic_get_physical_id_entry(vcpu, id);
3599         if (!new || !old)
3600                 return 1;
3601
3602         /* We need to move physical_id_entry to new offset */
3603         *new = *old;
3604         *old = 0ULL;
3605         to_svm(vcpu)->avic_physical_id_cache = new;
3606
3607         /*
3608          * Also update the guest physical APIC ID in the logical
3609          * APIC ID table entry if already setup the LDR.
3610          */
3611         if (svm->ldr_reg)
3612                 avic_handle_ldr_update(vcpu);
3613
3614         return 0;
3615 }
3616
3617 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
3618 {
3619         struct vcpu_svm *svm = to_svm(vcpu);
3620         u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
3621
3622         if (svm->dfr_reg == dfr)
3623                 return;
3624
3625         avic_invalidate_logical_id_entry(vcpu);
3626         svm->dfr_reg = dfr;
3627 }
3628
3629 static int avic_unaccel_trap_write(struct vcpu_svm *svm)
3630 {
3631         struct kvm_lapic *apic = svm->vcpu.arch.apic;
3632         u32 offset = svm->vmcb->control.exit_info_1 &
3633                                 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3634
3635         switch (offset) {
3636         case APIC_ID:
3637                 if (avic_handle_apic_id_update(&svm->vcpu))
3638                         return 0;
3639                 break;
3640         case APIC_LDR:
3641                 if (avic_handle_ldr_update(&svm->vcpu))
3642                         return 0;
3643                 break;
3644         case APIC_DFR:
3645                 avic_handle_dfr_update(&svm->vcpu);
3646                 break;
3647         default:
3648                 break;
3649         }
3650
3651         kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
3652
3653         return 1;
3654 }
3655
3656 static bool is_avic_unaccelerated_access_trap(u32 offset)
3657 {
3658         bool ret = false;
3659
3660         switch (offset) {
3661         case APIC_ID:
3662         case APIC_EOI:
3663         case APIC_RRR:
3664         case APIC_LDR:
3665         case APIC_DFR:
3666         case APIC_SPIV:
3667         case APIC_ESR:
3668         case APIC_ICR:
3669         case APIC_LVTT:
3670         case APIC_LVTTHMR:
3671         case APIC_LVTPC:
3672         case APIC_LVT0:
3673         case APIC_LVT1:
3674         case APIC_LVTERR:
3675         case APIC_TMICT:
3676         case APIC_TDCR:
3677                 ret = true;
3678                 break;
3679         default:
3680                 break;
3681         }
3682         return ret;
3683 }
3684
3685 static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
3686 {
3687         int ret = 0;
3688         u32 offset = svm->vmcb->control.exit_info_1 &
3689                      AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3690         u32 vector = svm->vmcb->control.exit_info_2 &
3691                      AVIC_UNACCEL_ACCESS_VECTOR_MASK;
3692         bool write = (svm->vmcb->control.exit_info_1 >> 32) &
3693                      AVIC_UNACCEL_ACCESS_WRITE_MASK;
3694         bool trap = is_avic_unaccelerated_access_trap(offset);
3695
3696         trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
3697                                             trap, write, vector);
3698         if (trap) {
3699                 /* Handling Trap */
3700                 WARN_ONCE(!write, "svm: Handling trap read.\n");
3701                 ret = avic_unaccel_trap_write(svm);
3702         } else {
3703                 /* Handling Fault */
3704                 ret = kvm_emulate_instruction(&svm->vcpu, 0);
3705         }
3706
3707         return ret;
3708 }
3709
3710 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3711         [SVM_EXIT_READ_CR0]                     = cr_interception,
3712         [SVM_EXIT_READ_CR3]                     = cr_interception,
3713         [SVM_EXIT_READ_CR4]                     = cr_interception,
3714         [SVM_EXIT_READ_CR8]                     = cr_interception,
3715         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3716         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3717         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3718         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3719         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3720         [SVM_EXIT_READ_DR0]                     = dr_interception,
3721         [SVM_EXIT_READ_DR1]                     = dr_interception,
3722         [SVM_EXIT_READ_DR2]                     = dr_interception,
3723         [SVM_EXIT_READ_DR3]                     = dr_interception,
3724         [SVM_EXIT_READ_DR4]                     = dr_interception,
3725         [SVM_EXIT_READ_DR5]                     = dr_interception,
3726         [SVM_EXIT_READ_DR6]                     = dr_interception,
3727         [SVM_EXIT_READ_DR7]                     = dr_interception,
3728         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3729         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3730         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3731         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3732         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3733         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3734         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3735         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3736         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3737         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3738         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3739         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3740         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3741         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3742         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3743         [SVM_EXIT_INTR]                         = intr_interception,
3744         [SVM_EXIT_NMI]                          = nmi_interception,
3745         [SVM_EXIT_SMI]                          = nop_on_interception,
3746         [SVM_EXIT_INIT]                         = nop_on_interception,
3747         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3748         [SVM_EXIT_RDPMC]                        = rdpmc_interception,
3749         [SVM_EXIT_CPUID]                        = cpuid_interception,
3750         [SVM_EXIT_IRET]                         = iret_interception,
3751         [SVM_EXIT_INVD]                         = emulate_on_interception,
3752         [SVM_EXIT_PAUSE]                        = pause_interception,
3753         [SVM_EXIT_HLT]                          = halt_interception,
3754         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3755         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3756         [SVM_EXIT_IOIO]                         = io_interception,
3757         [SVM_EXIT_MSR]                          = msr_interception,
3758         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3759         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3760         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3761         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3762         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3763         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3764         [SVM_EXIT_STGI]                         = stgi_interception,
3765         [SVM_EXIT_CLGI]                         = clgi_interception,
3766         [SVM_EXIT_SKINIT]                       = skinit_interception,
3767         [SVM_EXIT_WBINVD]                       = wbinvd_interception,
3768         [SVM_EXIT_MONITOR]                      = monitor_interception,
3769         [SVM_EXIT_MWAIT]                        = mwait_interception,
3770         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3771         [SVM_EXIT_RDPRU]                        = rdpru_interception,
3772         [SVM_EXIT_NPF]                          = npf_interception,
3773         [SVM_EXIT_RSM]                          = rsm_interception,
3774         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3775         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3776 };
3777
3778 static void dump_vmcb(struct kvm_vcpu *vcpu)
3779 {
3780         struct vcpu_svm *svm = to_svm(vcpu);
3781         struct vmcb_control_area *control = &svm->vmcb->control;
3782         struct vmcb_save_area *save = &svm->vmcb->save;
3783
3784         if (!dump_invalid_vmcb) {
3785                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3786                 return;
3787         }
3788
3789         pr_err("VMCB Control Area:\n");
3790         pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3791         pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3792         pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3793         pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3794         pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3795         pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3796         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3797         pr_err("%-20s%d\n", "pause filter threshold:",
3798                control->pause_filter_thresh);
3799         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3800         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3801         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3802         pr_err("%-20s%d\n", "asid:", control->asid);
3803         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3804         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3805         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3806         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3807         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3808         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3809         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3810         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3811         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3812         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3813         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3814         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3815         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3816         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3817         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3818         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3819         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3820         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3821         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3822         pr_err("VMCB State Save Area:\n");
3823         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3824                "es:",
3825                save->es.selector, save->es.attrib,
3826                save->es.limit, save->es.base);
3827         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3828                "cs:",
3829                save->cs.selector, save->cs.attrib,
3830                save->cs.limit, save->cs.base);
3831         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3832                "ss:",
3833                save->ss.selector, save->ss.attrib,
3834                save->ss.limit, save->ss.base);
3835         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3836                "ds:",
3837                save->ds.selector, save->ds.attrib,
3838                save->ds.limit, save->ds.base);
3839         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3840                "fs:",
3841                save->fs.selector, save->fs.attrib,
3842                save->fs.limit, save->fs.base);
3843         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3844                "gs:",
3845                save->gs.selector, save->gs.attrib,
3846                save->gs.limit, save->gs.base);
3847         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3848                "gdtr:",
3849                save->gdtr.selector, save->gdtr.attrib,
3850                save->gdtr.limit, save->gdtr.base);
3851         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3852                "ldtr:",
3853                save->ldtr.selector, save->ldtr.attrib,
3854                save->ldtr.limit, save->ldtr.base);
3855         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3856                "idtr:",
3857                save->idtr.selector, save->idtr.attrib,
3858                save->idtr.limit, save->idtr.base);
3859         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3860                "tr:",
3861                save->tr.selector, save->tr.attrib,
3862                save->tr.limit, save->tr.base);
3863         pr_err("cpl:            %d                efer:         %016llx\n",
3864                 save->cpl, save->efer);
3865         pr_err("%-15s %016llx %-13s %016llx\n",
3866                "cr0:", save->cr0, "cr2:", save->cr2);
3867         pr_err("%-15s %016llx %-13s %016llx\n",
3868                "cr3:", save->cr3, "cr4:", save->cr4);
3869         pr_err("%-15s %016llx %-13s %016llx\n",
3870                "dr6:", save->dr6, "dr7:", save->dr7);
3871         pr_err("%-15s %016llx %-13s %016llx\n",
3872                "rip:", save->rip, "rflags:", save->rflags);
3873         pr_err("%-15s %016llx %-13s %016llx\n",
3874                "rsp:", save->rsp, "rax:", save->rax);
3875         pr_err("%-15s %016llx %-13s %016llx\n",
3876                "star:", save->star, "lstar:", save->lstar);
3877         pr_err("%-15s %016llx %-13s %016llx\n",
3878                "cstar:", save->cstar, "sfmask:", save->sfmask);
3879         pr_err("%-15s %016llx %-13s %016llx\n",
3880                "kernel_gs_base:", save->kernel_gs_base,
3881                "sysenter_cs:", save->sysenter_cs);
3882         pr_err("%-15s %016llx %-13s %016llx\n",
3883                "sysenter_esp:", save->sysenter_esp,
3884                "sysenter_eip:", save->sysenter_eip);
3885         pr_err("%-15s %016llx %-13s %016llx\n",
3886                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3887         pr_err("%-15s %016llx %-13s %016llx\n",
3888                "br_from:", save->br_from, "br_to:", save->br_to);
3889         pr_err("%-15s %016llx %-13s %016llx\n",
3890                "excp_from:", save->last_excp_from,
3891                "excp_to:", save->last_excp_to);
3892 }
3893
3894 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3895 {
3896         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3897
3898         *info1 = control->exit_info_1;
3899         *info2 = control->exit_info_2;
3900 }
3901
3902 static int handle_exit(struct kvm_vcpu *vcpu,
3903         enum exit_fastpath_completion exit_fastpath)
3904 {
3905         struct vcpu_svm *svm = to_svm(vcpu);
3906         struct kvm_run *kvm_run = vcpu->run;
3907         u32 exit_code = svm->vmcb->control.exit_code;
3908
3909         trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3910
3911         if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3912                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3913         if (npt_enabled)
3914                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3915
3916         if (unlikely(svm->nested.exit_required)) {
3917                 nested_svm_vmexit(svm);
3918                 svm->nested.exit_required = false;
3919
3920                 return 1;
3921         }
3922
3923         if (is_guest_mode(vcpu)) {
3924                 int vmexit;
3925
3926                 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
3927                                         svm->vmcb->control.exit_info_1,
3928                                         svm->vmcb->control.exit_info_2,
3929                                         svm->vmcb->control.exit_int_info,
3930                                         svm->vmcb->control.exit_int_info_err,
3931                                         KVM_ISA_SVM);
3932
3933                 vmexit = nested_svm_exit_special(svm);
3934
3935                 if (vmexit == NESTED_EXIT_CONTINUE)
3936                         vmexit = nested_svm_exit_handled(svm);
3937
3938                 if (vmexit == NESTED_EXIT_DONE)
3939                         return 1;
3940         }
3941
3942         svm_complete_interrupts(svm);
3943
3944         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3945                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3946                 kvm_run->fail_entry.hardware_entry_failure_reason
3947                         = svm->vmcb->control.exit_code;
3948                 dump_vmcb(vcpu);
3949                 return 0;
3950         }
3951
3952         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3953             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3954             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3955             exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3956                 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3957                        "exit_code 0x%x\n",
3958                        __func__, svm->vmcb->control.exit_int_info,
3959                        exit_code);
3960
3961         if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
3962                 kvm_skip_emulated_instruction(vcpu);
3963                 return 1;
3964         } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3965             || !svm_exit_handlers[exit_code]) {
3966                 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
3967                 dump_vmcb(vcpu);
3968                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3969                 vcpu->run->internal.suberror =
3970                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3971                 vcpu->run->internal.ndata = 1;
3972                 vcpu->run->internal.data[0] = exit_code;
3973                 return 0;
3974         }
3975
3976 #ifdef CONFIG_RETPOLINE
3977         if (exit_code == SVM_EXIT_MSR)
3978                 return msr_interception(svm);
3979         else if (exit_code == SVM_EXIT_VINTR)
3980                 return interrupt_window_interception(svm);
3981         else if (exit_code == SVM_EXIT_INTR)
3982                 return intr_interception(svm);
3983         else if (exit_code == SVM_EXIT_HLT)
3984                 return halt_interception(svm);
3985         else if (exit_code == SVM_EXIT_NPF)
3986                 return npf_interception(svm);
3987 #endif
3988         return svm_exit_handlers[exit_code](svm);
3989 }
3990
3991 static void reload_tss(struct kvm_vcpu *vcpu)
3992 {
3993         int cpu = raw_smp_processor_id();
3994
3995         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3996         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3997         load_TR_desc();
3998 }
3999
4000 static void pre_sev_run(struct vcpu_svm *svm, int cpu)
4001 {
4002         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4003         int asid = sev_get_asid(svm->vcpu.kvm);
4004
4005         /* Assign the asid allocated with this SEV guest */
4006         svm->vmcb->control.asid = asid;
4007
4008         /*
4009          * Flush guest TLB:
4010          *
4011          * 1) when different VMCB for the same ASID is to be run on the same host CPU.
4012          * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
4013          */
4014         if (sd->sev_vmcbs[asid] == svm->vmcb &&
4015             svm->last_cpu == cpu)
4016                 return;
4017
4018         svm->last_cpu = cpu;
4019         sd->sev_vmcbs[asid] = svm->vmcb;
4020         svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4021         mark_dirty(svm->vmcb, VMCB_ASID);
4022 }
4023
4024 static void pre_svm_run(struct vcpu_svm *svm)
4025 {
4026         int cpu = raw_smp_processor_id();
4027
4028         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4029
4030         if (sev_guest(svm->vcpu.kvm))
4031                 return pre_sev_run(svm, cpu);
4032
4033         /* FIXME: handle wraparound of asid_generation */
4034         if (svm->asid_generation != sd->asid_generation)
4035                 new_asid(svm, sd);
4036 }
4037
4038 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
4039 {
4040         struct vcpu_svm *svm = to_svm(vcpu);
4041
4042         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
4043         vcpu->arch.hflags |= HF_NMI_MASK;
4044         set_intercept(svm, INTERCEPT_IRET);
4045         ++vcpu->stat.nmi_injections;
4046 }
4047
4048 static void svm_set_irq(struct kvm_vcpu *vcpu)
4049 {
4050         struct vcpu_svm *svm = to_svm(vcpu);
4051
4052         BUG_ON(!(gif_set(svm)));
4053
4054         trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
4055         ++vcpu->stat.irq_injections;
4056
4057         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
4058                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
4059 }
4060
4061 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
4062 {
4063         struct vcpu_svm *svm = to_svm(vcpu);
4064
4065         if (svm_nested_virtualize_tpr(vcpu))
4066                 return;
4067
4068         clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4069
4070         if (irr == -1)
4071                 return;
4072
4073         if (tpr >= irr)
4074                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4075 }
4076
4077 static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
4078 {
4079         return;
4080 }
4081
4082 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
4083 {
4084 }
4085
4086 static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
4087 {
4088 }
4089
4090 static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
4091 {
4092         if (!avic || !lapic_in_kernel(vcpu))
4093                 return;
4094
4095         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4096         kvm_request_apicv_update(vcpu->kvm, activate,
4097                                  APICV_INHIBIT_REASON_IRQWIN);
4098         vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4099 }
4100
4101 static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
4102 {
4103         int ret = 0;
4104         unsigned long flags;
4105         struct amd_svm_iommu_ir *ir;
4106         struct vcpu_svm *svm = to_svm(vcpu);
4107
4108         if (!kvm_arch_has_assigned_device(vcpu->kvm))
4109                 return 0;
4110
4111         /*
4112          * Here, we go through the per-vcpu ir_list to update all existing
4113          * interrupt remapping table entry targeting this vcpu.
4114          */
4115         spin_lock_irqsave(&svm->ir_list_lock, flags);
4116
4117         if (list_empty(&svm->ir_list))
4118                 goto out;
4119
4120         list_for_each_entry(ir, &svm->ir_list, node) {
4121                 if (activate)
4122                         ret = amd_iommu_activate_guest_mode(ir->data);
4123                 else
4124                         ret = amd_iommu_deactivate_guest_mode(ir->data);
4125                 if (ret)
4126                         break;
4127         }
4128 out:
4129         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4130         return ret;
4131 }
4132
4133 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4134 {
4135         struct vcpu_svm *svm = to_svm(vcpu);
4136         struct vmcb *vmcb = svm->vmcb;
4137         bool activated = kvm_vcpu_apicv_active(vcpu);
4138
4139         if (!avic)
4140                 return;
4141
4142         if (activated) {
4143                 /**
4144                  * During AVIC temporary deactivation, guest could update
4145                  * APIC ID, DFR and LDR registers, which would not be trapped
4146                  * by avic_unaccelerated_access_interception(). In this case,
4147                  * we need to check and update the AVIC logical APIC ID table
4148                  * accordingly before re-activating.
4149                  */
4150                 avic_post_state_restore(vcpu);
4151                 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
4152         } else {
4153                 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
4154         }
4155         mark_dirty(vmcb, VMCB_AVIC);
4156
4157         svm_set_pi_irte_mode(vcpu, activated);
4158 }
4159
4160 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
4161 {
4162         return;
4163 }
4164
4165 static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
4166 {
4167         if (!vcpu->arch.apicv_active)
4168                 return -1;
4169
4170         kvm_lapic_set_irr(vec, vcpu->arch.apic);
4171         smp_mb__after_atomic();
4172
4173         if (avic_vcpu_is_running(vcpu)) {
4174                 int cpuid = vcpu->cpu;
4175
4176                 if (cpuid != get_cpu())
4177                         wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
4178                 put_cpu();
4179         } else
4180                 kvm_vcpu_wake_up(vcpu);
4181
4182         return 0;
4183 }
4184
4185 static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
4186 {
4187         return false;
4188 }
4189
4190 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
4191 {
4192         unsigned long flags;
4193         struct amd_svm_iommu_ir *cur;
4194
4195         spin_lock_irqsave(&svm->ir_list_lock, flags);
4196         list_for_each_entry(cur, &svm->ir_list, node) {
4197                 if (cur->data != pi->ir_data)
4198                         continue;
4199                 list_del(&cur->node);
4200                 kfree(cur);
4201                 break;
4202         }
4203         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4204 }
4205
4206 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
4207 {
4208         int ret = 0;
4209         unsigned long flags;
4210         struct amd_svm_iommu_ir *ir;
4211
4212         /**
4213          * In some cases, the existing irte is updaed and re-set,
4214          * so we need to check here if it's already been * added
4215          * to the ir_list.
4216          */
4217         if (pi->ir_data && (pi->prev_ga_tag != 0)) {
4218                 struct kvm *kvm = svm->vcpu.kvm;
4219                 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
4220                 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
4221                 struct vcpu_svm *prev_svm;
4222
4223                 if (!prev_vcpu) {
4224                         ret = -EINVAL;
4225                         goto out;
4226                 }
4227
4228                 prev_svm = to_svm(prev_vcpu);
4229                 svm_ir_list_del(prev_svm, pi);
4230         }
4231
4232         /**
4233          * Allocating new amd_iommu_pi_data, which will get
4234          * add to the per-vcpu ir_list.
4235          */
4236         ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
4237         if (!ir) {
4238                 ret = -ENOMEM;
4239                 goto out;
4240         }
4241         ir->data = pi->ir_data;
4242
4243         spin_lock_irqsave(&svm->ir_list_lock, flags);
4244         list_add(&ir->node, &svm->ir_list);
4245         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4246 out:
4247         return ret;
4248 }
4249
4250 /**
4251  * Note:
4252  * The HW cannot support posting multicast/broadcast
4253  * interrupts to a vCPU. So, we still use legacy interrupt
4254  * remapping for these kind of interrupts.
4255  *
4256  * For lowest-priority interrupts, we only support
4257  * those with single CPU as the destination, e.g. user
4258  * configures the interrupts via /proc/irq or uses
4259  * irqbalance to make the interrupts single-CPU.
4260  */
4261 static int
4262 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
4263                  struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
4264 {
4265         struct kvm_lapic_irq irq;
4266         struct kvm_vcpu *vcpu = NULL;
4267
4268         kvm_set_msi_irq(kvm, e, &irq);
4269
4270         if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
4271             !kvm_irq_is_postable(&irq)) {
4272                 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
4273                          __func__, irq.vector);
4274                 return -1;
4275         }
4276
4277         pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
4278                  irq.vector);
4279         *svm = to_svm(vcpu);
4280         vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
4281         vcpu_info->vector = irq.vector;
4282
4283         return 0;
4284 }
4285
4286 /*
4287  * svm_update_pi_irte - set IRTE for Posted-Interrupts
4288  *
4289  * @kvm: kvm
4290  * @host_irq: host irq of the interrupt
4291  * @guest_irq: gsi of the interrupt
4292  * @set: set or unset PI
4293  * returns 0 on success, < 0 on failure
4294  */
4295 static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
4296                               uint32_t guest_irq, bool set)
4297 {
4298         struct kvm_kernel_irq_routing_entry *e;
4299         struct kvm_irq_routing_table *irq_rt;
4300         int idx, ret = -EINVAL;
4301
4302         if (!kvm_arch_has_assigned_device(kvm) ||
4303             !irq_remapping_cap(IRQ_POSTING_CAP))
4304                 return 0;
4305
4306         pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
4307                  __func__, host_irq, guest_irq, set);
4308
4309         idx = srcu_read_lock(&kvm->irq_srcu);
4310         irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
4311         WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
4312
4313         hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
4314                 struct vcpu_data vcpu_info;
4315                 struct vcpu_svm *svm = NULL;
4316
4317                 if (e->type != KVM_IRQ_ROUTING_MSI)
4318                         continue;
4319
4320                 /**
4321                  * Here, we setup with legacy mode in the following cases:
4322                  * 1. When cannot target interrupt to a specific vcpu.
4323                  * 2. Unsetting posted interrupt.
4324                  * 3. APIC virtialization is disabled for the vcpu.
4325                  * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
4326                  */
4327                 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
4328                     kvm_vcpu_apicv_active(&svm->vcpu)) {
4329                         struct amd_iommu_pi_data pi;
4330
4331                         /* Try to enable guest_mode in IRTE */
4332                         pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
4333                                             AVIC_HPA_MASK);
4334                         pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
4335                                                      svm->vcpu.vcpu_id);
4336                         pi.is_guest_mode = true;
4337                         pi.vcpu_data = &vcpu_info;
4338                         ret = irq_set_vcpu_affinity(host_irq, &pi);
4339
4340                         /**
4341                          * Here, we successfully setting up vcpu affinity in
4342                          * IOMMU guest mode. Now, we need to store the posted
4343                          * interrupt information in a per-vcpu ir_list so that
4344                          * we can reference to them directly when we update vcpu
4345                          * scheduling information in IOMMU irte.
4346                          */
4347                         if (!ret && pi.is_guest_mode)
4348                                 svm_ir_list_add(svm, &pi);
4349                 } else {
4350                         /* Use legacy mode in IRTE */
4351                         struct amd_iommu_pi_data pi;
4352
4353                         /**
4354                          * Here, pi is used to:
4355                          * - Tell IOMMU to use legacy mode for this interrupt.
4356                          * - Retrieve ga_tag of prior interrupt remapping data.
4357                          */
4358                         pi.is_guest_mode = false;
4359                         ret = irq_set_vcpu_affinity(host_irq, &pi);
4360
4361                         /**
4362                          * Check if the posted interrupt was previously
4363                          * setup with the guest_mode by checking if the ga_tag
4364                          * was cached. If so, we need to clean up the per-vcpu
4365                          * ir_list.
4366                          */
4367                         if (!ret && pi.prev_ga_tag) {
4368                                 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
4369                                 struct kvm_vcpu *vcpu;
4370
4371                                 vcpu = kvm_get_vcpu_by_id(kvm, id);
4372                                 if (vcpu)
4373                                         svm_ir_list_del(to_svm(vcpu), &pi);
4374                         }
4375                 }
4376
4377                 if (!ret && svm) {
4378                         trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
4379                                                  e->gsi, vcpu_info.vector,
4380                                                  vcpu_info.pi_desc_addr, set);
4381                 }
4382
4383                 if (ret < 0) {
4384                         pr_err("%s: failed to update PI IRTE\n", __func__);
4385                         goto out;
4386                 }
4387         }
4388
4389         ret = 0;
4390 out:
4391         srcu_read_unlock(&kvm->irq_srcu, idx);
4392         return ret;
4393 }
4394
4395 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
4396 {
4397         struct vcpu_svm *svm = to_svm(vcpu);
4398         struct vmcb *vmcb = svm->vmcb;
4399         int ret;
4400         ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
4401               !(svm->vcpu.arch.hflags & HF_NMI_MASK);
4402         ret = ret && gif_set(svm) && nested_svm_nmi(svm);
4403
4404         return ret;
4405 }
4406
4407 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
4408 {
4409         struct vcpu_svm *svm = to_svm(vcpu);
4410
4411         return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
4412 }
4413
4414 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4415 {
4416         struct vcpu_svm *svm = to_svm(vcpu);
4417
4418         if (masked) {
4419                 svm->vcpu.arch.hflags |= HF_NMI_MASK;
4420                 set_intercept(svm, INTERCEPT_IRET);
4421         } else {
4422                 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
4423                 clr_intercept(svm, INTERCEPT_IRET);
4424         }
4425 }
4426
4427 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
4428 {
4429         struct vcpu_svm *svm = to_svm(vcpu);
4430         struct vmcb *vmcb = svm->vmcb;
4431
4432         if (!gif_set(svm) ||
4433              (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
4434                 return 0;
4435
4436         if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
4437                 return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
4438         else
4439                 return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
4440 }
4441
4442 static void enable_irq_window(struct kvm_vcpu *vcpu)
4443 {
4444         struct vcpu_svm *svm = to_svm(vcpu);
4445
4446         /*
4447          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
4448          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
4449          * get that intercept, this function will be called again though and
4450          * we'll get the vintr intercept. However, if the vGIF feature is
4451          * enabled, the STGI interception will not occur. Enable the irq
4452          * window under the assumption that the hardware will set the GIF.
4453          */
4454         if (vgif_enabled(svm) || gif_set(svm)) {
4455                 /*
4456                  * IRQ window is not needed when AVIC is enabled,
4457                  * unless we have pending ExtINT since it cannot be injected
4458                  * via AVIC. In such case, we need to temporarily disable AVIC,
4459                  * and fallback to injecting IRQ via V_IRQ.
4460                  */
4461                 svm_toggle_avic_for_irq_window(vcpu, false);
4462                 svm_set_vintr(svm);
4463         }
4464 }
4465
4466 static void enable_nmi_window(struct kvm_vcpu *vcpu)
4467 {
4468         struct vcpu_svm *svm = to_svm(vcpu);
4469
4470         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
4471             == HF_NMI_MASK)
4472                 return; /* IRET will cause a vm exit */
4473
4474         if (!gif_set(svm)) {
4475                 if (vgif_enabled(svm))
4476                         set_intercept(svm, INTERCEPT_STGI);
4477                 return; /* STGI will cause a vm exit */
4478         }
4479
4480         if (svm->nested.exit_required)
4481                 return; /* we're not going to run the guest yet */
4482
4483         /*
4484          * Something prevents NMI from been injected. Single step over possible
4485          * problem (IRET or exception injection or interrupt shadow)
4486          */
4487         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
4488         svm->nmi_singlestep = true;
4489         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4490 }
4491
4492 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
4493 {
4494         return 0;
4495 }
4496
4497 static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
4498 {
4499         return 0;
4500 }
4501
4502 void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
4503 {
4504         struct vcpu_svm *svm = to_svm(vcpu);
4505
4506         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4507                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4508         else
4509                 svm->asid_generation--;
4510 }
4511
4512 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
4513 {
4514         struct vcpu_svm *svm = to_svm(vcpu);
4515
4516         invlpga(gva, svm->vmcb->control.asid);
4517 }
4518
4519 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
4520 {
4521 }
4522
4523 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4524 {
4525         struct vcpu_svm *svm = to_svm(vcpu);
4526
4527         if (svm_nested_virtualize_tpr(vcpu))
4528                 return;
4529
4530         if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
4531                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4532                 kvm_set_cr8(vcpu, cr8);
4533         }
4534 }
4535
4536 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
4537 {
4538         struct vcpu_svm *svm = to_svm(vcpu);
4539         u64 cr8;
4540
4541         if (svm_nested_virtualize_tpr(vcpu) ||
4542             kvm_vcpu_apicv_active(vcpu))
4543                 return;
4544
4545         cr8 = kvm_get_cr8(vcpu);
4546         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
4547         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
4548 }
4549
4550 static void svm_complete_interrupts(struct vcpu_svm *svm)
4551 {
4552         u8 vector;
4553         int type;
4554         u32 exitintinfo = svm->vmcb->control.exit_int_info;
4555         unsigned int3_injected = svm->int3_injected;
4556
4557         svm->int3_injected = 0;
4558
4559         /*
4560          * If we've made progress since setting HF_IRET_MASK, we've
4561          * executed an IRET and can allow NMI injection.
4562          */
4563         if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
4564             && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
4565                 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
4566                 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4567         }
4568
4569         svm->vcpu.arch.nmi_injected = false;
4570         kvm_clear_exception_queue(&svm->vcpu);
4571         kvm_clear_interrupt_queue(&svm->vcpu);
4572
4573         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4574                 return;
4575
4576         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4577
4578         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4579         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4580
4581         switch (type) {
4582         case SVM_EXITINTINFO_TYPE_NMI:
4583                 svm->vcpu.arch.nmi_injected = true;
4584                 break;
4585         case SVM_EXITINTINFO_TYPE_EXEPT:
4586                 /*
4587                  * In case of software exceptions, do not reinject the vector,
4588                  * but re-execute the instruction instead. Rewind RIP first
4589                  * if we emulated INT3 before.
4590                  */
4591                 if (kvm_exception_is_soft(vector)) {
4592                         if (vector == BP_VECTOR && int3_injected &&
4593                             kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
4594                                 kvm_rip_write(&svm->vcpu,
4595                                               kvm_rip_read(&svm->vcpu) -
4596                                               int3_injected);
4597                         break;
4598                 }
4599                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4600                         u32 err = svm->vmcb->control.exit_int_info_err;
4601                         kvm_requeue_exception_e(&svm->vcpu, vector, err);
4602
4603                 } else
4604                         kvm_requeue_exception(&svm->vcpu, vector);
4605                 break;
4606         case SVM_EXITINTINFO_TYPE_INTR:
4607                 kvm_queue_interrupt(&svm->vcpu, vector, false);
4608                 break;
4609         default:
4610                 break;
4611         }
4612 }
4613
4614 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4615 {
4616         struct vcpu_svm *svm = to_svm(vcpu);
4617         struct vmcb_control_area *control = &svm->vmcb->control;
4618
4619         control->exit_int_info = control->event_inj;
4620         control->exit_int_info_err = control->event_inj_err;
4621         control->event_inj = 0;
4622         svm_complete_interrupts(svm);
4623 }
4624
4625 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
4626 {
4627         struct vcpu_svm *svm = to_svm(vcpu);
4628
4629         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4630         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4631         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4632
4633         /*
4634          * A vmexit emulation is required before the vcpu can be executed
4635          * again.
4636          */
4637         if (unlikely(svm->nested.exit_required))
4638                 return;
4639
4640         /*
4641          * Disable singlestep if we're injecting an interrupt/exception.
4642          * We don't want our modified rflags to be pushed on the stack where
4643          * we might not be able to easily reset them if we disabled NMI
4644          * singlestep later.
4645          */
4646         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4647                 /*
4648                  * Event injection happens before external interrupts cause a
4649                  * vmexit and interrupts are disabled here, so smp_send_reschedule
4650                  * is enough to force an immediate vmexit.
4651                  */
4652                 disable_nmi_singlestep(svm);
4653                 smp_send_reschedule(vcpu->cpu);
4654         }
4655
4656         pre_svm_run(svm);
4657
4658         sync_lapic_to_cr8(vcpu);
4659
4660         svm->vmcb->save.cr2 = vcpu->arch.cr2;
4661
4662         clgi();
4663         kvm_load_guest_xsave_state(vcpu);
4664
4665         if (lapic_in_kernel(vcpu) &&
4666                 vcpu->arch.apic->lapic_timer.timer_advance_ns)
4667                 kvm_wait_lapic_expire(vcpu);
4668
4669         /*
4670          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4671          * it's non-zero. Since vmentry is serialising on affected CPUs, there
4672          * is no need to worry about the conditional branch over the wrmsr
4673          * being speculatively taken.
4674          */
4675         x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
4676
4677         local_irq_enable();
4678
4679         asm volatile (
4680                 "push %%" _ASM_BP "; \n\t"
4681                 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
4682                 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
4683                 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
4684                 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
4685                 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
4686                 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
4687 #ifdef CONFIG_X86_64
4688                 "mov %c[r8](%[svm]),  %%r8  \n\t"
4689                 "mov %c[r9](%[svm]),  %%r9  \n\t"
4690                 "mov %c[r10](%[svm]), %%r10 \n\t"
4691                 "mov %c[r11](%[svm]), %%r11 \n\t"
4692                 "mov %c[r12](%[svm]), %%r12 \n\t"
4693                 "mov %c[r13](%[svm]), %%r13 \n\t"
4694                 "mov %c[r14](%[svm]), %%r14 \n\t"
4695                 "mov %c[r15](%[svm]), %%r15 \n\t"
4696 #endif
4697
4698                 /* Enter guest mode */
4699                 "push %%" _ASM_AX " \n\t"
4700                 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
4701                 __ex("vmload %%" _ASM_AX) "\n\t"
4702                 __ex("vmrun %%" _ASM_AX) "\n\t"
4703                 __ex("vmsave %%" _ASM_AX) "\n\t"
4704                 "pop %%" _ASM_AX " \n\t"
4705
4706                 /* Save guest registers, load host registers */
4707                 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
4708                 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
4709                 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
4710                 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
4711                 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
4712                 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
4713 #ifdef CONFIG_X86_64
4714                 "mov %%r8,  %c[r8](%[svm]) \n\t"
4715                 "mov %%r9,  %c[r9](%[svm]) \n\t"
4716                 "mov %%r10, %c[r10](%[svm]) \n\t"
4717                 "mov %%r11, %c[r11](%[svm]) \n\t"
4718                 "mov %%r12, %c[r12](%[svm]) \n\t"
4719                 "mov %%r13, %c[r13](%[svm]) \n\t"
4720                 "mov %%r14, %c[r14](%[svm]) \n\t"
4721                 "mov %%r15, %c[r15](%[svm]) \n\t"
4722                 /*
4723                 * Clear host registers marked as clobbered to prevent
4724                 * speculative use.
4725                 */
4726                 "xor %%r8d, %%r8d \n\t"
4727                 "xor %%r9d, %%r9d \n\t"
4728                 "xor %%r10d, %%r10d \n\t"
4729                 "xor %%r11d, %%r11d \n\t"
4730                 "xor %%r12d, %%r12d \n\t"
4731                 "xor %%r13d, %%r13d \n\t"
4732                 "xor %%r14d, %%r14d \n\t"
4733                 "xor %%r15d, %%r15d \n\t"
4734 #endif
4735                 "xor %%ebx, %%ebx \n\t"
4736                 "xor %%ecx, %%ecx \n\t"
4737                 "xor %%edx, %%edx \n\t"
4738                 "xor %%esi, %%esi \n\t"
4739                 "xor %%edi, %%edi \n\t"
4740                 "pop %%" _ASM_BP
4741                 :
4742                 : [svm]"a"(svm),
4743                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
4744                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
4745                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
4746                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
4747                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
4748                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
4749                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
4750 #ifdef CONFIG_X86_64
4751                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
4752                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
4753                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
4754                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
4755                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
4756                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
4757                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
4758                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
4759 #endif
4760                 : "cc", "memory"
4761 #ifdef CONFIG_X86_64
4762                 , "rbx", "rcx", "rdx", "rsi", "rdi"
4763                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
4764 #else
4765                 , "ebx", "ecx", "edx", "esi", "edi"
4766 #endif
4767                 );
4768
4769         /* Eliminate branch target predictions from guest mode */
4770         vmexit_fill_RSB();
4771
4772 #ifdef CONFIG_X86_64
4773         wrmsrl(MSR_GS_BASE, svm->host.gs_base);
4774 #else
4775         loadsegment(fs, svm->host.fs);
4776 #ifndef CONFIG_X86_32_LAZY_GS
4777         loadsegment(gs, svm->host.gs);
4778 #endif
4779 #endif
4780
4781         /*
4782          * We do not use IBRS in the kernel. If this vCPU has used the
4783          * SPEC_CTRL MSR it may have left it on; save the value and
4784          * turn it off. This is much more efficient than blindly adding
4785          * it to the atomic save/restore list. Especially as the former
4786          * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
4787          *
4788          * For non-nested case:
4789          * If the L01 MSR bitmap does not intercept the MSR, then we need to
4790          * save it.
4791          *
4792          * For nested case:
4793          * If the L02 MSR bitmap does not intercept the MSR, then we need to
4794          * save it.
4795          */
4796         if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
4797                 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
4798
4799         reload_tss(vcpu);
4800
4801         local_irq_disable();
4802
4803         x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
4804
4805         vcpu->arch.cr2 = svm->vmcb->save.cr2;
4806         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4807         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4808         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4809
4810         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4811                 kvm_before_interrupt(&svm->vcpu);
4812
4813         kvm_load_host_xsave_state(vcpu);
4814         stgi();
4815
4816         /* Any pending NMI will happen here */
4817
4818         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4819                 kvm_after_interrupt(&svm->vcpu);
4820
4821         sync_cr8_to_lapic(vcpu);
4822
4823         svm->next_rip = 0;
4824
4825         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4826
4827         /* if exit due to PF check for async PF */
4828         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4829                 svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
4830
4831         if (npt_enabled) {
4832                 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
4833                 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
4834         }
4835
4836         /*
4837          * We need to handle MC intercepts here before the vcpu has a chance to
4838          * change the physical cpu
4839          */
4840         if (unlikely(svm->vmcb->control.exit_code ==
4841                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4842                 svm_handle_mce(svm);
4843
4844         mark_all_clean(svm->vmcb);
4845 }
4846 STACK_FRAME_NON_STANDARD(svm_vcpu_run);
4847
4848 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
4849 {
4850         struct vcpu_svm *svm = to_svm(vcpu);
4851         bool update_guest_cr3 = true;
4852         unsigned long cr3;
4853
4854         cr3 = __sme_set(root);
4855         if (npt_enabled) {
4856                 svm->vmcb->control.nested_cr3 = cr3;
4857                 mark_dirty(svm->vmcb, VMCB_NPT);
4858
4859                 /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
4860                 if (is_guest_mode(vcpu))
4861                         update_guest_cr3 = false;
4862                 else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
4863                         cr3 = vcpu->arch.cr3;
4864                 else /* CR3 is already up-to-date.  */
4865                         update_guest_cr3 = false;
4866         }
4867
4868         if (update_guest_cr3) {
4869                 svm->vmcb->save.cr3 = cr3;
4870                 mark_dirty(svm->vmcb, VMCB_CR);
4871         }
4872 }
4873
4874 static int is_disabled(void)
4875 {
4876         u64 vm_cr;
4877
4878         rdmsrl(MSR_VM_CR, vm_cr);
4879         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4880                 return 1;
4881
4882         return 0;
4883 }
4884
4885 static void
4886 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4887 {
4888         /*
4889          * Patch in the VMMCALL instruction:
4890          */
4891         hypercall[0] = 0x0f;
4892         hypercall[1] = 0x01;
4893         hypercall[2] = 0xd9;
4894 }
4895
4896 static int __init svm_check_processor_compat(void)
4897 {
4898         return 0;
4899 }
4900
4901 static bool svm_cpu_has_accelerated_tpr(void)
4902 {
4903         return false;
4904 }
4905
4906 static bool svm_has_emulated_msr(int index)
4907 {
4908         switch (index) {
4909         case MSR_IA32_MCG_EXT_CTL:
4910         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4911                 return false;
4912         default:
4913                 break;
4914         }
4915
4916         return true;
4917 }
4918
4919 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4920 {
4921         return 0;
4922 }
4923
4924 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4925 {
4926         struct vcpu_svm *svm = to_svm(vcpu);
4927
4928         vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4929                                     boot_cpu_has(X86_FEATURE_XSAVE) &&
4930                                     boot_cpu_has(X86_FEATURE_XSAVES);
4931
4932         /* Update nrips enabled cache */
4933         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4934                              guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
4935
4936         if (!kvm_vcpu_apicv_active(vcpu))
4937                 return;
4938
4939         /*
4940          * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
4941          * is exposed to the guest, disable AVIC.
4942          */
4943         if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
4944                 kvm_request_apicv_update(vcpu->kvm, false,
4945                                          APICV_INHIBIT_REASON_X2APIC);
4946
4947         /*
4948          * Currently, AVIC does not work with nested virtualization.
4949          * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
4950          */
4951         if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4952                 kvm_request_apicv_update(vcpu->kvm, false,
4953                                          APICV_INHIBIT_REASON_NESTED);
4954 }
4955
4956 static bool svm_has_wbinvd_exit(void)
4957 {
4958         return true;
4959 }
4960
4961 #define PRE_EX(exit)  { .exit_code = (exit), \
4962                         .stage = X86_ICPT_PRE_EXCEPT, }
4963 #define POST_EX(exit) { .exit_code = (exit), \
4964                         .stage = X86_ICPT_POST_EXCEPT, }
4965 #define POST_MEM(exit) { .exit_code = (exit), \
4966                         .stage = X86_ICPT_POST_MEMACCESS, }
4967
4968 static const struct __x86_intercept {
4969         u32 exit_code;
4970         enum x86_intercept_stage stage;
4971 } x86_intercept_map[] = {
4972         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4973         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4974         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4975         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4976         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4977         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4978         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4979         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4980         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4981         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4982         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4983         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4984         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4985         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4986         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4987         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4988         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4989         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4990         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4991         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4992         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4993         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4994         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4995         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4996         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4997         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4998         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4999         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
5000         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
5001         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
5002         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
5003         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
5004         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
5005         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
5006         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
5007         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
5008         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
5009         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
5010         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
5011         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
5012         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
5013         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
5014         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
5015         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
5016         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
5017         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
5018         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
5019 };
5020
5021 #undef PRE_EX
5022 #undef POST_EX
5023 #undef POST_MEM
5024
5025 static int svm_check_intercept(struct kvm_vcpu *vcpu,
5026                                struct x86_instruction_info *info,
5027                                enum x86_intercept_stage stage,
5028                                struct x86_exception *exception)
5029 {
5030         struct vcpu_svm *svm = to_svm(vcpu);
5031         int vmexit, ret = X86EMUL_CONTINUE;
5032         struct __x86_intercept icpt_info;
5033         struct vmcb *vmcb = svm->vmcb;
5034
5035         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
5036                 goto out;
5037
5038         icpt_info = x86_intercept_map[info->intercept];
5039
5040         if (stage != icpt_info.stage)
5041                 goto out;
5042
5043         switch (icpt_info.exit_code) {
5044         case SVM_EXIT_READ_CR0:
5045                 if (info->intercept == x86_intercept_cr_read)
5046                         icpt_info.exit_code += info->modrm_reg;
5047                 break;
5048         case SVM_EXIT_WRITE_CR0: {
5049                 unsigned long cr0, val;
5050                 u64 intercept;
5051
5052                 if (info->intercept == x86_intercept_cr_write)
5053                         icpt_info.exit_code += info->modrm_reg;
5054
5055                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
5056                     info->intercept == x86_intercept_clts)
5057                         break;
5058
5059                 intercept = svm->nested.intercept;
5060
5061                 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
5062                         break;
5063
5064                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
5065                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
5066
5067                 if (info->intercept == x86_intercept_lmsw) {
5068                         cr0 &= 0xfUL;
5069                         val &= 0xfUL;
5070                         /* lmsw can't clear PE - catch this here */
5071                         if (cr0 & X86_CR0_PE)
5072                                 val |= X86_CR0_PE;
5073                 }
5074
5075                 if (cr0 ^ val)
5076                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
5077
5078                 break;
5079         }
5080         case SVM_EXIT_READ_DR0:
5081         case SVM_EXIT_WRITE_DR0:
5082                 icpt_info.exit_code += info->modrm_reg;
5083                 break;
5084         case SVM_EXIT_MSR:
5085                 if (info->intercept == x86_intercept_wrmsr)
5086                         vmcb->control.exit_info_1 = 1;
5087                 else
5088                         vmcb->control.exit_info_1 = 0;
5089                 break;
5090         case SVM_EXIT_PAUSE:
5091                 /*
5092                  * We get this for NOP only, but pause
5093                  * is rep not, check this here
5094                  */
5095                 if (info->rep_prefix != REPE_PREFIX)
5096                         goto out;
5097                 break;
5098         case SVM_EXIT_IOIO: {
5099                 u64 exit_info;
5100                 u32 bytes;
5101
5102                 if (info->intercept == x86_intercept_in ||
5103                     info->intercept == x86_intercept_ins) {
5104                         exit_info = ((info->src_val & 0xffff) << 16) |
5105                                 SVM_IOIO_TYPE_MASK;
5106                         bytes = info->dst_bytes;
5107                 } else {
5108                         exit_info = (info->dst_val & 0xffff) << 16;
5109                         bytes = info->src_bytes;
5110                 }
5111
5112                 if (info->intercept == x86_intercept_outs ||
5113                     info->intercept == x86_intercept_ins)
5114                         exit_info |= SVM_IOIO_STR_MASK;
5115
5116                 if (info->rep_prefix)
5117                         exit_info |= SVM_IOIO_REP_MASK;
5118
5119                 bytes = min(bytes, 4u);
5120
5121                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
5122
5123                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
5124
5125                 vmcb->control.exit_info_1 = exit_info;
5126                 vmcb->control.exit_info_2 = info->next_rip;
5127
5128                 break;
5129         }
5130         default:
5131                 break;
5132         }
5133
5134         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
5135         if (static_cpu_has(X86_FEATURE_NRIPS))
5136                 vmcb->control.next_rip  = info->next_rip;
5137         vmcb->control.exit_code = icpt_info.exit_code;
5138         vmexit = nested_svm_exit_handled(svm);
5139
5140         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
5141                                            : X86EMUL_CONTINUE;
5142
5143 out:
5144         return ret;
5145 }
5146
5147 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
5148         enum exit_fastpath_completion *exit_fastpath)
5149 {
5150         if (!is_guest_mode(vcpu) &&
5151             to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
5152             to_svm(vcpu)->vmcb->control.exit_info_1)
5153                 *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
5154 }
5155
5156 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
5157 {
5158         if (pause_filter_thresh)
5159                 shrink_ple_window(vcpu);
5160 }
5161
5162 static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
5163 {
5164         if (avic_handle_apic_id_update(vcpu) != 0)
5165                 return;
5166         avic_handle_dfr_update(vcpu);
5167         avic_handle_ldr_update(vcpu);
5168 }
5169
5170 static void svm_setup_mce(struct kvm_vcpu *vcpu)
5171 {
5172         /* [63:9] are reserved. */
5173         vcpu->arch.mcg_cap &= 0x1ff;
5174 }
5175
5176 static int svm_smi_allowed(struct kvm_vcpu *vcpu)
5177 {
5178         struct vcpu_svm *svm = to_svm(vcpu);
5179
5180         /* Per APM Vol.2 15.22.2 "Response to SMI" */
5181         if (!gif_set(svm))
5182                 return 0;
5183
5184         if (is_guest_mode(&svm->vcpu) &&
5185             svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
5186                 /* TODO: Might need to set exit_info_1 and exit_info_2 here */
5187                 svm->vmcb->control.exit_code = SVM_EXIT_SMI;
5188                 svm->nested.exit_required = true;
5189                 return 0;
5190         }
5191
5192         return 1;
5193 }
5194
5195 static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
5196 {
5197         struct vcpu_svm *svm = to_svm(vcpu);
5198         int ret;
5199
5200         if (is_guest_mode(vcpu)) {
5201                 /* FED8h - SVM Guest */
5202                 put_smstate(u64, smstate, 0x7ed8, 1);
5203                 /* FEE0h - SVM Guest VMCB Physical Address */
5204                 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
5205
5206                 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
5207                 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
5208                 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
5209
5210                 ret = nested_svm_vmexit(svm);
5211                 if (ret)
5212                         return ret;
5213         }
5214         return 0;
5215 }
5216
5217 static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
5218 {
5219         struct vcpu_svm *svm = to_svm(vcpu);
5220         struct vmcb *nested_vmcb;
5221         struct kvm_host_map map;
5222         u64 guest;
5223         u64 vmcb;
5224
5225         guest = GET_SMSTATE(u64, smstate, 0x7ed8);
5226         vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
5227
5228         if (guest) {
5229                 if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
5230                         return 1;
5231                 nested_vmcb = map.hva;
5232                 enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
5233         }
5234         return 0;
5235 }
5236
5237 static int enable_smi_window(struct kvm_vcpu *vcpu)
5238 {
5239         struct vcpu_svm *svm = to_svm(vcpu);
5240
5241         if (!gif_set(svm)) {
5242                 if (vgif_enabled(svm))
5243                         set_intercept(svm, INTERCEPT_STGI);
5244                 /* STGI will cause a vm exit */
5245                 return 1;
5246         }
5247         return 0;
5248 }
5249
5250 static int sev_flush_asids(void)
5251 {
5252         int ret, error;
5253
5254         /*
5255          * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
5256          * so it must be guarded.
5257          */
5258         down_write(&sev_deactivate_lock);
5259
5260         wbinvd_on_all_cpus();
5261         ret = sev_guest_df_flush(&error);
5262
5263         up_write(&sev_deactivate_lock);
5264
5265         if (ret)
5266                 pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
5267
5268         return ret;
5269 }
5270
5271 /* Must be called with the sev_bitmap_lock held */
5272 static bool __sev_recycle_asids(void)
5273 {
5274         int pos;
5275
5276         /* Check if there are any ASIDs to reclaim before performing a flush */
5277         pos = find_next_bit(sev_reclaim_asid_bitmap,
5278                             max_sev_asid, min_sev_asid - 1);
5279         if (pos >= max_sev_asid)
5280                 return false;
5281
5282         if (sev_flush_asids())
5283                 return false;
5284
5285         bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
5286                    max_sev_asid);
5287         bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
5288
5289         return true;
5290 }
5291
5292 static int sev_asid_new(void)
5293 {
5294         bool retry = true;
5295         int pos;
5296
5297         mutex_lock(&sev_bitmap_lock);
5298
5299         /*
5300          * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
5301          */
5302 again:
5303         pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
5304         if (pos >= max_sev_asid) {
5305                 if (retry && __sev_recycle_asids()) {
5306                         retry = false;
5307                         goto again;
5308                 }
5309                 mutex_unlock(&sev_bitmap_lock);
5310                 return -EBUSY;
5311         }
5312
5313         __set_bit(pos, sev_asid_bitmap);
5314
5315         mutex_unlock(&sev_bitmap_lock);
5316
5317         return pos + 1;
5318 }
5319
5320 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
5321 {
5322         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5323         int asid, ret;
5324
5325         ret = -EBUSY;
5326         if (unlikely(sev->active))
5327                 return ret;
5328
5329         asid = sev_asid_new();
5330         if (asid < 0)
5331                 return ret;
5332
5333         ret = sev_platform_init(&argp->error);
5334         if (ret)
5335                 goto e_free;
5336
5337         sev->active = true;
5338         sev->asid = asid;
5339         INIT_LIST_HEAD(&sev->regions_list);
5340
5341         return 0;
5342
5343 e_free:
5344         sev_asid_free(asid);
5345         return ret;
5346 }
5347
5348 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
5349 {
5350         struct sev_data_activate *data;
5351         int asid = sev_get_asid(kvm);
5352         int ret;
5353
5354         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5355         if (!data)
5356                 return -ENOMEM;
5357
5358         /* activate ASID on the given handle */
5359         data->handle = handle;
5360         data->asid   = asid;
5361         ret = sev_guest_activate(data, error);
5362         kfree(data);
5363
5364         return ret;
5365 }
5366
5367 static int __sev_issue_cmd(int fd, int id, void *data, int *error)
5368 {
5369         struct fd f;
5370         int ret;
5371
5372         f = fdget(fd);
5373         if (!f.file)
5374                 return -EBADF;
5375
5376         ret = sev_issue_cmd_external_user(f.file, id, data, error);
5377
5378         fdput(f);
5379         return ret;
5380 }
5381
5382 static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
5383 {
5384         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5385
5386         return __sev_issue_cmd(sev->fd, id, data, error);
5387 }
5388
5389 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
5390 {
5391         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5392         struct sev_data_launch_start *start;
5393         struct kvm_sev_launch_start params;
5394         void *dh_blob, *session_blob;
5395         int *error = &argp->error;
5396         int ret;
5397
5398         if (!sev_guest(kvm))
5399                 return -ENOTTY;
5400
5401         if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
5402                 return -EFAULT;
5403
5404         start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
5405         if (!start)
5406                 return -ENOMEM;
5407
5408         dh_blob = NULL;
5409         if (params.dh_uaddr) {
5410                 dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
5411                 if (IS_ERR(dh_blob)) {
5412                         ret = PTR_ERR(dh_blob);
5413                         goto e_free;
5414                 }
5415
5416                 start->dh_cert_address = __sme_set(__pa(dh_blob));
5417                 start->dh_cert_len = params.dh_len;
5418         }
5419
5420         session_blob = NULL;
5421         if (params.session_uaddr) {
5422                 session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
5423                 if (IS_ERR(session_blob)) {
5424                         ret = PTR_ERR(session_blob);
5425                         goto e_free_dh;
5426                 }
5427
5428                 start->session_address = __sme_set(__pa(session_blob));
5429                 start->session_len = params.session_len;
5430         }
5431
5432         start->handle = params.handle;
5433         start->policy = params.policy;
5434
5435         /* create memory encryption context */
5436         ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
5437         if (ret)
5438                 goto e_free_session;
5439
5440         /* Bind ASID to this guest */
5441         ret = sev_bind_asid(kvm, start->handle, error);
5442         if (ret)
5443                 goto e_free_session;
5444
5445         /* return handle to userspace */
5446         params.handle = start->handle;
5447         if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
5448                 sev_unbind_asid(kvm, start->handle);
5449                 ret = -EFAULT;
5450                 goto e_free_session;
5451         }
5452
5453         sev->handle = start->handle;
5454         sev->fd = argp->sev_fd;
5455
5456 e_free_session:
5457         kfree(session_blob);
5458 e_free_dh:
5459         kfree(dh_blob);
5460 e_free:
5461         kfree(start);
5462         return ret;
5463 }
5464
5465 static unsigned long get_num_contig_pages(unsigned long idx,
5466                                 struct page **inpages, unsigned long npages)
5467 {
5468         unsigned long paddr, next_paddr;
5469         unsigned long i = idx + 1, pages = 1;
5470
5471         /* find the number of contiguous pages starting from idx */
5472         paddr = __sme_page_pa(inpages[idx]);
5473         while (i < npages) {
5474                 next_paddr = __sme_page_pa(inpages[i++]);
5475                 if ((paddr + PAGE_SIZE) == next_paddr) {
5476                         pages++;
5477                         paddr = next_paddr;
5478                         continue;
5479                 }
5480                 break;
5481         }
5482
5483         return pages;
5484 }
5485
5486 static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
5487 {
5488         unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
5489         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5490         struct kvm_sev_launch_update_data params;
5491         struct sev_data_launch_update_data *data;
5492         struct page **inpages;
5493         int ret;
5494
5495         if (!sev_guest(kvm))
5496                 return -ENOTTY;
5497
5498         if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
5499                 return -EFAULT;
5500
5501         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5502         if (!data)
5503                 return -ENOMEM;
5504
5505         vaddr = params.uaddr;
5506         size = params.len;
5507         vaddr_end = vaddr + size;
5508
5509         /* Lock the user memory. */
5510         inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
5511         if (!inpages) {
5512                 ret = -ENOMEM;
5513                 goto e_free;
5514         }
5515
5516         /*
5517          * The LAUNCH_UPDATE command will perform in-place encryption of the
5518          * memory content (i.e it will write the same memory region with C=1).
5519          * It's possible that the cache may contain the data with C=0, i.e.,
5520          * unencrypted so invalidate it first.
5521          */
5522         sev_clflush_pages(inpages, npages);
5523
5524         for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
5525                 int offset, len;
5526
5527                 /*
5528                  * If the user buffer is not page-aligned, calculate the offset
5529                  * within the page.
5530                  */
5531                 offset = vaddr & (PAGE_SIZE - 1);
5532
5533                 /* Calculate the number of pages that can be encrypted in one go. */
5534                 pages = get_num_contig_pages(i, inpages, npages);
5535
5536                 len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
5537
5538                 data->handle = sev->handle;
5539                 data->len = len;
5540                 data->address = __sme_page_pa(inpages[i]) + offset;
5541                 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
5542                 if (ret)
5543                         goto e_unpin;
5544
5545                 size -= len;
5546                 next_vaddr = vaddr + len;
5547         }
5548
5549 e_unpin:
5550         /* content of memory is updated, mark pages dirty */
5551         for (i = 0; i < npages; i++) {
5552                 set_page_dirty_lock(inpages[i]);
5553                 mark_page_accessed(inpages[i]);
5554         }
5555         /* unlock the user pages */
5556         sev_unpin_memory(kvm, inpages, npages);
5557 e_free:
5558         kfree(data);
5559         return ret;
5560 }
5561
5562 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
5563 {
5564         void __user *measure = (void __user *)(uintptr_t)argp->data;
5565         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5566         struct sev_data_launch_measure *data;
5567         struct kvm_sev_launch_measure params;
5568         void __user *p = NULL;
5569         void *blob = NULL;
5570         int ret;
5571
5572         if (!sev_guest(kvm))
5573                 return -ENOTTY;
5574
5575         if (copy_from_user(&params, measure, sizeof(params)))
5576                 return -EFAULT;
5577
5578         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5579         if (!data)
5580                 return -ENOMEM;
5581
5582         /* User wants to query the blob length */
5583         if (!params.len)
5584                 goto cmd;
5585
5586         p = (void __user *)(uintptr_t)params.uaddr;
5587         if (p) {
5588                 if (params.len > SEV_FW_BLOB_MAX_SIZE) {
5589                         ret = -EINVAL;
5590                         goto e_free;
5591                 }
5592
5593                 ret = -ENOMEM;
5594                 blob = kmalloc(params.len, GFP_KERNEL);
5595                 if (!blob)
5596                         goto e_free;
5597
5598                 data->address = __psp_pa(blob);
5599                 data->len = params.len;
5600         }
5601
5602 cmd:
5603         data->handle = sev->handle;
5604         ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
5605
5606         /*
5607          * If we query the session length, FW responded with expected data.
5608          */
5609         if (!params.len)
5610                 goto done;
5611
5612         if (ret)
5613                 goto e_free_blob;
5614
5615         if (blob) {
5616                 if (copy_to_user(p, blob, params.len))
5617                         ret = -EFAULT;
5618         }
5619
5620 done:
5621         params.len = data->len;
5622         if (copy_to_user(measure, &params, sizeof(params)))
5623                 ret = -EFAULT;
5624 e_free_blob:
5625         kfree(blob);
5626 e_free:
5627         kfree(data);
5628         return ret;
5629 }
5630
5631 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
5632 {
5633         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5634         struct sev_data_launch_finish *data;
5635         int ret;
5636
5637         if (!sev_guest(kvm))
5638                 return -ENOTTY;
5639
5640         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5641         if (!data)
5642                 return -ENOMEM;
5643
5644         data->handle = sev->handle;
5645         ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
5646
5647         kfree(data);
5648         return ret;
5649 }
5650
5651 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
5652 {
5653         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5654         struct kvm_sev_guest_status params;
5655         struct sev_data_guest_status *data;
5656         int ret;
5657
5658         if (!sev_guest(kvm))
5659                 return -ENOTTY;
5660
5661         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5662         if (!data)
5663                 return -ENOMEM;
5664
5665         data->handle = sev->handle;
5666         ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
5667         if (ret)
5668                 goto e_free;
5669
5670         params.policy = data->policy;
5671         params.state = data->state;
5672         params.handle = data->handle;
5673
5674         if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
5675                 ret = -EFAULT;
5676 e_free:
5677         kfree(data);
5678         return ret;
5679 }
5680
5681 static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
5682                                unsigned long dst, int size,
5683                                int *error, bool enc)
5684 {
5685         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5686         struct sev_data_dbg *data;
5687         int ret;
5688
5689         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5690         if (!data)
5691                 return -ENOMEM;
5692
5693         data->handle = sev->handle;
5694         data->dst_addr = dst;
5695         data->src_addr = src;
5696         data->len = size;
5697
5698         ret = sev_issue_cmd(kvm,
5699                             enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
5700                             data, error);
5701         kfree(data);
5702         return ret;
5703 }
5704
5705 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
5706                              unsigned long dst_paddr, int sz, int *err)
5707 {
5708         int offset;
5709
5710         /*
5711          * Its safe to read more than we are asked, caller should ensure that
5712          * destination has enough space.
5713          */
5714         src_paddr = round_down(src_paddr, 16);
5715         offset = src_paddr & 15;
5716         sz = round_up(sz + offset, 16);
5717
5718         return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
5719 }
5720
5721 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
5722                                   unsigned long __user dst_uaddr,
5723                                   unsigned long dst_paddr,
5724                                   int size, int *err)
5725 {
5726         struct page *tpage = NULL;
5727         int ret, offset;
5728
5729         /* if inputs are not 16-byte then use intermediate buffer */
5730         if (!IS_ALIGNED(dst_paddr, 16) ||
5731             !IS_ALIGNED(paddr,     16) ||
5732             !IS_ALIGNED(size,      16)) {
5733                 tpage = (void *)alloc_page(GFP_KERNEL);
5734                 if (!tpage)
5735                         return -ENOMEM;
5736
5737                 dst_paddr = __sme_page_pa(tpage);
5738         }
5739
5740         ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
5741         if (ret)
5742                 goto e_free;
5743
5744         if (tpage) {
5745                 offset = paddr & 15;
5746                 if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
5747                                  page_address(tpage) + offset, size))
5748                         ret = -EFAULT;
5749         }
5750
5751 e_free:
5752         if (tpage)
5753                 __free_page(tpage);
5754
5755         return ret;
5756 }
5757
5758 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
5759                                   unsigned long __user vaddr,
5760                                   unsigned long dst_paddr,
5761                                   unsigned long __user dst_vaddr,
5762                                   int size, int *error)
5763 {
5764         struct page *src_tpage = NULL;
5765         struct page *dst_tpage = NULL;
5766         int ret, len = size;
5767
5768         /* If source buffer is not aligned then use an intermediate buffer */
5769         if (!IS_ALIGNED(vaddr, 16)) {
5770                 src_tpage = alloc_page(GFP_KERNEL);
5771                 if (!src_tpage)
5772                         return -ENOMEM;
5773
5774                 if (copy_from_user(page_address(src_tpage),
5775                                 (void __user *)(uintptr_t)vaddr, size)) {
5776                         __free_page(src_tpage);
5777                         return -EFAULT;
5778                 }
5779
5780                 paddr = __sme_page_pa(src_tpage);
5781         }
5782
5783         /*
5784          *  If destination buffer or length is not aligned then do read-modify-write:
5785          *   - decrypt destination in an intermediate buffer
5786          *   - copy the source buffer in an intermediate buffer
5787          *   - use the intermediate buffer as source buffer
5788          */
5789         if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
5790                 int dst_offset;
5791
5792                 dst_tpage = alloc_page(GFP_KERNEL);
5793                 if (!dst_tpage) {
5794                         ret = -ENOMEM;
5795                         goto e_free;
5796                 }
5797
5798                 ret = __sev_dbg_decrypt(kvm, dst_paddr,
5799                                         __sme_page_pa(dst_tpage), size, error);
5800                 if (ret)
5801                         goto e_free;
5802
5803                 /*
5804                  *  If source is kernel buffer then use memcpy() otherwise
5805                  *  copy_from_user().
5806                  */
5807                 dst_offset = dst_paddr & 15;
5808
5809                 if (src_tpage)
5810                         memcpy(page_address(dst_tpage) + dst_offset,
5811                                page_address(src_tpage), size);
5812                 else {
5813                         if (copy_from_user(page_address(dst_tpage) + dst_offset,
5814                                            (void __user *)(uintptr_t)vaddr, size)) {
5815                                 ret = -EFAULT;
5816                                 goto e_free;
5817                         }
5818                 }
5819
5820                 paddr = __sme_page_pa(dst_tpage);
5821                 dst_paddr = round_down(dst_paddr, 16);
5822                 len = round_up(size, 16);
5823         }
5824
5825         ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
5826
5827 e_free:
5828         if (src_tpage)
5829                 __free_page(src_tpage);
5830         if (dst_tpage)
5831                 __free_page(dst_tpage);
5832         return ret;
5833 }
5834
5835 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
5836 {
5837         unsigned long vaddr, vaddr_end, next_vaddr;
5838         unsigned long dst_vaddr;
5839         struct page **src_p, **dst_p;
5840         struct kvm_sev_dbg debug;
5841         unsigned long n;
5842         unsigned int size;
5843         int ret;
5844
5845         if (!sev_guest(kvm))
5846                 return -ENOTTY;
5847
5848         if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
5849                 return -EFAULT;
5850
5851         if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
5852                 return -EINVAL;
5853         if (!debug.dst_uaddr)
5854                 return -EINVAL;
5855
5856         vaddr = debug.src_uaddr;
5857         size = debug.len;
5858         vaddr_end = vaddr + size;
5859         dst_vaddr = debug.dst_uaddr;
5860
5861         for (; vaddr < vaddr_end; vaddr = next_vaddr) {
5862                 int len, s_off, d_off;
5863
5864                 /* lock userspace source and destination page */
5865                 src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
5866                 if (!src_p)
5867                         return -EFAULT;
5868
5869                 dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
5870                 if (!dst_p) {
5871                         sev_unpin_memory(kvm, src_p, n);
5872                         return -EFAULT;
5873                 }
5874
5875                 /*
5876                  * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
5877                  * memory content (i.e it will write the same memory region with C=1).
5878                  * It's possible that the cache may contain the data with C=0, i.e.,
5879                  * unencrypted so invalidate it first.
5880                  */
5881                 sev_clflush_pages(src_p, 1);
5882                 sev_clflush_pages(dst_p, 1);
5883
5884                 /*
5885                  * Since user buffer may not be page aligned, calculate the
5886                  * offset within the page.
5887                  */
5888                 s_off = vaddr & ~PAGE_MASK;
5889                 d_off = dst_vaddr & ~PAGE_MASK;
5890                 len = min_t(size_t, (PAGE_SIZE - s_off), size);
5891
5892                 if (dec)
5893                         ret = __sev_dbg_decrypt_user(kvm,
5894                                                      __sme_page_pa(src_p[0]) + s_off,
5895                                                      dst_vaddr,
5896                                                      __sme_page_pa(dst_p[0]) + d_off,
5897                                                      len, &argp->error);
5898                 else
5899                         ret = __sev_dbg_encrypt_user(kvm,
5900                                                      __sme_page_pa(src_p[0]) + s_off,
5901                                                      vaddr,
5902                                                      __sme_page_pa(dst_p[0]) + d_off,
5903                                                      dst_vaddr,
5904                                                      len, &argp->error);
5905
5906                 sev_unpin_memory(kvm, src_p, n);
5907                 sev_unpin_memory(kvm, dst_p, n);
5908
5909                 if (ret)
5910                         goto err;
5911
5912                 next_vaddr = vaddr + len;
5913                 dst_vaddr = dst_vaddr + len;
5914                 size -= len;
5915         }
5916 err:
5917         return ret;
5918 }
5919
5920 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
5921 {
5922         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
5923         struct sev_data_launch_secret *data;
5924         struct kvm_sev_launch_secret params;
5925         struct page **pages;
5926         void *blob, *hdr;
5927         unsigned long n;
5928         int ret, offset;
5929
5930         if (!sev_guest(kvm))
5931                 return -ENOTTY;
5932
5933         if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
5934                 return -EFAULT;
5935
5936         pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
5937         if (!pages)
5938                 return -ENOMEM;
5939
5940         /*
5941          * The secret must be copied into contiguous memory region, lets verify
5942          * that userspace memory pages are contiguous before we issue command.
5943          */
5944         if (get_num_contig_pages(0, pages, n) != n) {
5945                 ret = -EINVAL;
5946                 goto e_unpin_memory;
5947         }
5948
5949         ret = -ENOMEM;
5950         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
5951         if (!data)
5952                 goto e_unpin_memory;
5953
5954         offset = params.guest_uaddr & (PAGE_SIZE - 1);
5955         data->guest_address = __sme_page_pa(pages[0]) + offset;
5956         data->guest_len = params.guest_len;
5957
5958         blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
5959         if (IS_ERR(blob)) {
5960                 ret = PTR_ERR(blob);
5961                 goto e_free;
5962         }
5963
5964         data->trans_address = __psp_pa(blob);
5965         data->trans_len = params.trans_len;
5966
5967         hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
5968         if (IS_ERR(hdr)) {
5969                 ret = PTR_ERR(hdr);
5970                 goto e_free_blob;
5971         }
5972         data->hdr_address = __psp_pa(hdr);
5973         data->hdr_len = params.hdr_len;
5974
5975         data->handle = sev->handle;
5976         ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
5977
5978         kfree(hdr);
5979
5980 e_free_blob:
5981         kfree(blob);
5982 e_free:
5983         kfree(data);
5984 e_unpin_memory:
5985         sev_unpin_memory(kvm, pages, n);
5986         return ret;
5987 }
5988
5989 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
5990 {
5991         struct kvm_sev_cmd sev_cmd;
5992         int r;
5993
5994         if (!svm_sev_enabled())
5995                 return -ENOTTY;
5996
5997         if (!argp)
5998                 return 0;
5999
6000         if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
6001                 return -EFAULT;
6002
6003         mutex_lock(&kvm->lock);
6004
6005         switch (sev_cmd.id) {
6006         case KVM_SEV_INIT:
6007                 r = sev_guest_init(kvm, &sev_cmd);
6008                 break;
6009         case KVM_SEV_LAUNCH_START:
6010                 r = sev_launch_start(kvm, &sev_cmd);
6011                 break;
6012         case KVM_SEV_LAUNCH_UPDATE_DATA:
6013                 r = sev_launch_update_data(kvm, &sev_cmd);
6014                 break;
6015         case KVM_SEV_LAUNCH_MEASURE:
6016                 r = sev_launch_measure(kvm, &sev_cmd);
6017                 break;
6018         case KVM_SEV_LAUNCH_FINISH:
6019                 r = sev_launch_finish(kvm, &sev_cmd);
6020                 break;
6021         case KVM_SEV_GUEST_STATUS:
6022                 r = sev_guest_status(kvm, &sev_cmd);
6023                 break;
6024         case KVM_SEV_DBG_DECRYPT:
6025                 r = sev_dbg_crypt(kvm, &sev_cmd, true);
6026                 break;
6027         case KVM_SEV_DBG_ENCRYPT:
6028                 r = sev_dbg_crypt(kvm, &sev_cmd, false);
6029                 break;
6030         case KVM_SEV_LAUNCH_SECRET:
6031                 r = sev_launch_secret(kvm, &sev_cmd);
6032                 break;
6033         default:
6034                 r = -EINVAL;
6035                 goto out;
6036         }
6037
6038         if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
6039                 r = -EFAULT;
6040
6041 out:
6042         mutex_unlock(&kvm->lock);
6043         return r;
6044 }
6045
6046 static int svm_register_enc_region(struct kvm *kvm,
6047                                    struct kvm_enc_region *range)
6048 {
6049         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6050         struct enc_region *region;
6051         int ret = 0;
6052
6053         if (!sev_guest(kvm))
6054                 return -ENOTTY;
6055
6056         if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
6057                 return -EINVAL;
6058
6059         region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
6060         if (!region)
6061                 return -ENOMEM;
6062
6063         region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
6064         if (!region->pages) {
6065                 ret = -ENOMEM;
6066                 goto e_free;
6067         }
6068
6069         /*
6070          * The guest may change the memory encryption attribute from C=0 -> C=1
6071          * or vice versa for this memory range. Lets make sure caches are
6072          * flushed to ensure that guest data gets written into memory with
6073          * correct C-bit.
6074          */
6075         sev_clflush_pages(region->pages, region->npages);
6076
6077         region->uaddr = range->addr;
6078         region->size = range->size;
6079
6080         mutex_lock(&kvm->lock);
6081         list_add_tail(&region->list, &sev->regions_list);
6082         mutex_unlock(&kvm->lock);
6083
6084         return ret;
6085
6086 e_free:
6087         kfree(region);
6088         return ret;
6089 }
6090
6091 static struct enc_region *
6092 find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
6093 {
6094         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6095         struct list_head *head = &sev->regions_list;
6096         struct enc_region *i;
6097
6098         list_for_each_entry(i, head, list) {
6099                 if (i->uaddr == range->addr &&
6100                     i->size == range->size)
6101                         return i;
6102         }
6103
6104         return NULL;
6105 }
6106
6107
6108 static int svm_unregister_enc_region(struct kvm *kvm,
6109                                      struct kvm_enc_region *range)
6110 {
6111         struct enc_region *region;
6112         int ret;
6113
6114         mutex_lock(&kvm->lock);
6115
6116         if (!sev_guest(kvm)) {
6117                 ret = -ENOTTY;
6118                 goto failed;
6119         }
6120
6121         region = find_enc_region(kvm, range);
6122         if (!region) {
6123                 ret = -EINVAL;
6124                 goto failed;
6125         }
6126
6127         /*
6128          * Ensure that all guest tagged cache entries are flushed before
6129          * releasing the pages back to the system for use. CLFLUSH will
6130          * not do this, so issue a WBINVD.
6131          */
6132         wbinvd_on_all_cpus();
6133
6134         __unregister_enc_region_locked(kvm, region);
6135
6136         mutex_unlock(&kvm->lock);
6137         return 0;
6138
6139 failed:
6140         mutex_unlock(&kvm->lock);
6141         return ret;
6142 }
6143
6144 static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
6145 {
6146         unsigned long cr4 = kvm_read_cr4(vcpu);
6147         bool smep = cr4 & X86_CR4_SMEP;
6148         bool smap = cr4 & X86_CR4_SMAP;
6149         bool is_user = svm_get_cpl(vcpu) == 3;
6150
6151         /*
6152          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
6153          *
6154          * Errata:
6155          * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
6156          * possible that CPU microcode implementing DecodeAssist will fail
6157          * to read bytes of instruction which caused #NPF. In this case,
6158          * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
6159          * return 0 instead of the correct guest instruction bytes.
6160          *
6161          * This happens because CPU microcode reading instruction bytes
6162          * uses a special opcode which attempts to read data using CPL=0
6163          * priviledges. The microcode reads CS:RIP and if it hits a SMAP
6164          * fault, it gives up and returns no instruction bytes.
6165          *
6166          * Detection:
6167          * We reach here in case CPU supports DecodeAssist, raised #NPF and
6168          * returned 0 in GuestIntrBytes field of the VMCB.
6169          * First, errata can only be triggered in case vCPU CR4.SMAP=1.
6170          * Second, if vCPU CR4.SMEP=1, errata could only be triggered
6171          * in case vCPU CPL==3 (Because otherwise guest would have triggered
6172          * a SMEP fault instead of #NPF).
6173          * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
6174          * As most guests enable SMAP if they have also enabled SMEP, use above
6175          * logic in order to attempt minimize false-positive of detecting errata
6176          * while still preserving all cases semantic correctness.
6177          *
6178          * Workaround:
6179          * To determine what instruction the guest was executing, the hypervisor
6180          * will have to decode the instruction at the instruction pointer.
6181          *
6182          * In non SEV guest, hypervisor will be able to read the guest
6183          * memory to decode the instruction pointer when insn_len is zero
6184          * so we return true to indicate that decoding is possible.
6185          *
6186          * But in the SEV guest, the guest memory is encrypted with the
6187          * guest specific key and hypervisor will not be able to decode the
6188          * instruction pointer so we will not able to workaround it. Lets
6189          * print the error and request to kill the guest.
6190          */
6191         if (smap && (!smep || is_user)) {
6192                 if (!sev_guest(vcpu->kvm))
6193                         return true;
6194
6195                 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
6196                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6197         }
6198
6199         return false;
6200 }
6201
6202 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
6203 {
6204         struct vcpu_svm *svm = to_svm(vcpu);
6205
6206         /*
6207          * TODO: Last condition latch INIT signals on vCPU when
6208          * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
6209          * To properly emulate the INIT intercept, SVM should implement
6210          * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit()
6211          * there if an INIT signal is pending.
6212          */
6213         return !gif_set(svm) ||
6214                    (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
6215 }
6216
6217 static bool svm_check_apicv_inhibit_reasons(ulong bit)
6218 {
6219         ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
6220                           BIT(APICV_INHIBIT_REASON_HYPERV) |
6221                           BIT(APICV_INHIBIT_REASON_NESTED) |
6222                           BIT(APICV_INHIBIT_REASON_IRQWIN) |
6223                           BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
6224                           BIT(APICV_INHIBIT_REASON_X2APIC);
6225
6226         return supported & BIT(bit);
6227 }
6228
6229 static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
6230 {
6231         avic_update_access_page(kvm, activate);
6232 }
6233
6234 static struct kvm_x86_ops svm_x86_ops __initdata = {
6235         .hardware_unsetup = svm_hardware_teardown,
6236         .hardware_enable = svm_hardware_enable,
6237         .hardware_disable = svm_hardware_disable,
6238         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
6239         .has_emulated_msr = svm_has_emulated_msr,
6240
6241         .vcpu_create = svm_create_vcpu,
6242         .vcpu_free = svm_free_vcpu,
6243         .vcpu_reset = svm_vcpu_reset,
6244
6245         .vm_size = sizeof(struct kvm_svm),
6246         .vm_init = svm_vm_init,
6247         .vm_destroy = svm_vm_destroy,
6248
6249         .prepare_guest_switch = svm_prepare_guest_switch,
6250         .vcpu_load = svm_vcpu_load,
6251         .vcpu_put = svm_vcpu_put,
6252         .vcpu_blocking = svm_vcpu_blocking,
6253         .vcpu_unblocking = svm_vcpu_unblocking,
6254
6255         .update_bp_intercept = update_bp_intercept,
6256         .get_msr_feature = svm_get_msr_feature,
6257         .get_msr = svm_get_msr,
6258         .set_msr = svm_set_msr,
6259         .get_segment_base = svm_get_segment_base,
6260         .get_segment = svm_get_segment,
6261         .set_segment = svm_set_segment,
6262         .get_cpl = svm_get_cpl,
6263         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
6264         .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
6265         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
6266         .set_cr0 = svm_set_cr0,
6267         .set_cr4 = svm_set_cr4,
6268         .set_efer = svm_set_efer,
6269         .get_idt = svm_get_idt,
6270         .set_idt = svm_set_idt,
6271         .get_gdt = svm_get_gdt,
6272         .set_gdt = svm_set_gdt,
6273         .get_dr6 = svm_get_dr6,
6274         .set_dr6 = svm_set_dr6,
6275         .set_dr7 = svm_set_dr7,
6276         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
6277         .cache_reg = svm_cache_reg,
6278         .get_rflags = svm_get_rflags,
6279         .set_rflags = svm_set_rflags,
6280
6281         .tlb_flush = svm_flush_tlb,
6282         .tlb_flush_gva = svm_flush_tlb_gva,
6283
6284         .run = svm_vcpu_run,
6285         .handle_exit = handle_exit,
6286         .skip_emulated_instruction = skip_emulated_instruction,
6287         .update_emulated_instruction = NULL,
6288         .set_interrupt_shadow = svm_set_interrupt_shadow,
6289         .get_interrupt_shadow = svm_get_interrupt_shadow,
6290         .patch_hypercall = svm_patch_hypercall,
6291         .set_irq = svm_set_irq,
6292         .set_nmi = svm_inject_nmi,
6293         .queue_exception = svm_queue_exception,
6294         .cancel_injection = svm_cancel_injection,
6295         .interrupt_allowed = svm_interrupt_allowed,
6296         .nmi_allowed = svm_nmi_allowed,
6297         .get_nmi_mask = svm_get_nmi_mask,
6298         .set_nmi_mask = svm_set_nmi_mask,
6299         .enable_nmi_window = enable_nmi_window,
6300         .enable_irq_window = enable_irq_window,
6301         .update_cr8_intercept = update_cr8_intercept,
6302         .set_virtual_apic_mode = svm_set_virtual_apic_mode,
6303         .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
6304         .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
6305         .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
6306         .load_eoi_exitmap = svm_load_eoi_exitmap,
6307         .hwapic_irr_update = svm_hwapic_irr_update,
6308         .hwapic_isr_update = svm_hwapic_isr_update,
6309         .sync_pir_to_irr = kvm_lapic_find_highest_irr,
6310         .apicv_post_state_restore = avic_post_state_restore,
6311
6312         .set_tss_addr = svm_set_tss_addr,
6313         .set_identity_map_addr = svm_set_identity_map_addr,
6314         .get_tdp_level = get_npt_level,
6315         .get_mt_mask = svm_get_mt_mask,
6316
6317         .get_exit_info = svm_get_exit_info,
6318
6319         .cpuid_update = svm_cpuid_update,
6320
6321         .has_wbinvd_exit = svm_has_wbinvd_exit,
6322
6323         .read_l1_tsc_offset = svm_read_l1_tsc_offset,
6324         .write_l1_tsc_offset = svm_write_l1_tsc_offset,
6325
6326         .load_mmu_pgd = svm_load_mmu_pgd,
6327
6328         .check_intercept = svm_check_intercept,
6329         .handle_exit_irqoff = svm_handle_exit_irqoff,
6330
6331         .request_immediate_exit = __kvm_request_immediate_exit,
6332
6333         .sched_in = svm_sched_in,
6334
6335         .pmu_ops = &amd_pmu_ops,
6336         .deliver_posted_interrupt = svm_deliver_avic_intr,
6337         .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
6338         .update_pi_irte = svm_update_pi_irte,
6339         .setup_mce = svm_setup_mce,
6340
6341         .smi_allowed = svm_smi_allowed,
6342         .pre_enter_smm = svm_pre_enter_smm,
6343         .pre_leave_smm = svm_pre_leave_smm,
6344         .enable_smi_window = enable_smi_window,
6345
6346         .mem_enc_op = svm_mem_enc_op,
6347         .mem_enc_reg_region = svm_register_enc_region,
6348         .mem_enc_unreg_region = svm_unregister_enc_region,
6349
6350         .nested_enable_evmcs = NULL,
6351         .nested_get_evmcs_version = NULL,
6352
6353         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
6354
6355         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
6356
6357         .check_nested_events = svm_check_nested_events,
6358 };
6359
6360 static struct kvm_x86_init_ops svm_init_ops __initdata = {
6361         .cpu_has_kvm_support = has_svm,
6362         .disabled_by_bios = is_disabled,
6363         .hardware_setup = svm_hardware_setup,
6364         .check_processor_compatibility = svm_check_processor_compat,
6365
6366         .runtime_ops = &svm_x86_ops,
6367 };
6368
6369 static int __init svm_init(void)
6370 {
6371         return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
6372                         __alignof__(struct vcpu_svm), THIS_MODULE);
6373 }
6374
6375 static void __exit svm_exit(void)
6376 {
6377         kvm_exit();
6378 }
6379
6380 module_init(svm_init)
6381 module_exit(svm_exit)