KVM: SVM: Add support for CR0 write traps for an SEV-ES guest
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
index d39d6cf..efa70e3 100644 (file)
@@ -71,6 +71,7 @@
 #include <asm/irq_remapping.h>
 #include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
 #include <clocksource/hyperv_timer.h>
@@ -161,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
 int __read_mostly pi_inject_timer = -1;
 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 
-#define KVM_NR_SHARED_MSRS 16
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
 
-struct kvm_shared_msrs_global {
+struct kvm_user_return_msrs_global {
        int nr;
-       u32 msrs[KVM_NR_SHARED_MSRS];
+       u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
 };
 
-struct kvm_shared_msrs {
+struct kvm_user_return_msrs {
        struct user_return_notifier urn;
        bool registered;
-       struct kvm_shared_msr_values {
+       struct kvm_user_return_msr_values {
                u64 host;
                u64 curr;
-       } values[KVM_NR_SHARED_MSRS];
+       } values[KVM_MAX_NR_USER_RETURN_MSRS];
 };
 
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static struct kvm_shared_msrs __percpu *shared_msrs;
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
 
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
                                | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -188,7 +194,7 @@ static struct kvm_shared_msrs __percpu *shared_msrs;
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
 
-bool __read_mostly allow_smaller_maxphyaddr;
+bool __read_mostly allow_smaller_maxphyaddr = 0;
 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 
 static u64 __read_mostly host_xss;
@@ -249,24 +255,23 @@ static struct kmem_cache *x86_emulator_cache;
 
 /*
  * When called, it means the previous get/set msr reached an invalid msr.
- * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
- * to fail the caller.
+ * Return true if we want to ignore/silent this failed msr access.
  */
-static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
-                                u64 data, bool write)
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+                                 u64 data, bool write)
 {
        const char *op = write ? "wrmsr" : "rdmsr";
 
        if (ignore_msrs) {
                if (report_ignored_msrs)
-                       vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
-                                   op, msr, data);
+                       kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+                                     op, msr, data);
                /* Mask the error */
-               return 0;
+               return true;
        } else {
-               vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
-                                      op, msr, data);
-               return 1;
+               kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+                                     op, msr, data);
+               return false;
        }
 }
 
@@ -293,9 +298,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
        unsigned slot;
-       struct kvm_shared_msrs *locals
-               = container_of(urn, struct kvm_shared_msrs, urn);
-       struct kvm_shared_msr_values *values;
+       struct kvm_user_return_msrs *msrs
+               = container_of(urn, struct kvm_user_return_msrs, urn);
+       struct kvm_user_return_msr_values *values;
        unsigned long flags;
 
        /*
@@ -303,73 +308,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
         * interrupted and executed through kvm_arch_hardware_disable()
         */
        local_irq_save(flags);
-       if (locals->registered) {
-               locals->registered = false;
+       if (msrs->registered) {
+               msrs->registered = false;
                user_return_notifier_unregister(urn);
        }
        local_irq_restore(flags);
-       for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
-               values = &locals->values[slot];
+       for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+               values = &msrs->values[slot];
                if (values->host != values->curr) {
-                       wrmsrl(shared_msrs_global.msrs[slot], values->host);
+                       wrmsrl(user_return_msrs_global.msrs[slot], values->host);
                        values->curr = values->host;
                }
        }
 }
 
-void kvm_define_shared_msr(unsigned slot, u32 msr)
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
 {
-       BUG_ON(slot >= KVM_NR_SHARED_MSRS);
-       shared_msrs_global.msrs[slot] = msr;
-       if (slot >= shared_msrs_global.nr)
-               shared_msrs_global.nr = slot + 1;
+       BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+       user_return_msrs_global.msrs[slot] = msr;
+       if (slot >= user_return_msrs_global.nr)
+               user_return_msrs_global.nr = slot + 1;
 }
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
 
-static void kvm_shared_msr_cpu_online(void)
+static void kvm_user_return_msr_cpu_online(void)
 {
        unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
        u64 value;
        int i;
 
-       for (i = 0; i < shared_msrs_global.nr; ++i) {
-               rdmsrl_safe(shared_msrs_global.msrs[i], &value);
-               smsr->values[i].host = value;
-               smsr->values[i].curr = value;
+       for (i = 0; i < user_return_msrs_global.nr; ++i) {
+               rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+               msrs->values[i].host = value;
+               msrs->values[i].curr = value;
        }
 }
 
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
 {
        unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
        int err;
 
-       value = (value & mask) | (smsr->values[slot].host & ~mask);
-       if (value == smsr->values[slot].curr)
+       value = (value & mask) | (msrs->values[slot].host & ~mask);
+       if (value == msrs->values[slot].curr)
                return 0;
-       err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+       err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
        if (err)
                return 1;
 
-       smsr->values[slot].curr = value;
-       if (!smsr->registered) {
-               smsr->urn.on_user_return = kvm_on_user_return;
-               user_return_notifier_register(&smsr->urn);
-               smsr->registered = true;
+       msrs->values[slot].curr = value;
+       if (!msrs->registered) {
+               msrs->urn.on_user_return = kvm_on_user_return;
+               user_return_notifier_register(&msrs->urn);
+               msrs->registered = true;
        }
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
 
 static void drop_user_return_notifiers(void)
 {
        unsigned int cpu = smp_processor_id();
-       struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+       struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
 
-       if (smsr->registered)
-               kvm_on_user_return(&smsr->urn);
+       if (msrs->registered)
+               kvm_on_user_return(&msrs->urn);
 }
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
@@ -799,11 +804,29 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(pdptrs_changed);
 
+void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
+{
+       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
+
+       if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+               kvm_clear_async_pf_completion_queue(vcpu);
+               kvm_async_pf_hash_reset(vcpu);
+       }
+
+       if ((cr0 ^ old_cr0) & update_bits)
+               kvm_mmu_reset_context(vcpu);
+
+       if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+           kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+           !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+               kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+}
+EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
+
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        unsigned long old_cr0 = kvm_read_cr0(vcpu);
        unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
-       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 
        cr0 |= X86_CR0_ET;
 
@@ -842,18 +865,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
        kvm_x86_ops.set_cr0(vcpu, cr0);
 
-       if ((cr0 ^ old_cr0) & X86_CR0_PG) {
-               kvm_clear_async_pf_completion_queue(vcpu);
-               kvm_async_pf_hash_reset(vcpu);
-       }
-
-       if ((cr0 ^ old_cr0) & update_bits)
-               kvm_mmu_reset_context(vcpu);
-
-       if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
-           kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
-           !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-               kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+       kvm_post_set_cr0(vcpu, old_cr0, cr0);
 
        return 0;
 }
@@ -959,25 +971,26 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 }
 EXPORT_SYMBOL_GPL(kvm_set_xcr);
 
-int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        if (cr4 & cr4_reserved_bits)
-               return -EINVAL;
+               return false;
 
        if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
-               return -EINVAL;
+               return false;
 
-       return 0;
+       return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
 }
-EXPORT_SYMBOL_GPL(kvm_valid_cr4);
+EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
        unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
                                   X86_CR4_SMEP;
+       unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
 
-       if (kvm_valid_cr4(vcpu, cr4))
+       if (!kvm_is_valid_cr4(vcpu, cr4))
                return 1;
 
        if (is_long_mode(vcpu)) {
@@ -1000,16 +1013,12 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                        return 1;
        }
 
-       if (kvm_x86_ops.set_cr4(vcpu, cr4))
-               return 1;
+       kvm_x86_ops.set_cr4(vcpu, cr4);
 
-       if (((cr4 ^ old_cr4) & pdptr_bits) ||
+       if (((cr4 ^ old_cr4) & mmu_role_bits) ||
            (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 
-       if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
-               kvm_update_cpuid_runtime(vcpu);
-
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -1035,7 +1044,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
 
        if (is_long_mode(vcpu) &&
-           (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+           (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
                return 1;
        else if (is_pae_paging(vcpu) &&
                 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1409,7 +1418,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
        if (r == KVM_MSR_RET_INVALID) {
                /* Unconditionally clear the output for simplicity */
                *data = 0;
-               r = kvm_msr_ignored_check(vcpu, index, 0, false);
+               if (kvm_msr_ignored_check(vcpu, index, 0, false))
+                       r = 0;
        }
 
        if (r)
@@ -1451,6 +1461,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        u64 old_efer = vcpu->arch.efer;
        u64 efer = msr_info->data;
+       int r;
 
        if (efer & efer_reserved_bits)
                return 1;
@@ -1467,7 +1478,11 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        efer &= ~EFER_LMA;
        efer |= vcpu->arch.efer & EFER_LMA;
 
-       kvm_x86_ops.set_efer(vcpu, efer);
+       r = kvm_x86_ops.set_efer(vcpu, efer);
+       if (r) {
+               WARN_ON(r > 0);
+               return r;
+       }
 
        /* Update reserved bits */
        if ((efer ^ old_efer) & EFER_NX)
@@ -1482,6 +1497,40 @@ void kvm_enable_efer_bits(u64 mask)
 }
 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+       u32 count = kvm->arch.msr_filter.count;
+       u32 i;
+       bool r = kvm->arch.msr_filter.default_allow;
+       int idx;
+
+       /* MSR filtering not set up or x2APIC enabled, allow everything */
+       if (!count || (index >= 0x800 && index <= 0x8ff))
+               return true;
+
+       /* Prevent collision with set_msr_filter */
+       idx = srcu_read_lock(&kvm->srcu);
+
+       for (i = 0; i < count; i++) {
+               u32 start = ranges[i].base;
+               u32 end = start + ranges[i].nmsrs;
+               u32 flags = ranges[i].flags;
+               unsigned long *bitmap = ranges[i].bitmap;
+
+               if ((index >= start) && (index < end) && (flags & type)) {
+                       r = !!test_bit(index - start, bitmap);
+                       break;
+               }
+       }
+
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
 /*
  * Write @data into the MSR specified by @index.  Select MSR specific fault
  * checks are bypassed if @host_initiated is %true.
@@ -1493,6 +1542,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
 {
        struct msr_data msr;
 
+       if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+               return KVM_MSR_RET_FILTERED;
+
        switch (index) {
        case MSR_FS_BASE:
        case MSR_GS_BASE:
@@ -1532,7 +1584,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
        int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
 
        if (ret == KVM_MSR_RET_INVALID)
-               ret = kvm_msr_ignored_check(vcpu, index, data, true);
+               if (kvm_msr_ignored_check(vcpu, index, data, true))
+                       ret = 0;
 
        return ret;
 }
@@ -1549,6 +1602,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
        struct msr_data msr;
        int ret;
 
+       if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+               return KVM_MSR_RET_FILTERED;
+
        msr.index = index;
        msr.host_initiated = host_initiated;
 
@@ -1566,7 +1622,8 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
        if (ret == KVM_MSR_RET_INVALID) {
                /* Unconditionally clear *data for simplicity */
                *data = 0;
-               ret = kvm_msr_ignored_check(vcpu, index, 0, false);
+               if (kvm_msr_ignored_check(vcpu, index, 0, false))
+                       ret = 0;
        }
 
        return ret;
@@ -1584,22 +1641,92 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_set_msr);
 
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+       int err = vcpu->run->msr.error;
+       if (!err) {
+               kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+               kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+       }
+
+       return kvm_x86_ops.complete_emulated_msr(vcpu, err);
+}
+
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+       return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+       switch (r) {
+       case KVM_MSR_RET_INVALID:
+               return KVM_MSR_EXIT_REASON_UNKNOWN;
+       case KVM_MSR_RET_FILTERED:
+               return KVM_MSR_EXIT_REASON_FILTER;
+       default:
+               return KVM_MSR_EXIT_REASON_INVAL;
+       }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+                             u32 exit_reason, u64 data,
+                             int (*completion)(struct kvm_vcpu *vcpu),
+                             int r)
+{
+       u64 msr_reason = kvm_msr_reason(r);
+
+       /* Check if the user wanted to know about this MSR fault */
+       if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+               return 0;
+
+       vcpu->run->exit_reason = exit_reason;
+       vcpu->run->msr.error = 0;
+       memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+       vcpu->run->msr.reason = msr_reason;
+       vcpu->run->msr.index = index;
+       vcpu->run->msr.data = data;
+       vcpu->arch.complete_userspace_io = completion;
+
+       return 1;
+}
+
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
+{
+       return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+                                  complete_emulated_rdmsr, r);
+}
+
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
+{
+       return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+                                  complete_emulated_wrmsr, r);
+}
+
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
 {
        u32 ecx = kvm_rcx_read(vcpu);
        u64 data;
+       int r;
 
-       if (kvm_get_msr(vcpu, ecx, &data)) {
-               trace_kvm_msr_read_ex(ecx);
-               kvm_inject_gp(vcpu, 0);
-               return 1;
+       r = kvm_get_msr(vcpu, ecx, &data);
+
+       /* MSR read failed? See if we should ask user space */
+       if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
+               /* Bounce to user space */
+               return 0;
        }
 
-       trace_kvm_msr_read(ecx, data);
+       if (!r) {
+               trace_kvm_msr_read(ecx, data);
 
-       kvm_rax_write(vcpu, data & -1u);
-       kvm_rdx_write(vcpu, (data >> 32) & -1u);
-       return kvm_skip_emulated_instruction(vcpu);
+               kvm_rax_write(vcpu, data & -1u);
+               kvm_rdx_write(vcpu, (data >> 32) & -1u);
+       } else {
+               trace_kvm_msr_read_ex(ecx);
+       }
+
+       return kvm_x86_ops.complete_emulated_msr(vcpu, r);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
 
@@ -1607,15 +1734,25 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 {
        u32 ecx = kvm_rcx_read(vcpu);
        u64 data = kvm_read_edx_eax(vcpu);
+       int r;
+
+       r = kvm_set_msr(vcpu, ecx, data);
 
-       if (kvm_set_msr(vcpu, ecx, data)) {
+       /* MSR write failed? See if we should ask user space */
+       if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
+               /* Bounce to user space */
+               return 0;
+
+       /* Signal all other negative errors to userspace */
+       if (r < 0)
+               return r;
+
+       if (!r)
+               trace_kvm_msr_write(ecx, data);
+       else
                trace_kvm_msr_write_ex(ecx, data);
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
 
-       trace_kvm_msr_write(ecx, data);
-       return kvm_skip_emulated_instruction(vcpu);
+       return kvm_x86_ops.complete_emulated_msr(vcpu, r);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 
@@ -1774,12 +1911,6 @@ static s64 get_kvmclock_base_ns(void)
 }
 #endif
 
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
-{
-       kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
-       kvm_vcpu_kick(vcpu);
-}
-
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        int version;
@@ -1787,6 +1918,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        struct pvclock_wall_clock wc;
        u64 wall_nsec;
 
+       kvm->arch.wall_clock = wall_clock;
+
        if (!wall_clock)
                return;
 
@@ -1819,6 +1952,34 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
        kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 }
 
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+                                 bool old_msr, bool host_initiated)
+{
+       struct kvm_arch *ka = &vcpu->kvm->arch;
+
+       if (vcpu->vcpu_id == 0 && !host_initiated) {
+               if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
+                       kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+               ka->boot_vcpu_runs_old_kvmclock = old_msr;
+       }
+
+       vcpu->arch.time = system_time;
+       kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+       /* we verify if the enable bit is set... */
+       vcpu->arch.pv_time_enabled = false;
+       if (!(system_time & 1))
+               return;
+
+       if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                                      &vcpu->arch.pv_time, system_time & ~1ULL,
+                                      sizeof(struct pvclock_vcpu_time_info)))
+               vcpu->arch.pv_time_enabled = true;
+
+       return;
+}
+
 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 {
        do_shl32_div32(dividend, divisor);
@@ -1978,12 +2139,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 #endif
 }
 
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
-{
-       u64 curr_offset = vcpu->arch.l1_tsc_offset;
-       vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
-}
-
 /*
  * Multiply tsc by a fixed point number represented by ratio.
  *
@@ -2045,14 +2200,13 @@ static inline bool kvm_check_tsc_unstable(void)
        return check_tsc_unstable();
 }
 
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
        bool matched;
        bool already_matched;
-       u64 data = msr->data;
        bool synchronizing = false;
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2061,7 +2215,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
        elapsed = ns - kvm->arch.last_tsc_nsec;
 
        if (vcpu->arch.virtual_tsc_khz) {
-               if (data == 0 && msr->host_initiated) {
+               if (data == 0) {
                        /*
                         * detection of vcpu initialization -- need to sync
                         * with other vCPUs. This particularly helps to keep
@@ -2131,9 +2285,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
 
-       if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
-               update_ia32_tsc_adjust_msr(vcpu, offset);
-
        kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
@@ -2148,8 +2299,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
                                           s64 adjustment)
 {
@@ -2695,24 +2844,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
        u32 page_num = data & ~PAGE_MASK;
        u64 page_addr = data & PAGE_MASK;
        u8 *page;
-       int r;
 
-       r = -E2BIG;
        if (page_num >= blob_size)
-               goto out;
-       r = -ENOMEM;
+               return 1;
+
        page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
-       if (IS_ERR(page)) {
-               r = PTR_ERR(page);
-               goto out;
+       if (IS_ERR(page))
+               return PTR_ERR(page);
+
+       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+               kfree(page);
+               return 1;
        }
-       if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
-               goto out_free;
-       r = 0;
-out_free:
-       kfree(page);
-out:
-       return r;
+       return 0;
 }
 
 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
@@ -2730,9 +2874,17 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
        if (data & 0x30)
                return 1;
 
-       if (!lapic_in_kernel(vcpu))
+       if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+           (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+               return 1;
+
+       if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+           (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
                return 1;
 
+       if (!lapic_in_kernel(vcpu))
+               return data ? 1 : 0;
+
        vcpu->arch.apf.msr_en_val = data;
 
        if (!kvm_pv_async_pf_enabled(vcpu)) {
@@ -2807,10 +2959,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
         * Doing a TLB flush here, on the guest's behalf, can avoid
         * expensive IPIs.
         */
-       trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
-               st->preempted & KVM_VCPU_FLUSH_TLB);
-       if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
-               kvm_vcpu_flush_tlb_guest(vcpu);
+       if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+               trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+                                      st->preempted & KVM_VCPU_FLUSH_TLB);
+               if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+                       kvm_vcpu_flush_tlb_guest(vcpu);
+       }
 
        vcpu->arch.st.preempted = 0;
 
@@ -2902,9 +3056,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        /* Values other than LBR and BTF are vendor-specific,
                           thus reserved and should throw a #GP */
                        return 1;
-               }
-               vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
-                           __func__, data);
+               } else if (report_ignored_msrs)
+                       vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+                                   __func__, data);
                break;
        case 0x200 ... 0x2ff:
                return kvm_mtrr_set_msr(vcpu, msr, data);
@@ -2944,7 +3098,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.msr_ia32_power_ctl = data;
                break;
        case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr_info);
+               if (msr_info->host_initiated) {
+                       kvm_synchronize_tsc(vcpu, data);
+               } else {
+                       u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+                       adjust_tsc_offset_guest(vcpu, adj);
+                       vcpu->arch.ia32_tsc_adjust_msr += adj;
+               }
                break;
        case MSR_IA32_XSS:
                if (!msr_info->host_initiated &&
@@ -2965,53 +3125,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.smi_count = data;
                break;
        case MSR_KVM_WALL_CLOCK_NEW:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+                       return 1;
+
+               kvm_write_wall_clock(vcpu->kvm, data);
+               break;
        case MSR_KVM_WALL_CLOCK:
-               vcpu->kvm->arch.wall_clock = data;
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+                       return 1;
+
                kvm_write_wall_clock(vcpu->kvm, data);
                break;
        case MSR_KVM_SYSTEM_TIME_NEW:
-       case MSR_KVM_SYSTEM_TIME: {
-               struct kvm_arch *ka = &vcpu->kvm->arch;
-
-               if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
-                       bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
-
-                       if (ka->boot_vcpu_runs_old_kvmclock != tmp)
-                               kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-
-                       ka->boot_vcpu_runs_old_kvmclock = tmp;
-               }
-
-               vcpu->arch.time = data;
-               kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
-
-               /* we verify if the enable bit is set... */
-               vcpu->arch.pv_time_enabled = false;
-               if (!(data & 1))
-                       break;
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+                       return 1;
 
-               if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
-                    &vcpu->arch.pv_time, data & ~1ULL,
-                    sizeof(struct pvclock_vcpu_time_info)))
-                       vcpu->arch.pv_time_enabled = true;
+               kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
+               break;
+       case MSR_KVM_SYSTEM_TIME:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+                       return 1;
 
+               kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
                break;
-       }
        case MSR_KVM_ASYNC_PF_EN:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+                       return 1;
+
                if (kvm_pv_enable_async_pf(vcpu, data))
                        return 1;
                break;
        case MSR_KVM_ASYNC_PF_INT:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+                       return 1;
+
                if (kvm_pv_enable_async_pf_int(vcpu, data))
                        return 1;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+                       return 1;
                if (data & 0x1) {
                        vcpu->arch.apf.pageready_pending = false;
                        kvm_check_async_pf_completion(vcpu);
                }
                break;
        case MSR_KVM_STEAL_TIME:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+                       return 1;
 
                if (unlikely(!sched_info_on()))
                        return 1;
@@ -3028,11 +3189,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
                break;
        case MSR_KVM_PV_EOI_EN:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+                       return 1;
+
                if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
                        return 1;
                break;
 
        case MSR_KVM_POLL_CONTROL:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+                       return 1;
+
                /* only enable bit supported */
                if (data & (-1ULL << 1))
                        return 1;
@@ -3221,9 +3388,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_POWER_CTL:
                msr_info->data = vcpu->arch.msr_ia32_power_ctl;
                break;
-       case MSR_IA32_TSC:
-               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+       case MSR_IA32_TSC: {
+               /*
+                * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+                * even when not intercepted. AMD manual doesn't explicitly
+                * state this but appears to behave the same.
+                *
+                * On userspace reads and writes, however, we unconditionally
+                * return L1's TSC value to ensure backwards-compatible
+                * behavior for migration.
+                */
+               u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+                                                           vcpu->arch.tsc_offset;
+
+               msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
                break;
+       }
        case MSR_MTRRcap:
        case 0x200 ... 0x2ff:
                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3276,29 +3456,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vcpu->arch.efer;
                break;
        case MSR_KVM_WALL_CLOCK:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+                       return 1;
+
+               msr_info->data = vcpu->kvm->arch.wall_clock;
+               break;
        case MSR_KVM_WALL_CLOCK_NEW:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+                       return 1;
+
                msr_info->data = vcpu->kvm->arch.wall_clock;
                break;
        case MSR_KVM_SYSTEM_TIME:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+                       return 1;
+
+               msr_info->data = vcpu->arch.time;
+               break;
        case MSR_KVM_SYSTEM_TIME_NEW:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+                       return 1;
+
                msr_info->data = vcpu->arch.time;
                break;
        case MSR_KVM_ASYNC_PF_EN:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+                       return 1;
+
                msr_info->data = vcpu->arch.apf.msr_en_val;
                break;
        case MSR_KVM_ASYNC_PF_INT:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+                       return 1;
+
                msr_info->data = vcpu->arch.apf.msr_int_val;
                break;
        case MSR_KVM_ASYNC_PF_ACK:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+                       return 1;
+
                msr_info->data = 0;
                break;
        case MSR_KVM_STEAL_TIME:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+                       return 1;
+
                msr_info->data = vcpu->arch.st.msr_val;
                break;
        case MSR_KVM_PV_EOI_EN:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+                       return 1;
+
                msr_info->data = vcpu->arch.pv_eoi.msr_val;
                break;
        case MSR_KVM_POLL_CONTROL:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+                       return 1;
+
                msr_info->data = vcpu->arch.msr_kvm_poll_control;
                break;
        case MSR_IA32_P5_MC_ADDR:
@@ -3455,6 +3669,27 @@ static inline bool kvm_can_mwait_in_guest(void)
                boot_cpu_has(X86_FEATURE_ARAT);
 }
 
+static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
+                                           struct kvm_cpuid2 __user *cpuid_arg)
+{
+       struct kvm_cpuid2 cpuid;
+       int r;
+
+       r = -EFAULT;
+       if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+               return r;
+
+       r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+       if (r)
+               return r;
+
+       r = -EFAULT;
+       if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+               return r;
+
+       return 0;
+}
+
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
        int r = 0;
@@ -3491,6 +3726,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_HYPERV_TLBFLUSH:
        case KVM_CAP_HYPERV_SEND_IPI:
        case KVM_CAP_HYPERV_CPUID:
+       case KVM_CAP_SYS_HYPERV_CPUID:
        case KVM_CAP_PCI_SEGMENT:
        case KVM_CAP_DEBUGREGS:
        case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -3513,6 +3749,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_EXCEPTION_PAYLOAD:
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_LAST_CPU:
+       case KVM_CAP_X86_USER_SPACE_MSR:
+       case KVM_CAP_X86_MSR_FILTER:
+       case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
                r = 1;
                break;
        case KVM_CAP_SYNC_REGS:
@@ -3578,6 +3817,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SMALLER_MAXPHYADDR:
                r = (int) allow_smaller_maxphyaddr;
                break;
+       case KVM_CAP_STEAL_TIME:
+               r = sched_info_on();
+               break;
        default:
                break;
        }
@@ -3670,6 +3912,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
        case KVM_GET_MSRS:
                r = msr_io(NULL, argp, do_get_msr_feature, 1);
                break;
+       case KVM_GET_SUPPORTED_HV_CPUID:
+               r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -3768,7 +4013,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        int idx;
 
-       if (vcpu->preempted)
+       if (vcpu->preempted && !vcpu->arch.guest_state_protected)
                vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
 
        /*
@@ -4380,6 +4625,13 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
                return kvm_x86_ops.enable_direct_tlbflush(vcpu);
 
+       case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+               vcpu->arch.pv_cpuid.enforce = cap->args[0];
+               if (vcpu->arch.pv_cpuid.enforce)
+                       kvm_update_pv_runtime(vcpu);
+
+               return 0;
+
        default:
                return -EINVAL;
        }
@@ -4739,25 +4991,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                srcu_read_unlock(&vcpu->kvm->srcu, idx);
                break;
        }
-       case KVM_GET_SUPPORTED_HV_CPUID: {
-               struct kvm_cpuid2 __user *cpuid_arg = argp;
-               struct kvm_cpuid2 cpuid;
-
-               r = -EFAULT;
-               if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
-                       goto out;
-
-               r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
-                                               cpuid_arg->entries);
-               if (r)
-                       goto out;
-
-               r = -EFAULT;
-               if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
-                       goto out;
-               r = 0;
+       case KVM_GET_SUPPORTED_HV_CPUID:
+               r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
                break;
-       }
        default:
                r = -EINVAL;
        }
@@ -5030,10 +5266,118 @@ split_irqchip_unlock:
                kvm->arch.exception_payload_enabled = cap->args[0];
                r = 0;
                break;
+       case KVM_CAP_X86_USER_SPACE_MSR:
+               kvm->arch.user_space_msr_mask = cap->args[0];
+               r = 0;
+               break;
        default:
                r = -EINVAL;
-               break;
+               break;
+       }
+       return r;
+}
+
+static void kvm_clear_msr_filter(struct kvm *kvm)
+{
+       u32 i;
+       u32 count = kvm->arch.msr_filter.count;
+       struct msr_bitmap_range ranges[16];
+
+       mutex_lock(&kvm->lock);
+       kvm->arch.msr_filter.count = 0;
+       memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
+       mutex_unlock(&kvm->lock);
+       synchronize_srcu(&kvm->srcu);
+
+       for (i = 0; i < count; i++)
+               kfree(ranges[i].bitmap);
+}
+
+static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+{
+       struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
+       struct msr_bitmap_range range;
+       unsigned long *bitmap = NULL;
+       size_t bitmap_size;
+       int r;
+
+       if (!user_range->nmsrs)
+               return 0;
+
+       bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+       if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+               return -EINVAL;
+
+       bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+       if (IS_ERR(bitmap))
+               return PTR_ERR(bitmap);
+
+       range = (struct msr_bitmap_range) {
+               .flags = user_range->flags,
+               .base = user_range->base,
+               .nmsrs = user_range->nmsrs,
+               .bitmap = bitmap,
+       };
+
+       if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
+               r = -EINVAL;
+               goto err;
+       }
+
+       if (!range.flags) {
+               r = -EINVAL;
+               goto err;
+       }
+
+       /* Everything ok, add this range identifier to our global pool */
+       ranges[kvm->arch.msr_filter.count] = range;
+       /* Make sure we filled the array before we tell anyone to walk it */
+       smp_wmb();
+       kvm->arch.msr_filter.count++;
+
+       return 0;
+err:
+       kfree(bitmap);
+       return r;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
+{
+       struct kvm_msr_filter __user *user_msr_filter = argp;
+       struct kvm_msr_filter filter;
+       bool default_allow;
+       int r = 0;
+       bool empty = true;
+       u32 i;
+
+       if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+               return -EFAULT;
+
+       for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
+               empty &= !filter.ranges[i].nmsrs;
+
+       default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+       if (empty && !default_allow)
+               return -EINVAL;
+
+       kvm_clear_msr_filter(kvm);
+
+       kvm->arch.msr_filter.default_allow = default_allow;
+
+       /*
+        * Protect from concurrent calls to this function that could trigger
+        * a TOCTOU violation on kvm->arch.msr_filter.count.
+        */
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+               r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
+               if (r)
+                       break;
        }
+
+       kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+       mutex_unlock(&kvm->lock);
+
        return r;
 }
 
@@ -5343,6 +5687,9 @@ set_pit2_out:
        case KVM_SET_PMU_EVENT_FILTER:
                r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
                break;
+       case KVM_X86_SET_MSR_FILTER:
+               r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
+               break;
        default:
                r = -ENOTTY;
        }
@@ -5704,6 +6051,9 @@ int handle_ud(struct kvm_vcpu *vcpu)
        char sig[5]; /* ud2; .ascii "kvm" */
        struct x86_exception e;
 
+       if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+               return 1;
+
        if (force_emulation_prefix &&
            kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
                                sig, sizeof(sig), &e) == 0 &&
@@ -6359,13 +6709,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
                            u32 msr_index, u64 *pdata)
 {
-       return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       int r;
+
+       r = kvm_get_msr(vcpu, msr_index, pdata);
+
+       if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+               /* Bounce to user space */
+               return X86EMUL_IO_NEEDED;
+       }
+
+       return r;
 }
 
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
                            u32 msr_index, u64 data)
 {
-       return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       int r;
+
+       r = kvm_set_msr(vcpu, msr_index, data);
+
+       if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+               /* Bounce to user space */
+               return X86EMUL_IO_NEEDED;
+       }
+
+       return r;
 }
 
 static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
@@ -6909,7 +7279,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        int r;
        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
        bool writeback = true;
-       bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+       bool write_fault_to_spt;
+
+       if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+               return 1;
 
        vcpu->arch.l1tf_flush_l1d = true;
 
@@ -6917,6 +7290,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         * Clear write_fault_to_shadow_pgtable here to ensure it is
         * never reused.
         */
+       write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
        vcpu->arch.write_fault_to_shadow_pgtable = false;
        kvm_clear_exception_queue(vcpu);
 
@@ -7511,9 +7885,9 @@ int kvm_arch_init(void *opaque)
                goto out_free_x86_fpu_cache;
        }
 
-       shared_msrs = alloc_percpu(struct kvm_shared_msrs);
-       if (!shared_msrs) {
-               printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+       user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+       if (!user_return_msrs) {
+               printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
                goto out_free_x86_emulator_cache;
        }
 
@@ -7546,7 +7920,7 @@ int kvm_arch_init(void *opaque)
        return 0;
 
 out_free_percpu:
-       free_percpu(shared_msrs);
+       free_percpu(user_return_msrs);
 out_free_x86_emulator_cache:
        kmem_cache_destroy(x86_emulator_cache);
 out_free_x86_fpu_cache:
@@ -7573,7 +7947,7 @@ void kvm_arch_exit(void)
 #endif
        kvm_x86_ops.hardware_enable = NULL;
        kvm_mmu_module_exit();
-       free_percpu(shared_msrs);
+       free_percpu(user_return_msrs);
        kmem_cache_destroy(x86_fpu_cache);
 }
 
@@ -7714,11 +8088,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                goto out;
        }
 
+       ret = -KVM_ENOSYS;
+
        switch (nr) {
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
        case KVM_HC_KICK_CPU:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
+                       break;
+
                kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
                kvm_sched_yield(vcpu->kvm, a1);
                ret = 0;
@@ -7729,9 +8108,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                break;
 #endif
        case KVM_HC_SEND_IPI:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
+                       break;
+
                ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
                break;
        case KVM_HC_SCHED_YIELD:
+               if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
+                       break;
+
                kvm_sched_yield(vcpu->kvm, a0);
                ret = 0;
                break;
@@ -7771,7 +8156,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *kvm_run = vcpu->run;
 
-       kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+       /*
+        * if_flag is obsolete and useless, so do not bother
+        * setting it for SEV-ES guests.  Userspace can just
+        * use kvm_run->ready_for_interrupt_injection.
+        */
+       kvm_run->if_flag = !vcpu->arch.guest_state_protected
+               && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+
        kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -8361,9 +8753,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        bool req_immediate_exit = false;
 
+       /* Forbid vmenter if vcpu dirty ring is soft-full */
+       if (unlikely(vcpu->kvm->dirty_ring_size &&
+                    kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+               vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+               trace_kvm_dirty_ring_exit(vcpu);
+               r = 0;
+               goto out;
+       }
+
        if (kvm_request_pending(vcpu)) {
-               if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
-                       if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) {
+               if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+                       if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
                                r = 0;
                                goto out;
                        }
@@ -8470,6 +8871,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_vcpu_update_apicv(vcpu);
                if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
                        kvm_check_async_pf_completion(vcpu);
+               if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+                       kvm_x86_ops.msr_filter_changed(vcpu);
        }
 
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -8545,7 +8948,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                kvm_x86_ops.request_immediate_exit(vcpu);
        }
 
-       trace_kvm_entry(vcpu->vcpu_id);
+       trace_kvm_entry(vcpu);
 
        fpregs_assert_state_consistent();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -9146,7 +9549,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
-static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
        if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
                /*
@@ -9154,31 +9557,29 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
                 * 64-bit mode (though maybe in a 32-bit code segment).
                 * CR4.PAE and EFER.LMA must be set.
                 */
-               if (!(sregs->cr4 & X86_CR4_PAE)
-                   || !(sregs->efer & EFER_LMA))
-                       return -EINVAL;
+               if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
+                       return false;
        } else {
                /*
                 * Not in 64-bit mode: EFER.LMA is clear and the code
                 * segment cannot be 64-bit.
                 */
                if (sregs->efer & EFER_LMA || sregs->cs.l)
-                       return -EINVAL;
+                       return false;
        }
 
-       return kvm_valid_cr4(vcpu, sregs->cr4);
+       return kvm_is_valid_cr4(vcpu, sregs->cr4);
 }
 
 static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
        struct msr_data apic_base_msr;
        int mmu_reset_needed = 0;
-       int cpuid_update_needed = 0;
        int pending_vec, max_bits, idx;
        struct desc_ptr dt;
        int ret = -EINVAL;
 
-       if (kvm_valid_sregs(vcpu, sregs))
+       if (!kvm_is_valid_sregs(vcpu, sregs))
                goto out;
 
        apic_base_msr.data = sregs->apic_base;
@@ -9208,11 +9609,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        vcpu->arch.cr0 = sregs->cr0;
 
        mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
-       cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
-                               (X86_CR4_OSXSAVE | X86_CR4_PKE));
        kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
-       if (cpuid_update_needed)
-               kvm_update_cpuid_runtime(vcpu);
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        if (is_pae_paging(vcpu)) {
@@ -9274,6 +9671,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        unsigned long rflags;
        int i, r;
 
+       if (vcpu->arch.guest_state_protected)
+               return -EINVAL;
+
        vcpu_load(vcpu);
 
        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
@@ -9559,7 +9959,6 @@ fail_mmu_destroy:
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
-       struct msr_data msr;
        struct kvm *kvm = vcpu->kvm;
 
        kvm_hv_vcpu_postcreate(vcpu);
@@ -9567,10 +9966,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
        if (mutex_lock_killable(&vcpu->mutex))
                return;
        vcpu_load(vcpu);
-       msr.data = 0x0;
-       msr.index = MSR_IA32_TSC;
-       msr.host_initiated = true;
-       kvm_write_tsc(vcpu, &msr);
+       kvm_synchronize_tsc(vcpu, 0);
        vcpu_put(vcpu);
 
        /* poll control enabled by default */
@@ -9607,6 +10003,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_mmu_destroy(vcpu);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        free_page((unsigned long)vcpu->arch.pio_data);
+       kvfree(vcpu->arch.cpuid_entries);
        if (!lapic_in_kernel(vcpu))
                static_key_slow_dec(&kvm_no_apic_vcpu);
 }
@@ -9704,7 +10101,7 @@ int kvm_arch_hardware_enable(void)
        u64 max_tsc = 0;
        bool stable, backwards_tsc = false;
 
-       kvm_shared_msr_cpu_online();
+       kvm_user_return_msr_cpu_online();
        ret = kvm_x86_ops.hardware_enable();
        if (ret != 0)
                return ret;
@@ -9963,7 +10360,32 @@ void kvm_arch_sync_events(struct kvm *kvm)
        kvm_free_pit(kvm);
 }
 
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+#define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
+
+/**
+ * __x86_set_memory_region: Setup KVM internal memory slot
+ *
+ * @kvm: the kvm pointer to the VM.
+ * @id: the slot ID to setup.
+ * @gpa: the GPA to install the slot (unused when @size == 0).
+ * @size: the size of the slot. Set to zero to uninstall a slot.
+ *
+ * This function helps to setup a KVM internal memory slot.  Specify
+ * @size > 0 to install a new slot, while @size == 0 to uninstall a
+ * slot.  The return code can be one of the following:
+ *
+ *   HVA:           on success (uninstall will return a bogus HVA)
+ *   -errno:        on error
+ *
+ * The caller should always use IS_ERR() to check the return value
+ * before use.  Note, the KVM internal memory slots are guaranteed to
+ * remain valid and unchanged until the VM is destroyed, i.e., the
+ * GPA->HVA translation will not change.  However, the HVA is a user
+ * address, i.e. its accessibility is not guaranteed, and must be
+ * accessed via __copy_{to,from}_user().
+ */
+void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
+                                     u32 size)
 {
        int i, r;
        unsigned long hva, old_npages;
@@ -9972,12 +10394,12 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 
        /* Called with kvm->slots_lock held.  */
        if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
-               return -EINVAL;
+               return ERR_PTR_USR(-EINVAL);
 
        slot = id_to_memslot(slots, id);
        if (size) {
                if (slot && slot->npages)
-                       return -EEXIST;
+                       return ERR_PTR_USR(-EEXIST);
 
                /*
                 * MAP_SHARED to prevent internal slot pages from being moved
@@ -9986,7 +10408,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
                hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
                              MAP_SHARED | MAP_ANONYMOUS, 0);
                if (IS_ERR((void *)hva))
-                       return PTR_ERR((void *)hva);
+                       return (void __user *)hva;
        } else {
                if (!slot || !slot->npages)
                        return 0;
@@ -10005,13 +10427,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
                m.memory_size = size;
                r = __kvm_set_memory_region(kvm, &m);
                if (r < 0)
-                       return r;
+                       return ERR_PTR_USR(r);
        }
 
        if (!size)
                vm_munmap(hva, old_npages * PAGE_SIZE);
 
-       return 0;
+       return (void __user *)hva;
 }
 EXPORT_SYMBOL_GPL(__x86_set_memory_region);
 
@@ -10022,6 +10444,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       u32 i;
+
        if (current->mm == kvm->mm) {
                /*
                 * Free memory regions allocated on behalf of userspace,
@@ -10038,6 +10462,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        }
        if (kvm_x86_ops.vm_destroy)
                kvm_x86_ops.vm_destroy(kvm);
+       for (i = 0; i < kvm->arch.msr_filter.count; i++)
+               kfree(kvm->arch.msr_filter.ranges[i].bitmap);
        kvm_pic_destroy(kvm);
        kvm_ioapic_destroy(kvm);
        kvm_free_vcpus(kvm);
@@ -10364,6 +10790,10 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
 {
+       /* Can't read the RIP when guest state is protected, just return 0 */
+       if (vcpu->arch.guest_state_protected)
+               return 0;
+
        if (is_64_bit_mode(vcpu))
                return kvm_rip_read(vcpu);
        return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
@@ -10768,6 +11198,284 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
 }
 EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
 
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+                             struct x86_exception *e)
+{
+       if (r == X86EMUL_PROPAGATE_FAULT) {
+               kvm_inject_emulated_page_fault(vcpu, e);
+               return 1;
+       }
+
+       /*
+        * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+        * while handling a VMX instruction KVM could've handled the request
+        * correctly by exiting to userspace and performing I/O but there
+        * doesn't seem to be a real use-case behind such requests, just return
+        * KVM_EXIT_INTERNAL_ERROR for now.
+        */
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 0;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+       bool pcid_enabled;
+       struct x86_exception e;
+       unsigned i;
+       unsigned long roots_to_free = 0;
+       struct {
+               u64 pcid;
+               u64 gla;
+       } operand;
+       int r;
+
+       r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+       if (r != X86EMUL_CONTINUE)
+               return kvm_handle_memory_failure(vcpu, r, &e);
+
+       if (operand.pcid >> 12 != 0) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+       switch (type) {
+       case INVPCID_TYPE_INDIV_ADDR:
+               if ((!pcid_enabled && (operand.pcid != 0)) ||
+                   is_noncanonical_address(operand.gla, vcpu)) {
+                       kvm_inject_gp(vcpu, 0);
+                       return 1;
+               }
+               kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+               return kvm_skip_emulated_instruction(vcpu);
+
+       case INVPCID_TYPE_SINGLE_CTXT:
+               if (!pcid_enabled && (operand.pcid != 0)) {
+                       kvm_inject_gp(vcpu, 0);
+                       return 1;
+               }
+
+               if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+                       kvm_mmu_sync_roots(vcpu);
+                       kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+               }
+
+               for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
+                           == operand.pcid)
+                               roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+               /*
+                * If neither the current cr3 nor any of the prev_roots use the
+                * given PCID, then nothing needs to be done here because a
+                * resync will happen anyway before switching to any other CR3.
+                */
+
+               return kvm_skip_emulated_instruction(vcpu);
+
+       case INVPCID_TYPE_ALL_NON_GLOBAL:
+               /*
+                * Currently, KVM doesn't mark global entries in the shadow
+                * page tables, so a non-global flush just degenerates to a
+                * global flush. If needed, we could optimize this later by
+                * keeping track of global entries in shadow page tables.
+                */
+
+               fallthrough;
+       case INVPCID_TYPE_ALL_INCL_GLOBAL:
+               kvm_mmu_unload(vcpu);
+               return kvm_skip_emulated_instruction(vcpu);
+
+       default:
+               BUG(); /* We have already checked above that type <= 3 */
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+
+static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       struct kvm_mmio_fragment *frag;
+       unsigned int len;
+
+       BUG_ON(!vcpu->mmio_needed);
+
+       /* Complete previous fragment */
+       frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+       len = min(8u, frag->len);
+       if (!vcpu->mmio_is_write)
+               memcpy(frag->data, run->mmio.data, len);
+
+       if (frag->len <= 8) {
+               /* Switch to the next fragment. */
+               frag++;
+               vcpu->mmio_cur_fragment++;
+       } else {
+               /* Go forward to the next mmio piece. */
+               frag->data += len;
+               frag->gpa += len;
+               frag->len -= len;
+       }
+
+       if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+               vcpu->mmio_needed = 0;
+
+               // VMG change, at this point, we're always done
+               // RIP has already been advanced
+               return 1;
+       }
+
+       // More MMIO is needed
+       run->mmio.phys_addr = frag->gpa;
+       run->mmio.len = min(8u, frag->len);
+       run->mmio.is_write = vcpu->mmio_is_write;
+       if (run->mmio.is_write)
+               memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+       run->exit_reason = KVM_EXIT_MMIO;
+
+       vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+       return 0;
+}
+
+int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+                         void *data)
+{
+       int handled;
+       struct kvm_mmio_fragment *frag;
+
+       if (!data)
+               return -EINVAL;
+
+       handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+       if (handled == bytes)
+               return 1;
+
+       bytes -= handled;
+       gpa += handled;
+       data += handled;
+
+       /*TODO: Check if need to increment number of frags */
+       frag = vcpu->mmio_fragments;
+       vcpu->mmio_nr_fragments = 1;
+       frag->len = bytes;
+       frag->gpa = gpa;
+       frag->data = data;
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_cur_fragment = 0;
+
+       vcpu->run->mmio.phys_addr = gpa;
+       vcpu->run->mmio.len = min(8u, frag->len);
+       vcpu->run->mmio.is_write = 1;
+       memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+       vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+       vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
+
+int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
+                        void *data)
+{
+       int handled;
+       struct kvm_mmio_fragment *frag;
+
+       if (!data)
+               return -EINVAL;
+
+       handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
+       if (handled == bytes)
+               return 1;
+
+       bytes -= handled;
+       gpa += handled;
+       data += handled;
+
+       /*TODO: Check if need to increment number of frags */
+       frag = vcpu->mmio_fragments;
+       vcpu->mmio_nr_fragments = 1;
+       frag->len = bytes;
+       frag->gpa = gpa;
+       frag->data = data;
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_cur_fragment = 0;
+
+       vcpu->run->mmio.phys_addr = gpa;
+       vcpu->run->mmio.len = min(8u, frag->len);
+       vcpu->run->mmio.is_write = 0;
+       vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+       vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+       memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data,
+              vcpu->arch.pio.count * vcpu->arch.pio.size);
+       vcpu->arch.pio.count = 0;
+
+       return 1;
+}
+
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+                          unsigned int port, void *data,  unsigned int count)
+{
+       int ret;
+
+       ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port,
+                                       data, count);
+       if (ret)
+               return ret;
+
+       vcpu->arch.pio.count = 0;
+
+       return 0;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+                         unsigned int port, void *data, unsigned int count)
+{
+       int ret;
+
+       ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port,
+                                      data, count);
+       if (ret) {
+               vcpu->arch.pio.count = 0;
+       } else {
+               vcpu->arch.guest_ins_data = data;
+               vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+       }
+
+       return 0;
+}
+
+int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
+                        unsigned int port, void *data,  unsigned int count,
+                        int in)
+{
+       return in ? kvm_sev_es_ins(vcpu, size, port, data, count)
+                 : kvm_sev_es_outs(vcpu, size, port, data, count);
+}
+EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
@@ -10790,3 +11498,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);